Compare commits

...

32 Commits

Author SHA1 Message Date
Ricardo Garcia
554bbdc48c Bump version number 2010-10-31 11:26:57 +01:00
Ricardo Garcia
37dfa1e0df Also try el=vevo on YouTube if everything else fails (fixes issue #115) 2010-10-31 11:26:57 +01:00
Ricardo Garcia
4dd63be193 Bump version number 2010-10-31 11:26:53 +01:00
Ricardo Garcia
7d8d06122d Add the "ord" template parameter (fixes issue #101) 2010-10-31 11:26:53 +01:00
Ricardo Garcia
9177ce4d8c Support new playlist style URL (fixes issue #114) 2010-10-31 11:26:52 +01:00
Ricardo Garcia
ce5cafea40 Change method to detect end of playlist (fixes issue #113) 2010-10-31 11:26:52 +01:00
Ricardo Garcia
ae3fc475eb Bump version number 2010-10-31 11:26:48 +01:00
Ricardo Garcia
d063db3810 Try el=detailpage if el=embedded fails for YouTube 2010-10-31 11:26:48 +01:00
Ricardo Garcia
6194531831 Add Yahoo! Video InfoExtractor, merged from "obeythepenguin" 2010-10-31 11:26:48 +01:00
Ricardo Garcia
2ed1ddd0a0 Request video info webpage using "embedded" instead of "detailpage"
In the request for get_video_info, use el=embedded instead of el=detailpage, as
if the request was coming from an embedded video player instead of the video
webpage. This created problems for some videos, with YouTube replying with
"Invalid parameters". This fixes issue #109 and fixes issue #110.
2010-10-31 11:26:48 +01:00
Ricardo Garcia
eaf4a7288d Solve minor aesthetical problem in rtmpdump error messages 2010-10-31 11:26:48 +01:00
Ricardo Garcia
6ba562b0e4 Added --all-format option from tweaked patch (fixes issue #102) 2010-10-31 11:26:48 +01:00
Ricardo Garcia
131bc7651a Make the "-" output file name equivalent to /dev/stdout (fixes issue #103) 2010-10-31 11:26:48 +01:00
Ricardo Garcia
5caacaddc6 Bump version number 2010-10-31 11:26:42 +01:00
Ricardo Garcia
79f193e5d8 Do not use the final URL for -g 2010-10-31 11:26:38 +01:00
Ricardo Garcia
44e16fa17f Bump version number 2010-10-31 11:26:34 +01:00
Ricardo Garcia
d983524781 Add --no-progress option (fixes issue #98) 2010-10-31 11:26:34 +01:00
Ricardo Garcia
1392f3f52c Give preference to format 34 before format 5 in quality list 2010-10-31 11:26:34 +01:00
Ricardo Garcia
43ab0ca432 Do not error out on problems printing the file name 2010-10-31 11:26:34 +01:00
Ricardo Garcia
31cbdaafd4 Properly support simple titles in the newest InfoExtractors 2010-10-31 11:26:34 +01:00
Ricardo Garcia
bd3cdf6dc4 Do not pass URLs around in Unicode form (fixes issue #92) 2010-10-31 11:26:34 +01:00
Ricardo Garcia
8cc468de75 Bump version number 2010-10-31 11:26:31 +01:00
Ricardo Garcia
31bcb48001 Tweak final filename in the open attempt, to be platform and filename-agnostic 2010-10-31 11:26:30 +01:00
Ricardo Garcia
c201ebc915 Fix SyntaxError triggered by mistake in user-agent commit 2010-10-31 11:26:30 +01:00
Ricardo Garcia
ce9c6a3097 Fix problem with sanitize_title not replacing Windows directory separator 2010-10-31 11:26:30 +01:00
Ricardo Garcia
4cfeb46544 Update user-agent string 2010-10-31 11:26:30 +01:00
Ricardo Garcia
490fd7aea7 Cherry-pick obeythepenguin's changes and merge them into main branch 2010-10-31 11:26:30 +01:00
Ricardo Garcia
c05fc6a345 Support simplest new URLs in YouTube 2010-10-31 11:26:30 +01:00
Ricardo Garcia
91bce611c7 Bump version number 2010-10-31 11:26:26 +01:00
Ricardo Garcia
1c1821f8eb Improve rtmpdump support 2010-10-31 11:25:09 +01:00
Ricardo Garcia
60f8049d05 Only verify the URL when it's an HTTP download 2010-10-31 11:25:08 +01:00
obeythepenguin@gmail.com
49c0028a7a patched to add Google Video and Photobucket support 2010-10-31 11:25:08 +01:00
2 changed files with 549 additions and 61 deletions

View File

@@ -1 +1 @@
2010.01.06
2010.04.04

View File

@@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-
# Author: Ricardo Garcia Gonzalez
# Author: Danny Colligan
# Author: Benjamin Johnson
# License: Public domain code
import htmlentitydefs
import httplib
@@ -26,7 +27,7 @@ except ImportError:
from cgi import parse_qs
std_headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
@@ -50,6 +51,61 @@ def preferredencoding():
yield pref
return yield_preferredencoding().next()
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character.
This function receives a match object and is intended to be used with
the re.sub() function.
"""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
return utitle.replace(unicode(os.sep), u'%')
def sanitize_open(filename, open_mode):
"""Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename. If this fails, it tries to change
the filename slightly, step by step, until it's either able to open it
or it fails and raises a final exception, like the standard open()
function.
It returns the tuple (stream, definitive_file_name).
"""
try:
if filename == u'-':
return (sys.stdout, filename)
stream = open(filename, open_mode)
return (stream, filename)
except (IOError, OSError), err:
# In case of error, try to remove win32 forbidden chars
filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
# An exception here should be caught in the caller
stream = open(filename, open_mode)
return (stream, filename)
class DownloadError(Exception):
"""Download Error exception.
@@ -138,18 +194,21 @@ class FileDownloader(object):
ratelimit: Download speed limit, in bytes/sec.
nooverwrites: Prevent overwriting files.
continuedl: Try to continue downloads if possible.
noprogress: Do not print the progress bar.
"""
params = None
_ies = []
_pps = []
_download_retcode = None
_num_downloads = None
def __init__(self, params):
"""Create a FileDownloader object with the given options."""
self._ies = []
self._pps = []
self._download_retcode = 0
self._num_downloads = 0
self.params = params
@staticmethod
@@ -246,11 +305,15 @@ class FileDownloader(object):
self._pps.append(pp)
pp.set_downloader(self)
def to_stdout(self, message, skip_eol=False):
def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
"""Print message to stdout if not in quiet mode."""
if not self.params.get('quiet', False):
print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
try:
if not self.params.get('quiet', False):
print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
sys.stdout.flush()
except (UnicodeEncodeError), err:
if not ignore_encoding_errors:
raise
def to_stderr(self, message):
"""Print message to stderr."""
@@ -288,10 +351,12 @@ class FileDownloader(object):
def report_destination(self, filename):
"""Report destination filename."""
self.to_stdout(u'[download] Destination: %s' % filename)
self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
"""Report download progress."""
if self.params.get('noprogress', False):
return
self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
@@ -301,7 +366,10 @@ class FileDownloader(object):
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
self.to_stdout(u'[download] %s has already been downloaded' % file_name)
try:
self.to_stdout(u'[download] %s has already been downloaded' % file_name)
except (UnicodeEncodeError), err:
self.to_stdout(u'[download] The file has already been downloaded')
def report_unable_to_resume(self):
"""Report it was impossible to resume download."""
@@ -309,28 +377,34 @@ class FileDownloader(object):
def report_finish(self):
"""Report download finished."""
self.to_stdout(u'')
if self.params.get('noprogress', False):
self.to_stdout(u'[download] Download completed')
else:
self.to_stdout(u'')
def process_info(self, info_dict):
"""Process a single dictionary returned by an InfoExtractor."""
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
try:
info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
raise UnavailableFormatError
# Verify URL if it's an HTTP one
if info_dict['url'].startswith('http'):
try:
self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
raise UnavailableFormatError
# Forced printings
if self.params.get('forcetitle', False):
print info_dict['title'].encode(preferredencoding())
print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
if self.params.get('forceurl', False):
print info_dict['url'].encode(preferredencoding())
print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
return
try:
template_dict = dict(info_dict)
template_dict['epoch'] = unicode(long(time.time()))
template_dict['ord'] = unicode('%05d' % self._num_downloads)
filename = self.params['outtmpl'] % template_dict
except (ValueError, KeyError), err:
self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
@@ -410,16 +484,17 @@ class FileDownloader(object):
# Download using rtmpdump. rtmpdump returns exit code 2 when
# the connection was interrumpted and resuming appears to be
# possible. This is part of rtmpdump's normal usage, AFAIK.
retval = subprocess.call(['rtmpdump', '-q', '-r', url, '-o', filename] + [[], ['-e']][self.params.get('continuedl', False)])
while retval == 2:
basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
while retval == 2 or retval == 1:
self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
time.sleep(2.0) # This seems to be needed
retval = subprocess.call(['rtmpdump', '-q', '-e', '-r', url, '-o', filename])
retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
if retval == 0:
self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
return True
else:
self.trouble('ERROR: rtmpdump exited with code %d' % retval)
self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
return False
def _do_download(self, filename, url):
@@ -481,8 +556,9 @@ class FileDownloader(object):
# Open file just in time
if stream is None:
try:
stream = open(filename, open_mode)
(stream, filename) = sanitize_open(filename, open_mode)
self.report_destination(filename)
self._num_downloads += 1
except (OSError, IOError), err:
self.trouble('ERROR: unable to open for writing: %s' % str(err))
return False
@@ -521,6 +597,7 @@ class InfoExtractor(object):
title: Literal title.
stitle: Simplified title.
ext: Video filename extension.
format: Video format.
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods, as well as the suitable() static method.
@@ -567,12 +644,12 @@ class InfoExtractor(object):
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
_LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
_available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
_available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
_video_extensions = {
'13': '3gp',
'17': 'mp4',
@@ -585,29 +662,6 @@ class YoutubeIE(InfoExtractor):
def suitable(url):
return (re.match(YoutubeIE._VALID_URL, url) is not None)
@staticmethod
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character."""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def report_lang(self):
"""Report attempt to set language."""
self._downloader.to_stdout(u'[youtube] Setting language')
@@ -715,6 +769,7 @@ class YoutubeIE(InfoExtractor):
# Downloader parameters
best_quality = False
all_formats = False
format_param = None
quality_index = 0
if self._downloader is not None:
@@ -723,21 +778,28 @@ class YoutubeIE(InfoExtractor):
if format_param == '0':
format_param = self._available_formats[quality_index]
best_quality = True
elif format_param == '-1':
format_param = self._available_formats[quality_index]
all_formats = True
while True:
# Extension
video_extension = self._video_extensions.get(format_param, 'flv')
# Get video info
video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
request = urllib2.Request(video_info_url, None, std_headers)
try:
self.report_video_info_webpage_download(video_id)
video_info_webpage = urllib2.urlopen(request).read()
video_info = parse_qs(video_info_webpage)
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
return
self.report_video_info_webpage_download(video_id)
for el_type in ['embedded', 'detailpage', 'vevo']:
video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
request = urllib2.Request(video_info_url, None, std_headers)
try:
video_info_webpage = urllib2.urlopen(request).read()
video_info = parse_qs(video_info_webpage)
if 'token' in video_info:
break
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
return
self.report_information_extraction(video_id)
# "t" param
@@ -774,8 +836,7 @@ class YoutubeIE(InfoExtractor):
return
video_title = urllib.unquote_plus(video_info['title'][0])
video_title = video_title.decode('utf-8')
video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
video_title = video_title.replace(os.sep, u'%')
video_title = sanitize_title(video_title)
# simplified title
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
@@ -790,20 +851,35 @@ class YoutubeIE(InfoExtractor):
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
})
if all_formats:
if quality_index == len(self._available_formats) - 1:
# None left to get
return
else:
quality_index += 1
format_param = self._available_formats[quality_index]
if format_param == None:
return
continue
return
except UnavailableFormatError, err:
if best_quality:
if best_quality or all_formats:
if quality_index == len(self._available_formats) - 1:
# I don't ever expect this to happen
self._downloader.trouble(u'ERROR: no known formats available for video')
if not all_formats:
self._downloader.trouble(u'ERROR: no known formats available for video')
return
else:
self.report_unavailable_format(video_id, format_param)
quality_index += 1
format_param = self._available_formats[quality_index]
if format_param == None:
return
continue
else:
self._downloader.trouble('ERROR: format not available for video')
@@ -915,6 +991,7 @@ class MetacafeIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
if mobj is None:
@@ -931,6 +1008,396 @@ class MetacafeIE(InfoExtractor):
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
'format': u'NA',
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class GoogleIE(InfoExtractor):
"""Information extractor for video.google.com."""
_VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return (re.match(GoogleIE._VALID_URL, url) is not None)
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_id = mobj.group(1)
video_extension = 'mp4'
# Retrieve video webpage to extract further information
request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
mobj = re.search(r"download_url:'([^']+)'", webpage)
if mobj is None:
video_extension = 'flv'
mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract media URL')
return
mediaURL = urllib.unquote(mobj.group(1))
mediaURL = mediaURL.replace('\\x3d', '\x3d')
mediaURL = mediaURL.replace('\\x26', '\x26')
video_url = mediaURL
mobj = re.search(r'<title>(.*)</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': u'NA',
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
'format': u'NA',
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class PhotobucketIE(InfoExtractor):
"""Information extractor for photobucket.com."""
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return (re.match(PhotobucketIE._VALID_URL, url) is not None)
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_id = mobj.group(1)
video_extension = 'flv'
# Retrieve video webpage to extract further information
request = urllib2.Request(url)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract media URL')
return
mediaURL = urllib.unquote(mobj.group(1))
video_url = mediaURL
mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
video_uploader = mobj.group(2).decode('utf-8')
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
'format': u'NA',
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class YahooIE(InfoExtractor):
"""Information extractor for video.yahoo.com."""
# _VALID_URL matches all Yahoo! Video URLs
# _VPAGE_URL matches only the extractable '/watch/' URLs
_VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
_VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return (re.match(YahooIE._VALID_URL, url) is not None)
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_id = mobj.group(2)
video_extension = 'flv'
# Rewrite valid but non-extractable URLs as
# extractable English language /watch/ URLs
if re.match(self._VPAGE_URL, url) is None:
request = urllib2.Request(url)
try:
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: Unable to extract id field')
return
yahoo_id = mobj.group(1)
mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: Unable to extract vid field')
return
yahoo_vid = mobj.group(1)
url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
return self._real_extract(url)
# Retrieve video webpage to extract further information
request = urllib2.Request(url)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
# Extract uploader and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
video_title = mobj.group(1).decode('utf-8')
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video uploader')
return
video_uploader = mobj.group(1).decode('utf-8')
# Extract video height and width
mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video height')
return
yv_video_height = mobj.group(1)
mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video width')
return
yv_video_width = mobj.group(1)
# Retrieve video playlist to extract media URL
# I'm not completely sure what all these options are, but we
# seem to need most of them, otherwise the server sends a 401.
yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
yv_bitrate = '700' # according to Wikipedia this is hard-coded
request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
'&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
# Extract media URL from playlist XML
mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: Unable to extract media URL')
return
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url,
'uploader': video_uploader,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class GenericIE(InfoExtractor):
"""Generic last-resort information extractor."""
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return True
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
video_id = url.split('/')[-1]
request = urllib2.Request(url)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
except ValueError, err:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj.group(1) is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_url = urllib.unquote(mobj.group(1))
video_id = os.path.basename(video_url)
# here's a fun little line of code for you:
video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
mobj = re.search(r'<title>(.*)</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
# video uploader is domain name
mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_uploader = mobj.group(1).decode('utf-8')
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
'format': u'NA',
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
@@ -955,6 +1422,7 @@ class YoutubeSearchIE(InfoExtractor):
def report_download_page(self, query, pagenum):
"""Report attempt to download playlist page with given number."""
query = query.decode(preferredencoding())
self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
def _real_initialize(self):
@@ -968,6 +1436,7 @@ class YoutubeSearchIE(InfoExtractor):
prefix, query = query.split(':')
prefix = prefix[8:]
query = query.encode('utf-8')
if prefix == '':
self._download_n_results(query, 1)
return
@@ -1028,10 +1497,10 @@ class YoutubeSearchIE(InfoExtractor):
class YoutubePlaylistIE(InfoExtractor):
"""Information Extractor for YouTube playlists."""
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
_TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
_MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
_youtube_ie = None
def __init__(self, youtube_ie, downloader=None):
@@ -1077,7 +1546,7 @@ class YoutubePlaylistIE(InfoExtractor):
ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page)
if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
if re.search(self._MORE_PAGES_INDICATOR, page) is None:
break
pagenum = pagenum + 1
@@ -1217,7 +1686,7 @@ if __name__ == '__main__':
# Parse command line
parser = optparse.OptionParser(
usage='Usage: %prog [options] url...',
version='2010.01.06',
version='2010.04.04',
conflict_handler='resolve',
)
@@ -1250,6 +1719,8 @@ if __name__ == '__main__':
action='store_const', dest='format', help='alias for -f 17', const='17')
video_format.add_option('-d', '--high-def',
action='store_const', dest='format', help='alias for -f 22', const='22')
video_format.add_option('--all-formats',
action='store_const', dest='format', help='download all available video formats', const='-1')
parser.add_option_group(video_format)
verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
@@ -1261,6 +1732,8 @@ if __name__ == '__main__':
action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
verbosity.add_option('-e', '--get-title',
action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
verbosity.add_option('--no-progress',
action='store_true', dest='noprogress', help='do not print progress bar', default=False)
parser.add_option_group(verbosity)
filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
@@ -1314,6 +1787,10 @@ if __name__ == '__main__':
youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
youtube_user_ie = YoutubeUserIE(youtube_ie)
youtube_search_ie = YoutubeSearchIE(youtube_ie)
google_ie = GoogleIE()
photobucket_ie = PhotobucketIE()
yahoo_ie = YahooIE()
generic_ie = GenericIE()
# File downloader
fd = FileDownloader({
@@ -1326,6 +1803,9 @@ if __name__ == '__main__':
'simulate': (opts.simulate or opts.geturl or opts.gettitle),
'format': opts.format,
'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
or u'%(id)s.%(ext)s'),
@@ -1333,12 +1813,20 @@ if __name__ == '__main__':
'ratelimit': opts.ratelimit,
'nooverwrites': opts.nooverwrites,
'continuedl': opts.continue_dl,
'noprogress': opts.noprogress,
})
fd.add_info_extractor(youtube_search_ie)
fd.add_info_extractor(youtube_pl_ie)
fd.add_info_extractor(youtube_user_ie)
fd.add_info_extractor(metacafe_ie)
fd.add_info_extractor(youtube_ie)
fd.add_info_extractor(google_ie)
fd.add_info_extractor(photobucket_ie)
fd.add_info_extractor(yahoo_ie)
# This must come last since it's the
# fallback if none of the others work
fd.add_info_extractor(generic_ie)
# Update version
if opts.update_self: