Compare commits

...

28 Commits

Author SHA1 Message Date
8cc468de75 Bump version number 2010-10-31 11:26:31 +01:00
31bcb48001 Tweak final filename in the open attempt, to be platform and filename-agnostic 2010-10-31 11:26:30 +01:00
c201ebc915 Fix SyntaxError triggered by mistake in user-agent commit 2010-10-31 11:26:30 +01:00
ce9c6a3097 Fix problem with sanitize_title not replacing Windows directory separator 2010-10-31 11:26:30 +01:00
4cfeb46544 Update user-agent string 2010-10-31 11:26:30 +01:00
490fd7aea7 Cherry-pick obeythepenguin's changes and merge them into main branch 2010-10-31 11:26:30 +01:00
c05fc6a345 Support simplest new URLs in YouTube 2010-10-31 11:26:30 +01:00
91bce611c7 Bump version number 2010-10-31 11:26:26 +01:00
1c1821f8eb Improve rtmpdump support 2010-10-31 11:25:09 +01:00
60f8049d05 Only verify the URL when it's an HTTP download 2010-10-31 11:25:08 +01:00
49c0028a7a patched to add Google Video and Photobucket support 2010-10-31 11:25:08 +01:00
f1b4bee09d Bump version number 2010-10-31 11:25:05 +01:00
a04e80a481 Add flexibility importing the "parse_qs" function (fixes issue #81) 2010-10-31 11:25:05 +01:00
fe788f2c6f Bump version number 2010-10-31 11:25:01 +01:00
75a4cf3c97 Fix minor problems with Youtube user InfoExtractor 2010-10-31 11:25:01 +01:00
0487b407a1 Add support for using rtmpdump 2010-10-31 11:25:01 +01:00
a692ca7c49 Bump version number 2010-10-31 11:24:57 +01:00
9c457d2a20 Handle file open mode correctly (fixes issue #76) 2010-10-31 11:24:56 +01:00
c39c05cdd7 Added support to download all of a user's videos! 2010-10-31 11:24:56 +01:00
29f0756805 Fix detection of uploader nickname in metacafe (fixes issue #67) 2010-10-31 11:24:56 +01:00
d9bc015b3c Take format 37 into account (fixes issue #65) 2010-10-31 11:24:56 +01:00
4bec29ef4b Add self-updating code 2010-10-31 11:24:56 +01:00
ab1f697827 Use unquote_plus to decode video title 2010-10-31 11:24:56 +01:00
583c714fde Allow empty titles because they do appear in some videos (fixes issue #53) 2010-10-31 11:24:56 +01:00
850ab76560 Use default values for "continuedl" and "nooverwrites" downloader parameters 2010-10-31 11:24:56 +01:00
f5a5bec351 Avoid using Unicode strings when forming URL requests (fixes issue #50) 2010-10-31 11:24:56 +01:00
f94b636c3e Improve preferred encoding detection method 2010-10-31 11:24:56 +01:00
0833f1eb83 Restore INTERNAL version number 2010-10-31 11:24:56 +01:00
2 changed files with 501 additions and 68 deletions

View File

@ -1 +1 @@
2009.09.13
2010.02.13

View File

@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-
# Author: Ricardo Garcia Gonzalez
# Author: Danny Colligan
# Author: Benjamin Johnson
# License: Public domain code
import htmlentitydefs
import httplib
@ -13,13 +14,20 @@ import os.path
import re
import socket
import string
import subprocess
import sys
import time
import urllib
import urllib2
# parse_qs was moved from the cgi module to the urlparse module recently.
try:
from urlparse import parse_qs
except ImportError:
from cgi import parse_qs
std_headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
@ -33,15 +41,68 @@ def preferredencoding():
Returns the best encoding scheme for the system, based on
locale.getpreferredencoding() and some further tweaks.
"""
def yield_preferredencoding():
try:
pref = locale.getpreferredencoding()
u'TEST'.encode(pref)
except:
pref = 'UTF-8'
while True:
yield pref
return yield_preferredencoding().next()
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character.
This function receives a match object and is intended to be used with
the re.sub() function.
"""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
return utitle.replace(unicode(os.sep), u'%')
def sanitize_open(filename, open_mode):
"""Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename. If this fails, it tries to change
the filename slightly, step by step, until it's either able to open it
or it fails and raises a final exception, like the standard open()
function.
It returns the tuple (stream, definitive_file_name).
"""
try:
pref = locale.getpreferredencoding()
# Mac OSX systems have this problem sometimes
if pref == '':
return 'UTF-8'
return pref
except:
sys.stderr.write('WARNING: problem obtaining preferred encoding. Falling back to UTF-8.\n')
return 'UTF-8'
stream = open(filename, open_mode)
return (stream, filename)
except (IOError, OSError), err:
# In case of error, try to remove win32 forbidden chars
filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
# An exception here should be caught in the caller
stream = open(filename, open_mode)
return (stream, filename)
class DownloadError(Exception):
"""Download Error exception.
@ -308,16 +369,18 @@ class FileDownloader(object):
"""Process a single dictionary returned by an InfoExtractor."""
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
try:
info_dict['url'] = self.verify_url(info_dict['url'])
except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
raise UnavailableFormatError
# Verify URL if it's an HTTP one
if info_dict['url'].startswith('http'):
try:
info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
raise UnavailableFormatError
# Forced printings
if self.params.get('forcetitle', False):
print info_dict['title'].encode(preferredencoding())
print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
if self.params.get('forceurl', False):
print info_dict['url'].encode(preferredencoding())
print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
return
@ -327,7 +390,7 @@ class FileDownloader(object):
filename = self.params['outtmpl'] % template_dict
except (ValueError, KeyError), err:
self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
if self.params['nooverwrites'] and os.path.exists(filename):
if self.params.get('nooverwrites', False) and os.path.exists(filename):
self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
return
@ -338,7 +401,7 @@ class FileDownloader(object):
return
try:
success = self._do_download(filename, info_dict['url'])
success = self._do_download(filename, info_dict['url'].encode('utf-8'))
except (OSError, IOError), err:
raise UnavailableFormatError
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -390,21 +453,53 @@ class FileDownloader(object):
if info is None:
break
def _do_download(self, filename, url):
stream = None
open_mode = 'ab'
def _download_with_rtmpdump(self, filename, url):
self.report_destination(filename)
# Check for rtmpdump first
try:
subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
except (OSError, IOError):
self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
return False
# Download using rtmpdump. rtmpdump returns exit code 2 when
# the connection was interrumpted and resuming appears to be
# possible. This is part of rtmpdump's normal usage, AFAIK.
basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
while retval == 2 or retval == 1:
self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
time.sleep(2.0) # This seems to be needed
retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
if retval == 0:
self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
return True
else:
self.trouble('ERROR: rtmpdump exited with code %d' % retval)
return False
def _do_download(self, filename, url):
# Attempt to download using rtmpdump
if url.startswith('rtmp'):
return self._download_with_rtmpdump(filename, url)
stream = None
open_mode = 'wb'
basic_request = urllib2.Request(url, None, std_headers)
request = urllib2.Request(url, None, std_headers)
# Attempt to resume download with "continuedl" option
# Establish possible resume length
if os.path.isfile(filename):
resume_len = os.path.getsize(filename)
else:
resume_len = 0
if self.params['continuedl'] and resume_len != 0:
# Request parameters in case of being able to resume
if self.params.get('continuedl', False) and resume_len != 0:
self.report_resuming_byte(resume_len)
request.add_header('Range','bytes=%d-' % resume_len)
open_mode = 'ab'
# Establish connection
try:
@ -412,12 +507,16 @@ class FileDownloader(object):
except (urllib2.HTTPError, ), err:
if err.code != 416: # 416 is 'Requested range not satisfiable'
raise
# Unable to resume
data = urllib2.urlopen(basic_request)
content_length = data.info()['Content-Length']
if content_length is not None and long(content_length) == resume_len:
# Because the file had already been fully downloaded
self.report_file_already_downloaded(filename)
return True
else:
# Because the server didn't let us
self.report_unable_to_resume()
open_mode = 'wb'
@ -439,7 +538,7 @@ class FileDownloader(object):
# Open file just in time
if stream is None:
try:
stream = open(filename, open_mode)
(stream, filename) = sanitize_open(filename, open_mode)
self.report_destination(filename)
except (OSError, IOError), err:
self.trouble('ERROR: unable to open for writing: %s' % str(err))
@ -525,46 +624,24 @@ class InfoExtractor(object):
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
_LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
_LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
_available_formats = ['22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
_available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
_video_extensions = {
'13': '3gp',
'17': 'mp4',
'18': 'mp4',
'22': 'mp4',
'37': 'mp4',
}
@staticmethod
def suitable(url):
return (re.match(YoutubeIE._VALID_URL, url) is not None)
@staticmethod
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a Unicode character."""
entity = matchobj.group(1)
# Known non-numeric HTML entity
if entity in htmlentitydefs.name2codepoint:
return unichr(htmlentitydefs.name2codepoint[entity])
# Unicode character
mobj = re.match(ur'(?u)#(x?\d+)', entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith(u'x'):
base = 16
numstr = u'0%s' % numstr
else:
base = 10
return unichr(long(numstr, base))
# Unknown entity in name, return its literal representation
return (u'&%s;' % entity)
def report_lang(self):
"""Report attempt to set language."""
self._downloader.to_stdout(u'[youtube] Setting language')
@ -589,6 +666,10 @@ class YoutubeIE(InfoExtractor):
"""Report extracted video URL."""
self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
def report_rtmp_download(self):
"""Indicate the download will use the RTMP protocol."""
self._downloader.to_stdout(u'[youtube] RTMP download detected')
def _real_initialize(self):
if self._downloader is None:
return
@ -687,46 +768,47 @@ class YoutubeIE(InfoExtractor):
try:
self.report_video_info_webpage_download(video_id)
video_info_webpage = urllib2.urlopen(request).read()
video_info = parse_qs(video_info_webpage)
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
return
self.report_information_extraction(video_id)
# "t" param
mobj = re.search(r'(?m)&token=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
if 'token' not in video_info:
# Attempt to see if YouTube has issued an error message
mobj = re.search(r'(?m)&reason=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
if 'reason' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
stream.write(video_info_webpage)
stream.close()
else:
reason = urllib.unquote_plus(mobj.group(1))
reason = urllib.unquote_plus(video_info['reason'][0])
self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
return
token = urllib.unquote(mobj.group(1))
token = urllib.unquote_plus(video_info['token'][0])
video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
if format_param is not None:
video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
# Check possible RTMP download
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
video_real_url = video_info['conn'][0]
# uploader
mobj = re.search(r'(?m)&author=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
if 'author' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
return
video_uploader = urllib.unquote(mobj.group(1))
video_uploader = urllib.unquote_plus(video_info['author'][0])
# title
mobj = re.search(r'(?m)&title=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
if 'title' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
video_title = urllib.unquote(mobj.group(1))
video_title = urllib.unquote_plus(video_info['title'][0])
video_title = video_title.decode('utf-8')
video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
video_title = video_title.replace(os.sep, u'%')
video_title = sanitize_title(video_title)
# simplified title
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
@ -866,8 +948,9 @@ class MetacafeIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
return
@ -887,6 +970,257 @@ class MetacafeIE(InfoExtractor):
self._downloader.trouble(u'ERROR: format not available for video')
class GoogleIE(InfoExtractor):
"""Information extractor for video.google.com."""
_VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return (re.match(GoogleIE._VALID_URL, url) is not None)
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_id = mobj.group(1)
video_extension = 'mp4'
# Retrieve video webpage to extract further information
request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
mobj = re.search(r"download_url:'([^']+)'", webpage)
if mobj is None:
video_extension = 'flv'
mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract media URL')
return
mediaURL = urllib.unquote(mobj.group(1))
mediaURL = mediaURL.replace('\\x3d', '\x3d')
mediaURL = mediaURL.replace('\\x26', '\x26')
video_url = mediaURL
mobj = re.search(r'<title>(.*)</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
# Google Video doesn't show uploader nicknames?
video_uploader = 'NA'
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'title': video_title,
'stitle': video_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class PhotobucketIE(InfoExtractor):
"""Information extractor for photobucket.com."""
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return (re.match(PhotobucketIE._VALID_URL, url) is not None)
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_id = mobj.group(1)
video_extension = 'flv'
# Retrieve video webpage to extract further information
request = urllib2.Request(url)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract media URL')
return
mediaURL = urllib.unquote(mobj.group(1))
video_url = mediaURL
mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
video_uploader = mobj.group(2).decode('utf-8')
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'title': video_title,
'stitle': video_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class GenericIE(InfoExtractor):
"""Generic last-resort information extractor."""
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
@staticmethod
def suitable(url):
return True
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
def _real_initialize(self):
return
def _real_extract(self, url):
video_id = url.split('/')[-1]
request = urllib2.Request(url)
try:
self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
return
except ValueError, err:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj.group(1) is None:
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
video_url = urllib.unquote(mobj.group(1))
video_id = os.path.basename(video_url)
# here's a fun little line of code for you:
video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
mobj = re.search(r'<title>(.*)</title>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = sanitize_title(video_title)
# video uploader is domain name
mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_uploader = mobj.group(1).decode('utf-8')
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'title': video_title,
'stitle': video_title,
'ext': video_extension.decode('utf-8'),
})
except UnavailableFormatError:
self._downloader.trouble(u'ERROR: format not available for video')
class YoutubeSearchIE(InfoExtractor):
"""Information Extractor for YouTube search queries."""
_VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
@ -906,6 +1240,7 @@ class YoutubeSearchIE(InfoExtractor):
def report_download_page(self, query, pagenum):
"""Report attempt to download playlist page with given number."""
query = query.decode(preferredencoding())
self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
def _real_initialize(self):
@ -919,6 +1254,7 @@ class YoutubeSearchIE(InfoExtractor):
prefix, query = query.split(':')
prefix = prefix[8:]
query = query.encode('utf-8')
if prefix == '':
self._download_n_results(query, 1)
return
@ -1036,6 +1372,61 @@ class YoutubePlaylistIE(InfoExtractor):
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
return
class YoutubeUserIE(InfoExtractor):
"""Information Extractor for YouTube users."""
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
_VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
_youtube_ie = None
def __init__(self, youtube_ie, downloader=None):
InfoExtractor.__init__(self, downloader)
self._youtube_ie = youtube_ie
@staticmethod
def suitable(url):
return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
def report_download_page(self, username):
"""Report attempt to download user page."""
self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
def _real_initialize(self):
self._youtube_ie.initialize()
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid url: %s' % url)
return
# Download user page
username = mobj.group(1)
video_ids = []
pagenum = 1
self.report_download_page(username)
request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
return
# Extract video identifiers
ids_in_page = []
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page)
for id in video_ids:
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
return
class PostProcessor(object):
"""Post Processor class.
@ -1089,6 +1480,22 @@ if __name__ == '__main__':
import getpass
import optparse
# Function to update the program file with the latest version from bitbucket.org
def update_self(downloader, filename):
# Note: downloader only used for options
if not os.access (filename, os.W_OK):
sys.exit('ERROR: no write permissions on %s' % filename)
downloader.to_stdout('Updating to latest stable version...')
latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
latest_version = urllib.urlopen(latest_url).read().strip()
prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
newcontent = urllib.urlopen(prog_url).read()
stream = open(filename, 'w')
stream.write(newcontent)
stream.close()
downloader.to_stdout('Updated to version %s' % latest_version)
# General configuration
urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
@ -1097,7 +1504,7 @@ if __name__ == '__main__':
# Parse command line
parser = optparse.OptionParser(
usage='Usage: %prog [options] url...',
version='2009.09.13',
version='2010.02.13',
conflict_handler='resolve',
)
@ -1105,6 +1512,8 @@ if __name__ == '__main__':
action='help', help='print this help text and exit')
parser.add_option('-v', '--version',
action='version', help='print program version and exit')
parser.add_option('-U', '--update',
action='store_true', dest='update_self', help='update this program to latest stable version')
parser.add_option('-i', '--ignore-errors',
action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
parser.add_option('-r', '--rate-limit',
@ -1157,7 +1566,7 @@ if __name__ == '__main__':
parser.add_option_group(filesystem)
(opts, args) = parser.parse_args()
# Batch file verification
batchurls = []
if opts.batchfile is not None:
@ -1169,9 +1578,11 @@ if __name__ == '__main__':
sys.exit(u'ERROR: batch file could not be read')
all_urls = batchurls + args
# Make sure all URLs are in our preferred encoding
for i in range(0, len(all_urls)):
all_urls[i] = unicode(all_urls[i], preferredencoding())
# Conflicting, missing and erroneous options
if len(all_urls) < 1:
parser.error(u'you must provide at least one URL')
if opts.usenetrc and (opts.username is not None or opts.password is not None):
parser.error(u'using .netrc conflicts with giving username/password')
if opts.password is not None and opts.username is None:
@ -1192,7 +1603,11 @@ if __name__ == '__main__':
youtube_ie = YoutubeIE()
metacafe_ie = MetacafeIE(youtube_ie)
youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
youtube_user_ie = YoutubeUserIE(youtube_ie)
youtube_search_ie = YoutubeSearchIE(youtube_ie)
google_ie = GoogleIE()
photobucket_ie = PhotobucketIE()
generic_ie = GenericIE()
# File downloader
fd = FileDownloader({
@ -1215,8 +1630,26 @@ if __name__ == '__main__':
})
fd.add_info_extractor(youtube_search_ie)
fd.add_info_extractor(youtube_pl_ie)
fd.add_info_extractor(youtube_user_ie)
fd.add_info_extractor(metacafe_ie)
fd.add_info_extractor(youtube_ie)
fd.add_info_extractor(google_ie)
fd.add_info_extractor(photobucket_ie)
# This must come last since it's the
# fallback if none of the others work
fd.add_info_extractor(generic_ie)
# Update version
if opts.update_self:
update_self(fd, sys.argv[0])
# Maybe do nothing
if len(all_urls) < 1:
if not opts.update_self:
parser.error(u'you must provide at least one URL')
else:
sys.exit()
retcode = fd.download(all_urls)
sys.exit(retcode)