From 369a759acc9d12590355c6d9f96ef7852153570f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 28 Oct 2013 16:54:38 +0100 Subject: [PATCH 001/132] setup.py: Make sure the setuptools_available variable is set Otherwise it would crash if it can't import setuptools. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index f14f96377..aa7cfca08 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ try: setuptools_available = True except ImportError: from distutils.core import setup + setuptools_available = False try: # This will create an exe that needs Microsoft Visual C++ 2008 From 32a35e441874ad9daba10c29a6a33f13a4953fbb Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 28 Oct 2013 17:35:01 +0100 Subject: [PATCH 002/132] Add support for http://www.extremetube.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/extremetube.py | 52 +++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 youtube_dl/extractor/extremetube.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0d933986f..5eed1eebd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -39,6 +39,7 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE from .exfm import ExfmIE +from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fktv import ( diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py new file mode 100644 index 000000000..981de430d --- /dev/null +++ b/youtube_dl/extractor/extremetube.py @@ -0,0 +1,52 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, +) + +class ExtremeTubeIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Pextremetube\.com/video/.+?(?P[0-9]+))(?:[/?&]|$)' + _TEST = { + u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', + u'file': u'652431.mp4', + u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0', + u'info_dict': { + u"title": u"Music Video 14 british euro brit european cumshots swallow", + u"uploader": u"unknown", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'

]*?title="([^"]+)"[^>]*>\1<', webpage, u'title') + uploader = self._html_search_regex(r'>Posted by:(?=<)(\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False) + video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[5].split('_')[:2] + format = "-".join( format ) + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'title': video_title, + 'uploader': uploader, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + 'age_limit': age_limit, + } From 77ae65877e7b4b71d446ea928fd14f973826f07b Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 28 Oct 2013 18:18:58 +0100 Subject: [PATCH 003/132] Add support for http://www.mofosex.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mofosex.py | 49 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 youtube_dl/extractor/mofosex.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0d933986f..045d4447a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -81,6 +81,7 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE +from .mofosex import MofosexIE from .mtv import MTVIE from .muzu import MuzuTVIE from .myspass import MySpassIE diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py new file mode 100644 index 000000000..a0c926cd1 --- /dev/null +++ b/youtube_dl/extractor/mofosex.py @@ -0,0 +1,49 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, +) + +class MofosexIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Pmofosex\.com/videos/(?P[0-9]+)/.*?\.html)' + _TEST = { + u'url': u'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', + u'file': u'5018.mp4', + u'md5': u'1b2eb47ac33cc75d4a80e3026b613c5a', + u'info_dict': { + u"title": u"Japanese Teen Music Video", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'

(.+?)<', webpage, u'title') + video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, u'video_url')) + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[5].split('_')[:2] + format = "-".join( format ) + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'title': video_title, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + 'age_limit': age_limit, + } From 2bc67c35acece68a75284b88fcb03d69f267a63c Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 28 Oct 2013 18:22:55 +0100 Subject: [PATCH 004/132] [KeezMoviesIE] Detect URLs with numbers in the SEO part correct --- youtube_dl/extractor/keezmovies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 5e05900da..786924445 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -12,7 +12,7 @@ from ..aes import ( ) class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Pkeezmovies\.com/video/.+?(?P[0-9]+))' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Pkeezmovies\.com/video/.+?(?P[0-9]+))(?:[/?&]|$)' _TEST = { u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', u'file': u'1214711.mp4', From dcc2a706ef7df65839aa40ce5fda61f8cea36645 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 28 Oct 2013 19:23:48 +0100 Subject: [PATCH 005/132] Add support for http://www.xtube.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/xtube.py | 54 ++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 youtube_dl/extractor/xtube.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0d933986f..7efd097e4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -149,6 +149,7 @@ from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE +from .xtube import XTubeIE from .yahoo import YahooIE, YahooSearchIE from .youjizz import YouJizzIE from .youku import YoukuIE diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py new file mode 100644 index 000000000..7d06a7021 --- /dev/null +++ b/youtube_dl/extractor/xtube.py @@ -0,0 +1,54 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, +) + +class XTubeIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Pxtube\.com/watch\.php\?v=(?P[^/?&]+))' + _TEST = { + u'url': u'http://www.xtube.com/watch.php?v=kVTUy_G222_', + u'file': u'kVTUy_G222_.mp4', + u'md5': u'092fbdd3cbe292c920ef6fc6a8a9cdab', + u'info_dict': { + u"title": u"strange erotica", + u"uploader": u"greenshowers", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'
([^<]+)', webpage, u'description', default=None) + video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/') + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[5].split('_')[:2] + format[0] += 'p' + format[1] += 'k' + format = "-".join( format ) + + return { + 'id': video_id, + 'title': video_title, + 'uploader': video_uploader, + 'description': video_description, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + 'age_limit': 18, + } From 702665c0854af6fb317600c4825c0b00e2a4c981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 28 Oct 2013 22:01:37 +0100 Subject: [PATCH 006/132] tests: build the filename from the info_dict if the 'file' key is missing It will need to have the 'id' and 'ext' keys to work. --- test/test_download.py | 39 +++++++++++++++++++++++---------------- youtube_dl/YoutubeDL.py | 2 +- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index b9a9be11d..f136176b1 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -60,9 +60,12 @@ def generator(test_case): if not ie._WORKING: print_skipping('IE marked as not _WORKING') return - if 'playlist' not in test_case and not test_case['file']: - print_skipping('No output file specified') - return + if 'playlist' not in test_case: + info_dict = test_case.get('info_dict', {}) + if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')): + print_skipping('The output file cannot be know, the "file" ' + 'key is missing or the info_dict is incomplete') + return if 'skip' in test_case: print_skipping(test_case['skip']) return @@ -77,11 +80,17 @@ def generator(test_case): finished_hook_called.add(status['filename']) ydl.fd.add_progress_hook(_hook) + def get_tc_filename(tc): + return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {})) + test_cases = test_case.get('playlist', [test_case]) - for tc in test_cases: - try_rm(tc['file']) - try_rm(tc['file'] + '.part') - try_rm(tc['file'] + '.info.json') + def try_rm_tcs_files(): + for tc in test_cases: + tc_filename = get_tc_filename(tc) + try_rm(tc_filename) + try_rm(tc_filename + '.part') + try_rm(tc_filename + '.info.json') + try_rm_tcs_files() try: for retry in range(1, RETRIES + 1): try: @@ -98,14 +107,15 @@ def generator(test_case): break for tc in test_cases: + tc_filename = get_tc_filename(tc) if not test_case.get('params', {}).get('skip_download', False): - self.assertTrue(os.path.exists(tc['file']), msg='Missing file ' + tc['file']) - self.assertTrue(tc['file'] in finished_hook_called) - self.assertTrue(os.path.exists(tc['file'] + '.info.json')) + self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) + self.assertTrue(tc_filename in finished_hook_called) + self.assertTrue(os.path.exists(tc_filename + '.info.json')) if 'md5' in tc: - md5_for_file = _file_md5(tc['file']) + md5_for_file = _file_md5(tc_filename) self.assertEqual(md5_for_file, tc['md5']) - with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: + with io.open(tc_filename + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, expected) in tc.get('info_dict', {}).items(): if isinstance(expected, compat_str) and expected.startswith('md5:'): @@ -126,10 +136,7 @@ def generator(test_case): for key in ('id', 'url', 'title', 'ext'): self.assertTrue(key in info_dict.keys() and info_dict[key]) finally: - for tc in test_cases: - try_rm(tc['file']) - try_rm(tc['file'] + '.part') - try_rm(tc['file'] + '.info.json') + try_rm_tcs_files() return test_template diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 313295839..060678e9b 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -272,7 +272,7 @@ class YoutubeDL(object): autonumber_size = 5 autonumber_templ = u'%0' + str(autonumber_size) + u'd' template_dict['autonumber'] = autonumber_templ % self._num_downloads - if template_dict['playlist_index'] is not None: + if template_dict.get('playlist_index') is not None: template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index'] sanitize = lambda k, v: sanitize_filename( From 2563bcc85cc09382d7e731709b2c8a4ad96c7ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 28 Oct 2013 22:02:17 +0100 Subject: [PATCH 007/132] Add an extractor for MySpace (closes #1666) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/myspace.py | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 youtube_dl/extractor/myspace.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0d933986f..caaf54456 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE from .muzu import MuzuTVIE +from .myspace import MySpaceIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .naver import NaverIE diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py new file mode 100644 index 000000000..050f54a5a --- /dev/null +++ b/youtube_dl/extractor/myspace.py @@ -0,0 +1,48 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_str, +) + + +class MySpaceIE(InfoExtractor): + _VALID_URL = r'https?://myspace\.com/([^/]+)/video/[^/]+/(?P\d+)' + + _TEST = { + u'url': u'https://myspace.com/coldplay/video/viva-la-vida/100008689', + u'info_dict': { + u'id': u'100008689', + u'ext': u'flv', + u'title': u'Viva La Vida', + u'description': u'The official Viva La Vida video, directed by Hype Williams', + u'uploader': u'Coldplay', + u'uploader_id': u'coldplay', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + context = json.loads(self._search_regex(r'context = ({.*?});', webpage, + u'context')) + video = context['video'] + rtmp_url, play_path = video['streamUrl'].split(';', 1) + + return { + 'id': compat_str(video['mediaId']), + 'title': video['title'], + 'url': rtmp_url, + 'play_path': play_path, + 'ext': 'flv', + 'description': video['description'], + 'thumbnail': video['imageUrl'], + 'uploader': video['artistName'], + 'uploader_id': video['artistUsername'], + } From dd508b7c4f0dd8881de07a4e8593d4fcdef9bae7 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 28 Oct 2013 18:03:26 -0400 Subject: [PATCH 008/132] [tests] don't fail on network errors This is suboptimal, but at least this way we will need to look at the logs only to check for network errors that happen too often, instead of parsing a ton of lines each time to see if there is some true test failing --- test/helper.py | 17 +++++++++++++++++ test/test_download.py | 22 +++++++++++++++++----- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/test/helper.py b/test/helper.py index 777119ea5..d7bf7a828 100644 --- a/test/helper.py +++ b/test/helper.py @@ -5,9 +5,11 @@ import json import os.path import re import types +import sys import youtube_dl.extractor from youtube_dl import YoutubeDL +from youtube_dl.utils import preferredencoding def global_setup(): @@ -33,6 +35,21 @@ def try_rm(filename): raise +def report_warning(message): + ''' + Print the message to stderr, it will be prefixed with 'WARNING:' + If stderr is a tty file the 'WARNING:' will be colored + ''' + if sys.stderr.isatty() and os.name != 'nt': + _msg_header = u'\033[0;33mWARNING:\033[0m' + else: + _msg_header = u'WARNING:' + output = u'%s %s\n' % (_msg_header, message) + if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3: + output = output.encode(preferredencoding()) + sys.stderr.write(output) + + class FakeYDL(YoutubeDL): def __init__(self, override=None): # Different instances of the downloader can't share the same dictionary diff --git a/test/test_download.py b/test/test_download.py index f136176b1..565afa1b5 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -6,7 +6,14 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, get_testcases, global_setup, try_rm, md5 +from test.helper import ( + get_params, + get_testcases, + global_setup, + try_rm, + md5, + report_warning +) global_setup() @@ -92,17 +99,22 @@ def generator(test_case): try_rm(tc_filename + '.info.json') try_rm_tcs_files() try: - for retry in range(1, RETRIES + 1): + try_num = 1 + while True: try: ydl.download([test_case['url']]) except (DownloadError, ExtractorError) as err: - if retry == RETRIES: raise - # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): raise - print('Retrying: {0} failed tries\n\n##########\n\n'.format(retry)) + if try_num == RETRIES: + report_warning(u'Failed due to network errors, skipping...') + return + + print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num)) + + try_num += 1 else: break From 646e17a53d3885b84b03045728b3add3d50f513c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 23:18:13 +0100 Subject: [PATCH 009/132] Fix YouTubeDL test --- test/test_YoutubeDL.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f8cd1bdce..ffebb4ae5 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -62,10 +62,10 @@ class TestFormatSelection(unittest.TestCase): def test_format_limit(self): formats = [ - {u'format_id': u'meh'}, - {u'format_id': u'good'}, - {u'format_id': u'great'}, - {u'format_id': u'excellent'}, + {u'format_id': u'meh', u'url': u'http://example.com/meh'}, + {u'format_id': u'good', u'url': u'http://example.com/good'}, + {u'format_id': u'great', u'url': u'http://example.com/great'}, + {u'format_id': u'excellent', u'url': u'http://example.com/exc'}, ] info_dict = { u'formats': formats, u'extractor': u'test', 'id': 'testvid'} From 321a01f97110c3048e9d9c360a099d1ec8cd4479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 28 Oct 2013 23:37:01 +0100 Subject: [PATCH 010/132] [mtv] Remove the templates from the mediagen url --- youtube_dl/extractor/mtv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e520e2bb4..e96d3952c 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -80,6 +80,8 @@ class MTVIE(InfoExtractor): video_id = self._id_from_uri(uri) self.report_extraction(video_id) mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] + # Remove the templates, like &device={device} + mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&acceptMethods=fms' mediagen_page = self._download_webpage(mediagen_url, video_id, From f6cc16f5d821a50df173b865164e4fa9cbe854af Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 28 Oct 2013 19:07:16 -0400 Subject: [PATCH 011/132] [tests] a HTTP 503 is a transient issue --- test/test_download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 565afa1b5..dfb04d010 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -26,6 +26,7 @@ import youtube_dl.YoutubeDL from youtube_dl.utils import ( compat_str, compat_urllib_error, + compat_HTTPError, DownloadError, ExtractorError, UnavailableVideoError, @@ -105,7 +106,7 @@ def generator(test_case): ydl.download([test_case['url']]) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one - if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): raise if try_num == RETRIES: From 795f28f871074aca2a74dfe67e1e75252b525c4c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 29 Oct 2013 06:45:54 +0100 Subject: [PATCH 012/132] [youtube] Fix login (Fixes #1681) --- youtube_dl/extractor/youtube.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d05d0a8c1..f3a2a32b4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -74,14 +74,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) return False - galx = None - dsh = None - match = re.search(re.compile(r' Date: Tue, 29 Oct 2013 06:48:39 +0100 Subject: [PATCH 013/132] release 2013.10.29 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 048afc8e7..1a94003bc 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.28' +__version__ = '2013.10.29' From 912cbf5d4ef5b131af88e63815863c389083d077 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 29 Oct 2013 14:00:01 +0100 Subject: [PATCH 014/132] [vevo] Fix timestamp handling ( / 1000 is implicit float division ) --- youtube_dl/extractor/vevo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 1c1cc418d..26ec9fa1b 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -58,9 +58,9 @@ class VevoIE(InfoExtractor): 'width': int(attr['frameWidth']), }) - date_epoch = int(self._search_regex( - r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000 - upload_date = datetime.datetime.fromtimestamp(date_epoch) + timestamp_ms = int(self._search_regex( + r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date')) + upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000) info = { 'id': video_id, 'title': video_info['title'], From 57dd9a8f2f5885fb3d909c4905adb69b4749491c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 29 Oct 2013 15:09:45 +0100 Subject: [PATCH 015/132] Nicer --list-formats output --- youtube_dl/YoutubeDL.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 060678e9b..260cd2809 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -759,6 +759,8 @@ class YoutubeDL(object): @staticmethod def format_resolution(format, default='unknown'): + if format.get('_resolution') is not None: + return format['_resolution'] if format.get('height') is not None: if format.get('width') is not None: res = u'%sx%s' % (format['width'], format['height']) @@ -769,19 +771,22 @@ class YoutubeDL(object): return res def list_formats(self, info_dict): - formats_s = [] - for format in info_dict.get('formats', [info_dict]): - formats_s.append(u'%-15s%-7s %-15s%s' % ( + def line(format): + return (u'%-15s%-10s%-12s%s' % ( format['format_id'], format['ext'], - format.get('format_note', ''), self.format_resolution(format), + format.get('format_note', ''), ) ) + + formats_s = list(map(line, info_dict.get('formats', [info_dict]))) if len(formats_s) != 1: - formats_s[0] += ' (worst)' - formats_s[-1] += ' (best)' - formats_s = "\n".join(formats_s) - self.to_screen(u'[info] Available formats for %s:\n' - u'format code extension note resolution\n%s' % ( - info_dict['id'], formats_s)) + formats_s[0] += (' ' if formats_s[0] else '') + '(worst)' + formats_s[-1] += (' ' if formats_s[-1] else '') + '(best)' + + header_line = line({ + 'format_id': u'format code', 'ext': u'extension', + '_resolution': u'resolution', 'format_note': u'note'}) + self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % + (info_dict['id'], header_line, u"\n".join(formats_s))) From e54fd4b23b8110779e8caff805d3078dcf042d0b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 29 Oct 2013 15:10:09 +0100 Subject: [PATCH 016/132] [vevo] Add more format details --- youtube_dl/extractor/vevo.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 26ec9fa1b..4d9f2a843 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -50,10 +50,11 @@ class VevoIE(InfoExtractor): # Already sorted from worst to best quality for rend in renditions.findall('rendition'): attr = rend.attrib - f_url = attr['url'] + format_note = '%(videoCodec)s@%(videoBitrate)4sK, %(audioCodec)s@%(audioBitrate)3sK' % attr formats.append({ - 'url': f_url, - 'ext': determine_ext(f_url), + 'url': attr['url'], + 'format_id': attr['name'], + 'format_note': format_note, 'height': int(attr['frameheight']), 'width': int(attr['frameWidth']), }) @@ -71,7 +72,4 @@ class VevoIE(InfoExtractor): 'duration': video_info['duration'], } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info From 21c924f4068692786e0c5435689d10f3d17ef612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 29 Oct 2013 20:58:49 +0100 Subject: [PATCH 017/132] [arte] Download the 'Originalversion' version if it's the only one available (fixes #1682) --- youtube_dl/extractor/arte.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index d39b48951..e10c74c11 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -158,7 +158,9 @@ class ArteTVPlus7IE(InfoExtractor): 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } - formats = player_info['VSR'].values() + all_formats = player_info['VSR'].values() + # Some formats use the m3u8 protocol + all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) def _match_lang(f): if f.get('versionCode') is None: return True @@ -170,11 +172,16 @@ class ArteTVPlus7IE(InfoExtractor): regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] return any(re.match(r, f['versionCode']) for r in regexes) # Some formats may not be in the same language as the url - formats = filter(_match_lang, formats) - # Some formats use the m3u8 protocol - formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) - # We order the formats by quality + formats = filter(_match_lang, all_formats) formats = list(formats) # in python3 filter returns an iterator + if not formats: + # Some videos are only available in the 'Originalversion' + # they aren't tagged as being in French or German + if all(f['versionCode'] == 'VO' for f in all_formats): + formats = all_formats + else: + raise ExtractorError(u'The formats list is empty') + # We order the formats by quality if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) else: From b9a836515fad5df57a86412b2cd41c49869ec0d6 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Tue, 29 Oct 2013 16:44:35 -0400 Subject: [PATCH 018/132] Update the Vimeo test vector md5 confirmed that this is indeed the first 10241 (we went off by one with byte range 0-10240) of the full, playing mp4, so they probably reencoded or something --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index b4dbcd2ee..c7d864a2b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -27,7 +27,7 @@ class VimeoIE(InfoExtractor): { u'url': u'http://vimeo.com/56015672#at=0', u'file': u'56015672.mp4', - u'md5': u'ae7a1d8b183758a0506b0622f37dfa14', + u'md5': u'8879b6cc097e987f02484baf890129e5', u'info_dict': { u"upload_date": u"20121220", u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", From 94badb2599e54bfd711b38f3a74c552ff652d6d3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:09:26 +0100 Subject: [PATCH 019/132] Fix output indenting for --list-formats --- youtube_dl/YoutubeDL.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 260cd2809..898533496 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -780,10 +780,11 @@ class YoutubeDL(object): ) ) - formats_s = list(map(line, info_dict.get('formats', [info_dict]))) - if len(formats_s) != 1: - formats_s[0] += (' ' if formats_s[0] else '') + '(worst)' - formats_s[-1] += (' ' if formats_s[-1] else '') + '(best)' + formats = info_dict.get('formats', [info_dict]) + formats_s = list(map(line, formats)) + if len(formats) > 1: + formats_s[0] += (' ' if formats[0].get('format_note') else '') + '(worst)' + formats_s[-1] += (' ' if formats[-1].get('format_note') else '') + '(best)' header_line = line({ 'format_id': u'format code', 'ext': u'extension', From b5d0d817bc8a23ef6dc2a00d1af6fad893143206 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:09:44 +0100 Subject: [PATCH 020/132] Remove superfluous space --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ce349fe20..cef4dce85 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -63,7 +63,7 @@ class InfoExtractor(object): * ext Will be calculated from url if missing * format A human-readable description of the format ("mp4 container with h264/opus"). - Calculated from the format_id, width, height + Calculated from the format_id, width, height. and format_note fields if missing. * format_id A short description of the format ("mp4_h264_opus" or "19") From 72321ead7b176824d1a8b2895ad4926555e41b88 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:14:17 +0100 Subject: [PATCH 021/132] [vevo] Readd support for SMIL (Fixes #1683) --- youtube_dl/extractor/vevo.py | 80 +++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 4d9f2a843..3f6020f74 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,7 +5,7 @@ import datetime from .common import InfoExtractor from ..utils import ( - determine_ext, + compat_HTTPError, ExtractorError, ) @@ -16,26 +16,22 @@ class VevoIE(InfoExtractor): (currently used by MTVIE) """ _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P.*?)(\?|$)' - _TEST = { + _TESTS = [{ u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', + u"md5": u"06bea460acb744eab74a9d7dcb4bfd61", u'info_dict': { u"upload_date": u"20130624", u"uploader": u"Hurts", u"title": u"Somebody to Die For", - u'duration': 230, + u"duration": 230, + u"width": 1920, + u"height": 1080, } - } + }] + _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - info_json = self._download_webpage(json_url, video_id, u'Downloading json info') - - self.report_extraction(video_id) - video_info = json.loads(info_json)['video'] + def _formats_from_json(self, video_info): last_version = {'version': -1} for version in video_info['videoVersions']: # These are the HTTP downloads, other types are for different manifests @@ -50,7 +46,7 @@ class VevoIE(InfoExtractor): # Already sorted from worst to best quality for rend in renditions.findall('rendition'): attr = rend.attrib - format_note = '%(videoCodec)s@%(videoBitrate)4sK, %(audioCodec)s@%(audioBitrate)3sK' % attr + format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr formats.append({ 'url': attr['url'], 'format_id': attr['name'], @@ -58,6 +54,62 @@ class VevoIE(InfoExtractor): 'height': int(attr['frameheight']), 'width': int(attr['frameWidth']), }) + return formats + + def _formats_from_smil(self, smil_xml): + formats = [] + smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) + els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') + for el in els: + src = el.attrib['src'] + m = re.match(r'''(?xi) + (?P[a-z0-9]+): + (?P + [/a-z0-9]+ # The directory and main part of the URL + _(?P[0-9]+)k + _(?P[0-9]+)x(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + \.[a-z0-9]+ # File extension + )''', src) + if not m: + continue + + format_url = self._SMIL_BASE_URL + m.group('path') + format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' % + m.groupdict()) + formats.append({ + 'url': format_url, + 'format_id': u'SMIL_' + m.group('cbr'), + 'format_note': format_note, + 'ext': m.group('ext'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + return formats + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id + info_json = self._download_webpage(json_url, video_id, u'Downloading json info') + video_info = json.loads(info_json)['video'] + + formats = self._formats_from_json(video_info) + try: + smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( + self._SMIL_BASE_URL, video_id, video_id.lower()) + smil_xml = self._download_webpage(smil_url, video_id, + u'Downloading SMIL info') + formats.extend(self._formats_from_smil(smil_xml)) + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError): + raise + self._downloader.report_warning( + u'Cannot download SMIL information, falling back to JSON ..') timestamp_ms = int(self._search_regex( r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date')) From 7193498811cb17a66ca57569a8588adb28ba2b27 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:17:00 +0100 Subject: [PATCH 022/132] Use index in formt string (Fixes vevo test on Python 2.6) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 898533496..7f73ea360 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -482,7 +482,7 @@ class YoutubeDL(object): format['format'] = u'{id} - {res}{note}'.format( id=format['format_id'], res=self.format_resolution(format), - note=u' ({})'.format(format['format_note']) if format.get('format_note') is not None else '', + note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', ) # Automatically determine file extension if missing if 'ext' not in format: From 33b1d9595d853893b5d732863dc2f5eabd939637 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:17:20 +0100 Subject: [PATCH 023/132] release 2013.10.30 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1a94003bc..e8eade7ad 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.29' +__version__ = '2013.10.30' From 9f1109a56424d118263963062bc5185d8415835e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 31 Oct 2013 00:20:49 +0100 Subject: [PATCH 024/132] [dailymotion] Fix support for age-restricted videos (Fixes #1688) --- youtube_dl/extractor/dailymotion.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 4c0488245..355b4ed0a 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -21,6 +21,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): """Build a request with the family filter disabled""" request = compat_urllib_request.Request(url) request.add_header('Cookie', 'family_filter=off') + request.add_header('Cookie', 'ff=off') return request class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): @@ -61,6 +62,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): }, u'skip': u'VEVO is only available in some countries', }, + # age-restricted video + { + u'url': u'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', + u'file': u'xyh2zz.mp4', + u'md5': u'0d667a7b9cebecc3c89ee93099c4159d', + u'info_dict': { + u'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', + u'uploader': 'HotWaves1012', + u'age_limit': 18, + } + + } ] def _real_extract(self, url): @@ -90,7 +103,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) @@ -132,15 +146,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id) return - return [{ + return { 'id': video_id, 'formats': formats, 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'subtitles': video_subtitles, - 'thumbnail': info['thumbnail_url'] - }] + 'thumbnail': info['thumbnail_url'], + 'age_limit': age_limit, + } def _get_available_subtitles(self, video_id): try: From 0ef7ad5cd49d527a24c62e831cf80f2eb443276f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 31 Oct 2013 07:55:03 +0100 Subject: [PATCH 025/132] Fix the test for dailymotion subtitles The extractor returns a single info_dict now. --- test/test_dailymotion_subtitles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index c596415c4..ba3580ea4 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -22,7 +22,7 @@ class TestDailymotionSubtitles(unittest.TestCase): return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict[0]['subtitles'] + return info_dict['subtitles'] def test_no_writesubtitles(self): subtitles = self.getSubtitles() self.assertEqual(subtitles, None) From 5f1ea943ab6814c2f8ca2a383f990e3f4c9e5f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 31 Oct 2013 08:07:26 +0100 Subject: [PATCH 026/132] [livestream] fix the extraction of events It now uses a json dictionary from the webpage. --- youtube_dl/extractor/livestream.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index d04da98c8..4531fd6ab 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -40,13 +40,9 @@ class LivestreamIE(InfoExtractor): if video_id is None: # This is an event page: - player = get_meta_content('twitter:player', webpage) - if player is None: - raise ExtractorError('Couldn\'t extract event api url') - api_url = player.replace('/player', '') - api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url) - info = json.loads(self._download_webpage(api_url, event_name, - u'Downloading event info')) + config_json = self._search_regex(r'window.config = ({.*?});', + webpage, u'window config') + info = json.loads(config_json)['event'] videos = [self._extract_video_info(video_data['data']) for video_data in info['feed']['data'] if video_data['type'] == u'video'] return self.playlist_result(videos, info['id'], info['full_name']) From ac2547f5ffc30a352207336194e7bbb0435d01a7 Mon Sep 17 00:00:00 2001 From: Alex Van't Hof Date: Thu, 31 Oct 2013 01:57:22 -0400 Subject: [PATCH 027/132] [teamcoco] Fix video url extraction for some videos Video url extraction failed for some videos, e.g. http://teamcoco.com/video/old-time-baseball The url extracted was also occasionally suboptimal quality, e.g. http://teamcoco.com/video/louis-ck-interview-george-w-bush --- youtube_dl/extractor/teamcoco.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index c910110ca..76246c7cc 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -3,6 +3,7 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + RegexNotFoundError, ) @@ -11,7 +12,7 @@ class TeamcocoIE(InfoExtractor): _TEST = { u'url': u'http://teamcoco.com/video/louis-ck-interview-george-w-bush', u'file': u'19705.mp4', - u'md5': u'27b6f7527da5acf534b15f21b032656e', + u'md5': u'cde9ba0fa3506f5f017ce11ead928f9a', u'info_dict': { u"description": u"Louis C.K. got starstruck by George W. Bush, so what? Part one.", u"title": u"Louis C.K. Interview Pt. 1 11/3/11" @@ -33,8 +34,21 @@ class TeamcocoIE(InfoExtractor): data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id data = self._download_webpage(data_url, video_id, 'Downloading data webpage') - video_url = self._html_search_regex(r']*type="high".*?>(.*?)', - data, u'video URL') + + qualities = [ '1080p', '720p', '1000k', '480p', '500k' ] + best_quality_idx = len(qualities)+1 # First regex match may not be optimal + for idx, quality in enumerate(qualities): + regex = r']*type="(?:high|standard)".*?>(.*%s.*)' % quality + try: + url = self._html_search_regex(regex, data, u'video URL') + if idx < best_quality_idx: + video_url = url + best_quality_idx = idx + except RegexNotFoundError: + # Just catch fatal exc. Don't want the fatal=False warning + continue + if not video_url: + raise RegexNotFoundError(u'Unable to extract video URL') return [{ 'id': video_id, From ab4e15134719e6c01a3a9768f21a0f361e4b781d Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Fri, 1 Nov 2013 01:24:23 +0100 Subject: [PATCH 028/132] [CinemassacreIE] Support more embed urls --- youtube_dl/extractor/cinemassacre.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 2fe1033f0..8f9396d6b 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -41,7 +41,7 @@ class CinemassacreIE(InfoExtractor): webpage_url = u'http://' + mobj.group('url') webpage = self._download_webpage(webpage_url, None) # Don't know video id yet video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') - mobj = re.search(r'src="(?Phttp://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P.+?))"', webpage) + mobj = re.search(r'src="(?Phttp://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P.+?))"', webpage) if not mobj: raise ExtractorError(u'Can\'t extract embed url and video id') playerdata_url = mobj.group(u'embed_url') From 66cf3ac3426b62fb960b4de770c4ea8203a0e205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Nov 2013 11:55:35 +0100 Subject: [PATCH 029/132] [metacafe] Fix support for age-restricted videos (fixes #1696) The 'Content-Type' header must be set for disabling the family filter. The 'flashversion' cookie is only needed for AnyClip videos. Added tests for standard metacafe videos and for age-restricted videos. Also set the 'age_limit' field. --- youtube_dl/extractor/metacafe.py | 51 ++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 234b9e80f..91480ba87 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -20,7 +20,9 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = u'metacafe' - _TESTS = [{ + _TESTS = [ + # Youtube video + { u"add_ie": ["Youtube"], u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", u"file": u"_aUehQsCQtM.mp4", @@ -32,15 +34,42 @@ class MetacafeIE(InfoExtractor): u"uploader_id": u"PBS" } }, + # Normal metacafe video + { + u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', + u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad', + u'info_dict': { + u'id': u'11121940', + u'ext': u'mp4', + u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4', + u'uploader': u'ign', + u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', + }, + }, + # AnyClip video { u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/", u"file": u"an-dVVXnuY7Jh77J.mp4", u"info_dict": { u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3", u"uploader": u"anyclip", - u"description": u"md5:38c711dd98f5bb87acf973d573442e67" - } - }] + u"description": u"md5:38c711dd98f5bb87acf973d573442e67", + }, + }, + # age-restricted video + { + u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', + u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09', + u'info_dict': { + u'id': u'5186653', + u'ext': u'mp4', + u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', + u'uploader': u'Dwayne Pipe', + u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b', + u'age_limit': 18, + }, + }, + ] def report_disclaimer(self): @@ -62,6 +91,7 @@ class MetacafeIE(InfoExtractor): 'submit': "Continue - I'm over 18", } request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') try: self.report_age_confirmation() compat_urllib_request.urlopen(request).read() @@ -83,7 +113,12 @@ class MetacafeIE(InfoExtractor): # Retrieve video webpage to extract further information req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) - req.headers['Cookie'] = 'flashVersion=0;' + + # AnyClip videos require the flashversion cookie so that we get the link + # to the mp4 file + mobj_an = re.match(r'^an-(.*?)$', video_id) + if mobj_an: + req.headers['Cookie'] = 'flashVersion=0;' webpage = self._download_webpage(req, video_id) # Extract URL, uploader and title from webpage @@ -125,6 +160,11 @@ class MetacafeIE(InfoExtractor): r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, u'uploader nickname', fatal=False) + if re.search(r'"contentRating":"restricted"', webpage) is not None: + age_limit = 18 + else: + age_limit = 0 + return { '_type': 'video', 'id': video_id, @@ -134,4 +174,5 @@ class MetacafeIE(InfoExtractor): 'upload_date': None, 'title': video_title, 'ext': video_ext, + 'age_limit': age_limit, } From 60d142aa8d896674ca2b062a53b3d18c644192ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Nov 2013 22:28:51 +0100 Subject: [PATCH 030/132] Add an extractor for vk.com (closes #1635) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/vk.py | 45 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 youtube_dl/extractor/vk.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index caaf54456..bcf1cce7f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -142,6 +142,7 @@ from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .vk import VKIE from .wat import WatIE from .websurg import WeBSurgIE from .weibo import WeiboIE diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py new file mode 100644 index 000000000..90d8a6d07 --- /dev/null +++ b/youtube_dl/extractor/vk.py @@ -0,0 +1,45 @@ +# encoding: utf-8 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_str, + unescapeHTML, +) + + +class VKIE(InfoExtractor): + IE_NAME = u'vk.com' + _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P.*?)(?:\?|%2F|$)' + + _TEST = { + u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + u'md5': u'0deae91935c54e00003c2a00646315f0', + u'info_dict': { + u'id': u'162222515', + u'ext': u'flv', + u'title': u'ProtivoGunz - Хуёвая песня', + u'uploader': u'Noize MC', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id + info_page = self._download_webpage(info_url, video_id) + m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) + if m_yt is not None: + self.to_screen(u'Youtube video detected') + return self.url_result(m_yt.group(1), 'Youtube') + vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars') + vars = json.loads(vars_json) + + return { + 'id': compat_str(vars['vid']), + 'url': vars['url240'], + 'title': unescapeHTML(vars['md_title']), + 'thumbnail': vars['jpg'], + 'uploader': vars['md_author'], + } From 8eddf3e91ddab3bb766bc5176edb3120be5743ea Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 2 Nov 2013 11:21:05 +0100 Subject: [PATCH 031/132] [youtube] Encode subtitle track name in request (Fixes #1700) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f3a2a32b4..dc601de52 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1111,7 +1111,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'lang': lang, 'v': video_id, 'fmt': self._downloader.params.get('subtitlesformat'), - 'name': l[0], + 'name': l[0].encode('utf-8'), }) url = u'http://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url From aa2484e390d8a5e74d740fda61b4062a4a8c1d0e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 2 Nov 2013 11:21:36 +0100 Subject: [PATCH 032/132] release 2013.11.02 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e8eade7ad..75a46a2d5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.30' +__version__ = '2013.11.02' From 31366066bd18cfd32de901264f53f42fe96f55c2 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 2 Nov 2013 18:08:16 +0100 Subject: [PATCH 033/132] Add support for live parameter to rtmpdump --- youtube_dl/FileDownloader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 8ecabab1a..0804dfbe1 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -267,7 +267,7 @@ class FileDownloader(object): self.to_screen(u'\r%s[download] 100%% of %s in %s' % (clear_line, data_len_str, self.format_seconds(tot_time))) - def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): + def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live): self.report_destination(filename) tmpfilename = self.temp_name(filename) test = self.params.get('test', False) @@ -294,6 +294,8 @@ class FileDownloader(object): basic_args += ['--tcUrl', url] if test: basic_args += ['--stop', '1'] + if live: + basic_args += ['--live'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if self.params.get('verbose', False): try: @@ -411,7 +413,8 @@ class FileDownloader(object): info_dict.get('player_url', None), info_dict.get('page_url', None), info_dict.get('play_path', None), - info_dict.get('tc_url', None)) + info_dict.get('tc_url', None), + info_dict.get('live', False)) # Attempt to download using mplayer if url.startswith('mms') or url.startswith('rtsp'): From 0a43ddf3209e13f5e87b07c440e03a45deea3e57 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 2 Nov 2013 18:08:35 +0100 Subject: [PATCH 034/132] [CinemassacreIE] Add live paramter to extracted info as a workaround --- youtube_dl/extractor/cinemassacre.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 2fe1033f0..79d879ced 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -65,6 +65,7 @@ class CinemassacreIE(InfoExtractor): { 'url': url, 'play_path': 'mp4:' + sd_file, + 'live': True, # workaround 'ext': 'flv', 'format': 'sd', 'format_id': 'sd', @@ -72,6 +73,7 @@ class CinemassacreIE(InfoExtractor): { 'url': url, 'play_path': 'mp4:' + hd_file, + 'live': True, # workaround 'ext': 'flv', 'format': 'hd', 'format_id': 'hd', From 72a5b4f70216fe1a5b1c22be34653ae0ff81058a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Nov 2013 19:01:01 +0100 Subject: [PATCH 035/132] Add an extractor for bambuser.com (#1702) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bambuser.py | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 youtube_dl/extractor/bambuser.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bcf1cce7f..a1e35eb46 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,6 +9,7 @@ from .arte import ( ArteTVFutureIE, ) from .auengine import AUEngineIE +from .bambuser import BambuserIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py new file mode 100644 index 000000000..cf8da22e3 --- /dev/null +++ b/youtube_dl/extractor/bambuser.py @@ -0,0 +1,42 @@ +import re +import json + +from .common import InfoExtractor + + +class BambuserIE(InfoExtractor): + _VALID_URL = r'https?://bambuser\.com/v/(?P\d+)' + _API_KEY = '005f64509e19a868399060af746a00aa' + + _TEST = { + u'url': u'http://bambuser.com/v/4050584', + u'md5': u'fba8f7693e48fd4e8641b3fd5539a641', + u'info_dict': { + u'id': u'4050584', + u'ext': u'flv', + u'title': u'Education engineering days - lightning talks', + u'duration': 3741, + u'uploader': u'pixelversity', + u'uploader_id': u'344706', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = ('http://player-c.api.bambuser.com/getVideo.json?' + '&api_key=%s&vid=%s' % (self._API_KEY, video_id)) + info_json = self._download_webpage(info_url, video_id) + info = json.loads(info_json)['result'] + + return { + 'id': video_id, + 'title': info['title'], + 'url': info['url'], + 'thumbnail': info['preview'], + 'duration': int(info['length']), + 'view_count': int(info['views_total']), + 'uploader': info['username'], + 'uploader_id': info['uid'], + } + From 1f343eaabbb9e0daf67363b7737833cf5e2a3e16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Sat, 2 Nov 2013 18:01:05 +0100 Subject: [PATCH 036/132] [subtitles] refactor to support websites with subtitle information the webpage. I added the parameter webpage, so now it's similar to the way automatic captions are handled. This is an improvement needed for websites like TED. --- youtube_dl/extractor/dailymotion.py | 6 +++--- youtube_dl/extractor/subtitles.py | 12 ++++++------ youtube_dl/extractor/youtube.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7d8353946..3aef82bcf 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -113,9 +113,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): video_url = info[max_quality] # subtitles - video_subtitles = self.extract_subtitles(video_id) + video_subtitles = self.extract_subtitles(video_id, webpage) if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id) + self._list_available_subtitles(video_id, webpage) return return [{ @@ -129,7 +129,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): 'thumbnail': info['thumbnail_url'] }] - def _get_available_subtitles(self, video_id): + def _get_available_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 90de7de3a..4b4c5235d 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -12,9 +12,9 @@ class SubtitlesInfoExtractor(InfoExtractor): return any([self._downloader.params.get('writesubtitles', False), self._downloader.params.get('writeautomaticsub')]) - def _list_available_subtitles(self, video_id, webpage=None): + def _list_available_subtitles(self, video_id, webpage): """ outputs the available subtitles for the video """ - sub_lang_list = self._get_available_subtitles(video_id) + sub_lang_list = self._get_available_subtitles(video_id, webpage) auto_captions_list = self._get_available_automatic_caption(video_id, webpage) sub_lang = ",".join(list(sub_lang_list.keys())) self.to_screen(u'%s: Available subtitles for video: %s' % @@ -23,7 +23,7 @@ class SubtitlesInfoExtractor(InfoExtractor): self.to_screen(u'%s: Available automatic captions for video: %s' % (video_id, auto_lang)) - def extract_subtitles(self, video_id, video_webpage=None): + def extract_subtitles(self, video_id, webpage): """ returns {sub_lang: sub} ,{} if subtitles not found or None if the subtitles aren't requested. @@ -32,9 +32,9 @@ class SubtitlesInfoExtractor(InfoExtractor): return None available_subs_list = {} if self._downloader.params.get('writeautomaticsub', False): - available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) + available_subs_list.update(self._get_available_automatic_caption(video_id, webpage)) if self._downloader.params.get('writesubtitles', False): - available_subs_list.update(self._get_available_subtitles(video_id)) + available_subs_list.update(self._get_available_subtitles(video_id, webpage)) if not available_subs_list: # error, it didn't get the available subtitles return {} @@ -74,7 +74,7 @@ class SubtitlesInfoExtractor(InfoExtractor): return return sub - def _get_available_subtitles(self, video_id): + def _get_available_subtitles(self, video_id, webpage): """ returns {sub_lang: url} or {} if not available Must be redefined by the subclasses diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4347651d7..d7c9b38f9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1099,7 +1099,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _get_available_subtitles(self, video_id): + def _get_available_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, From a9a3876d55be943a7eaf505cbeb8fb862514db6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Sat, 2 Nov 2013 19:48:39 +0100 Subject: [PATCH 037/132] [ted] Added support for subtitle download --- test/test_ted_subtitles.py | 63 +++++++++++++++++++++++++++++++++++++ youtube_dl/extractor/ted.py | 28 ++++++++++++++--- 2 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 test/test_ted_subtitles.py diff --git a/test/test_ted_subtitles.py b/test/test_ted_subtitles.py new file mode 100644 index 000000000..3283253ab --- /dev/null +++ b/test/test_ted_subtitles.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +import sys +import unittest +import hashlib + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import TEDIE +from youtube_dl.utils import * +from helper import FakeYDL + +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() + +class TestTedSubtitles(unittest.TestCase): + def setUp(self): + self.DL = FakeYDL() + self.url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' + def getInfoDict(self): + IE = TEDIE(self.DL) + info_dict = IE.extract(self.url) + return info_dict + def getSubtitles(self): + info_dict = self.getInfoDict() + return info_dict[0]['subtitles'] + def test_no_writesubtitles(self): + subtitles = self.getSubtitles() + self.assertEqual(subtitles, None) + def test_subtitles(self): + self.DL.params['writesubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '2154f31ff9b9f89a0aa671537559c21d') + def test_subtitles_lang(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitleslangs'] = ['fr'] + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['fr']), '7616cbc6df20ec2c1204083c83871cf6') + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles.keys()), 28) + def test_list_subtitles(self): + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() + self.assertEqual(info_dict, [None]) + def test_automatic_captions(self): + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslang'] = ['en'] + subtitles = self.getSubtitles() + self.assertTrue(len(subtitles.keys()) == 0) + def test_multiple_langs(self): + self.DL.params['writesubtitles'] = True + langs = ['es', 'fr', 'de'] + self.DL.params['subtitleslangs'] = langs + subtitles = self.getSubtitles() + for lang in langs: + self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index dfa1176a3..239e2a448 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -1,10 +1,9 @@ import json import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor - -class TEDIE(InfoExtractor): +class TEDIE(SubtitlesInfoExtractor): _VALID_URL=r'''http://www\.ted\.com/ ( ((?Pplaylists)/(?P\d+)) # We have a playlist @@ -82,11 +81,21 @@ class TEDIE(InfoExtractor): 'url': stream['file'], 'format': stream['id'] } for stream in info['htmlStreams']] + + video_id = info['id'] + + # subtitles + video_subtitles = self.extract_subtitles(video_id, webpage) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, webpage) + return + info = { - 'id': info['id'], + 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'description': desc, + 'subtitles': video_subtitles, 'formats': formats, } @@ -94,3 +103,14 @@ class TEDIE(InfoExtractor): info.update(info['formats'][-1]) return info + + def _get_available_subtitles(self, video_id, webpage): + options = self._search_regex(r'(?:)', webpage, 'subtitles_language_select', flags=re.DOTALL) + languages = re.findall(r'(?: