release 2016.02.10

[YoutubeDL] Sanitize format_id (Closes #8494 )
[pbs] Switch to portal player by default (Closes #8491 )
2016-02-10 16:17:38 +01:00 · 2016-02-10 21:16:58 +06:00 · 2016-02-10 20:46:38 +06:00 · 2016-02-10 20:05:17 +06:00 · 2016-02-10 04:43:00 +06:00 · 2016-02-09 20:12:36 +01:00
9 changed files with 151 additions and 48 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -1288,6 +1288,9 @@ class YoutubeDL(object):

            if format.get('format_id') is None:
                format['format_id'] = compat_str(i)
+            else:
+                # Sanitize format_id from characters used in format selector expression
+                format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
            format_id = format['format_id']
            if format_id not in formats_dict:
                formats_dict[format_id] = []
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1186,6 +1186,7 @@ class InfoExtractor(object):
        http_count = 0
        m3u8_count = 0

+        src_urls = []
        videos = smil.findall(self._xpath_ns('.//video', namespace))
        for video in videos:
            src = video.get('src')
@ -1222,6 +1223,9 @@ class InfoExtractor(object):
                continue

            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+            if src_url in src_urls:
+                continue
+            src_urls.append(src_url)

            if proto == 'm3u8' or src_ext == 'm3u8':
                m3u8_formats = self._extract_m3u8_formats(
@ -1267,11 +1271,13 @@ class InfoExtractor(object):
        return formats

    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+        urls = []
        subtitles = {}
        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
            src = textstream.get('src')
-            if not src:
+            if not src or src in urls:
                continue
+            urls.append(src)
            ext = textstream.get('ext') or determine_ext(src)
            if not ext:
                type_ = textstream.get('type')
@ -1434,6 +1440,8 @@ class InfoExtractor(object):
                            base_url = mpd_base_url + base_url
                        representation_id = representation_attrib.get('id')
                        lang = representation_attrib.get('lang')
+                        url_el = representation.find(_add_ns('BaseURL'))
+                        filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
                        f = {
                            'format_id': mpd_id or representation_id,
                            'url': base_url,
@ -1446,6 +1454,7 @@ class InfoExtractor(object):
                            'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
                            'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
                            'format_note': 'DASH %s' % content_type,
+                            'filesize': filesize,
                        }
                        representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
                        if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
--- a/youtube_dl/extractor/fox.py
+++ b/youtube_dl/extractor/fox.py
@ -9,6 +9,7 @@ class FOXIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)'
    _TEST = {
        'url': 'http://www.fox.com/watch/255180355939/7684182528',
+        'md5': 'ebd296fcc41dd4b19f8115d8461a3165',
        'info_dict': {
            'id': '255180355939',
            'ext': 'mp4',
@ -17,10 +18,6 @@ class FOXIE(InfoExtractor):
            'duration': 129,
        },
        'add_ie': ['ThePlatform'],
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
    }

    def _real_extract(self, url):
@ -29,7 +26,7 @@ class FOXIE(InfoExtractor):

        release_url = self._parse_json(self._search_regex(
            r'"fox_pdk_player"\s*:\s*({[^}]+?})', webpage, 'fox_pdk_player'),
-            video_id)['release_url'] + '&manifest=m3u'
+            video_id)['release_url'] + '&switch=http'

        return {
            '_type': 'url_transparent',
--- a/youtube_dl/extractor/hotstar.py
+++ b/youtube_dl/extractor/hotstar.py
@ -10,8 +10,8 @@ from ..utils import (


 class HotStarIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?hotstar\.com/.*?[/-](?P<id>\d{10})'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})'
+    _TESTS = [{
        'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273',
        'info_dict': {
            'id': '1000076273',
@ -26,7 +26,13 @@ class HotStarIE(InfoExtractor):
            # m3u8 download
            'skip_download': True,
        }
-    }
+    }, {
+        'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.hotstar.com/1000000515',
+        'only_matching': True,
+    }]

    _GET_CONTENT_TEMPLATE = 'http://account.hotstar.com/AVS/besc?action=GetAggregatedContentDetails&channel=PCTV&contentId=%s'
    _GET_CDN_TEMPLATE = 'http://getcdn.hotstar.com/AVS/besc?action=GetCDN&asJson=Y&channel=%s&id=%s&type=%s'
--- a/youtube_dl/extractor/mailru.py
+++ b/youtube_dl/extractor/mailru.py
@ -4,6 +4,10 @@ from __future__ import unicode_literals
 import re

 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    remove_end,
+)


 class MailRuIE(InfoExtractor):
@ -34,14 +38,30 @@ class MailRuIE(InfoExtractor):
                'id': '46843144_1263',
                'ext': 'mp4',
                'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',
-                'timestamp': 1397217632,
-                'upload_date': '20140411',
-                'uploader': 'hitech',
+                'timestamp': 1397039888,
+                'upload_date': '20140409',
+                'uploader': 'hitech@corp.mail.ru',
                'uploader_id': 'hitech@corp.mail.ru',
                'duration': 245,
            },
            'skip': 'Not accessible from Travis CI server',
        },
+        {
+            # only available via metaUrl API
+            'url': 'http://my.mail.ru/mail/720pizle/video/_myvideo/502.html',
+            'md5': '3b26d2491c6949d031a32b96bd97c096',
+            'info_dict': {
+                'id': '56664382_502',
+                'ext': 'mp4',
+                'title': ':8336',
+                'timestamp': 1449094163,
+                'upload_date': '20151202',
+                'uploader': '720pizle@mail.ru',
+                'uploader_id': '720pizle@mail.ru',
+                'duration': 6001,
+            },
+            'skip': 'Not accessible from Travis CI server',
+        }
    ]

    def _real_extract(self, url):
@ -51,32 +71,55 @@ class MailRuIE(InfoExtractor):
        if not video_id:
            video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')

-        video_data = self._download_json(
-            'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON')
+        webpage = self._download_webpage(url, video_id)

-        author = video_data['author']
-        uploader = author['name']
-        uploader_id = author.get('id') or author.get('email')
-        view_count = video_data.get('views_count')
+        video_data = None
+
+        page_config = self._parse_json(self._search_regex(
+            r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>',
+            webpage, 'page config', default='{}'), video_id, fatal=False)
+        if page_config:
+            meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl')
+            if meta_url:
+                video_data = self._download_json(
+                    meta_url, video_id, 'Downloading video meta JSON', fatal=False)
+
+        # Fallback old approach
+        if not video_data:
+            video_data = self._download_json(
+                'http://api.video.mail.ru/videos/%s.json?new=1' % video_id,
+                video_id, 'Downloading video JSON')
+
+        formats = []
+        for f in video_data['videos']:
+            video_url = f.get('url')
+            if not video_url:
+                continue
+            format_id = f.get('key')
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'height': height,
+            })
+        self._sort_formats(formats)

        meta_data = video_data['meta']
-        content_id = '%s_%s' % (
-            meta_data.get('accId', ''), meta_data['itemId'])
-        title = meta_data['title']
-        if title.endswith('.mp4'):
-            title = title[:-4]
-        thumbnail = meta_data['poster']
-        duration = meta_data['duration']
-        timestamp = meta_data['timestamp']
+        title = remove_end(meta_data['title'], '.mp4')

-        formats = [
-            {
-                'url': video['url'],
-                'format_id': video['key'],
-                'height': int(video['key'].rstrip('p'))
-            } for video in video_data['videos']
-        ]
-        self._sort_formats(formats)
+        author = video_data.get('author')
+        uploader = author.get('name')
+        uploader_id = author.get('id') or author.get('email')
+        view_count = int_or_none(video_data.get('viewsCount') or video_data.get('views_count'))
+
+        acc_id = meta_data.get('accId')
+        item_id = meta_data.get('itemId')
+        content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id
+
+        thumbnail = meta_data.get('poster')
+        duration = int_or_none(meta_data.get('duration'))
+        timestamp = int_or_none(meta_data.get('timestamp'))

        return {
            'id': content_id,
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@ -8,6 +8,7 @@ from ..utils import (
    ExtractorError,
    determine_ext,
    int_or_none,
+    js_to_json,
    strip_jsonp,
    unified_strdate,
    US_RATINGS,
@ -432,9 +433,20 @@ class PBSIE(InfoExtractor):
                for vid_id in video_id]
            return self.playlist_result(entries, display_id)

+        player = self._download_webpage(
+            'http://player.pbs.org/portalplayer/%s' % video_id, display_id)
+
+        info = self._parse_json(
+            self._search_regex(
+                r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
+                player, 'video data', default='{}'),
+            display_id, transform_source=js_to_json, fatal=False)
+
+        # Fallback to old videoInfo API
+        if not info:
            info = self._download_json(
                'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
-            display_id)
+                display_id, 'Downloading video info JSON')

        formats = []
        for encoding_name in ('recommended_encoding', 'alternate_encoding'):
--- a/youtube_dl/extractor/viddler.py
+++ b/youtube_dl/extractor/viddler.py
@ -1,6 +1,10 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urlparse,
+)
 from ..utils import (
    float_or_none,
    int_or_none,
@ -12,10 +16,10 @@ class ViddlerIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
    _TESTS = [{
        'url': 'http://www.viddler.com/v/43903784',
-        'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4',
+        'md5': '9eee21161d2c7f5b39690c3e325fab2f',
        'info_dict': {
            'id': '43903784',
-            'ext': 'mp4',
+            'ext': 'mov',
            'title': 'Video Made Easy',
            'description': 'md5:6a697ebd844ff3093bd2e82c37b409cd',
            'uploader': 'viddler',
@ -29,10 +33,10 @@ class ViddlerIE(InfoExtractor):
        }
    }, {
        'url': 'http://www.viddler.com/v/4d03aad9/',
-        'md5': 'faa71fbf70c0bee7ab93076fd007f4b0',
+        'md5': 'f12c5a7fa839c47a79363bfdf69404fb',
        'info_dict': {
            'id': '4d03aad9',
-            'ext': 'mp4',
+            'ext': 'ts',
            'title': 'WALL-TO-GORTAT',
            'upload_date': '20150126',
            'uploader': 'deadspin',
@ -42,10 +46,10 @@ class ViddlerIE(InfoExtractor):
        }
    }, {
        'url': 'http://www.viddler.com/player/221ebbbd/0/',
-        'md5': '0defa2bd0ea613d14a6e9bd1db6be326',
+        'md5': '740511f61d3d1bb71dc14a0fe01a1c10',
        'info_dict': {
            'id': '221ebbbd',
-            'ext': 'mp4',
+            'ext': 'mov',
            'title': 'LETeens-Grammar-snack-third-conditional',
            'description': ' ',
            'upload_date': '20140929',
@ -54,16 +58,42 @@ class ViddlerIE(InfoExtractor):
            'view_count': int,
            'comment_count': int,
        }
+    }, {
+        # secret protected
+        'url': 'http://www.viddler.com/v/890c0985?secret=34051570',
+        'info_dict': {
+            'id': '890c0985',
+            'ext': 'mp4',
+            'title': 'Complete Property Training - Traineeships',
+            'description': ' ',
+            'upload_date': '20130606',
+            'uploader': 'TiffanyBowtell',
+            'timestamp': 1370496993,
+            'view_count': int,
+            'comment_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

-        json_url = (
-            'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' %
-            video_id)
+        query = {
+            'video_id': video_id,
+            'key': 'v0vhrt7bg2xq1vyxhkct',
+        }
+
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        secret = qs.get('secret', [None])[0]
+        if secret:
+            query['secret'] = secret
+
        headers = {'Referer': 'http://static.cdn-ec.viddler.com/js/arpeggio/v2/embed.html'}
-        request = sanitized_Request(json_url, None, headers)
+        request = sanitized_Request(
+            'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?%s'
+            % compat_urllib_parse.urlencode(query), None, headers)
        data = self._download_json(request, video_id)['video']

        formats = []
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@ -229,6 +229,9 @@ class YoukuIE(InfoExtractor):
            if error_note is not None and '因版权原因无法观看此视频' in error_note:
                raise ExtractorError(
                    'Youku said: Sorry, this video is available in China only', expected=True)
+            elif error_note and '该视频被设为私密' in error_note:
+                raise ExtractorError(
+                    'Youku said: Sorry, this video is private', expected=True)
            else:
                msg = 'Youku server reported error %i' % error.get('code')
                if error_note is not None:
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@ -1,3 +1,3 @@
 from __future__ import unicode_literals

-__version__ = '2016.02.09'
+__version__ = '2016.02.10'
Author	SHA1	Message	Date
Philipp Hagemeister	f817d9bec1	release 2016.02.10	2016-02-10 16:17:38 +01:00
Sergey M․	e2effb08a4	[YoutubeDL] Sanitize format_id (Closes #8494 )	2016-02-10 21:16:58 +06:00
Sergey M․	7fcea295c5	[pbs] Switch to portal player by default (Closes #8491 )	2016-02-10 20:46:38 +06:00
Sergey M․	cc799437ea	[youku] Report private videos (Closes #8498 )	2016-02-10 20:05:17 +06:00
Sergey M․	89d23f37f2	[hotstar] Relax _VALID_URL (Closes #8487 )	2016-02-10 04:43:00 +06:00
Philipp Hagemeister	b92071ef00	release 2016.02.09.1	2016-02-09 20:12:36 +01:00
Sergey M․	47246ae26c	[viddler] Update tests	2016-02-10 01:12:47 +06:00
Sergey M․	9c15869c28	[viddler] Add support for secret videos (Closes #8481 )	2016-02-10 01:09:07 +06:00
remitamine	51e9094f4a	[extractor/common] extract youtube dash formats filesize(fixes #8480 )	2016-02-09 20:05:39 +01:00
remitamine	5e3a6fec33	[fox] update test	2016-02-09 17:30:42 +01:00
remitamine	d413095f7e	[extractor/common] remove duplicated formats and subtiles in smil manifests	2016-02-09 17:15:41 +01:00
remitamine	1bedf4de06	[fox] extract http formats	2016-02-09 17:12:34 +01:00
Sergey M․	3967a761f4	[mailru] Fix tests	2016-02-09 21:31:51 +06:00
Sergey M․	b081350bd9	[mailru] Improve and modernize	2016-02-09 21:30:48 +06:00
Sergey M․	16f1430ba6	[mailru] Prefer metaUrl API (Closes #8474 )	2016-02-09 21:14:02 +06:00