release 2014.02.10

[youtube] Correct a minor regex typo
Fix #2355 (date parsing with dashes)
2014-02-10 02:01:11 +01:00 · 2014-02-10 01:30:47 +01:00 · 2014-02-09 18:09:57 +01:00 · 2014-02-09 17:56:10 +01:00 · 2014-02-09 14:23:19 +01:00 · 2014-02-09 14:11:45 +01:00
10 changed files with 291 additions and 106 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -127,6 +127,7 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(unified_strdate('8/7/2009'), '20090708')
        self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
        self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
        self.assertEqual(unified_strdate('1968-12-10'), '19681210')
    def test_find_xpath_attr(self):
        testxml = u'''<root>
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -115,6 +115,7 @@ from .keezmovies import KeezMoviesIE
 from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
 from .keek import KeekIE
 from .kontrtube import KontrTubeIE
 from .la7 import LA7IE
 from .lifenews import LifeNewsIE
 from .liveleak import LiveLeakIE
--- a/youtube_dl/extractor/bbccouk.py
+++ b/youtube_dl/extractor/bbccouk.py
@ -2,29 +2,160 @@ from __future__ import unicode_literals
 import re
-from .common import InfoExtractor
+from .subtitles import SubtitlesInfoExtractor
 from ..utils import ExtractorError
-class BBCCoUkIE(InfoExtractor):
+class BBCCoUkIE(SubtitlesInfoExtractor):
    IE_NAME = 'bbc.co.uk'
-    IE_DESC = 'BBC - iPlayer Radio'
+    IE_DESC = 'BBC iPlayer'
    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
-    _TEST = {
+    _TESTS = [
-        'url': 'http://www.bbc.co.uk/programmes/p01q7wz1',
+        {
-        'info_dict': {
+            'url': 'http://www.bbc.co.uk/programmes/p01q7wz1',
-            'id': 'p01q7wz4',
+            'info_dict': {
-            'ext': 'flv',
+                'id': 'p01q7wz4',
-            'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix',
+                'ext': 'flv',
-            'description': 'Blu Mar Ten deliver a Guest Mix for Friction.',
+                'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix',
-            'duration': 1936,
+                'description': 'Blu Mar Ten deliver a Guest Mix for Friction.',
                'duration': 1936,
            },
            'params': {
                # rtmp download
                'skip_download': True,
            }
        },
-        'params': {
+        {
-            # rtmp download
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
-            'skip_download': True,
+            'info_dict': {
                'id': 'b00yng1d',
                'ext': 'flv',
                'title': 'The Man in Black: Series 3: The Printed Name',
                'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
                'duration': 1800,
            },
            'params': {
                # rtmp download
                'skip_download': True,
            }
        },
        {
            'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
            'info_dict': {
                'id': 'b00yng1d',
                'ext': 'flv',
                'title': 'The Voice UK: Series 3: Blind Auditions 5',
                'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
                'duration': 5100,
            },
            'params': {
                # rtmp download
                'skip_download': True,
            },
            'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
        }
-    }
+    ]
    def _extract_asx_playlist(self, connection, programme_id):
        asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
        return [ref.get('href') for ref in asx.findall('./Entry/ref')]
    def _extract_connection(self, connection, programme_id):
        formats = []
        protocol = connection.get('protocol')
        supplier = connection.get('supplier')
        if protocol == 'http':
            href = connection.get('href')
            # ASX playlist
            if supplier == 'asx':
                for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
                    formats.append({
                        'url': ref,
                        'format_id': 'ref%s_%s' % (i, supplier),
                    })
            # Direct link
            else:
                formats.append({
                    'url': href,
                    'format_id': supplier,
                })
        elif protocol == 'rtmp':
            application = connection.get('application', 'ondemand')
            auth_string = connection.get('authString')
            identifier = connection.get('identifier')
            server = connection.get('server')
            formats.append({
                'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
                'play_path': identifier,
                'app': '%s?%s' % (application, auth_string),
                'page_url': 'http://www.bbc.co.uk',
                'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
                'rtmp_live': False,
                'ext': 'flv',
                'format_id': supplier,
            })
        return formats
    def _extract_items(self, playlist):
        return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
    def _extract_medias(self, media_selection):
        return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
    def _extract_connections(self, media):
        return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
    def _extract_video(self, media, programme_id):
        formats = []
        vbr = int(media.get('bitrate'))
        vcodec = media.get('encoding')
        service = media.get('service')
        width = int(media.get('width'))
        height = int(media.get('height'))
        file_size = int(media.get('media_file_size'))
        for connection in self._extract_connections(media):
            conn_formats = self._extract_connection(connection, programme_id)
            for format in conn_formats:
                format.update({
                    'format_id': '%s_%s' % (service, format['format_id']),
                    'width': width,
                    'height': height,
                    'vbr': vbr,
                    'vcodec': vcodec,
                    'filesize': file_size,
                })
            formats.extend(conn_formats)
        return formats
    def _extract_audio(self, media, programme_id):
        formats = []
        abr = int(media.get('bitrate'))
        acodec = media.get('encoding')
        service = media.get('service')
        for connection in self._extract_connections(media):
            conn_formats = self._extract_connection(connection, programme_id)
            for format in conn_formats:
                format.update({
                    'format_id': '%s_%s' % (service, format['format_id']),
                    'abr': abr,
                    'acodec': acodec,
                })
            formats.extend(conn_formats)
        return formats
    def _extract_captions(self, media, programme_id):
        subtitles = {}
        for connection in self._extract_connections(media):
            captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
            lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
            ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
            srt = ''
            for pos, p in enumerate(ps):
                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
                                                          p.text.strip() if p.text is not None else '')
            subtitles[lang] = srt
        return subtitles
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@ -33,84 +164,54 @@ class BBCCoUkIE(InfoExtractor):
        playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
            'Downloading playlist XML')
-        item = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}item')
+        no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
-        if item is None:
+        if no_items is not None:
-            no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
+            reason = no_items.get('reason')
-            if no_items is not None:
+            if reason == 'preAvailability':
-                reason = no_items.get('reason')
+                msg = 'Episode %s is not yet available' % group_id
-                if reason == 'preAvailability':
+            elif reason == 'postAvailability':
-                    msg = 'Episode %s is not yet available' % group_id
+                msg = 'Episode %s is no longer available' % group_id
-                elif reason == 'postAvailability':
+            else:
-                    msg = 'Episode %s is no longer available' % group_id
+                msg = 'Episode %s is not available: %s' % (group_id, reason)
-                else:
+            raise ExtractorError(msg, expected=True)
                    msg = 'Episode %s is not available: %s' % (group_id, reason)
                raise ExtractorError(msg, expected=True)
            raise ExtractorError('Failed to extract media for episode %s' % group_id, expected=True)
        title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
        description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
        radio_programme_id = item.get('identifier')
        duration = int(item.get('duration'))
        media_selection = self._download_xml(
            'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'  % radio_programme_id,
            radio_programme_id, 'Downloading media selection XML')
        formats = []
-        for media in media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media'):
+        subtitles = None
-            bitrate = int(media.get('bitrate'))
+
-            encoding = media.get('encoding')
+        for item in self._extract_items(playlist):
-            service = media.get('service')
+            kind = item.get('kind')
-            connection = media.find('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
+            if kind != 'programme' and kind != 'radioProgramme':
-            protocol = connection.get('protocol')
+                continue
-            priority = connection.get('priority')
+            title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
-            supplier = connection.get('supplier')
+            description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
-            if protocol == 'http':
+
-                href = connection.get('href')
+            programme_id = item.get('identifier')
-                # ASX playlist
+            duration = int(item.get('duration'))
-                if supplier == 'asx':
+
-                    asx = self._download_xml(href, radio_programme_id, 'Downloading %s ASX playlist' % service)
+            media_selection = self._download_xml(
-                    for i, ref in enumerate(asx.findall('./Entry/ref')):
+                'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'  % programme_id,
-                        formats.append({
+                programme_id, 'Downloading media selection XML')
-                            'url': ref.get('href'),
+
-                            'format_id': '%s_ref%s' % (service, i),
+            for media in self._extract_medias(media_selection):
-                            'abr': bitrate,
+                kind = media.get('kind')
-                            'acodec': encoding,
+                if kind == 'audio':
-                            'preference': priority,
+                    formats.extend(self._extract_audio(media, programme_id))
-                        })
+                elif kind == 'video':
-                    continue
+                    formats.extend(self._extract_video(media, programme_id))
-                # Direct link
+                elif kind == 'captions':
-                formats.append({
+                    subtitles = self._extract_captions(media, programme_id)
-                    'url': href,
+
-                    'format_id': service,
+        if self._downloader.params.get('listsubtitles', False):
-                    'abr': bitrate,
+            self._list_available_subtitles(programme_id, subtitles)
-                    'acodec': encoding,
+            return
                    'preference': priority,
                })
            elif protocol == 'rtmp':
                application = connection.get('application', 'ondemand')
                auth_string = connection.get('authString')
                identifier = connection.get('identifier')
                server = connection.get('server')
                formats.append({
                    'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
                    'play_path': identifier,
                    'app': '%s?%s' % (application, auth_string),
                    'rtmp_live': False,
                    'ext': 'flv',
                    'format_id': service,
                    'abr': bitrate,
                    'acodec': encoding,
                    'preference': priority,
                })
        self._sort_formats(formats)
        return {
-            'id': radio_programme_id,
+            'id': programme_id,
            'title': title,
            'description': description,
            'duration': duration,
            'formats': formats,
            'subtitles': subtitles,
        }
--- a/youtube_dl/extractor/bloomberg.py
+++ b/youtube_dl/extractor/bloomberg.py
@ -24,5 +24,7 @@ class BloombergIE(InfoExtractor):
        mobj = re.match(self._VALID_URL, url)
        name = mobj.group('name')
        webpage = self._download_webpage(url, name)
-        ooyala_url = self._twitter_search_player(webpage)
+        embed_code = self._search_regex(
-        return self.url_result(ooyala_url, OoyalaIE.ie_key())
+            r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
            'embed code')
        return OoyalaIE._build_url_result(embed_code)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -271,8 +271,11 @@ class InfoExtractor(object):
    def _download_json(self, url_or_request, video_id,
                       note=u'Downloading JSON metadata',
-                       errnote=u'Unable to download JSON metadata'):
+                       errnote=u'Unable to download JSON metadata',
                       transform_source=None):
        json_string = self._download_webpage(url_or_request, video_id, note, errnote)
        if transform_source:
            json_string = transform_source(json_string)
        try:
            return json.loads(json_string)
        except ValueError as ve:
--- a/youtube_dl/extractor/kontrtube.py
+++ b/youtube_dl/extractor/kontrtube.py
@ -0,0 +1,66 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 class KontrTubeIE(InfoExtractor):
    IE_NAME = 'kontrtube'
    IE_DESC = 'KontrTube.ru - Труба зовёт'
    _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+'
    _TEST = {
        'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
        'md5': '975a991a4926c9a85f383a736a2e6b80',
        'info_dict': {
            'id': '2678',
            'ext': 'mp4',
            'title': 'Над олимпийской деревней в Сочи поднят российский флаг',
            'description': 'md5:80edc4c613d5887ae8ccf1d59432be41',
            'thumbnail': 'http://www.kontrtube.ru/contents/videos_screenshots/2000/2678/preview.mp4.jpg',
            'duration': 270,
        }
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(url, video_id, 'Downloading page')
        video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
        thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
        title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
            'video title')
        description = self._html_search_meta('description', webpage, 'video description')
        mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
            webpage)
        duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
        view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
            'view count', fatal=False)
        view_count = int(view_count) if view_count is not None else None
        comment_count = None
        comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
            fatal=False)
        if comment_str.startswith('комментариев нет'):
            comment_count = 0
        else:
            mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
            if mobj:
                comment_count = int(mobj.group('total'))
        return {
            'id': video_id,
            'url': video_url,
            'thumbnail': thumbnail,
            'title': title,
            'description': description,
            'duration': duration,
            'view_count': view_count,
            'comment_count': comment_count,
        }
--- a/youtube_dl/extractor/slideshare.py
+++ b/youtube_dl/extractor/slideshare.py
@ -1,3 +1,5 @@
 from __future__ import unicode_literals
 import re
 import json
@ -12,11 +14,12 @@ class SlideshareIE(InfoExtractor):
    _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
    _TEST = {
-        u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
+        'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
-        u'file': u'25665706.mp4',
+        'info_dict': {
-        u'info_dict': {
+            'id': '25665706',
-            u'title': u'Managing Scale and Complexity',
+            'ext': 'mp4',
-            u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix',
+            'title': 'Managing Scale and Complexity',
            'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',
        },
    }
@ -26,15 +29,17 @@ class SlideshareIE(InfoExtractor):
        webpage = self._download_webpage(url, page_title)
        slideshare_obj = self._search_regex(
            r'var slideshare_object =  ({.*?}); var user_info =',
-            webpage, u'slideshare object')
+            webpage, 'slideshare object')
        info = json.loads(slideshare_obj)
-        if info['slideshow']['type'] != u'video':
+        if info['slideshow']['type'] != 'video':
-            raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
+            raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
        doc = info['doc']
        bucket = info['jsplayer']['video_bucket']
        ext = info['jsplayer']['video_extension']
        video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
        description = self._html_search_regex(
            r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description')
        return {
            '_type': 'video',
@ -43,5 +48,5 @@ class SlideshareIE(InfoExtractor):
            'ext': ext,
            'url': video_url,
            'thumbnail': info['slideshow']['pin_image_url'],
-            'description': self._og_search_description(webpage),
+            'description': description,
        }
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -34,6 +34,7 @@ from ..utils import (
    unified_strdate,
    orderedSet,
    write_json_file,
    uppercase_escape,
 )
 class YoutubeBaseInfoExtractor(InfoExtractor):
@ -136,7 +137,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                         (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                            (?:www\.)?deturl\.com/www\.youtube\.com/|
-                            (?:www\.)?pwnyoutube\.com|
+                            (?:www\.)?pwnyoutube\.com/|
                            tube\.majestyc\.net/|
                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
@ -1590,10 +1591,9 @@ class YoutubeChannelIE(InfoExtractor):
            # Download all channel pages using the json-based channel_ajax query
            for pagenum in itertools.count(1):
                url = self._MORE_PAGES_URL % (pagenum, channel_id)
-                page = self._download_webpage(url, channel_id,
+                page = self._download_json(
-                                              u'Downloading page #%s' % pagenum)
+                    url, channel_id, note=u'Downloading page #%s' % pagenum,
-    
+                    transform_source=uppercase_escape)
                page = json.loads(page)
                ids_in_page = self.extract_videos_from_page(page['content_html'])
                video_ids.extend(ids_in_page)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -756,9 +756,9 @@ def unified_strdate(date_str):
    """Return a string with the date in the format YYYYMMDD"""
    upload_date = None
    #Replace commas
-    date_str = date_str.replace(',',' ')
+    date_str = date_str.replace(',', ' ')
    # %z (UTC offset) is only supported in python>=3.2
-    date_str = re.sub(r' ?(\+|-)[0-9:]*$', '', date_str)
+    date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
    format_expressions = [
        '%d %B %Y',
        '%B %d %Y',
@ -1214,3 +1214,9 @@ class PagedList(object):
            if end == nextfirstid:
                break
        return res
 def uppercase_escape(s):
    return re.sub(
        r'\\U([0-9a-fA-F]{8})',
        lambda m: compat_chr(int(m.group(1), base=16)), s)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@ -1,2 +1,2 @@
-__version__ = '2014.02.08.2'
+__version__ = '2014.02.10'
Author	SHA1	Message	Date
Philipp Hagemeister	2e20bba708	release 2014.02.10	2014-02-10 02:01:11 +01:00
Filippo Valsorda	e70dc1d14b	[youtube] Correct a minor regex typo	2014-02-10 01:30:47 +01:00
Philipp Hagemeister	026fcc0495	Fix #2355 (date parsing with dashes)	2014-02-09 18:09:57 +01:00
Philipp Hagemeister	81c2f20b53	[youtube] Correct invalid JSON (Fixes #2353 )	2014-02-09 17:56:10 +01:00
Jaime Marquínez Ferrándiz	1afe753462	[slideshare] Fix description extraction and modernize The ‘og:description’ property doesn’t contain the full description	2014-02-09 14:23:19 +01:00
Jaime Marquínez Ferrándiz	524c2c716a	[bloomberg] Fix extraction of ooyala embed code	2014-02-09 14:11:45 +01:00
Sergey M.	b542d4bbd7	[kontrtube] Add support for kontrtube.ru (Closes #2354 )	2014-02-09 19:53:11 +07:00
Sergey M.	17968e444c	[bbc.co.uk] Fix TV episode test	2014-02-09 04:04:21 +07:00
Sergey M	2e3fd9ec2f	[bbc.co.uk] Improve overall extractor structure, add subtitles support (#2184) Everything from http://www.bbc.co.uk/iplayer/ should be downloadable now.	2014-02-09 04:00:49 +07:00
`@ -1,2 +1,2 @@`

	`__version__ = '2014.02.08.2'`	`__version__ = '2014.02.10'`