release 2013.09.10

Add an extractor for Slideshare (closes #1400 )
[sohu] add support for my.tv.sohu.com urls (fixes #1398 )
2013-09-10 11:25:11 +02:00 · 2013-09-10 11:19:58 +02:00 · 2013-09-09 19:56:16 +02:00 · 2013-09-09 10:38:54 +02:00 · 2013-09-09 10:33:12 +02:00 · 2013-09-08 18:50:07 +02:00
11 changed files with 164 additions and 30 deletions
--- a/devscripts/gh-pages/add-version.py
+++ b/devscripts/gh-pages/add-version.py
@ -3,7 +3,8 @@
 import json
 import sys
 import hashlib
-import urllib.request
+import os.path
+

 if len(sys.argv) <= 1:
    print('Specify the version number as parameter')
@ -25,6 +26,7 @@ filenames = {
    'tar': 'youtube-dl-%s.tar.gz' % version}
 build_dir = os.path.join('..', '..', 'build', version)
 for key, filename in filenames.items():
+    url = 'https://yt-dl.org/downloads/%s/%s' % (version, filename)
    fn = os.path.join(build_dir, filename)
    with open(fn, 'rb') as f:
        data = f.read()
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@ -21,14 +21,15 @@ class TestAllURLsMatching(unittest.TestCase):
        self.assertEqual(self.matching_ies(url), ie_list)

    def test_youtube_playlist_matching(self):
-        self.assertTrue(YoutubePlaylistIE.suitable(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8'))
-        self.assertTrue(YoutubePlaylistIE.suitable(u'UUBABnxM4Ar9ten8Mdjj1j0Q')) #585
-        self.assertTrue(YoutubePlaylistIE.suitable(u'PL63F0C78739B09958'))
-        self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q'))
-        self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8'))
-        self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC'))
-        self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
-        self.assertFalse(YoutubePlaylistIE.suitable(u'PLtS2H6bU1M'))
+        assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
+        assertPlaylist(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
+        assertPlaylist(u'UUBABnxM4Ar9ten8Mdjj1j0Q') #585
+        assertPlaylist(u'PL63F0C78739B09958')
+        assertPlaylist(u'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
+        assertPlaylist(u'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
+        assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
+        assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
+        self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M'))

    def test_youtube_matching(self):
        self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))
@ -37,13 +38,23 @@ class TestAllURLsMatching(unittest.TestCase):
        self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])

    def test_youtube_channel_matching(self):
-        self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM'))
-        self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec'))
-        self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM/videos'))
+        assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
+        assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
+        assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
+        assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')

    def test_youtube_user_matching(self):
        self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user'])

+    def test_youtube_feeds(self):
+        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later'])
+        self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
+        self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
+        self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
+
+    def test_youtube_show_matching(self):
+        self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
+
    def test_justin_tv_channelid_matching(self):
        self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
        self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
@ -61,10 +72,13 @@ class TestAllURLsMatching(unittest.TestCase):
        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361"))

    def test_youtube_extract(self):
-        self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')
-        self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')
-        self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc'), 'BaW_jenozKc')
-        self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch_popup?v=BaW_jenozKc'), 'BaW_jenozKc')
+        assertExtractId = lambda url, id: self.assertEqual(YoutubeIE()._extract_id(url), id)
+        assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc')
+        assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc')
+        assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc')
+        assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc')
+        assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc')
+        assertExtractId('BaW_jenozKc', 'BaW_jenozKc')

    def test_no_duplicates(self):
        ies = gen_extractors()
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@ -66,7 +66,7 @@ class FileDownloader(object):
    @staticmethod
    def format_seconds(seconds):
        (mins, secs) = divmod(seconds, 60)
-        (hours, eta_mins) = divmod(mins, 60)
+        (hours, mins) = divmod(mins, 60)
        if hours > 99:
            return '--:--:--'
        if hours == 0:
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -52,6 +52,7 @@ from .keek import KeekIE
 from .liveleak import LiveLeakIE
 from .livestream import LivestreamIE
 from .metacafe import MetacafeIE
+from .metacritic import MetacriticIE
 from .mit import TechTVMITIE, MITIE
 from .mixcloud import MixcloudIE
 from .mtv import MTVIE
@ -74,6 +75,7 @@ from .roxwel import RoxwelIE
 from .rtlnow import RTLnowIE
 from .sina import SinaIE
 from .slashdot import SlashdotIE
+from .slideshare import SlideshareIE
 from .sohu import SohuIE
 from .soundcloud import SoundcloudIE, SoundcloudSetIE
 from .spiegel import SpiegelIE
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@ -14,7 +14,7 @@ from ..utils import (
 class DailymotionIE(InfoExtractor):
    """Information Extractor for Dailymotion"""

-    _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
+    _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
    IE_NAME = u'dailymotion'
    _TEST = {
        u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
@ -33,6 +33,7 @@ class DailymotionIE(InfoExtractor):
        video_id = mobj.group(1).split('_')[0].split('?')[0]

        video_extension = 'mp4'
+        url = 'http://www.dailymotion.com/video/%s' % video_id

        # Retrieve video webpage to extract further information
        request = compat_urllib_request.Request(url)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -109,6 +109,11 @@ class GenericIE(InfoExtractor):
        return new_url

    def _real_extract(self, url):
+        parsed_url = compat_urlparse.urlparse(url)
+        if not parsed_url.scheme:
+            self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
+            return self.url_result('http://' + url)
+
        try:
            new_url = self._test_redirect(url)
            if new_url:
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@ -0,0 +1,55 @@
+import re
+import xml.etree.ElementTree
+import operator
+
+from .common import InfoExtractor
+
+
+class MetacriticIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
+        u'file': u'3698222.mp4',
+        u'info_dict': {
+            u'title': u'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors',
+            u'description': u'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
+            u'duration': 221,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        # The xml is not well formatted, there are raw '&'
+        info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
+            video_id, u'Downloading info xml').replace('&', '&amp;')
+        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+
+        clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
+        formats = []
+        for videoFile in clip.findall('httpURI/videoFile'):
+            rate_str = videoFile.find('rate').text
+            video_url = videoFile.find('filePath').text
+            formats.append({
+                'url': video_url,
+                'ext': 'mp4',
+                'format_id': rate_str,
+                'rate': int(rate_str),
+            })
+        formats.sort(key=operator.itemgetter('rate'))
+
+        description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
+            webpage, u'description', flags=re.DOTALL)
+
+        info = {
+            'id': video_id,
+            'title': clip.find('title').text,
+            'formats': formats,
+            'description': description,
+            'duration': int(clip.find('duration').text),
+        }
+        # TODO: Remove when #980 has been merged
+        info.update(formats[-1])
+        return info
--- a/youtube_dl/extractor/slideshare.py
+++ b/youtube_dl/extractor/slideshare.py
@ -0,0 +1,47 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urlparse,
+    ExtractorError,
+)
+
+
+class SlideshareIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
+
+    _TEST = {
+        u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
+        u'file': u'25665706.mp4',
+        u'info_dict': {
+            u'title': u'Managing Scale and Complexity',
+            u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        page_title = mobj.group('title')
+        webpage = self._download_webpage(url, page_title)
+        slideshare_obj = self._search_regex(
+            r'var slideshare_object =  ({.*?}); var user_info =',
+            webpage, u'slideshare object')
+        info = json.loads(slideshare_obj)
+        if info['slideshow']['type'] != u'video':
+            raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
+
+        doc = info['doc']
+        bucket = info['jsplayer']['video_bucket']
+        ext = info['jsplayer']['video_extension']
+        video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
+
+        return {
+            '_type': 'video',
+            'id': info['slideshow']['id'],
+            'title': info['slideshow']['title'],
+            'ext': ext,
+            'url': video_url,
+            'thumbnail': info['slideshow']['pin_image_url'],
+            'description': self._og_search_description(webpage),
+        }
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@ -8,7 +8,7 @@ from ..utils import ExtractorError


 class SohuIE(InfoExtractor):
-    _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?'
+    _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'

    _TEST = {
        u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
@ -21,7 +21,10 @@ class SohuIE(InfoExtractor):

    def _real_extract(self, url):

-        def _fetch_data(vid_id):
+        def _fetch_data(vid_id, mytv=False):
+            if mytv:
+                base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid='
+            else:
                base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid='
            data_url = base_data_url + str(vid_id)
            data_json = self._download_webpage(
@ -31,15 +34,16 @@ class SohuIE(InfoExtractor):

        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
+        mytv = mobj.group('mytv') is not None

        webpage = self._download_webpage(url, video_id)
        raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>',
                                            webpage, u'video title')
        title = raw_title.partition('-')[0].strip()

-        vid = self._html_search_regex(r'var vid="(\d+)"', webpage,
+        vid = self._html_search_regex(r'var vid ?= ?["\'](\d+)["\']', webpage,
                                      u'video path')
-        data = _fetch_data(vid)
+        data = _fetch_data(vid, mytv)

        QUALITIES = ('ori', 'super', 'high', 'nor')
        vid_ids = [data['data'][q + 'Vid']
@ -51,7 +55,7 @@ class SohuIE(InfoExtractor):
        # For now, we just pick the highest available quality
        vid_id = vid_ids[-1]

-        format_data = data if vid == vid_id else _fetch_data(vid_id)
+        format_data = data if vid == vid_id else _fetch_data(vid_id, mytv)
        part_count = format_data['data']['totalBlocks']
        allot = format_data['allot']
        prot = format_data['prot']
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -150,7 +150,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         |youtu\.be/                                          # just youtu.be/xxxx
                         )
                     )?                                                       # all until now is optional -> you can pass the naked ID
-                     ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
+                     ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
                     (?(1).+)?                                                # if we found the ID, everything can follow
                     $"""
    _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
@ -386,7 +386,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
-        if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
+        if YoutubePlaylistIE.suitable(url): return False
        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None

    def report_video_webpage_download(self, video_id):
@ -643,7 +643,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
        formats_urls = _get_urls(manifest)
        for format_url in formats_urls:
-            itag = self._search_regex(r'itag%3D(\d+?)/', format_url, 'itag')
+            itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
            url_map[itag] = format_url
        return url_map

@ -1015,14 +1015,18 @@ class YoutubeChannelIE(InfoExtractor):

 class YoutubeUserIE(InfoExtractor):
    IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
-    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)([A-Za-z0-9_-]+)'
+    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
    _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
    _GDATA_PAGE_SIZE = 50
    _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
    IE_NAME = u'youtube:user'

+    @classmethod
    def suitable(cls, url):
-        if YoutubeIE.suitable(url): return False
+        # Don't return True if the url can be extracted with other youtube
+        # extractor, the regex would is too permissive and it would match.
+        other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
+        if any(ie.suitable(url) for ie in other_ies): return False
        else: return super(YoutubeUserIE, cls).suitable(url)

    def _real_extract(self, url):
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@ -1,2 +1,2 @@

-__version__ = '2013.09.06.1'
+__version__ = '2013.09.10'
Author	SHA1	Message	Date
Philipp Hagemeister	a1ab553858	release 2013.09.10	2013-09-10 11:25:11 +02:00
Jaime Marquínez Ferrándiz	07463ea162	Add an extractor for Slideshare (closes #1400 )	2013-09-10 11:19:58 +02:00
Jaime Marquínez Ferrándiz	6d2d21f713	[sohu] add support for my.tv.sohu.com urls (fixes #1398 )	2013-09-09 19:56:16 +02:00
Jaime Marquínez Ferrándiz	061b2889a9	Fix the minutes part in FileDownloader.format_seconds (fixed #1397 ) It printed for the minutes the result of (seconds // 60)	2013-09-09 10:38:54 +02:00
Jaime Marquínez Ferrándiz	8963d9c266	[youtube] Modify the regex to match ids of length 11 (fixes #1396 ) In urls like http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930 you can't split the query string and ids always have that length.	2013-09-09 10:33:12 +02:00
Jaime Marquínez Ferrándiz	890f62e868	Revert "[youtube] Fix detection of tags from HLS videos." They have undo the change This reverts commit `0638ad9999`.	2013-09-08 18:50:07 +02:00
Philipp Hagemeister	8f362589a5	release 2013.09.07	2013-09-07 22:29:15 +02:00
Philipp Hagemeister	a27a2470cd	Merge branch 'master' of github.com:rg3/youtube-dl	2013-09-07 22:28:54 +02:00
Jaime Marquínez Ferrándiz	a7130543fa	[generic] If the url doesn't specify the protocol, then try to extract prepending 'http://'	2013-09-06 18:39:35 +02:00
Jaime Marquínez Ferrándiz	a490fda746	[daylimotion] accept embed urls (fixes #1386 )	2013-09-06 18:36:07 +02:00
Jaime Marquínez Ferrándiz	7e77275293	Add an extractor for Metacritic	2013-09-06 18:08:07 +02:00
Jaime Marquínez Ferrándiz	e3ea479087	[youtube] Fix some issues with the detection of playlist/channel urls (reported in #1374 ) They were being caught by YoutubeUserIE, now it only extracts a url if the rest of extractors aren't suitable. Now the url tests check that the urls can only be extracted with an specific extractor.	2013-09-06 16:24:24 +02:00
Jaime Marquínez Ferrándiz	faab1d3836	[youtube] Fix detection of feeds urls (fixes #1294 ) Urls like https://www.youtube.com/feed/watch_later were being as users (before the last changes to YoutubeUserIE, as videos)	2013-09-06 14:45:49 +02:00
Philipp Hagemeister	8851a574a3	Fix add-versions	2013-09-06 11:07:34 +02:00