release 2013.09.06.1

Merge remote-tracking branch 'origin/master'
Do not re-download files for hashsum generation (Fixes #1383 )
2013-09-06 10:53:35 +02:00 · 2013-09-06 10:53:24 +02:00 · 2013-09-06 10:51:53 +02:00 · 2013-09-06 10:43:02 +02:00 · 2013-09-06 10:25:31 +02:00 · 2013-09-06 10:13:33 +02:00
10 changed files with 206 additions and 32 deletions
--- a/devscripts/gh-pages/add-version.py
+++ b/devscripts/gh-pages/add-version.py
@@ -23,10 +23,13 @@ filenames = {
    'bin': 'youtube-dl',
    'exe': 'youtube-dl.exe',
    'tar': 'youtube-dl-%s.tar.gz' % version}
+build_dir = os.path.join('..', '..', 'build', version)
 for key, filename in filenames.items():
-    print('Downloading and checksumming %s...' % filename)
-    url = 'https://yt-dl.org/downloads/%s/%s' % (version, filename)
-    data = urllib.request.urlopen(url).read()
+    fn = os.path.join(build_dir, filename)
+    with open(fn, 'rb') as f:
+        data = f.read()
+    if not data:
+        raise ValueError('File %s is empty!' % fn)
    sha256sum = hashlib.sha256(data).hexdigest()
    new_version[key] = (url, sha256sum)

--- a/devscripts/youtube_genalgo.py
+++ b/devscripts/youtube_genalgo.py
@@ -20,9 +20,9 @@ tests = [
    # 87
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<",
     "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"),
-    # 86 - vflHOr_nV 2013/08/30
+    # 86 - vfluy6kdb 2013/09/06
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<",
-     "?;}|[{=+._)(*&^%$#@!MNBqCXZASDFGHJKLPOIUYTREWQ<987654321mnbvcxzasdfghjklpoiuytrew"),
+     "yuioplkjhgfdsazxcvbnm12345678q0QWrRTYUIOELKJHGFD-AZXCVBNM!@#$%^&*()_<+={[|};?/>.S"),
    # 85
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<",
     ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"),
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -11,6 +11,15 @@ from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE,
 from helper import get_testcases

 class TestAllURLsMatching(unittest.TestCase):
+    def setUp(self):
+        self.ies = gen_extractors()
+
+    def matching_ies(self, url):
+        return [ie.IE_NAME for ie in self.ies if ie.suitable(url) and ie.IE_NAME != 'generic']
+
+    def assertMatch(self, url, ie_list):
+        self.assertEqual(self.matching_ies(url), ie_list)
+
    def test_youtube_playlist_matching(self):
        self.assertTrue(YoutubePlaylistIE.suitable(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8'))
        self.assertTrue(YoutubePlaylistIE.suitable(u'UUBABnxM4Ar9ten8Mdjj1j0Q')) #585
@@ -24,12 +33,17 @@ class TestAllURLsMatching(unittest.TestCase):
    def test_youtube_matching(self):
        self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))
        self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
+        self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
+        self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])

    def test_youtube_channel_matching(self):
        self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM'))
        self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec'))
        self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM/videos'))

+    def test_youtube_user_matching(self):
+        self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
+
    def test_justin_tv_channelid_matching(self):
        self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
        self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
@@ -63,15 +77,12 @@ class TestAllURLsMatching(unittest.TestCase):
                    self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url))

    def test_keywords(self):
-        ies = gen_extractors()
-        matching_ies = lambda url: [ie.IE_NAME for ie in ies
-                                    if ie.suitable(url) and ie.IE_NAME != 'generic']
-        self.assertEqual(matching_ies(':ytsubs'), ['youtube:subscriptions'])
-        self.assertEqual(matching_ies(':ytsubscriptions'), ['youtube:subscriptions'])
-        self.assertEqual(matching_ies(':thedailyshow'), ['ComedyCentral'])
-        self.assertEqual(matching_ies(':tds'), ['ComedyCentral'])
-        self.assertEqual(matching_ies(':colbertreport'), ['ComedyCentral'])
-        self.assertEqual(matching_ies(':cr'), ['ComedyCentral'])
+        self.assertMatch(':ytsubs', ['youtube:subscriptions'])
+        self.assertMatch(':ytsubscriptions', ['youtube:subscriptions'])
+        self.assertMatch(':thedailyshow', ['ComedyCentral'])
+        self.assertMatch(':tds', ['ComedyCentral'])
+        self.assertMatch(':colbertreport', ['ComedyCentral'])
+        self.assertMatch(':cr', ['ComedyCentral'])


 if __name__ == '__main__':
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@@ -28,6 +28,7 @@ __authors__  = (
    'Axel Noack',
    'Albert Kim',
    'Pierre Rudloff',
+    'Huarong Huo',
 )

 __license__ = 'Public Domain'
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@@ -18,6 +18,7 @@ from .condenast import CondeNastIE
 from .criterion import CriterionIE
 from .cspan import CSpanIE
 from .dailymotion import DailymotionIE, DailymotionPlaylistIE
+from .daum import DaumIE
 from .depositfiles import DepositFilesIE
 from .dotsub import DotsubIE
 from .dreisat import DreiSatIE
@@ -57,6 +58,7 @@ from .mtv import MTVIE
 from .muzu import MuzuTVIE
 from .myspass import MySpassIE
 from .myvideo import MyVideoIE
+from .naver import NaverIE
 from .nba import NBAIE
 from .nbc import NBCNewsIE
 from .ooyala import OoyalaIE
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -0,0 +1,74 @@
+# encoding: utf-8
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    determine_ext,
+)
+
+
+class DaumIE(InfoExtractor):
+    _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
+    IE_NAME = u'daum.net'
+
+    _TEST = {
+        u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
+        u'file': u'52554690.mp4',
+        u'info_dict': {
+            u'title': u'DOTA 2GETHER 시즌2 6회 - 2부',
+            u'description': u'DOTA 2GETHER 시즌2 6회 - 2부',
+            u'upload_date': u'20130831',
+            u'duration': 3868,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+        canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
+        webpage = self._download_webpage(canonical_url, video_id)
+        full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"',
+            webpage, u'full id')
+        query = compat_urllib_parse.urlencode({'vid': full_id})
+        info_xml = self._download_webpage(
+            'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
+            u'Downloading video info')
+        urls_xml = self._download_webpage(
+            'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
+            video_id, u'Downloading video formats info')
+        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+        urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
+
+        self.to_screen(u'%s: Getting video urls' % video_id)
+        formats = []
+        for format_el in urls.findall('result/output_list/output_list'):
+            profile = format_el.attrib['profile']
+            format_query = compat_urllib_parse.urlencode({
+                'vid': full_id,
+                'profile': profile,
+            })
+            url_xml = self._download_webpage(
+                'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query,
+                video_id, note=False)
+            url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8'))
+            format_url = url_doc.find('result/url').text
+            formats.append({
+                'url': format_url,
+                'ext': determine_ext(format_url),
+                'format_id': profile,
+            })
+
+        info = {
+            'id': video_id,
+            'title': info.find('TITLE').text,
+            'formats': formats,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': info.find('CONTENTS').text,
+            'duration': int(info.find('DURATION').text),
+            'upload_date': info.find('REGDTTM').text[:8],
+        }
+        # TODO: Remove when #980 has been merged
+        info.update(formats[-1])
+        return info
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -153,7 +153,7 @@ class GenericIE(InfoExtractor):
                mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
        if mobj is None:
            # HTML5 video
-            mobj = re.search(r'<video[^<]*>.*?<source .*?src="([^"]+)"', webpage, flags=re.DOTALL)
+            mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
        if mobj is None:
            raise ExtractorError(u'Invalid URL: %s' % url)

@@ -162,9 +162,9 @@ class GenericIE(InfoExtractor):
        if mobj.group(1) is None:
            raise ExtractorError(u'Invalid URL: %s' % url)

-        video_url = compat_urllib_parse.unquote(mobj.group(1))
+        video_url = mobj.group(1)
        video_url = compat_urlparse.urljoin(url, video_url)
-        video_id = os.path.basename(video_url)
+        video_id = compat_urllib_parse.unquote(os.path.basename(video_url))

        # here's a fun little line of code for you:
        video_extension = os.path.splitext(video_id)[1][1:]
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -0,0 +1,73 @@
+# encoding: utf-8
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    ExtractorError,
+)
+
+
+class NaverIE(InfoExtractor):
+    _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://tvcast.naver.com/v/81652',
+        u'file': u'81652.mp4',
+        u'info_dict': {
+            u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
+            u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
+            u'upload_date': u'20130903',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+        webpage = self._download_webpage(url, video_id)
+        m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
+            webpage)
+        if m_id is None:
+            raise ExtractorError(u'couldn\'t extract vid and key')
+        vid = m_id.group(1)
+        key = m_id.group(2)
+        query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,})
+        query_urls = compat_urllib_parse.urlencode({
+            'masterVid': vid,
+            'protocol': 'p2p',
+            'inKey': key,
+        })
+        info_xml = self._download_webpage(
+            'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
+            video_id, u'Downloading video info')
+        urls_xml = self._download_webpage(
+            'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
+            video_id, u'Downloading video formats info')
+        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+        urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
+
+        formats = []
+        for format_el in urls.findall('EncodingOptions/EncodingOption'):
+            domain = format_el.find('Domain').text
+            if domain.startswith('rtmp'):
+                continue
+            formats.append({
+                'url': domain + format_el.find('uri').text,
+                'ext': 'mp4',
+                'width': int(format_el.find('width').text),
+                'height': int(format_el.find('height').text),
+            })
+
+        info = {
+            'id': video_id,
+            'title': info.find('Subject').text,
+            'formats': formats,
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'upload_date': info.find('WriteDate').text.replace('.', ''),
+            'view_count': int(info.find('PlayCount').text),
+        }
+        # TODO: Remove when #980 has been merged
+        info.update(formats[-1])
+        return info
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -135,7 +135,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
    _VALID_URL = r"""^
                     (
                         (?:https?://)?                                       # http(s):// (optional)
-                         (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
+                         (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
                            tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                         (?:                                                  # the various things that can precede the ID:
@@ -146,7 +146,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                                 (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
                                 v=
                             )
-                         )?                                                   # optional -> youtube.com/xxxx is OK
+                         ))
+                         |youtu\.be/                                          # just youtu.be/xxxx
+                         )
                     )?                                                       # all until now is optional -> you can pass the naked ID
                     ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
                     (?(1).+)?                                                # if we found the ID, everything can follow
@@ -434,7 +436,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        elif len(s) == 87:
            return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
        elif len(s) == 86:
-            return s[81:73:-1] + s[84] + s[72:58:-1] + s[0] + s[57:35:-1] + s[85] + s[34:0:-1]
+            return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
        elif len(s) == 85:
            return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27]
        elif len(s) == 84:
@@ -641,7 +643,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
        formats_urls = _get_urls(manifest)
        for format_url in formats_urls:
-            itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
+            itag = self._search_regex(r'itag%3D(\d+?)/', format_url, 'itag')
            url_map[itag] = format_url
        return url_map

@@ -943,8 +945,11 @@ class YoutubePlaylistIE(InfoExtractor):

            for entry in response['feed']['entry']:
                index = entry['yt$position']['$t']
-                if 'media$group' in entry and 'media$player' in entry['media$group']:
-                    videos.append((index, entry['media$group']['media$player']['url']))
+                if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
+                    videos.append((
+                        index,
+                        'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
+                    ))

        videos = [v[1] for v in sorted(videos)]

@@ -1010,13 +1015,16 @@ class YoutubeChannelIE(InfoExtractor):

 class YoutubeUserIE(InfoExtractor):
    IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
-    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
+    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)([A-Za-z0-9_-]+)'
    _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
    _GDATA_PAGE_SIZE = 50
-    _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
-    _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
+    _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
    IE_NAME = u'youtube:user'

+    def suitable(cls, url):
+        if YoutubeIE.suitable(url): return False
+        else: return super(YoutubeUserIE, cls).suitable(url)
+
    def _real_extract(self, url):
        # Extract username
        mobj = re.match(self._VALID_URL, url)
@@ -1039,13 +1047,15 @@ class YoutubeUserIE(InfoExtractor):
            page = self._download_webpage(gdata_url, username,
                                          u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))

+            try:
+                response = json.loads(page)
+            except ValueError as err:
+                raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
+
            # Extract video identifiers
            ids_in_page = []
-
-            for mobj in re.finditer(self._VIDEO_INDICATOR, page):
-                if mobj.group(1) not in ids_in_page:
-                    ids_in_page.append(mobj.group(1))
-
+            for entry in response['feed']['entry']:
+                ids_in_page.append(entry['id']['$t'].split('/')[-1])
            video_ids.extend(ids_in_page)

            # A little optimization - if current page is not
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@

-__version__ = '2013.09.04'
+__version__ = '2013.09.06.1'
Author	SHA1	Message	Date
Philipp Hagemeister	59282080c8	release 2013.09.06.1	2013-09-06 10:53:35 +02:00
Philipp Hagemeister	98f3da4040	Merge remote-tracking branch 'origin/master'	2013-09-06 10:53:24 +02:00
Philipp Hagemeister	1d213233cd	Do not re-download files for hashsum generation (Fixes #1383 )	2013-09-06 10:51:53 +02:00
Jaime Marquínez Ferrándiz	fd9cf73836	[youtube] Users: download from the api in json to simplify extraction (fixes #1358 ) There could be duplicate videos or other videos if the description have links.	2013-09-06 10:43:02 +02:00
Jaime Marquínez Ferrándiz	0638ad9999	[youtube] Fix detection of tags from HLS videos.	2013-09-06 10:25:31 +02:00
Philipp Hagemeister	1eb527692a	release 2013.09.06	2013-09-06 10:13:33 +02:00
Jaime Marquínez Ferrándiz	09bb17e108	Merge pull request #1378 from patrickslin/patch-6 Vevo sig changed again, please update for us! Thanks very much! (fixes #...	2013-09-06 09:53:23 +02:00
patrickslin	1cf911bc82	Vevo sig changed again, please update for us! Thanks very much! (fixes #1375 )	2013-09-05 17:38:03 -07:00
Jaime Marquínez Ferrándiz	f4b052321b	[youtube] Urls like youtube.com/NASA are now interpreted as users (fixes #1069 ) Video urls like http://youtube.com/BaW_jenozKc are not valid, but http://youtu.be/BaW_jenozKc is correct.	2013-09-05 22:39:15 +02:00
Philipp Hagemeister	a636203ea5	release 2013.09.05	2013-09-05 22:30:50 +02:00
Jaime Marquínez Ferrándiz	c215217e39	[youtube] Playlists: extract the videos id from ['media$group']['yt$videoid'] (fixes #1374 ) 'media$player' is not defined for private videos.	2013-09-05 21:40:04 +02:00
Jaime Marquínez Ferrándiz	08e291b54d	[generic] Recognize html5 video in the format '<video src=".+?"' and only unquote the url when extracting the id (fixes #1372 )	2013-09-05 18:02:17 +02:00
Jaime Marquínez Ferrándiz	6b95b065be	Add extractor for tvcast.naver.com (closes #1331 )	2013-09-05 10:53:40 +02:00
Jaime Marquínez Ferrándiz	9363169b67	[daum] Get the video page from a canonical url to extract the full id (fixes #1373 ) and extract description.	2013-09-05 10:08:17 +02:00
Jaime Marquínez Ferrándiz	085bea4513	Credit @Huarong for tv.sohu.com	2013-09-04 22:09:22 +02:00
Jaime Marquínez Ferrándiz	150f20828b	Add extractor for daum.net (closes #1330 )	2013-09-04 22:06:50 +02:00