Compare commits

..

24 Commits

Author SHA1 Message Date
41fa1b627d release 2014.02.06.3 2014-02-07 01:41:01 +01:00
3587159614 [nfb] Add encode POST data 2014-02-07 02:13:04 +07:00
d67cc9fa7c [youtube:playlist] Recognize ‘top tracks’ urls (closes #2332)
The list parameter starts with ‘MC’ and can have more characters after it, including dots
2014-02-06 19:46:26 +01:00
bf3a2fe923 [elpais] Fix typo 2014-02-07 00:38:29 +07:00
e9ea0bf123 [ndr] Add support for ndr.de (Closes #2325) 2014-02-07 00:35:26 +07:00
63424b6233 release 2014.02.06.2 2014-02-06 15:45:47 +01:00
0bf35c5cf5 [nfb] Add support for onf.ca URLs 2014-02-06 21:41:31 +07:00
95c29381eb [mooshare] Fix bogus video page URL 2014-02-06 21:26:12 +07:00
94c4abce7f [nfb] Add support for nfb.ca (Closes #2069) 2014-02-06 21:19:13 +07:00
df872ec4e7 release 2014.02.06.1 2014-02-06 11:30:00 +01:00
5de90176d9 [elpais] Add extractor 2014-02-06 11:29:46 +01:00
dcf3eec47a [test_download] Skip over BadStatusLine errors
An error like https://travis-ci.org/rg3/youtube-dl/jobs/18317799#L449 is almost certainly the server's fault.
2014-02-06 04:19:57 +01:00
e9e4f30d26 [pbs] Remove unused import 2014-02-06 04:19:43 +01:00
83cebd73d4 [collegehumor] We only get shortened descriptions now 2014-02-06 04:16:22 +01:00
1df4229bd7 [mtv/gametrailers] Change order of title preference
It looks like the plain title is better again
2014-02-06 04:15:12 +01:00
3c995527e9 release 2014.02.06 2014-02-06 03:30:30 +01:00
7c62b568a2 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-06 03:30:18 +01:00
ccf9114e84 [googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 03:29:10 +01:00
d8061908bb [ina] Improve _VALID_URL regex (fixes #2328)
Accept all letters in upper case and don’t require anything after the id
2014-02-05 23:01:24 +01:00
211e17dd43 release 2014.02.05 2014-02-05 21:23:28 +01:00
6cb38a9994 [firstpost] Add extractor (Fixes #2324) 2014-02-05 21:23:21 +01:00
fa7df757a7 [thisav] Simplify and use unicode literals 2014-02-05 19:13:06 +07:00
8c82077619 [toutv] Use unicode literals 2014-02-05 19:02:03 +07:00
e5d1f9e50a [m6] Add support for m6.fr (Closes #2313) 2014-02-05 17:38:17 +07:00
21 changed files with 436 additions and 71 deletions

View File

@ -37,6 +37,8 @@ class TestAllURLsMatching(unittest.TestCase):
assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668 assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M')) self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self): def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M')) self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))

View File

@ -22,6 +22,7 @@ import socket
import youtube_dl.YoutubeDL import youtube_dl.YoutubeDL
from youtube_dl.utils import ( from youtube_dl.utils import (
compat_http_client,
compat_str, compat_str,
compat_urllib_error, compat_urllib_error,
compat_HTTPError, compat_HTTPError,
@ -110,7 +111,7 @@ def generator(test_case):
ydl.download([test_case['url']]) ydl.download([test_case['url']])
except (DownloadError, ExtractorError) as err: except (DownloadError, ExtractorError) as err:
# Check if the exception is not a network related one # Check if the exception is not a network related one
if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
raise raise
if try_num == RETRIES: if try_num == RETRIES:

View File

@ -34,6 +34,7 @@ from youtube_dl.extractor import (
KhanAcademyIE, KhanAcademyIE,
EveryonesMixtapeIE, EveryonesMixtapeIE,
RutubeChannelIE, RutubeChannelIE,
GoogleSearchIE,
GenericIE, GenericIE,
) )
@ -240,6 +241,14 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'Always/Never: A Little-Seen Movie About Nuclear Command and Control : The New Yorker') self.assertEqual(result['title'], 'Always/Never: A Little-Seen Movie About Nuclear Command and Control : The New Yorker')
self.assertEqual(len(result['entries']), 3) self.assertEqual(len(result['entries']), 3)
def test_GoogleSearch(self):
dl = FakeYDL()
ie = GoogleSearchIE(dl)
result = ie.extract('gvsearch15:python language')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'python language')
self.assertEqual(result['title'], 'python language')
self.assertTrue(len(result['entries']) == 15)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -117,6 +117,13 @@ class TestYoutubeLists(unittest.TestCase):
original_video = entries[0] original_video = entries[0]
self.assertEqual(original_video['id'], 'rjFaenf1T-Y') self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
def test_youtube_toptracks(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
entries = result['entries']
self.assertEqual(len(entries), 100)
def test_youtube_toplist(self): def test_youtube_toplist(self):
dl = FakeYDL() dl = FakeYDL()
ie = YoutubeTopListIE(dl) ie = YoutubeTopListIE(dl)

View File

@ -54,12 +54,14 @@ from .ebaumsworld import EbaumsWorldIE
from .ehow import EHowIE from .ehow import EHowIE
from .eighttracks import EightTracksIE from .eighttracks import EightTracksIE
from .eitb import EitbIE from .eitb import EitbIE
from .elpais import ElPaisIE
from .escapist import EscapistIE from .escapist import EscapistIE
from .everyonesmixtape import EveryonesMixtapeIE from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE from .exfm import ExfmIE
from .extremetube import ExtremeTubeIE from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE from .facebook import FacebookIE
from .faz import FazIE from .faz import FazIE
from .firstpost import FirstpostIE
from .fktv import ( from .fktv import (
FKTVIE, FKTVIE,
FKTVPosteckeIE, FKTVPosteckeIE,
@ -118,6 +120,7 @@ from .lynda import (
LyndaIE, LyndaIE,
LyndaCourseIE LyndaCourseIE
) )
from .m6 import M6IE
from .macgamestore import MacGameStoreIE from .macgamestore import MacGameStoreIE
from .malemotion import MalemotionIE from .malemotion import MalemotionIE
from .mdr import MDRIE from .mdr import MDRIE
@ -139,8 +142,10 @@ from .myvideo import MyVideoIE
from .naver import NaverIE from .naver import NaverIE
from .nba import NBAIE from .nba import NBAIE
from .nbc import NBCNewsIE from .nbc import NBCNewsIE
from .ndr import NDRIE
from .ndtv import NDTVIE from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE from .newgrounds import NewgroundsIE
from .nfb import NFBIE
from .nhl import NHLIE, NHLVideocenterIE from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE from .niconico import NiconicoIE
from .ninegag import NineGagIE from .ninegag import NineGagIE

View File

@ -4,6 +4,7 @@ import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import int_or_none
class CollegeHumorIE(InfoExtractor): class CollegeHumorIE(InfoExtractor):
@ -11,22 +12,25 @@ class CollegeHumorIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', 'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
'file': '6902724.mp4',
'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd', 'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
'info_dict': { 'info_dict': {
'id': '6902724',
'ext': 'mp4',
'title': 'Comic-Con Cosplay Catastrophe', 'title': 'Comic-Con Cosplay Catastrophe',
'description': 'Fans get creative this year at San Diego. Too', 'description': 'Fans get creative this year',
'age_limit': 13, 'age_limit': 13,
}, },
}, },
{ {
'url': 'http://www.collegehumor.com/video/3505939/font-conference', 'url': 'http://www.collegehumor.com/video/3505939/font-conference',
'file': '3505939.mp4',
'md5': '72fa701d8ef38664a4dbb9e2ab721816', 'md5': '72fa701d8ef38664a4dbb9e2ab721816',
'info_dict': { 'info_dict': {
'id': '3505939',
'ext': 'mp4',
'title': 'Font Conference', 'title': 'Font Conference',
'description': 'This video wasn\'t long enough, so we made it double-spaced.', 'description': 'This video wasn\'t long enough,',
'age_limit': 10, 'age_limit': 10,
'duration': 179,
}, },
}, },
# embedded youtube video # embedded youtube video
@ -82,6 +86,8 @@ class CollegeHumorIE(InfoExtractor):
}) })
self._sort_formats(formats) self._sort_formats(formats)
duration = int_or_none(vdata.get('duration'), 1000)
return { return {
'id': video_id, 'id': video_id,
'title': vdata['title'], 'title': vdata['title'],
@ -89,4 +95,5 @@ class CollegeHumorIE(InfoExtractor):
'thumbnail': vdata.get('thumbnail'), 'thumbnail': vdata.get('thumbnail'),
'formats': formats, 'formats': formats,
'age_limit': age_limit, 'age_limit': age_limit,
'duration': duration,
} }

View File

@ -0,0 +1,58 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import unified_strdate
class ElPaisIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
IE_DESC = 'El País'
_TEST = {
'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
'md5': '98406f301f19562170ec071b83433d55',
'info_dict': {
'id': 'tiempo-nuevo-recetas-viejas',
'ext': 'mp4',
'title': 'Tiempo nuevo, recetas viejas',
'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
'upload_date': '20140206',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
prefix = self._html_search_regex(
r'var url_cache = "([^"]+)";', webpage, 'URL prefix')
video_suffix = self._search_regex(
r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL')
video_url = prefix + video_suffix
thumbnail_suffix = self._search_regex(
r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL',
fatal=False)
thumbnail = (
None if thumbnail_suffix is None
else prefix + thumbnail_suffix)
title = self._html_search_regex(
'<h2 class="entry-header entry-title.*?>(.*?)</h2>',
webpage, 'title')
date_str = self._search_regex(
r'<p class="date-header date-int updated"\s+title="([^"]+)">',
webpage, 'upload date', fatal=False)
upload_date = (None if date_str is None else unified_strdate(date_str))
return {
'id': video_id,
'url': video_url,
'title': title,
'description': self._og_search_description(webpage),
'thumbnail': thumbnail,
'upload_date': upload_date,
}

View File

@ -0,0 +1,38 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class FirstpostIE(InfoExtractor):
IE_NAME = 'Firstpost.com'
_VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html'
_TEST = {
'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html',
'md5': 'ee9114957692f01fb1263ed87039112a',
'info_dict': {
'id': '1025403',
'ext': 'mp4',
'title': 'India to launch indigenous aircraft carrier INS Vikrant today',
'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'<div.*?name="div_video".*?flashvars="([^"]+)">',
webpage, 'video URL')
return {
'id': video_id,
'url': video_url,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import itertools import itertools
import re import re
@ -8,32 +10,42 @@ from ..utils import (
class GoogleSearchIE(SearchInfoExtractor): class GoogleSearchIE(SearchInfoExtractor):
IE_DESC = u'Google Video search' IE_DESC = 'Google Video search'
_MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
_MAX_RESULTS = 1000 _MAX_RESULTS = 1000
IE_NAME = u'video.google:search' IE_NAME = 'video.google:search'
_SEARCH_KEY = 'gvsearch' _SEARCH_KEY = 'gvsearch'
def _get_n_results(self, query, n): def _get_n_results(self, query, n):
"""Get a specified number of results for a query""" """Get a specified number of results for a query"""
entries = []
res = { res = {
'_type': 'playlist', '_type': 'playlist',
'id': query, 'id': query,
'entries': [] 'title': query,
} }
for pagenum in itertools.count(1): for pagenum in itertools.count():
result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10) result_url = (
webpage = self._download_webpage(result_url, u'gvsearch:' + query, 'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en'
note='Downloading result page ' + str(pagenum)) % (compat_urllib_parse.quote_plus(query), pagenum * 10))
for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage): webpage = self._download_webpage(
e = { result_url, 'gvsearch:' + query,
note='Downloading result page ' + str(pagenum + 1))
for hit_idx, mobj in enumerate(re.finditer(
r'<h3 class="r"><a href="([^"]+)"', webpage)):
# Skip playlists
if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
continue
entries.append({
'_type': 'url', '_type': 'url',
'url': mobj.group(1) 'url': mobj.group(1)
} })
res['entries'].append(e)
if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage): if (len(entries) >= n) or not re.search(r'class="pn" id="pnnext"', webpage):
res['entries'] = entries[:n]
return res return res

View File

@ -7,7 +7,7 @@ from .common import InfoExtractor
class InaIE(InfoExtractor): class InaIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-F0-9]+)/.*' _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
_TEST = { _TEST = {
'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
'md5': 'a667021bf2b41f8dc6049479d9bb38a3', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3',

View File

@ -0,0 +1,56 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class M6IE(InfoExtractor):
IE_NAME = 'm6'
_VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html'
_TEST = {
'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html',
'md5': '242994a87de2c316891428e0176bcb77',
'info_dict': {
'id': '11323908',
'ext': 'mp4',
'title': 'Emeline est la Reine du Shopping sur le thème « Ma fête danniversaire ! »',
'description': 'md5:1212ae8fb4b7baa4dc3886c5676007c2',
'duration': 100,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
'Downloading video RSS')
title = rss.find('./channel/item/title').text
description = rss.find('./channel/item/description').text
thumbnail = rss.find('./channel/item/visuel_clip_big').text
duration = int(rss.find('./channel/item/duration').text)
view_count = int(rss.find('./channel/item/nombre_vues').text)
formats = []
for format_id in ['lq', 'sd', 'hq', 'hd']:
video_url = rss.find('./channel/item/url_video_%s' % format_id)
if video_url is None:
continue
formats.append({
'url': video_url.text,
'format_id': format_id,
})
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@ -61,7 +61,7 @@ class MooshareIE(InfoExtractor):
} }
request = compat_urllib_request.Request( request = compat_urllib_request.Request(
'http://mooshare.biz/8dqtk4bjbp8g', compat_urllib_parse.urlencode(download_form)) 'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded') request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.to_screen('%s: Waiting for timeout' % video_id) self.to_screen('%s: Waiting for timeout' % video_id)
@ -111,4 +111,4 @@ class MooshareIE(InfoExtractor):
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'duration': duration, 'duration': duration,
'formats': formats, 'formats': formats,
} }

View File

@ -82,10 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
title_el = find_xpath_attr( title_el = find_xpath_attr(
itemdoc, './/{http://search.yahoo.com/mrss/}category', itemdoc, './/{http://search.yahoo.com/mrss/}category',
'scheme', 'urn:mtvn:video_title') 'scheme', 'urn:mtvn:video_title')
if title_el is None:
title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
if title_el is None: if title_el is None:
title_el = itemdoc.find('.//title') title_el = itemdoc.find('.//title')
if title_el.text is None:
title_el = None
if title_el is None:
title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
title = title_el.text title = title_el.text
if title is None: if title is None:
raise ExtractorError('Could not find video title') raise ExtractorError('Could not find video title')

View File

@ -0,0 +1,89 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class NDRIE(InfoExtractor):
IE_NAME = 'ndr'
IE_DESC = 'NDR.de - Mediathek'
_VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
_TESTS = [
# video
{
'url': 'http://www.ndr.de/fernsehen/sendungen/hallo_niedersachsen/media/hallonds19925.html',
'md5': '20eba151ff165f386643dad9c1da08f7',
'info_dict': {
'id': '19925',
'ext': 'mp4',
'title': 'Hallo Niedersachsen ',
'description': 'Bei Hallo Niedersachsen um 19:30 Uhr erfahren Sie alles, was am Tag in Niedersachsen los war.',
'duration': 1722,
},
},
# audio
{
'url': 'http://www.ndr.de/903/audio191719.html',
'md5': '41ed601768534dd18a9ae34d84798129',
'info_dict': {
'id': '191719',
'ext': 'mp3',
'title': '"Es war schockierend"',
'description': 'md5:ed7ff8364793545021a6355b97e95f10',
'duration': 112,
}
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
title = self._og_search_title(page)
description = self._og_search_description(page)
mobj = re.search(
r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
page)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
formats = []
mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
if mp3_url:
formats.append({
'url': mp3_url.group('audio'),
'format_id': 'mp3',
})
thumbnail = None
video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
if video_url:
thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
page, 'thumbnail', fatal=False)
if thumbnail:
thumbnail = 'http://www.ndr.de' + thumbnail
for format_id in ['lo', 'hi', 'hq']:
formats.append({
'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
'format_id': format_id,
})
if not formats:
raise ExtractorError('No media links available for %s' % video_id)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}

View File

@ -0,0 +1,76 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
compat_urllib_parse,
)
class NFBIE(InfoExtractor):
IE_NAME = 'nfb'
IE_DESC = 'National Film Board of Canada'
_VALID_URL = r'https?://(?:www\.)?(nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
_TEST = {
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
'info_dict': {
'id': 'qallunaat_why_white_people_are_funny',
'ext': 'mp4',
'title': 'Qallunaat! Why White People Are Funny ',
'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
'duration': 3128,
'uploader': 'Mark Sandiford',
'uploader_id': 'mark-sandiford',
},
'params': {
# rtmp download
'skip_download': True,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage('https://www.nfb.ca/film/%s' % video_id, video_id, 'Downloading film page')
uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
page, 'director id', fatal=False)
uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
page, 'director name', fatal=False)
request = compat_urllib_request.Request('https://www.nfb.ca/film/%s/player_config' % video_id,
compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
config = self._download_xml(request, video_id, 'Downloading player config XML')
thumbnail = config.find("./player/stream/media[@type='posterImage']/assets/asset[@quality='high']/default/url").text
video = config.find("./player/stream/media[@type='video']")
duration = int(video.get('duration'))
title = video.find('title').text
description = video.find('description').text
# It seems assets always go from lower to better quality, so no need to sort
formats = [{
'url': x.find('default/streamerURI').text + '/',
'play_path': x.find('default/url').text,
'rtmp_live': False,
'ext': 'mp4',
'format_id': x.get('quality'),
} for x in video.findall('assets/asset')]
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats,
}

View File

@ -1,7 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor

View File

@ -1,22 +1,23 @@
#coding: utf-8 #coding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import determine_ext
determine_ext,
)
class ThisAVIE(InfoExtractor): class ThisAVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
_TEST = { _TEST = {
u"url": u"http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html", 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html',
u"file": u"47734.flv", 'md5': '0480f1ef3932d901f0e0e719f188f19b',
u"md5": u"0480f1ef3932d901f0e0e719f188f19b", 'info_dict': {
u"info_dict": { 'id': '47734',
u"title": u"高樹マリア - Just fit", 'ext': 'flv',
u"uploader": u"dj7970", 'title': '高樹マリア - Just fit',
u"uploader_id": u"dj7970" 'uploader': 'dj7970',
'uploader_id': 'dj7970'
} }
} }
@ -25,19 +26,18 @@ class ThisAVIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, u'title') title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, 'title')
video_url = self._html_search_regex( video_url = self._html_search_regex(
r"addVariable\('file','([^']+)'\);", webpage, u'video url') r"addVariable\('file','([^']+)'\);", webpage, 'video url')
uploader = self._html_search_regex( uploader = self._html_search_regex(
r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
webpage, u'uploader name', fatal=False) webpage, 'uploader name', fatal=False)
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(
r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
webpage, u'uploader id', fatal=False) webpage, 'uploader id', fatal=False)
ext = determine_ext(video_url) ext = determine_ext(video_url)
return { return {
'_type': 'video',
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': uploader, 'uploader': uploader,

View File

@ -1,4 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -9,25 +11,25 @@ from ..utils import (
class TouTvIE(InfoExtractor): class TouTvIE(InfoExtractor):
IE_NAME = u'tou.tv' IE_NAME = 'tou.tv'
_VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))' _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
_TEST = { _TEST = {
u'url': u'http://www.tou.tv/30-vies/S04E41', 'url': 'http://www.tou.tv/30-vies/S04E41',
u'file': u'30-vies_S04E41.mp4', 'file': '30-vies_S04E41.mp4',
u'info_dict': { 'info_dict': {
u'title': u'30 vies Saison 4 / Épisode 41', 'title': '30 vies Saison 4 / Épisode 41',
u'description': u'md5:da363002db82ccbe4dafeb9cab039b09', 'description': 'md5:da363002db82ccbe4dafeb9cab039b09',
u'age_limit': 8, 'age_limit': 8,
u'uploader': u'Groupe des Nouveaux Médias', 'uploader': 'Groupe des Nouveaux Médias',
u'duration': 1296, 'duration': 1296,
u'upload_date': u'20131118', 'upload_date': '20131118',
u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', 'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
}, },
u'params': { 'params': {
u'skip_download': True, # Requires rtmpdump 'skip_download': True, # Requires rtmpdump
}, },
u'skip': 'Only available in Canada' 'skip': 'Only available in Canada'
} }
def _real_extract(self, url): def _real_extract(self, url):
@ -36,25 +38,25 @@ class TouTvIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
mediaId = self._search_regex( mediaId = self._search_regex(
r'"idMedia":\s*"([^"]+)"', webpage, u'media ID') r'"idMedia":\s*"([^"]+)"', webpage, 'media ID')
streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId
streams_doc = self._download_xml( streams_doc = self._download_xml(
streams_url, video_id, note=u'Downloading stream list') streams_url, video_id, note='Downloading stream list')
video_url = next(n.text video_url = next(n.text
for n in streams_doc.findall('.//choice/url') for n in streams_doc.findall('.//choice/url')
if u'//ad.doubleclick' not in n.text) if '//ad.doubleclick' not in n.text)
if video_url.endswith('/Unavailable.flv'): if video_url.endswith('/Unavailable.flv'):
raise ExtractorError( raise ExtractorError(
u'Access to this video is blocked from outside of Canada', 'Access to this video is blocked from outside of Canada',
expected=True) expected=True)
duration_str = self._html_search_meta( duration_str = self._html_search_meta(
'video:duration', webpage, u'duration') 'video:duration', webpage, 'duration')
duration = int(duration_str) if duration_str else None duration = int(duration_str) if duration_str else None
upload_date_str = self._html_search_meta( upload_date_str = self._html_search_meta(
'video:release_date', webpage, u'upload date') 'video:release_date', webpage, 'upload date')
upload_date = unified_strdate(upload_date_str) if upload_date_str else None upload_date = unified_strdate(upload_date_str) if upload_date_str else None
return { return {

View File

@ -1422,7 +1422,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
class YoutubePlaylistIE(YoutubeBaseInfoExtractor): class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists' IE_DESC = u'YouTube.com playlists'
_VALID_URL = r"""(?: _VALID_URL = r"""(?x)(?:
(?:https?://)? (?:https?://)?
(?:\w+\.)? (?:\w+\.)?
youtube\.com/ youtube\.com/
@ -1431,7 +1431,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
\? (?:.*?&)*? (?:p|a|list)= \? (?:.*?&)*? (?:p|a|list)=
| p/ | p/
) )
((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}) (
(?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
.* .*
| |
((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
@ -1441,11 +1445,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
_VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)' _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist' IE_NAME = u'youtube:playlist'
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -1469,7 +1468,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
# Extract playlist id # Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE) mobj = re.match(self._VALID_URL, url)
if mobj is None: if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url) raise ExtractorError(u'Invalid URL: %s' % url)
playlist_id = mobj.group(1) or mobj.group(2) playlist_id = mobj.group(1) or mobj.group(2)

View File

@ -751,13 +751,14 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
https_request = http_request https_request = http_request
https_response = http_response https_response = http_response
def unified_strdate(date_str): def unified_strdate(date_str):
"""Return a string with the date in the format YYYYMMDD""" """Return a string with the date in the format YYYYMMDD"""
upload_date = None upload_date = None
#Replace commas #Replace commas
date_str = date_str.replace(',',' ') date_str = date_str.replace(',',' ')
# %z (UTC offset) is only supported in python>=3.2 # %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) date_str = re.sub(r' ?(\+|-)[0-9:]*$', '', date_str)
format_expressions = [ format_expressions = [
'%d %B %Y', '%d %B %Y',
'%B %d %Y', '%B %d %Y',
@ -771,11 +772,12 @@ def unified_strdate(date_str):
'%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%fZ',
'%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S.%f0Z',
'%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S',
'%Y-%m-%dT%H:%M',
] ]
for expression in format_expressions: for expression in format_expressions:
try: try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
except: except ValueError:
pass pass
if upload_date is None: if upload_date is None:
timetuple = email.utils.parsedate_tz(date_str) timetuple = email.utils.parsedate_tz(date_str)

View File

@ -1,2 +1,2 @@
__version__ = '2014.02.04.1' __version__ = '2014.02.06.3'