Compare commits

...

29 Commits

Author SHA1 Message Date
3c995527e9 release 2014.02.06 2014-02-06 03:30:30 +01:00
7c62b568a2 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-06 03:30:18 +01:00
ccf9114e84 [googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 03:29:10 +01:00
d8061908bb [ina] Improve _VALID_URL regex (fixes #2328)
Accept all letters in upper case and don’t require anything after the id
2014-02-05 23:01:24 +01:00
211e17dd43 release 2014.02.05 2014-02-05 21:23:28 +01:00
6cb38a9994 [firstpost] Add extractor (Fixes #2324) 2014-02-05 21:23:21 +01:00
fa7df757a7 [thisav] Simplify and use unicode literals 2014-02-05 19:13:06 +07:00
8c82077619 [toutv] Use unicode literals 2014-02-05 19:02:03 +07:00
e5d1f9e50a [m6] Add support for m6.fr (Closes #2313) 2014-02-05 17:38:17 +07:00
7ee50ae7b5 release 2014.02.04.1 2014-02-04 23:26:55 +01:00
de563c9da0 [ina] Simplify
Download the feed with ‘_download_xml’ to make the extraction easier
2014-02-04 23:15:36 +01:00
50451f2a18 [vbox7] simplify 2014-02-04 23:02:53 +01:00
9bc70948e1 [statigram] Simplify 2014-02-04 22:52:27 +01:00
5dc733f071 [vine] Simplify 2014-02-04 22:02:15 +01:00
bc4850908c [test/youtube_signature] Add a test with the last player
To verify it correctly handles function with “$” in their names.
2014-02-04 21:56:17 +01:00
20650c8654 [youtube] signatures: Recognize javascript functions that contain “$” (fixes #2304) 2014-02-04 21:38:50 +01:00
56dced2670 remove accidentally duplicated test file 2014-02-04 16:35:22 +01:00
eef726c04b release 2014.02.04 2014-02-04 16:33:19 +01:00
acf1555d76 Merge remote-tracking branch 'origin/master' 2014-02-04 16:33:06 +01:00
22e7f1a6ec [pbs] Add support for article pages (Fixes #870) 2014-02-04 16:31:00 +01:00
3c49325658 [lifenews] Fix video URL extraction (Closes #2302) 2014-02-04 21:31:25 +07:00
bb1cd2bea1 [mooshare] Add support for mooshare.biz (Closes #2149) 2014-02-04 20:53:46 +07:00
fdf1f8d4ce [collegehumor] Adapt test to changed video description 2014-02-04 10:37:01 +01:00
117c8c6b97 [bliptv] Remove unused imports 2014-02-04 10:25:19 +01:00
5cef4ff09b [subtittles] Check that the result is not empty 2014-02-04 10:24:17 +01:00
91264ce572 [iprima] Use centralized format sorting 2014-02-04 10:24:00 +01:00
c79ef8e1ae Merge remote-tracking branch 'pulpe/_iprima' 2014-02-04 10:21:42 +01:00
58d915df51 [traileraddict] mark as broken
traileraddict has changed their URL encoding scheme.
I'm working on restoring support, but that may take some time.
2014-02-04 10:13:52 +01:00
7881a64499 [iprima] Add support for play.iprima.cz 2014-02-04 07:45:41 +01:00
22 changed files with 523 additions and 161 deletions

View File

@ -34,6 +34,7 @@ from youtube_dl.extractor import (
KhanAcademyIE, KhanAcademyIE,
EveryonesMixtapeIE, EveryonesMixtapeIE,
RutubeChannelIE, RutubeChannelIE,
GoogleSearchIE,
GenericIE, GenericIE,
) )
@ -240,6 +241,14 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'Always/Never: A Little-Seen Movie About Nuclear Command and Control : The New Yorker') self.assertEqual(result['title'], 'Always/Never: A Little-Seen Movie About Nuclear Command and Control : The New Yorker')
self.assertEqual(len(result['entries']), 3) self.assertEqual(len(result['entries']), 3)
def test_GoogleSearch(self):
dl = FakeYDL()
ie = GoogleSearchIE(dl)
result = ie.extract('gvsearch15:python language')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'python language')
self.assertEqual(result['title'], 'python language')
self.assertTrue(len(result['entries']) == 15)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -27,6 +27,12 @@ _TESTS = [
85, 85,
u'3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', u'3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@',
), ),
(
u'https://s.ytimg.com/yts/jsbin/html5player-vfle-mVwz.js',
u'js',
90,
u']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876',
),
] ]

View File

@ -60,6 +60,7 @@ from .exfm import ExfmIE
from .extremetube import ExtremeTubeIE from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE from .facebook import FacebookIE
from .faz import FazIE from .faz import FazIE
from .firstpost import FirstpostIE
from .fktv import ( from .fktv import (
FKTVIE, FKTVIE,
FKTVPosteckeIE, FKTVPosteckeIE,
@ -96,6 +97,7 @@ from .ina import InaIE
from .infoq import InfoQIE from .infoq import InfoQIE
from .instagram import InstagramIE from .instagram import InstagramIE
from .internetvideoarchive import InternetVideoArchiveIE from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE
from .ivi import ( from .ivi import (
IviIE, IviIE,
IviCompilationIE IviCompilationIE
@ -117,6 +119,7 @@ from .lynda import (
LyndaIE, LyndaIE,
LyndaCourseIE LyndaCourseIE
) )
from .m6 import M6IE
from .macgamestore import MacGameStoreIE from .macgamestore import MacGameStoreIE
from .malemotion import MalemotionIE from .malemotion import MalemotionIE
from .mdr import MDRIE from .mdr import MDRIE
@ -126,6 +129,7 @@ from .mit import TechTVMITIE, MITIE
from .mixcloud import MixcloudIE from .mixcloud import MixcloudIE
from .mpora import MporaIE from .mpora import MporaIE
from .mofosex import MofosexIE from .mofosex import MofosexIE
from .mooshare import MooshareIE
from .mtv import ( from .mtv import (
MTVIE, MTVIE,
MTVIggyIE, MTVIggyIE,

View File

@ -1,19 +1,14 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import datetime import datetime
import json
import re import re
import socket
from .common import InfoExtractor from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor from .subtitles import SubtitlesInfoExtractor
from ..utils import ( from ..utils import (
compat_http_client,
compat_str, compat_str,
compat_urllib_error,
compat_urllib_request, compat_urllib_request,
ExtractorError,
unescapeHTML, unescapeHTML,
) )

View File

@ -38,7 +38,7 @@ class CollegeHumorIE(InfoExtractor):
'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]', 'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
'uploader': 'Funnyplox TV', 'uploader': 'Funnyplox TV',
'uploader_id': 'funnyploxtv', 'uploader_id': 'funnyploxtv',
'description': 'md5:506f69f7a297ed698ced3375f2363b0e', 'description': 'md5:11812366244110c3523968aa74f02521',
'upload_date': '20140128', 'upload_date': '20140128',
}, },
'params': { 'params': {

View File

@ -0,0 +1,38 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class FirstpostIE(InfoExtractor):
IE_NAME = 'Firstpost.com'
_VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html'
_TEST = {
'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html',
'md5': 'ee9114957692f01fb1263ed87039112a',
'info_dict': {
'id': '1025403',
'ext': 'mp4',
'title': 'India to launch indigenous aircraft carrier INS Vikrant today',
'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'<div.*?name="div_video".*?flashvars="([^"]+)">',
webpage, 'video URL')
return {
'id': video_id,
'url': video_url,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import itertools import itertools
import re import re
@ -8,32 +10,42 @@ from ..utils import (
class GoogleSearchIE(SearchInfoExtractor): class GoogleSearchIE(SearchInfoExtractor):
IE_DESC = u'Google Video search' IE_DESC = 'Google Video search'
_MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
_MAX_RESULTS = 1000 _MAX_RESULTS = 1000
IE_NAME = u'video.google:search' IE_NAME = 'video.google:search'
_SEARCH_KEY = 'gvsearch' _SEARCH_KEY = 'gvsearch'
def _get_n_results(self, query, n): def _get_n_results(self, query, n):
"""Get a specified number of results for a query""" """Get a specified number of results for a query"""
entries = []
res = { res = {
'_type': 'playlist', '_type': 'playlist',
'id': query, 'id': query,
'entries': [] 'title': query,
} }
for pagenum in itertools.count(1): for pagenum in itertools.count():
result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10) result_url = (
webpage = self._download_webpage(result_url, u'gvsearch:' + query, 'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en'
note='Downloading result page ' + str(pagenum)) % (compat_urllib_parse.quote_plus(query), pagenum * 10))
for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage): webpage = self._download_webpage(
e = { result_url, 'gvsearch:' + query,
note='Downloading result page ' + str(pagenum + 1))
for hit_idx, mobj in enumerate(re.finditer(
r'<h3 class="r"><a href="([^"]+)"', webpage)):
# Skip playlists
if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
continue
entries.append({
'_type': 'url', '_type': 'url',
'url': mobj.group(1) 'url': mobj.group(1)
} })
res['entries'].append(e)
if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage): if (len(entries) >= n) or not re.search(r'class="pn" id="pnnext"', webpage):
res['entries'] = entries[:n]
return res return res

View File

@ -1,39 +1,36 @@
# encoding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
class InaIE(InfoExtractor): class InaIE(InfoExtractor):
"""Information Extractor for Ina.fr""" _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
_VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I?[A-F0-9]+)/.*'
_TEST = { _TEST = {
u'url': u'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
u'file': u'I12055569.mp4', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
u'md5': u'a667021bf2b41f8dc6049479d9bb38a3', 'info_dict': {
u'info_dict': { 'id': 'I12055569',
u"title": u"Fran\u00e7ois Hollande \"Je crois que c'est clair\"" 'ext': 'mp4',
'title': 'François Hollande "Je crois que c\'est clair"',
} }
} }
def _real_extract(self,url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id mrss_url = 'http://player.ina.fr/notices/%s.mrss' % video_id
video_extension = 'mp4' info_doc = self._download_xml(mrss_url, video_id)
webpage = self._download_webpage(mrss_url, video_id)
self.report_extraction(video_id) self.report_extraction(video_id)
video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', video_url = info_doc.find('.//{http://search.yahoo.com/mrss/}player').attrib['url']
webpage, u'video URL')
video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', return {
webpage, u'title') 'id': video_id,
'url': video_url,
return [{ 'title': info_doc.find('.//title').text,
'id': video_id, }
'url': video_url,
'ext': video_extension,
'title': video_title,
}]

View File

@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from random import random
from math import floor
from .common import InfoExtractor
from ..utils import compat_urllib_request
class IPrimaIE(InfoExtractor):
_VALID_URL = r'https?://play\.iprima\.cz/(?P<videogroup>.+)/(?P<videoid>.+)'
_TESTS = [{
'url': 'http://play.iprima.cz/particka/particka-92',
'info_dict': {
'id': '39152',
'ext': 'flv',
'title': 'Partička (92)',
'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6',
'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
},
'params': {
'skip_download': True,
},
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
webpage = self._download_webpage(url, video_id)
player_url = 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % (
floor(random()*1073741824),
floor(random()*1073741824))
req = compat_urllib_request.Request(player_url)
req.add_header('Referer', url)
playerpage = self._download_webpage(req, video_id)
base_url = ''.join(re.findall(r"embed\['stream'\] = '(.+?)'.+'(\?auth=)'.+'(.+?)';", playerpage)[1])
zoneGEO = self._html_search_regex(r'"zoneGEO":(.+?),', webpage, 'zoneGEO')
if zoneGEO != '0':
base_url = base_url.replace('token', 'token_'+zoneGEO)
formats = []
for format_id in ['lq', 'hq', 'hd']:
filename = self._html_search_regex(r'"%s_id":(.+?),' % format_id, webpage, 'filename')
if filename == 'null':
continue
real_id = self._search_regex(r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id')
if format_id == 'lq':
quality = 0
elif format_id == 'hq':
quality = 1
elif format_id == 'hd':
quality = 2
filename = 'hq/'+filename
formats.append({
'format_id': format_id,
'url': base_url,
'quality': quality,
'play_path': 'mp4:'+filename.replace('"', '')[:-4],
'rtmp_live': True,
'ext': 'flv',
})
self._sort_formats(formats)
return {
'id': real_id,
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
'description': self._og_search_description(webpage),
}

View File

@ -31,7 +31,7 @@ class LifeNewsIE(InfoExtractor):
webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page') webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page')
video_url = self._html_search_regex( video_url = self._html_search_regex(
r'<video.*?src="([^"]+)"></video>', webpage, 'video URL') r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL')
thumbnail = self._html_search_regex( thumbnail = self._html_search_regex(
r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail') r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')

View File

@ -0,0 +1,56 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class M6IE(InfoExtractor):
IE_NAME = 'm6'
_VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html'
_TEST = {
'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html',
'md5': '242994a87de2c316891428e0176bcb77',
'info_dict': {
'id': '11323908',
'ext': 'mp4',
'title': 'Emeline est la Reine du Shopping sur le thème « Ma fête danniversaire ! »',
'description': 'md5:1212ae8fb4b7baa4dc3886c5676007c2',
'duration': 100,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
'Downloading video RSS')
title = rss.find('./channel/item/title').text
description = rss.find('./channel/item/description').text
thumbnail = rss.find('./channel/item/visuel_clip_big').text
duration = int(rss.find('./channel/item/duration').text)
view_count = int(rss.find('./channel/item/nombre_vues').text)
formats = []
for format_id in ['lq', 'sd', 'hq', 'hd']:
video_url = rss.find('./channel/item/url_video_%s' % format_id)
if video_url is None:
continue
formats.append({
'url': video_url.text,
'format_id': format_id,
})
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@ -0,0 +1,114 @@
from __future__ import unicode_literals
import re
import time
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_request,
compat_urllib_parse,
)
class MooshareIE(InfoExtractor):
IE_NAME = 'mooshare'
IE_DESC = 'Mooshare.biz'
_VALID_URL = r'http://mooshare\.biz/(?P<id>[\da-z]{12})'
_TESTS = [
{
'url': 'http://mooshare.biz/8dqtk4bjbp8g',
'md5': '4e14f9562928aecd2e42c6f341c8feba',
'info_dict': {
'id': '8dqtk4bjbp8g',
'ext': 'mp4',
'title': 'Comedy Football 2011 - (part 1-2)',
'duration': 893,
},
},
{
'url': 'http://mooshare.biz/aipjtoc4g95j',
'info_dict': {
'id': 'aipjtoc4g95j',
'ext': 'mp4',
'title': 'Orange Caramel Dashing Through the Snow',
'duration': 212,
},
'params': {
# rtmp download
'skip_download': True,
}
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
if re.search(r'>Video Not Found or Deleted<', page) is not None:
raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
hash_key = self._html_search_regex(r'<input type="hidden" name="hash" value="([^"]+)">', page, 'hash')
title = self._html_search_regex(r'(?m)<div class="blockTitle">\s*<h2>Watch ([^<]+)</h2>', page, 'title')
download_form = {
'op': 'download1',
'id': video_id,
'hash': hash_key,
}
request = compat_urllib_request.Request(
'http://mooshare.biz/8dqtk4bjbp8g', compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.to_screen('%s: Waiting for timeout' % video_id)
time.sleep(5)
video_page = self._download_webpage(request, video_id, 'Downloading video page')
thumbnail = self._html_search_regex(r'image:\s*"([^"]+)",', video_page, 'thumbnail', fatal=False)
duration_str = self._html_search_regex(r'duration:\s*"(\d+)",', video_page, 'duration', fatal=False)
duration = int(duration_str) if duration_str is not None else None
formats = []
# SD video
mobj = re.search(r'(?m)file:\s*"(?P<url>[^"]+)",\s*provider:', video_page)
if mobj is not None:
formats.append({
'url': mobj.group('url'),
'format_id': 'sd',
'format': 'SD',
})
# HD video
mobj = re.search(r'\'hd-2\': { file: \'(?P<url>[^\']+)\' },', video_page)
if mobj is not None:
formats.append({
'url': mobj.group('url'),
'format_id': 'hd',
'format': 'HD',
})
# rtmp video
mobj = re.search(r'(?m)file: "(?P<playpath>[^"]+)",\s*streamer: "(?P<rtmpurl>rtmp://[^"]+)",', video_page)
if mobj is not None:
formats.append({
'url': mobj.group('rtmpurl'),
'play_path': mobj.group('playpath'),
'rtmp_live': False,
'ext': 'mp4',
'format_id': 'rtmp',
'format': 'HD',
})
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re import re
import json import json
@ -5,30 +7,63 @@ from .common import InfoExtractor
class PBSIE(InfoExtractor): class PBSIE(InfoExtractor):
_VALID_URL = r'https?://video\.pbs\.org/video/(?P<id>\d+)/?' _VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
video\.pbs\.org/video/(?P<id>[0-9]+)/? |
# Article with embedded player
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
# Player
video\.pbs\.org/partnerplayer/(?P<player_id>[^/]+)/
)
'''
_TEST = { _TEST = {
u'url': u'http://video.pbs.org/video/2365006249/', 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
u'file': u'2365006249.mp4', 'md5': 'ce1888486f0908d555a8093cac9a7362',
u'md5': 'ce1888486f0908d555a8093cac9a7362', 'info_dict': {
u'info_dict': { 'id': '2365006249',
u'title': u'A More Perfect Union', 'ext': 'mp4',
u'description': u'md5:ba0c207295339c8d6eced00b7c363c6a', 'title': 'A More Perfect Union',
u'duration': 3190, 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
'duration': 3190,
}, },
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
presumptive_id = mobj.group('presumptive_id')
display_id = presumptive_id
if presumptive_id:
webpage = self._download_webpage(url, display_id)
url = self._search_regex(
r'<iframe\s+id=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
webpage, 'player URL')
mobj = re.match(self._VALID_URL, url)
player_id = mobj.group('player_id')
if not display_id:
display_id = player_id
if player_id:
player_page = self._download_webpage(
url, display_id, note='Downloading player page',
errnote='Could not download player page')
video_id = self._search_regex(
r'<div\s+id="video_([0-9]+)"', player_page, 'video ID')
else:
video_id = mobj.group('id')
display_id = video_id
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info_page = self._download_webpage(info_url, video_id) info = self._download_json(info_url, display_id)
info =json.loads(info_page)
return {'id': video_id, return {
'title': info['title'], 'id': video_id,
'url': info['alternate_encoding']['url'], 'title': info['title'],
'ext': 'mp4', 'url': info['alternate_encoding']['url'],
'description': info['program'].get('description'), 'ext': 'mp4',
'thumbnail': info.get('image_url'), 'description': info['program'].get('description'),
'duration': info.get('duration'), 'thumbnail': info.get('image_url'),
} 'duration': info.get('duration'),
}

View File

@ -1,36 +1,38 @@
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
class StatigramIE(InfoExtractor): class StatigramIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' _VALID_URL = r'https?://(www\.)?statigr\.am/p/(?P<id>[^/]+)'
_TEST = { _TEST = {
u'url': u'http://statigr.am/p/522207370455279102_24101272', 'url': 'http://statigr.am/p/522207370455279102_24101272',
u'file': u'522207370455279102_24101272.mp4', 'md5': '6eb93b882a3ded7c378ee1d6884b1814',
u'md5': u'6eb93b882a3ded7c378ee1d6884b1814', 'info_dict': {
u'info_dict': { 'id': '522207370455279102_24101272',
u'uploader_id': u'aguynamedpatrick', 'ext': 'mp4',
u'title': u'Instagram photo by @aguynamedpatrick (Patrick Janelle)', 'uploader_id': 'aguynamedpatrick',
'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',
}, },
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1) video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
html_title = self._html_search_regex( html_title = self._html_search_regex(
r'<title>(.+?)</title>', r'<title>(.+?)</title>',
webpage, u'title') webpage, 'title')
title = re.sub(r'(?: *\(Videos?\))? \| Statigram$', '', html_title) title = re.sub(r'(?: *\(Videos?\))? \| Statigram$', '', html_title)
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(
r'@([^ ]+)', title, u'uploader name', fatal=False) r'@([^ ]+)', title, 'uploader name', fatal=False)
ext = 'mp4'
return [{ return {
'id': video_id, 'id': video_id,
'url': self._og_search_video_url(webpage), 'url': self._og_search_video_url(webpage),
'ext': ext, 'title': title,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id' : uploader_id 'uploader_id': uploader_id
}] }

View File

@ -68,13 +68,14 @@ class SubtitlesInfoExtractor(InfoExtractor):
def _request_subtitle_url(self, sub_lang, url): def _request_subtitle_url(self, sub_lang, url):
""" makes the http request for the subtitle """ """ makes the http request for the subtitle """
try: try:
return self._download_subtitle_url(sub_lang, url) sub = self._download_subtitle_url(sub_lang, url)
except ExtractorError as err: except ExtractorError as err:
self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
return return
if not sub: if not sub:
self._downloader.report_warning(u'Did not fetch video subtitles') self._downloader.report_warning(u'Did not fetch video subtitles')
return return
return sub
def _get_available_subtitles(self, video_id, webpage): def _get_available_subtitles(self, video_id, webpage):
""" """

View File

@ -1,22 +1,23 @@
#coding: utf-8 #coding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import determine_ext
determine_ext,
)
class ThisAVIE(InfoExtractor): class ThisAVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
_TEST = { _TEST = {
u"url": u"http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html", 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html',
u"file": u"47734.flv", 'md5': '0480f1ef3932d901f0e0e719f188f19b',
u"md5": u"0480f1ef3932d901f0e0e719f188f19b", 'info_dict': {
u"info_dict": { 'id': '47734',
u"title": u"高樹マリア - Just fit", 'ext': 'flv',
u"uploader": u"dj7970", 'title': '高樹マリア - Just fit',
u"uploader_id": u"dj7970" 'uploader': 'dj7970',
'uploader_id': 'dj7970'
} }
} }
@ -25,19 +26,18 @@ class ThisAVIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, u'title') title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, 'title')
video_url = self._html_search_regex( video_url = self._html_search_regex(
r"addVariable\('file','([^']+)'\);", webpage, u'video url') r"addVariable\('file','([^']+)'\);", webpage, 'video url')
uploader = self._html_search_regex( uploader = self._html_search_regex(
r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
webpage, u'uploader name', fatal=False) webpage, 'uploader name', fatal=False)
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(
r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
webpage, u'uploader id', fatal=False) webpage, 'uploader id', fatal=False)
ext = determine_ext(video_url) ext = determine_ext(video_url)
return { return {
'_type': 'video',
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': uploader, 'uploader': uploader,

View File

@ -1,4 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -9,25 +11,25 @@ from ..utils import (
class TouTvIE(InfoExtractor): class TouTvIE(InfoExtractor):
IE_NAME = u'tou.tv' IE_NAME = 'tou.tv'
_VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))' _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
_TEST = { _TEST = {
u'url': u'http://www.tou.tv/30-vies/S04E41', 'url': 'http://www.tou.tv/30-vies/S04E41',
u'file': u'30-vies_S04E41.mp4', 'file': '30-vies_S04E41.mp4',
u'info_dict': { 'info_dict': {
u'title': u'30 vies Saison 4 / Épisode 41', 'title': '30 vies Saison 4 / Épisode 41',
u'description': u'md5:da363002db82ccbe4dafeb9cab039b09', 'description': 'md5:da363002db82ccbe4dafeb9cab039b09',
u'age_limit': 8, 'age_limit': 8,
u'uploader': u'Groupe des Nouveaux Médias', 'uploader': 'Groupe des Nouveaux Médias',
u'duration': 1296, 'duration': 1296,
u'upload_date': u'20131118', 'upload_date': '20131118',
u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', 'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
}, },
u'params': { 'params': {
u'skip_download': True, # Requires rtmpdump 'skip_download': True, # Requires rtmpdump
}, },
u'skip': 'Only available in Canada' 'skip': 'Only available in Canada'
} }
def _real_extract(self, url): def _real_extract(self, url):
@ -36,25 +38,25 @@ class TouTvIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
mediaId = self._search_regex( mediaId = self._search_regex(
r'"idMedia":\s*"([^"]+)"', webpage, u'media ID') r'"idMedia":\s*"([^"]+)"', webpage, 'media ID')
streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId
streams_doc = self._download_xml( streams_doc = self._download_xml(
streams_url, video_id, note=u'Downloading stream list') streams_url, video_id, note='Downloading stream list')
video_url = next(n.text video_url = next(n.text
for n in streams_doc.findall('.//choice/url') for n in streams_doc.findall('.//choice/url')
if u'//ad.doubleclick' not in n.text) if '//ad.doubleclick' not in n.text)
if video_url.endswith('/Unavailable.flv'): if video_url.endswith('/Unavailable.flv'):
raise ExtractorError( raise ExtractorError(
u'Access to this video is blocked from outside of Canada', 'Access to this video is blocked from outside of Canada',
expected=True) expected=True)
duration_str = self._html_search_meta( duration_str = self._html_search_meta(
'video:duration', webpage, u'duration') 'video:duration', webpage, 'duration')
duration = int(duration_str) if duration_str else None duration = int(duration_str) if duration_str else None
upload_date_str = self._html_search_meta( upload_date_str = self._html_search_meta(
'video:release_date', webpage, u'upload date') 'video:release_date', webpage, 'upload date')
upload_date = unified_strdate(upload_date_str) if upload_date_str else None upload_date = unified_strdate(upload_date_str) if upload_date_str else None
return { return {

View File

@ -6,6 +6,7 @@ from .common import InfoExtractor
class TrailerAddictIE(InfoExtractor): class TrailerAddictIE(InfoExtractor):
_WORKING = False
_VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)'
_TEST = { _TEST = {
'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer',

View File

@ -1,3 +1,6 @@
# encoding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -10,45 +13,44 @@ from ..utils import (
class Vbox7IE(InfoExtractor): class Vbox7IE(InfoExtractor):
"""Information Extractor for Vbox7""" _VALID_URL = r'http://(www\.)?vbox7\.com/play:(?P<id>[^/]+)'
_VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
_TEST = { _TEST = {
u'url': u'http://vbox7.com/play:249bb972c2', 'url': 'http://vbox7.com/play:249bb972c2',
u'file': u'249bb972c2.flv', 'md5': '99f65c0c9ef9b682b97313e052734c3f',
u'md5': u'99f65c0c9ef9b682b97313e052734c3f', 'info_dict': {
u'info_dict': { 'id': '249bb972c2',
u"title": u"\u0421\u043c\u044f\u0445! \u0427\u0443\u0434\u043e - \u0447\u0438\u0441\u0442 \u0437\u0430 \u0441\u0435\u043a\u0443\u043d\u0434\u0438 - \u0421\u043a\u0440\u0438\u0442\u0430 \u043a\u0430\u043c\u0435\u0440\u0430" 'ext': 'flv',
} 'title': 'Смях! Чудо - чист за секунди - Скрита камера',
},
} }
def _real_extract(self,url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None: video_id = mobj.group('id')
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1)
redirect_page, urlh = self._download_webpage_handle(url, video_id) redirect_page, urlh = self._download_webpage_handle(url, video_id)
new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') new_location = self._search_regex(r'window\.location = \'(.*)\';',
redirect_page, 'redirect location')
redirect_url = urlh.geturl() + new_location redirect_url = urlh.geturl() + new_location
webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') webpage = self._download_webpage(redirect_url, video_id,
'Downloading redirect page')
title = self._html_search_regex(r'<title>(.*)</title>', title = self._html_search_regex(r'<title>(.*)</title>',
webpage, u'title').split('/')[0].strip() webpage, 'title').split('/')[0].strip()
ext = "flv"
info_url = "http://vbox7.com/play/magare.do" info_url = "http://vbox7.com/play/magare.do"
data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) data = compat_urllib_parse.urlencode({'as3': '1', 'vid': video_id})
info_request = compat_urllib_request.Request(info_url, data) info_request = compat_urllib_request.Request(info_url, data)
info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage')
if info_response is None: if info_response is None:
raise ExtractorError(u'Unable to extract the media url') raise ExtractorError('Unable to extract the media url')
(final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
return [{ return {
'id': video_id, 'id': video_id,
'url': final_url, 'url': final_url,
'ext': ext, 'ext': 'flv',
'title': title, 'title': title,
'thumbnail': thumbnail_url, 'thumbnail': thumbnail_url,
}] }

View File

@ -1,18 +1,21 @@
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
class VineIE(InfoExtractor): class VineIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?vine\.co/v/(?P<id>\w+)'
_TEST = { _TEST = {
u'url': u'https://vine.co/v/b9KOOWX7HUx', 'url': 'https://vine.co/v/b9KOOWX7HUx',
u'file': u'b9KOOWX7HUx.mp4', 'md5': '2f36fed6235b16da96ce9b4dc890940d',
u'md5': u'2f36fed6235b16da96ce9b4dc890940d', 'info_dict': {
u'info_dict': { 'id': 'b9KOOWX7HUx',
u"uploader": u"Jack Dorsey", 'ext': 'mp4',
u"title": u"Chicken." 'uploader': 'Jack Dorsey',
} 'title': 'Chicken.',
},
} }
def _real_extract(self, url): def _real_extract(self, url):
@ -24,17 +27,17 @@ class VineIE(InfoExtractor):
self.report_extraction(video_id) self.report_extraction(video_id)
video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"', video_url = self._html_search_meta('twitter:player:stream', webpage,
webpage, u'video URL') 'video URL')
uploader = self._html_search_regex(r'<p class="username">(.*?)</p>', uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
webpage, u'uploader', fatal=False, flags=re.DOTALL) webpage, 'uploader', fatal=False, flags=re.DOTALL)
return [{ return {
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'ext': 'mp4', 'ext': 'mp4',
'title': self._og_search_title(webpage), 'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'uploader': uploader, 'uploader': uploader,
}] }

View File

@ -502,7 +502,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return a % b return a % b
m = re.match( m = re.match(
r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr) r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
if m: if m:
fname = m.group('func') fname = m.group('func')
if fname not in functions: if fname not in functions:

View File

@ -1,2 +1,2 @@
__version__ = '2014.02.03.1' __version__ = '2014.02.06'