Compare commits

...

9 Commits

9 changed files with 170 additions and 51 deletions

View File

@ -34,6 +34,7 @@ from youtube_dl.extractor import (
KhanAcademyIE, KhanAcademyIE,
EveryonesMixtapeIE, EveryonesMixtapeIE,
RutubeChannelIE, RutubeChannelIE,
GoogleSearchIE,
GenericIE, GenericIE,
) )
@ -240,6 +241,14 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'Always/Never: A Little-Seen Movie About Nuclear Command and Control : The New Yorker') self.assertEqual(result['title'], 'Always/Never: A Little-Seen Movie About Nuclear Command and Control : The New Yorker')
self.assertEqual(len(result['entries']), 3) self.assertEqual(len(result['entries']), 3)
def test_GoogleSearch(self):
dl = FakeYDL()
ie = GoogleSearchIE(dl)
result = ie.extract('gvsearch15:python language')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'python language')
self.assertEqual(result['title'], 'python language')
self.assertTrue(len(result['entries']) == 15)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -60,6 +60,7 @@ from .exfm import ExfmIE
from .extremetube import ExtremeTubeIE from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE from .facebook import FacebookIE
from .faz import FazIE from .faz import FazIE
from .firstpost import FirstpostIE
from .fktv import ( from .fktv import (
FKTVIE, FKTVIE,
FKTVPosteckeIE, FKTVPosteckeIE,
@ -118,6 +119,7 @@ from .lynda import (
LyndaIE, LyndaIE,
LyndaCourseIE LyndaCourseIE
) )
from .m6 import M6IE
from .macgamestore import MacGameStoreIE from .macgamestore import MacGameStoreIE
from .malemotion import MalemotionIE from .malemotion import MalemotionIE
from .mdr import MDRIE from .mdr import MDRIE

View File

@ -0,0 +1,38 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class FirstpostIE(InfoExtractor):
IE_NAME = 'Firstpost.com'
_VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html'
_TEST = {
'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html',
'md5': 'ee9114957692f01fb1263ed87039112a',
'info_dict': {
'id': '1025403',
'ext': 'mp4',
'title': 'India to launch indigenous aircraft carrier INS Vikrant today',
'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'<div.*?name="div_video".*?flashvars="([^"]+)">',
webpage, 'video URL')
return {
'id': video_id,
'url': video_url,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import itertools import itertools
import re import re
@ -8,32 +10,42 @@ from ..utils import (
class GoogleSearchIE(SearchInfoExtractor): class GoogleSearchIE(SearchInfoExtractor):
IE_DESC = u'Google Video search' IE_DESC = 'Google Video search'
_MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
_MAX_RESULTS = 1000 _MAX_RESULTS = 1000
IE_NAME = u'video.google:search' IE_NAME = 'video.google:search'
_SEARCH_KEY = 'gvsearch' _SEARCH_KEY = 'gvsearch'
def _get_n_results(self, query, n): def _get_n_results(self, query, n):
"""Get a specified number of results for a query""" """Get a specified number of results for a query"""
entries = []
res = { res = {
'_type': 'playlist', '_type': 'playlist',
'id': query, 'id': query,
'entries': [] 'title': query,
} }
for pagenum in itertools.count(1): for pagenum in itertools.count():
result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10) result_url = (
webpage = self._download_webpage(result_url, u'gvsearch:' + query, 'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en'
note='Downloading result page ' + str(pagenum)) % (compat_urllib_parse.quote_plus(query), pagenum * 10))
for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage): webpage = self._download_webpage(
e = { result_url, 'gvsearch:' + query,
note='Downloading result page ' + str(pagenum + 1))
for hit_idx, mobj in enumerate(re.finditer(
r'<h3 class="r"><a href="([^"]+)"', webpage)):
# Skip playlists
if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
continue
entries.append({
'_type': 'url', '_type': 'url',
'url': mobj.group(1) 'url': mobj.group(1)
} })
res['entries'].append(e)
if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage): if (len(entries) >= n) or not re.search(r'class="pn" id="pnnext"', webpage):
res['entries'] = entries[:n]
return res return res

View File

@ -7,7 +7,7 @@ from .common import InfoExtractor
class InaIE(InfoExtractor): class InaIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-F0-9]+)/.*' _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
_TEST = { _TEST = {
'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
'md5': 'a667021bf2b41f8dc6049479d9bb38a3', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3',

View File

@ -0,0 +1,56 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class M6IE(InfoExtractor):
IE_NAME = 'm6'
_VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html'
_TEST = {
'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html',
'md5': '242994a87de2c316891428e0176bcb77',
'info_dict': {
'id': '11323908',
'ext': 'mp4',
'title': 'Emeline est la Reine du Shopping sur le thème « Ma fête danniversaire ! »',
'description': 'md5:1212ae8fb4b7baa4dc3886c5676007c2',
'duration': 100,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
'Downloading video RSS')
title = rss.find('./channel/item/title').text
description = rss.find('./channel/item/description').text
thumbnail = rss.find('./channel/item/visuel_clip_big').text
duration = int(rss.find('./channel/item/duration').text)
view_count = int(rss.find('./channel/item/nombre_vues').text)
formats = []
for format_id in ['lq', 'sd', 'hq', 'hd']:
video_url = rss.find('./channel/item/url_video_%s' % format_id)
if video_url is None:
continue
formats.append({
'url': video_url.text,
'format_id': format_id,
})
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@ -1,22 +1,23 @@
#coding: utf-8 #coding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import determine_ext
determine_ext,
)
class ThisAVIE(InfoExtractor): class ThisAVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
_TEST = { _TEST = {
u"url": u"http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html", 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html',
u"file": u"47734.flv", 'md5': '0480f1ef3932d901f0e0e719f188f19b',
u"md5": u"0480f1ef3932d901f0e0e719f188f19b", 'info_dict': {
u"info_dict": { 'id': '47734',
u"title": u"高樹マリア - Just fit", 'ext': 'flv',
u"uploader": u"dj7970", 'title': '高樹マリア - Just fit',
u"uploader_id": u"dj7970" 'uploader': 'dj7970',
'uploader_id': 'dj7970'
} }
} }
@ -25,19 +26,18 @@ class ThisAVIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, u'title') title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, 'title')
video_url = self._html_search_regex( video_url = self._html_search_regex(
r"addVariable\('file','([^']+)'\);", webpage, u'video url') r"addVariable\('file','([^']+)'\);", webpage, 'video url')
uploader = self._html_search_regex( uploader = self._html_search_regex(
r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
webpage, u'uploader name', fatal=False) webpage, 'uploader name', fatal=False)
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(
r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
webpage, u'uploader id', fatal=False) webpage, 'uploader id', fatal=False)
ext = determine_ext(video_url) ext = determine_ext(video_url)
return { return {
'_type': 'video',
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': uploader, 'uploader': uploader,

View File

@ -1,4 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -9,25 +11,25 @@ from ..utils import (
class TouTvIE(InfoExtractor): class TouTvIE(InfoExtractor):
IE_NAME = u'tou.tv' IE_NAME = 'tou.tv'
_VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))' _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
_TEST = { _TEST = {
u'url': u'http://www.tou.tv/30-vies/S04E41', 'url': 'http://www.tou.tv/30-vies/S04E41',
u'file': u'30-vies_S04E41.mp4', 'file': '30-vies_S04E41.mp4',
u'info_dict': { 'info_dict': {
u'title': u'30 vies Saison 4 / Épisode 41', 'title': '30 vies Saison 4 / Épisode 41',
u'description': u'md5:da363002db82ccbe4dafeb9cab039b09', 'description': 'md5:da363002db82ccbe4dafeb9cab039b09',
u'age_limit': 8, 'age_limit': 8,
u'uploader': u'Groupe des Nouveaux Médias', 'uploader': 'Groupe des Nouveaux Médias',
u'duration': 1296, 'duration': 1296,
u'upload_date': u'20131118', 'upload_date': '20131118',
u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', 'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
}, },
u'params': { 'params': {
u'skip_download': True, # Requires rtmpdump 'skip_download': True, # Requires rtmpdump
}, },
u'skip': 'Only available in Canada' 'skip': 'Only available in Canada'
} }
def _real_extract(self, url): def _real_extract(self, url):
@ -36,25 +38,25 @@ class TouTvIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
mediaId = self._search_regex( mediaId = self._search_regex(
r'"idMedia":\s*"([^"]+)"', webpage, u'media ID') r'"idMedia":\s*"([^"]+)"', webpage, 'media ID')
streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId
streams_doc = self._download_xml( streams_doc = self._download_xml(
streams_url, video_id, note=u'Downloading stream list') streams_url, video_id, note='Downloading stream list')
video_url = next(n.text video_url = next(n.text
for n in streams_doc.findall('.//choice/url') for n in streams_doc.findall('.//choice/url')
if u'//ad.doubleclick' not in n.text) if '//ad.doubleclick' not in n.text)
if video_url.endswith('/Unavailable.flv'): if video_url.endswith('/Unavailable.flv'):
raise ExtractorError( raise ExtractorError(
u'Access to this video is blocked from outside of Canada', 'Access to this video is blocked from outside of Canada',
expected=True) expected=True)
duration_str = self._html_search_meta( duration_str = self._html_search_meta(
'video:duration', webpage, u'duration') 'video:duration', webpage, 'duration')
duration = int(duration_str) if duration_str else None duration = int(duration_str) if duration_str else None
upload_date_str = self._html_search_meta( upload_date_str = self._html_search_meta(
'video:release_date', webpage, u'upload date') 'video:release_date', webpage, 'upload date')
upload_date = unified_strdate(upload_date_str) if upload_date_str else None upload_date = unified_strdate(upload_date_str) if upload_date_str else None
return { return {

View File

@ -1,2 +1,2 @@
__version__ = '2014.02.04.1' __version__ = '2014.02.06'