Compare commits

..

9 Commits

10 changed files with 291 additions and 106 deletions

View File

@ -127,6 +127,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('8/7/2009'), '20090708') self.assertEqual(unified_strdate('8/7/2009'), '20090708')
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214') self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
def test_find_xpath_attr(self): def test_find_xpath_attr(self):
testxml = u'''<root> testxml = u'''<root>

View File

@ -115,6 +115,7 @@ from .keezmovies import KeezMoviesIE
from .khanacademy import KhanAcademyIE from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE from .kickstarter import KickStarterIE
from .keek import KeekIE from .keek import KeekIE
from .kontrtube import KontrTubeIE
from .la7 import LA7IE from .la7 import LA7IE
from .lifenews import LifeNewsIE from .lifenews import LifeNewsIE
from .liveleak import LiveLeakIE from .liveleak import LiveLeakIE

View File

@ -2,29 +2,160 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor
from ..utils import ExtractorError from ..utils import ExtractorError
class BBCCoUkIE(InfoExtractor): class BBCCoUkIE(SubtitlesInfoExtractor):
IE_NAME = 'bbc.co.uk' IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC - iPlayer Radio' IE_DESC = 'BBC iPlayer'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
_TEST = { _TESTS = [
'url': 'http://www.bbc.co.uk/programmes/p01q7wz1', {
'info_dict': { 'url': 'http://www.bbc.co.uk/programmes/p01q7wz1',
'id': 'p01q7wz4', 'info_dict': {
'ext': 'flv', 'id': 'p01q7wz4',
'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix', 'ext': 'flv',
'description': 'Blu Mar Ten deliver a Guest Mix for Friction.', 'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix',
'duration': 1936, 'description': 'Blu Mar Ten deliver a Guest Mix for Friction.',
'duration': 1936,
},
'params': {
# rtmp download
'skip_download': True,
}
}, },
'params': { {
# rtmp download 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
'skip_download': True, 'info_dict': {
'id': 'b00yng1d',
'ext': 'flv',
'title': 'The Man in Black: Series 3: The Printed Name',
'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
'duration': 1800,
},
'params': {
# rtmp download
'skip_download': True,
}
},
{
'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
'info_dict': {
'id': 'b00yng1d',
'ext': 'flv',
'title': 'The Voice UK: Series 3: Blind Auditions 5',
'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
'duration': 5100,
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
} }
} ]
def _extract_asx_playlist(self, connection, programme_id):
asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
return [ref.get('href') for ref in asx.findall('./Entry/ref')]
def _extract_connection(self, connection, programme_id):
formats = []
protocol = connection.get('protocol')
supplier = connection.get('supplier')
if protocol == 'http':
href = connection.get('href')
# ASX playlist
if supplier == 'asx':
for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
formats.append({
'url': ref,
'format_id': 'ref%s_%s' % (i, supplier),
})
# Direct link
else:
formats.append({
'url': href,
'format_id': supplier,
})
elif protocol == 'rtmp':
application = connection.get('application', 'ondemand')
auth_string = connection.get('authString')
identifier = connection.get('identifier')
server = connection.get('server')
formats.append({
'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
'play_path': identifier,
'app': '%s?%s' % (application, auth_string),
'page_url': 'http://www.bbc.co.uk',
'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
'rtmp_live': False,
'ext': 'flv',
'format_id': supplier,
})
return formats
def _extract_items(self, playlist):
return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
def _extract_medias(self, media_selection):
return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
def _extract_connections(self, media):
return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
def _extract_video(self, media, programme_id):
formats = []
vbr = int(media.get('bitrate'))
vcodec = media.get('encoding')
service = media.get('service')
width = int(media.get('width'))
height = int(media.get('height'))
file_size = int(media.get('media_file_size'))
for connection in self._extract_connections(media):
conn_formats = self._extract_connection(connection, programme_id)
for format in conn_formats:
format.update({
'format_id': '%s_%s' % (service, format['format_id']),
'width': width,
'height': height,
'vbr': vbr,
'vcodec': vcodec,
'filesize': file_size,
})
formats.extend(conn_formats)
return formats
def _extract_audio(self, media, programme_id):
formats = []
abr = int(media.get('bitrate'))
acodec = media.get('encoding')
service = media.get('service')
for connection in self._extract_connections(media):
conn_formats = self._extract_connection(connection, programme_id)
for format in conn_formats:
format.update({
'format_id': '%s_%s' % (service, format['format_id']),
'abr': abr,
'acodec': acodec,
})
formats.extend(conn_formats)
return formats
def _extract_captions(self, media, programme_id):
subtitles = {}
for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
srt = ''
for pos, p in enumerate(ps):
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
p.text.strip() if p.text is not None else '')
subtitles[lang] = srt
return subtitles
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
@ -33,84 +164,54 @@ class BBCCoUkIE(InfoExtractor):
playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id, playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
'Downloading playlist XML') 'Downloading playlist XML')
item = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}item') no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
if item is None: if no_items is not None:
no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') reason = no_items.get('reason')
if no_items is not None: if reason == 'preAvailability':
reason = no_items.get('reason') msg = 'Episode %s is not yet available' % group_id
if reason == 'preAvailability': elif reason == 'postAvailability':
msg = 'Episode %s is not yet available' % group_id msg = 'Episode %s is no longer available' % group_id
elif reason == 'postAvailability': else:
msg = 'Episode %s is no longer available' % group_id msg = 'Episode %s is not available: %s' % (group_id, reason)
else: raise ExtractorError(msg, expected=True)
msg = 'Episode %s is not available: %s' % (group_id, reason)
raise ExtractorError(msg, expected=True)
raise ExtractorError('Failed to extract media for episode %s' % group_id, expected=True)
title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
radio_programme_id = item.get('identifier')
duration = int(item.get('duration'))
media_selection = self._download_xml(
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % radio_programme_id,
radio_programme_id, 'Downloading media selection XML')
formats = [] formats = []
for media in media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media'): subtitles = None
bitrate = int(media.get('bitrate'))
encoding = media.get('encoding') for item in self._extract_items(playlist):
service = media.get('service') kind = item.get('kind')
connection = media.find('./{http://bbc.co.uk/2008/mp/mediaselection}connection') if kind != 'programme' and kind != 'radioProgramme':
protocol = connection.get('protocol') continue
priority = connection.get('priority') title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
supplier = connection.get('supplier') description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
if protocol == 'http':
href = connection.get('href') programme_id = item.get('identifier')
# ASX playlist duration = int(item.get('duration'))
if supplier == 'asx':
asx = self._download_xml(href, radio_programme_id, 'Downloading %s ASX playlist' % service) media_selection = self._download_xml(
for i, ref in enumerate(asx.findall('./Entry/ref')): 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
formats.append({ programme_id, 'Downloading media selection XML')
'url': ref.get('href'),
'format_id': '%s_ref%s' % (service, i), for media in self._extract_medias(media_selection):
'abr': bitrate, kind = media.get('kind')
'acodec': encoding, if kind == 'audio':
'preference': priority, formats.extend(self._extract_audio(media, programme_id))
}) elif kind == 'video':
continue formats.extend(self._extract_video(media, programme_id))
# Direct link elif kind == 'captions':
formats.append({ subtitles = self._extract_captions(media, programme_id)
'url': href,
'format_id': service, if self._downloader.params.get('listsubtitles', False):
'abr': bitrate, self._list_available_subtitles(programme_id, subtitles)
'acodec': encoding, return
'preference': priority,
})
elif protocol == 'rtmp':
application = connection.get('application', 'ondemand')
auth_string = connection.get('authString')
identifier = connection.get('identifier')
server = connection.get('server')
formats.append({
'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
'play_path': identifier,
'app': '%s?%s' % (application, auth_string),
'rtmp_live': False,
'ext': 'flv',
'format_id': service,
'abr': bitrate,
'acodec': encoding,
'preference': priority,
})
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': radio_programme_id, 'id': programme_id,
'title': title, 'title': title,
'description': description, 'description': description,
'duration': duration, 'duration': duration,
'formats': formats, 'formats': formats,
'subtitles': subtitles,
} }

View File

@ -24,5 +24,7 @@ class BloombergIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
name = mobj.group('name') name = mobj.group('name')
webpage = self._download_webpage(url, name) webpage = self._download_webpage(url, name)
ooyala_url = self._twitter_search_player(webpage) embed_code = self._search_regex(
return self.url_result(ooyala_url, OoyalaIE.ie_key()) r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
'embed code')
return OoyalaIE._build_url_result(embed_code)

View File

@ -271,8 +271,11 @@ class InfoExtractor(object):
def _download_json(self, url_or_request, video_id, def _download_json(self, url_or_request, video_id,
note=u'Downloading JSON metadata', note=u'Downloading JSON metadata',
errnote=u'Unable to download JSON metadata'): errnote=u'Unable to download JSON metadata',
transform_source=None):
json_string = self._download_webpage(url_or_request, video_id, note, errnote) json_string = self._download_webpage(url_or_request, video_id, note, errnote)
if transform_source:
json_string = transform_source(json_string)
try: try:
return json.loads(json_string) return json.loads(json_string)
except ValueError as ve: except ValueError as ve:

View File

@ -0,0 +1,66 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class KontrTubeIE(InfoExtractor):
IE_NAME = 'kontrtube'
IE_DESC = 'KontrTube.ru - Труба зовёт'
_VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+'
_TEST = {
'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
'md5': '975a991a4926c9a85f383a736a2e6b80',
'info_dict': {
'id': '2678',
'ext': 'mp4',
'title': 'Над олимпийской деревней в Сочи поднят российский флаг',
'description': 'md5:80edc4c613d5887ae8ccf1d59432be41',
'thumbnail': 'http://www.kontrtube.ru/contents/videos_screenshots/2000/2678/preview.mp4.jpg',
'duration': 270,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id, 'Downloading page')
video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
'video title')
description = self._html_search_meta('description', webpage, 'video description')
mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
webpage)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
'view count', fatal=False)
view_count = int(view_count) if view_count is not None else None
comment_count = None
comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
fatal=False)
if comment_str.startswith('комментариев нет'):
comment_count = 0
else:
mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
if mobj:
comment_count = int(mobj.group('total'))
return {
'id': video_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
'description': description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re import re
import json import json
@ -12,11 +14,12 @@ class SlideshareIE(InfoExtractor):
_VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
_TEST = { _TEST = {
u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
u'file': u'25665706.mp4', 'info_dict': {
u'info_dict': { 'id': '25665706',
u'title': u'Managing Scale and Complexity', 'ext': 'mp4',
u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix', 'title': 'Managing Scale and Complexity',
'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',
}, },
} }
@ -26,15 +29,17 @@ class SlideshareIE(InfoExtractor):
webpage = self._download_webpage(url, page_title) webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex( slideshare_obj = self._search_regex(
r'var slideshare_object = ({.*?}); var user_info =', r'var slideshare_object = ({.*?}); var user_info =',
webpage, u'slideshare object') webpage, 'slideshare object')
info = json.loads(slideshare_obj) info = json.loads(slideshare_obj)
if info['slideshow']['type'] != u'video': if info['slideshow']['type'] != 'video':
raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
doc = info['doc'] doc = info['doc']
bucket = info['jsplayer']['video_bucket'] bucket = info['jsplayer']['video_bucket']
ext = info['jsplayer']['video_extension'] ext = info['jsplayer']['video_extension']
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
description = self._html_search_regex(
r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description')
return { return {
'_type': 'video', '_type': 'video',
@ -43,5 +48,5 @@ class SlideshareIE(InfoExtractor):
'ext': ext, 'ext': ext,
'url': video_url, 'url': video_url,
'thumbnail': info['slideshow']['pin_image_url'], 'thumbnail': info['slideshow']['pin_image_url'],
'description': self._og_search_description(webpage), 'description': description,
} }

View File

@ -34,6 +34,7 @@ from ..utils import (
unified_strdate, unified_strdate,
orderedSet, orderedSet,
write_json_file, write_json_file,
uppercase_escape,
) )
class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeBaseInfoExtractor(InfoExtractor):
@ -136,7 +137,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
(?:https?://|//)? # http(s):// or protocol-independent URL (optional) (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com| (?:www\.)?pwnyoutube\.com/|
tube\.majestyc\.net/| tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls (?:.*?\#/)? # handle anchor (#/) redirect urls
@ -1590,10 +1591,9 @@ class YoutubeChannelIE(InfoExtractor):
# Download all channel pages using the json-based channel_ajax query # Download all channel pages using the json-based channel_ajax query
for pagenum in itertools.count(1): for pagenum in itertools.count(1):
url = self._MORE_PAGES_URL % (pagenum, channel_id) url = self._MORE_PAGES_URL % (pagenum, channel_id)
page = self._download_webpage(url, channel_id, page = self._download_json(
u'Downloading page #%s' % pagenum) url, channel_id, note=u'Downloading page #%s' % pagenum,
transform_source=uppercase_escape)
page = json.loads(page)
ids_in_page = self.extract_videos_from_page(page['content_html']) ids_in_page = self.extract_videos_from_page(page['content_html'])
video_ids.extend(ids_in_page) video_ids.extend(ids_in_page)

View File

@ -756,9 +756,9 @@ def unified_strdate(date_str):
"""Return a string with the date in the format YYYYMMDD""" """Return a string with the date in the format YYYYMMDD"""
upload_date = None upload_date = None
#Replace commas #Replace commas
date_str = date_str.replace(',',' ') date_str = date_str.replace(',', ' ')
# %z (UTC offset) is only supported in python>=3.2 # %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' ?(\+|-)[0-9:]*$', '', date_str) date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
format_expressions = [ format_expressions = [
'%d %B %Y', '%d %B %Y',
'%B %d %Y', '%B %d %Y',
@ -1214,3 +1214,9 @@ class PagedList(object):
if end == nextfirstid: if end == nextfirstid:
break break
return res return res
def uppercase_escape(s):
return re.sub(
r'\\U([0-9a-fA-F]{8})',
lambda m: compat_chr(int(m.group(1), base=16)), s)

View File

@ -1,2 +1,2 @@
__version__ = '2014.02.08.2' __version__ = '2014.02.10'