Compare commits
9 Commits
2014.02.08
...
2014.02.10
Author | SHA1 | Date | |
---|---|---|---|
2e20bba708 | |||
e70dc1d14b | |||
026fcc0495 | |||
81c2f20b53 | |||
1afe753462 | |||
524c2c716a | |||
b542d4bbd7 | |||
17968e444c | |||
2e3fd9ec2f |
@ -127,6 +127,7 @@ class TestUtil(unittest.TestCase):
|
|||||||
self.assertEqual(unified_strdate('8/7/2009'), '20090708')
|
self.assertEqual(unified_strdate('8/7/2009'), '20090708')
|
||||||
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
|
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
|
||||||
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
|
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
|
||||||
|
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
|
||||||
|
|
||||||
def test_find_xpath_attr(self):
|
def test_find_xpath_attr(self):
|
||||||
testxml = u'''<root>
|
testxml = u'''<root>
|
||||||
|
@ -115,6 +115,7 @@ from .keezmovies import KeezMoviesIE
|
|||||||
from .khanacademy import KhanAcademyIE
|
from .khanacademy import KhanAcademyIE
|
||||||
from .kickstarter import KickStarterIE
|
from .kickstarter import KickStarterIE
|
||||||
from .keek import KeekIE
|
from .keek import KeekIE
|
||||||
|
from .kontrtube import KontrTubeIE
|
||||||
from .la7 import LA7IE
|
from .la7 import LA7IE
|
||||||
from .lifenews import LifeNewsIE
|
from .lifenews import LifeNewsIE
|
||||||
from .liveleak import LiveLeakIE
|
from .liveleak import LiveLeakIE
|
||||||
|
@ -2,29 +2,160 @@ from __future__ import unicode_literals
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .subtitles import SubtitlesInfoExtractor
|
||||||
from ..utils import ExtractorError
|
from ..utils import ExtractorError
|
||||||
|
|
||||||
|
|
||||||
class BBCCoUkIE(InfoExtractor):
|
class BBCCoUkIE(SubtitlesInfoExtractor):
|
||||||
IE_NAME = 'bbc.co.uk'
|
IE_NAME = 'bbc.co.uk'
|
||||||
IE_DESC = 'BBC - iPlayer Radio'
|
IE_DESC = 'BBC iPlayer'
|
||||||
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
|
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
|
||||||
|
|
||||||
_TEST = {
|
_TESTS = [
|
||||||
'url': 'http://www.bbc.co.uk/programmes/p01q7wz1',
|
{
|
||||||
'info_dict': {
|
'url': 'http://www.bbc.co.uk/programmes/p01q7wz1',
|
||||||
'id': 'p01q7wz4',
|
'info_dict': {
|
||||||
'ext': 'flv',
|
'id': 'p01q7wz4',
|
||||||
'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix',
|
'ext': 'flv',
|
||||||
'description': 'Blu Mar Ten deliver a Guest Mix for Friction.',
|
'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix',
|
||||||
'duration': 1936,
|
'description': 'Blu Mar Ten deliver a Guest Mix for Friction.',
|
||||||
|
'duration': 1936,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
# rtmp download
|
||||||
|
'skip_download': True,
|
||||||
|
}
|
||||||
},
|
},
|
||||||
'params': {
|
{
|
||||||
# rtmp download
|
'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
|
||||||
'skip_download': True,
|
'info_dict': {
|
||||||
|
'id': 'b00yng1d',
|
||||||
|
'ext': 'flv',
|
||||||
|
'title': 'The Man in Black: Series 3: The Printed Name',
|
||||||
|
'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
|
||||||
|
'duration': 1800,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
# rtmp download
|
||||||
|
'skip_download': True,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'b00yng1d',
|
||||||
|
'ext': 'flv',
|
||||||
|
'title': 'The Voice UK: Series 3: Blind Auditions 5',
|
||||||
|
'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
|
||||||
|
'duration': 5100,
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
# rtmp download
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
|
||||||
}
|
}
|
||||||
}
|
]
|
||||||
|
|
||||||
|
def _extract_asx_playlist(self, connection, programme_id):
|
||||||
|
asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
|
||||||
|
return [ref.get('href') for ref in asx.findall('./Entry/ref')]
|
||||||
|
|
||||||
|
def _extract_connection(self, connection, programme_id):
|
||||||
|
formats = []
|
||||||
|
protocol = connection.get('protocol')
|
||||||
|
supplier = connection.get('supplier')
|
||||||
|
if protocol == 'http':
|
||||||
|
href = connection.get('href')
|
||||||
|
# ASX playlist
|
||||||
|
if supplier == 'asx':
|
||||||
|
for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
|
||||||
|
formats.append({
|
||||||
|
'url': ref,
|
||||||
|
'format_id': 'ref%s_%s' % (i, supplier),
|
||||||
|
})
|
||||||
|
# Direct link
|
||||||
|
else:
|
||||||
|
formats.append({
|
||||||
|
'url': href,
|
||||||
|
'format_id': supplier,
|
||||||
|
})
|
||||||
|
elif protocol == 'rtmp':
|
||||||
|
application = connection.get('application', 'ondemand')
|
||||||
|
auth_string = connection.get('authString')
|
||||||
|
identifier = connection.get('identifier')
|
||||||
|
server = connection.get('server')
|
||||||
|
formats.append({
|
||||||
|
'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
|
||||||
|
'play_path': identifier,
|
||||||
|
'app': '%s?%s' % (application, auth_string),
|
||||||
|
'page_url': 'http://www.bbc.co.uk',
|
||||||
|
'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
|
||||||
|
'rtmp_live': False,
|
||||||
|
'ext': 'flv',
|
||||||
|
'format_id': supplier,
|
||||||
|
})
|
||||||
|
return formats
|
||||||
|
|
||||||
|
def _extract_items(self, playlist):
|
||||||
|
return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
|
||||||
|
|
||||||
|
def _extract_medias(self, media_selection):
|
||||||
|
return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
|
||||||
|
|
||||||
|
def _extract_connections(self, media):
|
||||||
|
return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
|
||||||
|
|
||||||
|
def _extract_video(self, media, programme_id):
|
||||||
|
formats = []
|
||||||
|
vbr = int(media.get('bitrate'))
|
||||||
|
vcodec = media.get('encoding')
|
||||||
|
service = media.get('service')
|
||||||
|
width = int(media.get('width'))
|
||||||
|
height = int(media.get('height'))
|
||||||
|
file_size = int(media.get('media_file_size'))
|
||||||
|
for connection in self._extract_connections(media):
|
||||||
|
conn_formats = self._extract_connection(connection, programme_id)
|
||||||
|
for format in conn_formats:
|
||||||
|
format.update({
|
||||||
|
'format_id': '%s_%s' % (service, format['format_id']),
|
||||||
|
'width': width,
|
||||||
|
'height': height,
|
||||||
|
'vbr': vbr,
|
||||||
|
'vcodec': vcodec,
|
||||||
|
'filesize': file_size,
|
||||||
|
})
|
||||||
|
formats.extend(conn_formats)
|
||||||
|
return formats
|
||||||
|
|
||||||
|
def _extract_audio(self, media, programme_id):
|
||||||
|
formats = []
|
||||||
|
abr = int(media.get('bitrate'))
|
||||||
|
acodec = media.get('encoding')
|
||||||
|
service = media.get('service')
|
||||||
|
for connection in self._extract_connections(media):
|
||||||
|
conn_formats = self._extract_connection(connection, programme_id)
|
||||||
|
for format in conn_formats:
|
||||||
|
format.update({
|
||||||
|
'format_id': '%s_%s' % (service, format['format_id']),
|
||||||
|
'abr': abr,
|
||||||
|
'acodec': acodec,
|
||||||
|
})
|
||||||
|
formats.extend(conn_formats)
|
||||||
|
return formats
|
||||||
|
|
||||||
|
def _extract_captions(self, media, programme_id):
|
||||||
|
subtitles = {}
|
||||||
|
for connection in self._extract_connections(media):
|
||||||
|
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
|
||||||
|
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
|
||||||
|
ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
|
||||||
|
srt = ''
|
||||||
|
for pos, p in enumerate(ps):
|
||||||
|
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
|
||||||
|
p.text.strip() if p.text is not None else '')
|
||||||
|
subtitles[lang] = srt
|
||||||
|
return subtitles
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
@ -33,84 +164,54 @@ class BBCCoUkIE(InfoExtractor):
|
|||||||
playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
|
playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
|
||||||
'Downloading playlist XML')
|
'Downloading playlist XML')
|
||||||
|
|
||||||
item = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}item')
|
no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
|
||||||
if item is None:
|
if no_items is not None:
|
||||||
no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
|
reason = no_items.get('reason')
|
||||||
if no_items is not None:
|
if reason == 'preAvailability':
|
||||||
reason = no_items.get('reason')
|
msg = 'Episode %s is not yet available' % group_id
|
||||||
if reason == 'preAvailability':
|
elif reason == 'postAvailability':
|
||||||
msg = 'Episode %s is not yet available' % group_id
|
msg = 'Episode %s is no longer available' % group_id
|
||||||
elif reason == 'postAvailability':
|
else:
|
||||||
msg = 'Episode %s is no longer available' % group_id
|
msg = 'Episode %s is not available: %s' % (group_id, reason)
|
||||||
else:
|
raise ExtractorError(msg, expected=True)
|
||||||
msg = 'Episode %s is not available: %s' % (group_id, reason)
|
|
||||||
raise ExtractorError(msg, expected=True)
|
|
||||||
raise ExtractorError('Failed to extract media for episode %s' % group_id, expected=True)
|
|
||||||
|
|
||||||
title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
|
|
||||||
description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
|
|
||||||
|
|
||||||
radio_programme_id = item.get('identifier')
|
|
||||||
duration = int(item.get('duration'))
|
|
||||||
|
|
||||||
media_selection = self._download_xml(
|
|
||||||
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % radio_programme_id,
|
|
||||||
radio_programme_id, 'Downloading media selection XML')
|
|
||||||
|
|
||||||
formats = []
|
formats = []
|
||||||
for media in media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media'):
|
subtitles = None
|
||||||
bitrate = int(media.get('bitrate'))
|
|
||||||
encoding = media.get('encoding')
|
for item in self._extract_items(playlist):
|
||||||
service = media.get('service')
|
kind = item.get('kind')
|
||||||
connection = media.find('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
|
if kind != 'programme' and kind != 'radioProgramme':
|
||||||
protocol = connection.get('protocol')
|
continue
|
||||||
priority = connection.get('priority')
|
title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
|
||||||
supplier = connection.get('supplier')
|
description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
|
||||||
if protocol == 'http':
|
|
||||||
href = connection.get('href')
|
programme_id = item.get('identifier')
|
||||||
# ASX playlist
|
duration = int(item.get('duration'))
|
||||||
if supplier == 'asx':
|
|
||||||
asx = self._download_xml(href, radio_programme_id, 'Downloading %s ASX playlist' % service)
|
media_selection = self._download_xml(
|
||||||
for i, ref in enumerate(asx.findall('./Entry/ref')):
|
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
|
||||||
formats.append({
|
programme_id, 'Downloading media selection XML')
|
||||||
'url': ref.get('href'),
|
|
||||||
'format_id': '%s_ref%s' % (service, i),
|
for media in self._extract_medias(media_selection):
|
||||||
'abr': bitrate,
|
kind = media.get('kind')
|
||||||
'acodec': encoding,
|
if kind == 'audio':
|
||||||
'preference': priority,
|
formats.extend(self._extract_audio(media, programme_id))
|
||||||
})
|
elif kind == 'video':
|
||||||
continue
|
formats.extend(self._extract_video(media, programme_id))
|
||||||
# Direct link
|
elif kind == 'captions':
|
||||||
formats.append({
|
subtitles = self._extract_captions(media, programme_id)
|
||||||
'url': href,
|
|
||||||
'format_id': service,
|
if self._downloader.params.get('listsubtitles', False):
|
||||||
'abr': bitrate,
|
self._list_available_subtitles(programme_id, subtitles)
|
||||||
'acodec': encoding,
|
return
|
||||||
'preference': priority,
|
|
||||||
})
|
|
||||||
elif protocol == 'rtmp':
|
|
||||||
application = connection.get('application', 'ondemand')
|
|
||||||
auth_string = connection.get('authString')
|
|
||||||
identifier = connection.get('identifier')
|
|
||||||
server = connection.get('server')
|
|
||||||
formats.append({
|
|
||||||
'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
|
|
||||||
'play_path': identifier,
|
|
||||||
'app': '%s?%s' % (application, auth_string),
|
|
||||||
'rtmp_live': False,
|
|
||||||
'ext': 'flv',
|
|
||||||
'format_id': service,
|
|
||||||
'abr': bitrate,
|
|
||||||
'acodec': encoding,
|
|
||||||
'preference': priority,
|
|
||||||
})
|
|
||||||
|
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': radio_programme_id,
|
'id': programme_id,
|
||||||
'title': title,
|
'title': title,
|
||||||
'description': description,
|
'description': description,
|
||||||
'duration': duration,
|
'duration': duration,
|
||||||
'formats': formats,
|
'formats': formats,
|
||||||
|
'subtitles': subtitles,
|
||||||
}
|
}
|
@ -24,5 +24,7 @@ class BloombergIE(InfoExtractor):
|
|||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
name = mobj.group('name')
|
name = mobj.group('name')
|
||||||
webpage = self._download_webpage(url, name)
|
webpage = self._download_webpage(url, name)
|
||||||
ooyala_url = self._twitter_search_player(webpage)
|
embed_code = self._search_regex(
|
||||||
return self.url_result(ooyala_url, OoyalaIE.ie_key())
|
r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
|
||||||
|
'embed code')
|
||||||
|
return OoyalaIE._build_url_result(embed_code)
|
||||||
|
@ -271,8 +271,11 @@ class InfoExtractor(object):
|
|||||||
|
|
||||||
def _download_json(self, url_or_request, video_id,
|
def _download_json(self, url_or_request, video_id,
|
||||||
note=u'Downloading JSON metadata',
|
note=u'Downloading JSON metadata',
|
||||||
errnote=u'Unable to download JSON metadata'):
|
errnote=u'Unable to download JSON metadata',
|
||||||
|
transform_source=None):
|
||||||
json_string = self._download_webpage(url_or_request, video_id, note, errnote)
|
json_string = self._download_webpage(url_or_request, video_id, note, errnote)
|
||||||
|
if transform_source:
|
||||||
|
json_string = transform_source(json_string)
|
||||||
try:
|
try:
|
||||||
return json.loads(json_string)
|
return json.loads(json_string)
|
||||||
except ValueError as ve:
|
except ValueError as ve:
|
||||||
|
66
youtube_dl/extractor/kontrtube.py
Normal file
66
youtube_dl/extractor/kontrtube.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
# encoding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class KontrTubeIE(InfoExtractor):
|
||||||
|
IE_NAME = 'kontrtube'
|
||||||
|
IE_DESC = 'KontrTube.ru - Труба зовёт'
|
||||||
|
_VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+'
|
||||||
|
|
||||||
|
_TEST = {
|
||||||
|
'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
|
||||||
|
'md5': '975a991a4926c9a85f383a736a2e6b80',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '2678',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Над олимпийской деревней в Сочи поднят российский флаг',
|
||||||
|
'description': 'md5:80edc4c613d5887ae8ccf1d59432be41',
|
||||||
|
'thumbnail': 'http://www.kontrtube.ru/contents/videos_screenshots/2000/2678/preview.mp4.jpg',
|
||||||
|
'duration': 270,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
video_id = mobj.group('id')
|
||||||
|
|
||||||
|
webpage = self._download_webpage(url, video_id, 'Downloading page')
|
||||||
|
|
||||||
|
video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
|
||||||
|
thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
|
||||||
|
title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
|
||||||
|
'video title')
|
||||||
|
description = self._html_search_meta('description', webpage, 'video description')
|
||||||
|
|
||||||
|
mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
|
||||||
|
webpage)
|
||||||
|
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
|
||||||
|
|
||||||
|
view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
|
||||||
|
'view count', fatal=False)
|
||||||
|
view_count = int(view_count) if view_count is not None else None
|
||||||
|
|
||||||
|
comment_count = None
|
||||||
|
comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
|
||||||
|
fatal=False)
|
||||||
|
if comment_str.startswith('комментариев нет'):
|
||||||
|
comment_count = 0
|
||||||
|
else:
|
||||||
|
mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
|
||||||
|
if mobj:
|
||||||
|
comment_count = int(mobj.group('total'))
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'url': video_url,
|
||||||
|
'thumbnail': thumbnail,
|
||||||
|
'title': title,
|
||||||
|
'description': description,
|
||||||
|
'duration': duration,
|
||||||
|
'view_count': view_count,
|
||||||
|
'comment_count': comment_count,
|
||||||
|
}
|
@ -1,3 +1,5 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@ -12,11 +14,12 @@ class SlideshareIE(InfoExtractor):
|
|||||||
_VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
|
_VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
|
||||||
|
|
||||||
_TEST = {
|
_TEST = {
|
||||||
u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
|
'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
|
||||||
u'file': u'25665706.mp4',
|
'info_dict': {
|
||||||
u'info_dict': {
|
'id': '25665706',
|
||||||
u'title': u'Managing Scale and Complexity',
|
'ext': 'mp4',
|
||||||
u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix',
|
'title': 'Managing Scale and Complexity',
|
||||||
|
'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -26,15 +29,17 @@ class SlideshareIE(InfoExtractor):
|
|||||||
webpage = self._download_webpage(url, page_title)
|
webpage = self._download_webpage(url, page_title)
|
||||||
slideshare_obj = self._search_regex(
|
slideshare_obj = self._search_regex(
|
||||||
r'var slideshare_object = ({.*?}); var user_info =',
|
r'var slideshare_object = ({.*?}); var user_info =',
|
||||||
webpage, u'slideshare object')
|
webpage, 'slideshare object')
|
||||||
info = json.loads(slideshare_obj)
|
info = json.loads(slideshare_obj)
|
||||||
if info['slideshow']['type'] != u'video':
|
if info['slideshow']['type'] != 'video':
|
||||||
raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
|
raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
|
||||||
|
|
||||||
doc = info['doc']
|
doc = info['doc']
|
||||||
bucket = info['jsplayer']['video_bucket']
|
bucket = info['jsplayer']['video_bucket']
|
||||||
ext = info['jsplayer']['video_extension']
|
ext = info['jsplayer']['video_extension']
|
||||||
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
|
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
|
||||||
|
description = self._html_search_regex(
|
||||||
|
r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description')
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'_type': 'video',
|
'_type': 'video',
|
||||||
@ -43,5 +48,5 @@ class SlideshareIE(InfoExtractor):
|
|||||||
'ext': ext,
|
'ext': ext,
|
||||||
'url': video_url,
|
'url': video_url,
|
||||||
'thumbnail': info['slideshow']['pin_image_url'],
|
'thumbnail': info['slideshow']['pin_image_url'],
|
||||||
'description': self._og_search_description(webpage),
|
'description': description,
|
||||||
}
|
}
|
||||||
|
@ -34,6 +34,7 @@ from ..utils import (
|
|||||||
unified_strdate,
|
unified_strdate,
|
||||||
orderedSet,
|
orderedSet,
|
||||||
write_json_file,
|
write_json_file,
|
||||||
|
uppercase_escape,
|
||||||
)
|
)
|
||||||
|
|
||||||
class YoutubeBaseInfoExtractor(InfoExtractor):
|
class YoutubeBaseInfoExtractor(InfoExtractor):
|
||||||
@ -136,7 +137,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
|
|||||||
(?:https?://|//)? # http(s):// or protocol-independent URL (optional)
|
(?:https?://|//)? # http(s):// or protocol-independent URL (optional)
|
||||||
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
|
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
|
||||||
(?:www\.)?deturl\.com/www\.youtube\.com/|
|
(?:www\.)?deturl\.com/www\.youtube\.com/|
|
||||||
(?:www\.)?pwnyoutube\.com|
|
(?:www\.)?pwnyoutube\.com/|
|
||||||
tube\.majestyc\.net/|
|
tube\.majestyc\.net/|
|
||||||
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
|
||||||
(?:.*?\#/)? # handle anchor (#/) redirect urls
|
(?:.*?\#/)? # handle anchor (#/) redirect urls
|
||||||
@ -1590,10 +1591,9 @@ class YoutubeChannelIE(InfoExtractor):
|
|||||||
# Download all channel pages using the json-based channel_ajax query
|
# Download all channel pages using the json-based channel_ajax query
|
||||||
for pagenum in itertools.count(1):
|
for pagenum in itertools.count(1):
|
||||||
url = self._MORE_PAGES_URL % (pagenum, channel_id)
|
url = self._MORE_PAGES_URL % (pagenum, channel_id)
|
||||||
page = self._download_webpage(url, channel_id,
|
page = self._download_json(
|
||||||
u'Downloading page #%s' % pagenum)
|
url, channel_id, note=u'Downloading page #%s' % pagenum,
|
||||||
|
transform_source=uppercase_escape)
|
||||||
page = json.loads(page)
|
|
||||||
|
|
||||||
ids_in_page = self.extract_videos_from_page(page['content_html'])
|
ids_in_page = self.extract_videos_from_page(page['content_html'])
|
||||||
video_ids.extend(ids_in_page)
|
video_ids.extend(ids_in_page)
|
||||||
|
@ -756,9 +756,9 @@ def unified_strdate(date_str):
|
|||||||
"""Return a string with the date in the format YYYYMMDD"""
|
"""Return a string with the date in the format YYYYMMDD"""
|
||||||
upload_date = None
|
upload_date = None
|
||||||
#Replace commas
|
#Replace commas
|
||||||
date_str = date_str.replace(',',' ')
|
date_str = date_str.replace(',', ' ')
|
||||||
# %z (UTC offset) is only supported in python>=3.2
|
# %z (UTC offset) is only supported in python>=3.2
|
||||||
date_str = re.sub(r' ?(\+|-)[0-9:]*$', '', date_str)
|
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
|
||||||
format_expressions = [
|
format_expressions = [
|
||||||
'%d %B %Y',
|
'%d %B %Y',
|
||||||
'%B %d %Y',
|
'%B %d %Y',
|
||||||
@ -1214,3 +1214,9 @@ class PagedList(object):
|
|||||||
if end == nextfirstid:
|
if end == nextfirstid:
|
||||||
break
|
break
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def uppercase_escape(s):
|
||||||
|
return re.sub(
|
||||||
|
r'\\U([0-9a-fA-F]{8})',
|
||||||
|
lambda m: compat_chr(int(m.group(1), base=16)), s)
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
|
|
||||||
__version__ = '2014.02.08.2'
|
__version__ = '2014.02.10'
|
||||||
|
Reference in New Issue
Block a user