Compare commits

..

11 Commits

Author SHA1 Message Date
Philipp Hagemeister
a43ee88c6f release 2014.09.29 2014-09-29 01:51:53 +02:00
Philipp Hagemeister
e2dce53781 [youtube] Always request webpage in English (Fixes #3844) 2014-09-29 01:39:26 +02:00
Philipp Hagemeister
1770ed9e86 [thvideo] Simplify (#3848) 2014-09-29 00:38:37 +02:00
Philipp Hagemeister
457ac58cc7 Merge remote-tracking branch 'diffycat/thvideo-update' 2014-09-29 00:36:55 +02:00
Philipp Hagemeister
9c44d2429b [vimeo:likes] Support large like lists (Fixes #3847) 2014-09-29 00:36:06 +02:00
Philipp Hagemeister
d2e32f7df5 Do not use HTML characters in output
This messes up the format when people paste it outside of code tags.
2014-09-29 00:23:43 +02:00
Anton Larionov
67077b182b [thvideo] Add support for playlists 2014-09-28 23:36:55 +04:00
Naglis Jonaitis
5f4c318844 [nfl] Support team micro-sites (fixes #3831) 2014-09-28 21:48:26 +03:00
Naglis Jonaitis
dfee83234b [nfl] Prefer progressive downloads 2014-09-28 19:25:28 +03:00
Sergey M․
7f5c0c4a19 [README] Clarify test's md5 filesize (#3846) 2014-09-28 22:10:20 +07:00
Philipp Hagemeister
4bc77c8417 [README] Use _match_id helper function 2014-09-28 13:52:21 +02:00
10 changed files with 228 additions and 100 deletions

View File

@@ -442,8 +442,6 @@ If you want to add support for a new site, you can follow this quick list (assum
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
@@ -451,7 +449,7 @@ If you want to add support for a new site, you can follow this quick list (assum
_VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
_TEST = { _TEST = {
'url': 'http://yourextractor.com/watch/42', 'url': 'http://yourextractor.com/watch/42',
'md5': 'TODO: md5 sum of the first 10KiB of the video file', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
'info_dict': { 'info_dict': {
'id': '42', 'id': '42',
'ext': 'mp4', 'ext': 'mp4',
@@ -466,8 +464,7 @@ If you want to add support for a new site, you can follow this quick list (assum
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) video_id = self._match_id(url)
video_id = mobj.group('id')
# TODO more code goes here, for example ... # TODO more code goes here, for example ...
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)

View File

@@ -22,7 +22,8 @@ from youtube_dl.utils import (
fix_xml_ampersands, fix_xml_ampersands,
get_meta_content, get_meta_content,
orderedSet, orderedSet,
PagedList, OnDemandPagedList,
InAdvancePagedList,
parse_duration, parse_duration,
read_batch_urls, read_batch_urls,
sanitize_filename, sanitize_filename,
@@ -246,10 +247,14 @@ class TestUtil(unittest.TestCase):
for i in range(firstid, upto): for i in range(firstid, upto):
yield i yield i
pl = PagedList(get_page, pagesize) pl = OnDemandPagedList(get_page, pagesize)
got = pl.getslice(*sliceargs) got = pl.getslice(*sliceargs)
self.assertEqual(got, expected) self.assertEqual(got, expected)
iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize)
got = iapl.getslice(*sliceargs)
self.assertEqual(got, expected)
testPL(5, 2, (), [0, 1, 2, 3, 4]) testPL(5, 2, (), [0, 1, 2, 3, 4])
testPL(5, 2, (1,), [1, 2, 3, 4]) testPL(5, 2, (1,), [1, 2, 3, 4])
testPL(5, 2, (2,), [2, 3, 4]) testPL(5, 2, (2,), [2, 3, 4])

View File

@@ -371,7 +371,10 @@ from .thisav import ThisAVIE
from .tinypic import TinyPicIE from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE from .tlc import TlcIE, TlcDeIE
from .tnaflix import TNAFlixIE from .tnaflix import TNAFlixIE
from .thvideo import THVideoIE from .thvideo import (
THVideoIE,
THVideoPlaylistIE
)
from .toutv import TouTvIE from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE from .traileraddict import TrailerAddictIE

View File

@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
compat_urllib_parse,
int_or_none, int_or_none,
remove_end, remove_end,
) )
@@ -13,37 +14,92 @@ from ..utils import (
class NFLIE(InfoExtractor): class NFLIE(InfoExtractor):
IE_NAME = 'nfl.com' IE_NAME = 'nfl.com'
_VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)' _VALID_URL = r'''(?x)https?://
_PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
_TEST = { (?:.+?/)*
(?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
_TESTS = [
{
'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
# 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates 'md5': '394ef771ddcd1354f665b471d78ec4c6',
'info_dict': { 'info_dict': {
'id': '0ap3000000398478', 'id': '0ap3000000398478',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights', 'title': 'Week 3: Redskins vs. Eagles highlights',
'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
'upload_date': '20140921', 'upload_date': '20140921',
'timestamp': 1411337580, 'timestamp': 1411337580,
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
} }
},
{
'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
'info_dict': {
'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
'ext': 'mp4',
'title': 'LIVE: Post Game vs. Browns',
'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
'upload_date': '20131229',
'timestamp': 1388354455,
'thumbnail': 're:^https?://.*\.jpg$',
}
}
]
@staticmethod
def prepend_host(host, url):
if not url.startswith('http'):
if not url.startswith('/'):
url = '/%s' % url
url = 'http://{0:}{1:}'.format(host, url)
return url
@staticmethod
def format_from_stream(stream, protocol, host, path_prefix='',
preference=0, note=None):
url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
protocol=protocol,
host=host,
prefix=path_prefix,
path=stream.get('path'),
)
return {
'url': url,
'vbr': int_or_none(stream.get('rate', 0), 1000),
'preference': preference,
'format_note': note,
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id, host = mobj.group('id'), mobj.group('host')
config = self._download_json(self._PLAYER_CONFIG_URL, video_id, webpage = self._download_webpage(url, video_id)
config_url = NFLIE.prepend_host(host, self._search_regex(
r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL'))
config = self._download_json(config_url, video_id,
note='Downloading player config') note='Downloading player config')
url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config) url_template = NFLIE.prepend_host(
video_data = self._download_json(url_template.format(id=video_id), video_id) host, '{contentURLTemplate:}'.format(**config))
video_data = self._download_json(
url_template.format(id=video_id), video_id)
formats = []
cdn_data = video_data.get('cdnData', {})
streams = cdn_data.get('bitrateInfo', [])
if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
parts = compat_urllib_parse.urlparse(cdn_data.get('uri'))
protocol, host = parts.scheme, parts.netloc
for stream in streams:
formats.append(
NFLIE.format_from_stream(stream, protocol, host))
else:
cdns = config.get('cdns') cdns = config.get('cdns')
if not cdns: if not cdns:
raise ExtractorError('Failed to get CDN data', expected=True) raise ExtractorError('Failed to get CDN data', expected=True)
formats = []
streams = video_data.get('cdnData', {}).get('bitrateInfo', [])
for name, cdn in cdns.items(): for name, cdn in cdns.items():
# LimeLight streams don't seem to work # LimeLight streams don't seem to work
if cdn.get('name') == 'LIMELIGHT': if cdn.get('name') == 'LIMELIGHT':
@@ -54,35 +110,20 @@ class NFLIE(InfoExtractor):
if not (protocol and host): if not (protocol and host):
continue continue
path_prefix = cdn.get('pathprefix', '') prefix = cdn.get('pathprefix', '')
if path_prefix and not path_prefix.endswith('/'): if prefix and not prefix.endswith('/'):
path_prefix = '%s/' % path_prefix prefix = '%s/' % prefix
get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format(
protocol=protocol,
host=host,
prefix=path_prefix,
path=p,
)
preference = 0
if protocol == 'rtmp': if protocol == 'rtmp':
preference = -2 preference = -2
elif 'prog' in name.lower(): elif 'prog' in name.lower():
preference = -1 preference = 1
else:
preference = 0
for stream in streams: for stream in streams:
path = stream.get('path') formats.append(
if not path: NFLIE.format_from_stream(stream, protocol, host,
continue prefix, preference, name))
formats.append({
'url': get_url(path),
'vbr': int_or_none(stream.get('rate', 0), 1000),
'preference': preference,
'format_note': name,
})
self._sort_formats(formats) self._sort_formats(formats)
@@ -94,7 +135,7 @@ class NFLIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'title': video_data.get('storyHeadline'), 'title': video_data.get('headline'),
'formats': formats, 'formats': formats,
'description': video_data.get('caption'), 'description': video_data.get('caption'),
'duration': video_data.get('duration'), 'duration': video_data.get('duration'),

View File

@@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor):
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) video_id = self._match_id(url)
video_id = mobj.group('id')
# extract download link from mobile player page # extract download link from mobile player page
webpage_player = self._download_webpage( webpage_player = self._download_webpage(
@@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor):
'description': description, 'description': description,
'upload_date': upload_date 'upload_date': upload_date
} }
class THVideoPlaylistIE(InfoExtractor):
_VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)'
_TEST = {
'url': 'http://thvideo.tv/mylist2',
'info_dict': {
'id': '2',
'title': '幻想万華鏡',
},
'playlist_mincount': 23,
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
list_title = self._html_search_regex(
r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title',
fatal=False)
entries = [
self.url_result('http://thvideo.tv/v/th' + id, 'THVideo')
for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)]
return self.playlist_result(entries, playlist_id, list_title)

View File

@@ -8,18 +8,19 @@ import itertools
from .common import InfoExtractor from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor from .subtitles import SubtitlesInfoExtractor
from ..utils import ( from ..utils import (
clean_html,
compat_HTTPError, compat_HTTPError,
compat_urllib_parse, compat_urllib_parse,
compat_urllib_request, compat_urllib_request,
clean_html, compat_urlparse,
get_element_by_attribute,
ExtractorError, ExtractorError,
get_element_by_attribute,
InAdvancePagedList,
int_or_none,
RegexNotFoundError, RegexNotFoundError,
smuggle_url,
std_headers, std_headers,
unsmuggle_url, unsmuggle_url,
urlencode_postdata, urlencode_postdata,
int_or_none,
) )
@@ -533,32 +534,55 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
class VimeoLikesIE(InfoExtractor): class VimeoLikesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])' _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
IE_NAME = 'vimeo:likes' IE_NAME = 'vimeo:likes'
IE_DESC = 'Vimeo user likes' IE_DESC = 'Vimeo user likes'
_TEST = { _TEST = {
'url': 'https://vimeo.com/user20132939/likes', 'url': 'https://vimeo.com/user755559/likes/',
'playlist_mincount': 4, 'playlist_mincount': 293,
'add_ies': ['Generic'],
"info_dict": { "info_dict": {
"description": "Videos Philipp Hagemeister likes on Vimeo.", "description": "See all the videos urza likes",
"title": "Vimeo / Philipp Hagemeister's likes", "title": 'Videos urza likes',
},
'params': {
'extract_flat': False,
}, },
} }
def _real_extract(self, url): def _real_extract(self, url):
user_id = self._match_id(url) user_id = self._match_id(url)
rss_url = '%s//vimeo.com/user%s/likes/rss' % ( webpage = self._download_webpage(url, user_id)
self.http_scheme(), user_id) page_count = self._int(
surl = smuggle_url(rss_url, { self._search_regex(
'force_videoid': '%s_likes' % user_id, r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)">
'to_generic': True, .*?</a></li>\s*<li\s+class="pagination_next">
}) ''', webpage, 'page count'),
'page count', fatal=True)
PAGE_SIZE = 12
title = self._html_search_regex(
r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False)
description = self._html_search_meta('description', webpage)
def _get_page(idx):
page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % (
self.http_scheme(), user_id, idx + 1)
webpage = self._download_webpage(
page_url, user_id,
note='Downloading page %d/%d' % (idx + 1, page_count))
video_list = self._search_regex(
r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>',
webpage, 'video content')
paths = re.findall(
r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list)
for path in paths:
yield {
'_type': 'url',
'url': compat_urlparse.urljoin(page_url, path),
}
pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
return { return {
'_type': 'url', '_type': 'playlist',
'url': surl, 'id': 'user%s_likes' % user_id,
'title': title,
'description': description,
'entries': pl,
} }

View File

@@ -26,7 +26,7 @@ from ..utils import (
get_element_by_attribute, get_element_by_attribute,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
PagedList, OnDemandPagedList,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
orderedSet, orderedSet,
@@ -655,7 +655,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Get video webpage # Get video webpage
url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
video_webpage = self._download_webpage(url, video_id) req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'PREF=hl=en')
video_webpage = self._download_webpage(req, video_id)
# Attempt to extract SWF player URL # Attempt to extract SWF player URL
mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
@@ -1341,7 +1343,7 @@ class YoutubeUserIE(InfoExtractor):
'id': video_id, 'id': video_id,
'title': title, 'title': title,
} }
url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
return self.playlist_result(url_results, playlist_title=username) return self.playlist_result(url_results, playlist_title=username)

View File

@@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None):
for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
try: try:
i = opts.index(private_opt) i = opts.index(private_opt)
opts[i+1] = '<PRIVATE>' opts[i+1] = 'PRIVATE'
except ValueError: except ValueError:
pass pass
return opts return opts

View File

@@ -1384,14 +1384,16 @@ def check_executable(exe, args=[]):
class PagedList(object): class PagedList(object):
def __init__(self, pagefunc, pagesize):
self._pagefunc = pagefunc
self._pagesize = pagesize
def __len__(self): def __len__(self):
# This is only useful for tests # This is only useful for tests
return len(self.getslice()) return len(self.getslice())
class OnDemandPagedList(PagedList):
def __init__(self, pagefunc, pagesize):
self._pagefunc = pagefunc
self._pagesize = pagesize
def getslice(self, start=0, end=None): def getslice(self, start=0, end=None):
res = [] res = []
for pagenum in itertools.count(start // self._pagesize): for pagenum in itertools.count(start // self._pagesize):
@@ -1430,6 +1432,35 @@ class PagedList(object):
return res return res
class InAdvancePagedList(PagedList):
def __init__(self, pagefunc, pagecount, pagesize):
self._pagefunc = pagefunc
self._pagecount = pagecount
self._pagesize = pagesize
def getslice(self, start=0, end=None):
res = []
start_page = start // self._pagesize
end_page = (
self._pagecount if end is None else (end // self._pagesize + 1))
skip_elems = start - start_page * self._pagesize
only_more = None if end is None else end - start
for pagenum in range(start_page, end_page):
page = list(self._pagefunc(pagenum))
if skip_elems:
page = page[skip_elems:]
skip_elems = None
if only_more is not None:
if len(page) < only_more:
only_more -= len(page)
else:
page = page[:only_more]
res.extend(page)
break
res.extend(page)
return res
def uppercase_escape(s): def uppercase_escape(s):
unicode_escape = codecs.getdecoder('unicode_escape') unicode_escape = codecs.getdecoder('unicode_escape')
return re.sub( return re.sub(

View File

@@ -1,2 +1,2 @@
__version__ = '2014.09.28.1' __version__ = '2014.09.29'