Compare commits

...

17 Commits

Author SHA1 Message Date
5e0b652344 release 2014.02.22 2014-02-22 15:07:25 +01:00
0f8f097183 [release.sh] Do not run tests by default
We are at the point that testing takes waay too long for a release cycle, and fails way too often.
Tests through travis are a better indicator than testing just before release.
2014-02-22 15:06:07 +01:00
491ed3dda2 [trutube] Support multiple formats (#2433) 2014-02-22 15:05:30 +01:00
af284c6d1b Merge remote-tracking branch 'JohnyMoSwag/master' 2014-02-22 14:38:42 +01:00
41d3ec5fba [savefrom] Add extractor (Fixes #2434) 2014-02-22 14:36:16 +01:00
0568c352f3 [canalc2] Modernize 2014-02-22 14:27:09 +01:00
2e7b4cb714 [spankwire] Fix uploader id regex 2014-02-22 16:50:08 +07:00
9767726b66 [spankwire] Improve and modernize 2014-02-22 16:45:03 +07:00
9ddfd84e41 added trutubeIE 2014-02-22 00:11:57 -08:00
1cf563d84b release 2014.02.21.1 2014-02-21 18:19:48 +01:00
f7300c5c90 [generic] Fix on python 2.6
`ParseError` is not available, it raises `xml.parsers.expat.ExpatError`.
The webpage needs to be encoded.
2014-02-21 16:59:10 +01:00
3489b7d26c [youtube] Simplify the decryption process for the manifest urls and add a test (closes #2422) 2014-02-21 15:15:58 +01:00
acd2bcc384 Merge branch 'youtube-dash' of github.com:m0vie/youtube-dl 2014-02-21 15:02:47 +01:00
43e77ca455 release 2014.02.21 2014-02-21 12:16:03 +01:00
da36297988 [wimp] Modernize and replace test 2014-02-21 17:57:19 +07:00
dbb94fb044 [youtube] Fix playlist extraction (Closes #2423, #2424, #2425) 2014-02-21 17:19:55 +07:00
d68f0cdb23 [youtube] decrypt signature when downloading dash manifest 2014-02-21 03:24:56 +01:00
13 changed files with 231 additions and 76 deletions

View File

@ -14,9 +14,9 @@
set -e set -e
skip_tests=false skip_tests=true
if [ "$1" = '--skip-test' ]; then if [ "$1" = '--run-tests' ]; then
skip_tests=true skip_tests=false
shift shift
fi fi

View File

@ -18,6 +18,7 @@ from test.helper import (
import hashlib import hashlib
import io import io
import json import json
import re
import socket import socket
import youtube_dl.YoutubeDL import youtube_dl.YoutubeDL
@ -137,6 +138,15 @@ def generator(test_case):
with io.open(info_json_fn, encoding='utf-8') as infof: with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof) info_dict = json.load(infof)
for (info_field, expected) in tc.get('info_dict', {}).items(): for (info_field, expected) in tc.get('info_dict', {}).items():
if isinstance(expected, compat_str) and expected.startswith('re:'):
got = info_dict.get(info_field)
match_str = expected[len('re:'):]
match_rex = re.compile(match_str)
self.assertTrue(
isinstance(got, compat_str) and match_rex.match(got),
u'field %s (value: %r) should match %r' % (info_field, got, match_str))
else:
if isinstance(expected, compat_str) and expected.startswith('md5:'): if isinstance(expected, compat_str) and expected.startswith('md5:'):
got = 'md5:' + md5(info_dict.get(info_field)) got = 'md5:' + md5(info_dict.get(info_field))
else: else:

View File

@ -186,6 +186,7 @@ from .rutube import (
RutubeMovieIE, RutubeMovieIE,
RutubePersonIE, RutubePersonIE,
) )
from .savefrom import SaveFromIE
from .servingsys import ServingSysIE from .servingsys import ServingSysIE
from .sina import SinaIE from .sina import SinaIE
from .slashdot import SlashdotIE from .slashdot import SlashdotIE
@ -224,6 +225,7 @@ from .tinypic import TinyPicIE
from .toutv import TouTvIE from .toutv import TouTvIE
from .traileraddict import TrailerAddictIE from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE from .trilulilu import TriluliluIE
from .trutube import TruTubeIE
from .tube8 import Tube8IE from .tube8 import Tube8IE
from .tudou import TudouIE from .tudou import TudouIE
from .tumblr import TumblrIE from .tumblr import TumblrIE

View File

@ -1,4 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -9,11 +11,12 @@ class Canalc2IE(InfoExtractor):
_VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)' _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
_TEST = { _TEST = {
u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
u'file': u'12163.mp4', 'md5': '060158428b650f896c542dfbb3d6487f',
u'md5': u'060158428b650f896c542dfbb3d6487f', 'info_dict': {
u'info_dict': { 'id': '12163',
u'title': u'Terrasses du Numérique' 'ext': 'mp4',
'title': 'Terrasses du Numérique'
} }
} }
@ -28,9 +31,10 @@ class Canalc2IE(InfoExtractor):
video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
title = self._html_search_regex( title = self._html_search_regex(
r'class="evenement8">(.*?)</a>', webpage, u'title') r'class="evenement8">(.*?)</a>', webpage, 'title')
return {'id': video_id, return {
'id': video_id,
'ext': 'mp4', 'ext': 'mp4',
'url': video_url, 'url': video_url,
'title': title, 'title': title,

View File

@ -13,6 +13,7 @@ from ..utils import (
compat_urllib_parse, compat_urllib_parse,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
compat_xml_parse_error,
ExtractorError, ExtractorError,
HEADRequest, HEADRequest,
@ -241,10 +242,10 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed? # Is it an RSS feed?
try: try:
doc = xml.etree.ElementTree.fromstring(webpage) doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss': if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc) return self._extract_rss(url, video_id, doc)
except xml.etree.ElementTree.ParseError: except compat_xml_parse_error:
pass pass
# it's tempting to parse this further, but you would # it's tempting to parse this further, but you would

View File

@ -0,0 +1,37 @@
# coding: utf-8
from __future__ import unicode_literals
import os.path
import re
from .common import InfoExtractor
class SaveFromIE(InfoExtractor):
IE_NAME = 'savefrom.net'
_VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P<url>.*)$'
_TEST = {
'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com',
'info_dict': {
'id': 'UlVRAPW2WJY',
'ext': 'mp4',
'title': 'About Team Radical MMA | MMA Fighting',
'upload_date': '20120816',
'uploader': 'Howcast',
'uploader_id': 'Howcast',
'description': 'md5:4f0aac94361a12e1ce57d74f85265175',
},
'params': {
'skip_download': True
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = os.path.splitext(url.split('/')[-1])[0]
return {
'_type': 'url',
'id': video_id,
'url': mobj.group('url'),
}

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import os
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -8,23 +7,27 @@ from ..utils import (
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urllib_request, compat_urllib_request,
compat_urllib_parse, compat_urllib_parse,
unified_strdate,
str_to_int,
int_or_none,
) )
from ..aes import ( from ..aes import aes_decrypt_text
aes_decrypt_text
)
class SpankwireIE(InfoExtractor): class SpankwireIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)' _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
_TEST = { _TEST = {
'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
'file': '103545.mp4', 'md5': '8bbfde12b101204b39e4b9fe7eb67095',
'md5': '1b3f55e345500552dbc252a3e9c1af43',
'info_dict': { 'info_dict': {
"uploader": "oreusz", 'id': '103545',
"title": "Buckcherry`s X Rated Music Video Crazy Bitch", 'ext': 'mp4',
"description": "Crazy Bitch X rated music video.", 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch',
"age_limit": 18, 'description': 'Crazy Bitch X rated music video.',
'uploader': 'oreusz',
'uploader_id': '124697',
'upload_date': '20070508',
'age_limit': 18,
} }
} }
@ -37,13 +40,26 @@ class SpankwireIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1') req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id) webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title') title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(
r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
description = self._html_search_regex( description = self._html_search_regex(
r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False) r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
uploader = self._html_search_regex(
r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
uploader_id = self._html_search_regex(
r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False)
upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False)
if upload_date:
upload_date = unified_strdate(upload_date)
view_count = self._html_search_regex(
r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = int_or_none(self._html_search_regex(
r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False))
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
if webpage.find('flashvars\.encrypted = "true"') != -1: if webpage.find('flashvars\.encrypted = "true"') != -1:
@ -53,16 +69,13 @@ class SpankwireIE(InfoExtractor):
formats = [] formats = []
for video_url in video_urls: for video_url in video_urls:
path = compat_urllib_parse_urlparse(video_url).path path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[4].split('_')[:2] format = path.split('/')[4].split('_')[:2]
resolution, bitrate_str = format resolution, bitrate_str = format
format = "-".join(format) format = "-".join(format)
height = int(resolution.rstrip('P')) height = int(resolution.rstrip('Pp'))
tbr = int(bitrate_str.rstrip('K')) tbr = int(bitrate_str.rstrip('Kk'))
formats.append({ formats.append({
'url': video_url, 'url': video_url,
'ext': extension,
'resolution': resolution, 'resolution': resolution,
'format': format, 'format': format,
'tbr': tbr, 'tbr': tbr,
@ -75,10 +88,14 @@ class SpankwireIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'uploader': video_uploader, 'title': title,
'title': video_title,
'thumbnail': thumbnail,
'description': description, 'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
'view_count': view_count,
'comment_count': comment_count,
'formats': formats, 'formats': formats,
'age_limit': age_limit, 'age_limit': age_limit,
} }

View File

@ -0,0 +1,47 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class TruTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P<id>[0-9]+)/.*'
_TEST = {
'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-',
'md5': 'c5b6e301b0a2040b074746cbeaa26ca1',
'info_dict': {
'id': '14880',
'ext': 'flv',
'title': 'Ramses II - Proven To Be A Red Headed Caucasoid',
'thumbnail': 're:^http:.*\.jpg$',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_title = self._og_search_title(webpage).strip()
thumbnail = self._search_regex(
r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False)
all_formats = re.finditer(
r"var (?P<key>[a-z]+)_video_file\s*=\s*'(?P<url>[^']+)';", webpage)
formats = [{
'format_id': m.group('key'),
'quality': -i,
'url': m.group('url'),
} for i, m in enumerate(all_formats)]
self._sort_formats(formats)
return {
'id': video_id,
'title': video_title,
'formats': formats,
'thumbnail': thumbnail,
}

View File

@ -6,14 +6,15 @@ from .common import InfoExtractor
class WimpIE(InfoExtractor): class WimpIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?wimp\.com/([^/]+)/' _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/'
_TEST = { _TEST = {
'url': 'http://www.wimp.com/deerfence/', 'url': 'http://www.wimp.com/maruexhausted/',
'file': 'deerfence.flv', 'md5': 'f1acced123ecb28d9bb79f2479f2b6a1',
'md5': '8b215e2e0168c6081a1cf84b2846a2b5',
'info_dict': { 'info_dict': {
"title": "Watch Till End: Herd of deer jump over a fence.", 'id': 'maruexhausted',
"description": "These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.", 'ext': 'flv',
'title': 'Maru is exhausted.',
'description': 'md5:57e099e857c0a4ea312542b684a869b8',
} }
} }

View File

@ -297,6 +297,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"format": "141", u"format": "141",
}, },
}, },
# DASH manifest with encrypted signature
{
u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
u'info_dict': {
u'id': u'IB3lcPjvWLA',
u'ext': u'm4a',
u'title': u'Afrojack - The Spark ft. Spree Wilson',
u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
u'uploader': u'AfrojackVEVO',
u'uploader_id': u'AfrojackVEVO',
u'upload_date': u'20131011',
},
u"params": {
u'youtube_include_dash_manifest': True,
u'format': '141',
},
},
] ]
@ -1272,8 +1289,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
if not mobj: if not mobj:
raise ValueError('Could not find vevo ID') raise ValueError('Could not find vevo ID')
info = json.loads(mobj.group(1)) ytplayer_config = json.loads(mobj.group(1))
args = info['args'] args = ytplayer_config['args']
# Easy way to know if the 's' value is in url_encoded_fmt_stream_map # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
# this signatures are encrypted # this signatures are encrypted
if 'url_encoded_fmt_stream_map' not in args: if 'url_encoded_fmt_stream_map' not in args:
@ -1366,12 +1383,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# Look for the DASH manifest # Look for the DASH manifest
dash_manifest_url_lst = video_info.get('dashmpd') if (self._downloader.params.get('youtube_include_dash_manifest', False)):
if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
self._downloader.params.get('youtube_include_dash_manifest', False)):
try: try:
# The DASH manifest used needs to be the one from the original video_webpage.
# The one found in get_video_info seems to be using different signatures.
# However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
# Luckily, it seems, this case uses some kind of default signature (len == 86), so the
# combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
if age_gate:
dash_manifest_url = video_info.get('dashmpd')[0]
else:
dash_manifest_url = ytplayer_config['args']['dashmpd']
def decrypt_sig(mobj):
s = mobj.group(1)
dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
return '/signature/%s' % dec_s
dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dash_doc = self._download_xml( dash_doc = self._download_xml(
dash_manifest_url_lst[0], video_id, dash_manifest_url, video_id,
note=u'Downloading DASH manifest', note=u'Downloading DASH manifest',
errnote=u'Could not download DASH manifest') errnote=u'Could not download DASH manifest')
for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
@ -1443,9 +1472,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
| |
((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
)""" )"""
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_MORE_PAGES_INDICATOR = r'data-link-type="next"' _MORE_PAGES_INDICATOR = r'data-link-type="next"'
_VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist' IE_NAME = u'youtube:playlist'
def _real_initialize(self): def _real_initialize(self):
@ -1493,29 +1522,31 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
raise ExtractorError(u'For downloading YouTube.com top lists, use ' raise ExtractorError(u'For downloading YouTube.com top lists, use '
u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
more_widget_html = content_html = page
# Extract the video ids from the playlist pages # Extract the video ids from the playlist pages
ids = [] ids = []
for page_num in itertools.count(1): for page_num in itertools.count(1):
url = self._TEMPLATE_URL % (playlist_id, page_num) matches = re.finditer(self._VIDEO_RE, content_html)
page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
matches = re.finditer(self._VIDEO_RE, page)
# We remove the duplicates and the link with index 0 # We remove the duplicates and the link with index 0
# (it's not the first video of the playlist) # (it's not the first video of the playlist)
new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
ids.extend(new_ids) ids.extend(new_ids)
if re.search(self._MORE_PAGES_INDICATOR, page) is None: mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
if not mobj:
break break
try: more = self._download_json(
playlist_title = self._og_search_title(page) 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
except RegexNotFoundError: content_html = more['content_html']
self.report_warning( more_widget_html = more['load_more_widget_html']
u'Playlist page is missing OpenGraph title, falling back ...',
playlist_id)
playlist_title = self._html_search_regex( playlist_title = self._html_search_regex(
r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title') r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
url_results = self._ids_to_results(ids) url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, playlist_title) return self.playlist_result(url_results, playlist_id, playlist_title)

View File

@ -174,6 +174,11 @@ try:
except NameError: except NameError:
compat_chr = chr compat_chr = chr
try:
from xml.etree.ElementTree import ParseError as compat_xml_parse_error
except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error
def compat_ord(c): def compat_ord(c):
if type(c) is int: return c if type(c) is int: return c
else: return ord(c) else: return ord(c)

View File

@ -1,2 +1,2 @@
__version__ = '2014.02.20' __version__ = '2014.02.22'