Compare commits

...

29 Commits

Author SHA1 Message Date
Philipp Hagemeister
dae313e725 release 2014.03.07.1 2014-03-07 15:59:10 +01:00
Jaime Marquínez Ferrándiz
b74fa8cd2c [facebook] Fix login process
It was broken and didn't work in python 3.
And use `_download_webpage` instead of `compat_urllib_request.urlopen`.
2014-03-07 15:25:33 +01:00
Philipp Hagemeister
94eae04c94 release 2014.03.07 2014-03-07 06:41:48 +01:00
Sergey M․
16ff7ebc77 [lynda] Fix successful login regex and fix formats extraction (Closes #2520) 2014-03-07 06:56:48 +07:00
Philipp Hagemeister
c361c505b0 release 2014.03.06 2014-03-06 23:57:00 +01:00
Sergey M․
d37c07c575 [vesti] Fix extraction and support more link formats (Closes #2517) 2014-03-07 02:27:39 +07:00
Sergey M․
9d6105c9f0 Do not resume live streams
No resuming or seeking in live streams is possible (c) man rtmpdump
2014-03-05 22:46:20 +07:00
Sergey M․
8dec03ecba Use unicode literals 2014-03-05 22:24:07 +07:00
Sergey M․
826547870b Report no connect as error 2014-03-05 22:21:19 +07:00
Sergey M․
52d6a9a61d Handle rtmpdump's no connection return value 2014-03-05 22:19:27 +07:00
Sergey M․
ad242b5fbc Remove superfluous whitespace 2014-03-05 22:16:50 +07:00
Sergey M․
3524175625 Use meaningful return value constants for rtmpdump 2014-03-05 22:12:02 +07:00
Jaime Marquínez Ferrándiz
7b9965ea93 [ted] Remove unused import and modernize test 2014-03-05 14:27:45 +01:00
Philipp Hagemeister
0a5bce566f [generic] Add all test attributes for embedly (#2447)
In the future, we may want to not only print something, but throw an error for untested properties.
2014-03-05 14:05:50 +01:00
Philipp Hagemeister
8012bd2424 [generic] Get a better ID 2014-03-05 14:02:14 +01:00
Philipp Hagemeister
f55a1f0a88 Merge remote-tracking branch 'rzhxeo/embedly'
Conflicts:
	youtube_dl/extractor/generic.py
2014-03-05 14:01:53 +01:00
Jaime Marquínez Ferrándiz
bacac173a9 [ted] Style fixes 2014-03-05 13:27:26 +01:00
Jaime Marquínez Ferrándiz
ca1fee34f2 [ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00
Sergey M․
6dadaa9930 [prosiebensat1] Replace test 2014-03-05 15:10:49 +07:00
Jaime Marquínez Ferrándiz
553f6e4633 [dailymotion] Convert width and height fields from strings to integers 2014-03-04 22:24:38 +01:00
Jaime Marquínez Ferrándiz
652bee05f0 [ted] Fix video extraction
The site has been redesigned
2014-03-04 21:47:01 +01:00
Philipp Hagemeister
d63516e9cd release 2014.03.04.2 2014-03-04 20:56:31 +01:00
Sergey M․
e477dcf649 [vesti] Fix width and height 2014-03-04 21:40:35 +07:00
Sergey M․
9d3f7781f3 [soundcloud:set] Fix _VALID_URL regex (Closes #2509) 2014-03-04 21:29:14 +07:00
Sergey M․
c7095dada3 [tvigle] Add support for another video link format 2014-03-04 19:22:48 +07:00
Sergey M․
607dbbad76 [xtube] Fix extraction add more metafields 2014-03-04 16:12:11 +07:00
Philipp Hagemeister
17b75c0de1 Document width, height, and resolution (#1445) 2014-03-04 03:49:33 +01:00
Philipp Hagemeister
ab24f4f3be [facebook] Use consistent quotes 2014-03-04 03:49:12 +01:00
rzhxeo
1b86cc41cf Add support for embed.ly 2014-02-27 08:14:28 +01:00
18 changed files with 337 additions and 149 deletions

View File

@@ -124,8 +124,12 @@ which means you can modify it, redistribute it or use it however you like.
video id, %(playlist)s for the playlist the
video is in, %(playlist_index)s for the
position in the playlist and %% for a
literal percent. Use - to output to stdout.
Can also be used to download to a different
literal percent. %(height)s and %(width)s
for the width and height of the video
format. %(resolution)s for a textual
description of the resolution of the video
format. Use - to output to stdout. Can also
be used to download to a different
directory, for example with -o '/my/downloa
ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
--autonumber-size NUMBER Specifies the number of digits in

View File

@@ -36,6 +36,7 @@ from youtube_dl.extractor import (
RutubeChannelIE,
GoogleSearchIE,
GenericIE,
TEDIE,
)
@@ -259,5 +260,14 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'Zero Punctuation')
self.assertTrue(len(result['entries']) > 10)
def test_ted_playlist(self):
dl = FakeYDL()
ie = TEDIE(dl)
result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], '10')
self.assertEqual(result['title'], 'Who are the hackers?')
self.assertTrue(len(result['entries']) >= 6)
if __name__ == '__main__':
unittest.main()

View File

@@ -33,6 +33,7 @@ from youtube_dl.utils import (
unified_strdate,
unsmuggle_url,
url_basename,
urlencode_postdata,
xpath_with_ns,
)
@@ -261,5 +262,9 @@ class TestUtil(unittest.TestCase):
bam''')
self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam'])
def test_urlencode_postdata(self):
data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})
self.assertTrue(isinstance(data, bytes))
if __name__ == '__main__':
unittest.main()

View File

@@ -409,6 +409,13 @@ class YoutubeDL(object):
template_dict['autonumber'] = autonumber_templ % self._num_downloads
if template_dict.get('playlist_index') is not None:
template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
if template_dict.get('resolution') is None:
if template_dict.get('width') and template_dict.get('height'):
template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
elif template_dict.get('height'):
res = '%sp' % template_dict['height']
elif template_dict.get('width'):
res = '?x%d' % template_dict['width']
sanitize = lambda k, v: sanitize_filename(
compat_str(v),

View File

@@ -430,6 +430,8 @@ def parseOpts(overrideArguments=None):
'%(extractor)s for the provider (youtube, metacafe, etc), '
'%(id)s for the video id, %(playlist)s for the playlist the video is in, '
'%(playlist_index)s for the position in the playlist and %% for a literal percent. '
'%(height)s and %(width)s for the width and height of the video format. '
'%(resolution)s for a textual description of the resolution of the video format. '
'Use - to output to stdout. Can also be used to download to a different directory, '
'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
filesystem.add_option('--autonumber-size',

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import os
import re
import subprocess
@@ -22,7 +24,7 @@ class RtmpFD(FileDownloader):
proc_stderr_closed = False
while not proc_stderr_closed:
# read line from stderr
line = u''
line = ''
while True:
char = proc.stderr.read(1)
if not char:
@@ -46,7 +48,7 @@ class RtmpFD(FileDownloader):
data_len = None
if percent > 0:
data_len = int(downloaded_data_len * 100 / percent)
data_len_str = u'~' + format_bytes(data_len)
data_len_str = '~' + format_bytes(data_len)
self.report_progress(percent, data_len_str, speed, eta)
cursor_in_new_line = False
self._hook_progress({
@@ -76,12 +78,12 @@ class RtmpFD(FileDownloader):
})
elif self.params.get('verbose', False):
if not cursor_in_new_line:
self.to_screen(u'')
self.to_screen('')
cursor_in_new_line = True
self.to_screen(u'[rtmpdump] '+line)
self.to_screen('[rtmpdump] '+line)
proc.wait()
if not cursor_in_new_line:
self.to_screen(u'')
self.to_screen('')
return proc.returncode
url = info_dict['url']
@@ -102,7 +104,7 @@ class RtmpFD(FileDownloader):
try:
subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
except (OSError, IOError):
self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
self.report_error('RTMP download detected but "rtmpdump" could not be run')
return False
# Download using rtmpdump. rtmpdump returns exit code 2 when
@@ -127,7 +129,7 @@ class RtmpFD(FileDownloader):
basic_args += ['--live']
if conn:
basic_args += ['--conn', conn]
args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
args = basic_args + [[], ['--resume', '--skip', '1']][not live and self.params.get('continuedl', False)]
if sys.platform == 'win32' and sys.version_info < (3, 0):
# Windows subprocess module does not actually support Unicode
@@ -150,26 +152,35 @@ class RtmpFD(FileDownloader):
shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
except ImportError:
shell_quote = repr
self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
self.to_screen('[debug] rtmpdump command line: ' + shell_quote(str_args))
RD_SUCCESS = 0
RD_FAILED = 1
RD_INCOMPLETE = 2
RD_NO_CONNECT = 3
retval = run_rtmpdump(args)
while (retval == 2 or retval == 1) and not test:
if retval == RD_NO_CONNECT:
self.report_error('[rtmpdump] Could not connect to RTMP server.')
return False
while (retval == RD_INCOMPLETE or retval == RD_FAILED) and not test and not live:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
self.to_screen('[rtmpdump] %s bytes' % prevsize)
time.sleep(5.0) # This seems to be needed
retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED])
cursize = os.path.getsize(encodeFilename(tmpfilename))
if prevsize == cursize and retval == 1:
if prevsize == cursize and retval == RD_FAILED:
break
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
if prevsize == cursize and retval == 2 and cursize > 1024:
self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = 0
if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024:
self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = RD_SUCCESS
break
if retval == 0 or (test and retval == 2):
if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE):
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'[rtmpdump] %s bytes' % fsize)
self.to_screen('[rtmpdump] %s bytes' % fsize)
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
@@ -179,6 +190,6 @@ class RtmpFD(FileDownloader):
})
return True
else:
self.to_stderr(u"\n")
self.report_error(u'rtmpdump exited with code %d' % retval)
self.to_stderr('\n')
self.report_error('rtmpdump exited with code %d' % retval)
return False

View File

@@ -12,6 +12,7 @@ from ..utils import (
get_element_by_id,
orderedSet,
str_to_int,
int_or_none,
ExtractorError,
)
@@ -124,7 +125,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
if video_url is not None:
m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
if m_size is not None:
width, height = m_size.group(1), m_size.group(2)
width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
else:
width, height = None, None
formats.append({

View File

@@ -11,6 +11,7 @@ from ..utils import (
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
urlencode_postdata,
ExtractorError,
)
@@ -35,8 +36,8 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '120708114770723',
'ext': 'mp4',
u"duration": 279,
u"title": u"PEOPLE ARE AWESOME 2013"
'duration': 279,
'title': 'PEOPLE ARE AWESOME 2013'
}
}
@@ -51,8 +52,8 @@ class FacebookIE(InfoExtractor):
login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
login_page_req.add_header('Cookie', 'locale=en_US')
self.report_login()
login_page = self._download_webpage(login_page_req, None, note=False,
login_page = self._download_webpage(login_page_req, None,
note='Downloading login page',
errnote='Unable to download login page')
lsd = self._search_regex(
r'<input type="hidden" name="lsd" value="([^"]*)"',
@@ -70,23 +71,25 @@ class FacebookIE(InfoExtractor):
'timezone': '-60',
'trynum': '1',
}
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
request = compat_urllib_request.Request(self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
try:
login_results = compat_urllib_request.urlopen(request).read()
login_results = self._download_webpage(request, None,
note='Logging in', errnote='unable to fetch login page')
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
check_form = {
'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, 'fb_dtsg'),
'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'),
'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, 'nh'),
'name_action_selected': 'dont_save',
'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, 'continue'),
'submit[Continue]': self._search_regex(r'<button[^>]+value="(.*?)"[^>]+name="submit\[Continue\]"', login_results, 'continue'),
}
check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
check_response = compat_urllib_request.urlopen(check_req).read()
check_response = self._download_webpage(check_req, None,
note='Confirming login')
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

View File

@@ -116,7 +116,24 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': False,
}
}
},
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
'info_dict': {
'id': '9ODmcdjQcHQ',
'ext': 'mp4',
'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
'upload_date': '20140225',
'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
'uploader': 'Tested',
'uploader_id': 'testedcom',
},
# No need to test YoutubeIE here
'params': {
'skip_download': True,
},
},
]
def report_download_webpage(self, video_id):
@@ -211,7 +228,7 @@ class GenericIE(InfoExtractor):
else:
assert ':' in default_search
return self.url_result(default_search + url)
video_id = os.path.splitext(url.split('/')[-1])[0]
video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
self.to_screen('%s: Requesting header' % video_id)
@@ -407,6 +424,14 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'HuffPost')
# Look for embed.ly
mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'))
mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
if mobj is not None:
return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:

View File

@@ -8,7 +8,8 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
ExtractorError
ExtractorError,
int_or_none,
)
@@ -19,7 +20,7 @@ class LyndaIE(SubtitlesInfoExtractor):
_LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
_NETRC_MACHINE = 'lynda'
_SUCCESSFUL_LOGIN_REGEX = r'<a href="https://www.lynda.com/home/userAccount/ChangeContactInfo.aspx" data-qa="eyebrow_account_menu">My account'
_SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
_TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
@@ -55,13 +56,29 @@ class LyndaIE(SubtitlesInfoExtractor):
duration = video_json['DurationInSeconds']
title = video_json['Title']
formats = [{'url': fmt['Url'],
formats = []
fmts = video_json.get('Formats')
if fmts:
formats.extend([
{
'url': fmt['Url'],
'ext': fmt['Extension'],
'width': fmt['Width'],
'height': fmt['Height'],
'filesize': fmt['FileSize'],
'format_id': str(fmt['Resolution'])
} for fmt in video_json['Formats']]
} for fmt in fmts])
prioritized_streams = video_json.get('PrioritizedStreams')
if prioritized_streams:
formats.extend([
{
'url': video_url,
'width': int_or_none(format_id),
'format_id': format_id,
} for format_id, video_url in prioritized_streams['0'].items()
])
self._sort_formats(formats)
@@ -179,6 +196,9 @@ class LyndaCourseIE(InfoExtractor):
videos = []
(username, _) = self._get_login_info()
# Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
# by single video API anymore
for chapter in course_json['Chapters']:
for video in chapter['Videos']:
if username is None and video['HasAccess'] is False:

View File

@@ -51,14 +51,14 @@ class ProSiebenSat1IE(InfoExtractor):
'skip': 'Seems to be broken',
},
{
'url': 'http://www.prosiebenmaxx.de/yep/one-piece/video/148-folge-48-gold-rogers-heimat-ganze-folge',
'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
'info_dict': {
'id': '2437108',
'id': '2429369',
'ext': 'mp4',
'title': 'Folge 48: Gold Rogers Heimat',
'description': 'Ruffy erreicht die Insel, auf der der berühmte Gold Roger lebte und hingerichtet wurde.',
'upload_date': '20140226',
'duration': 1401.48,
'title': 'Countdown für die Autowerkstatt',
'description': 'md5:809fc051a457b5d8666013bc40698817',
'upload_date': '20140223',
'duration': 2595.04,
},
'params': {
# rtmp download

View File

@@ -217,7 +217,7 @@ class SoundcloudIE(InfoExtractor):
return self._extract_info_dict(info, full_title, secret_token=token)
class SoundcloudSetIE(SoundcloudIE):
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
_VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
IE_NAME = 'soundcloud:set'
# it's in tests/test_playlists.py
_TESTS = []

View File

@@ -6,115 +6,111 @@ import re
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
RegexNotFoundError,
compat_str,
)
class TEDIE(SubtitlesInfoExtractor):
_VALID_URL=r'''http://www\.ted\.com/
(
((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
'''
_VALID_URL = r'''(?x)http://www\.ted\.com/
(
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
'''
_TEST = {
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'file': '102.mp4',
'md5': '4ea1dada91e4174b53dac2bb8ace429d',
'info_dict': {
"description": "md5:c6fa72e6eedbd938c9caf6b2702f5922",
"title": "Dan Dennett: The illusion of consciousness"
'id': '102',
'ext': 'mp4',
'title': 'The illusion of consciousness',
'description': ('Philosopher Dan Dennett makes a compelling '
'argument that not only don\'t we understand our own '
'consciousness, but that half the time our brains are '
'actively fooling us.'),
'uploader': 'Dan Dennett',
}
}
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
_FORMATS_PREFERENCE = {
'low': 1,
'medium': 2,
'high': 3,
}
def _extract_info(self, webpage):
info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
webpage, 'info json')
return json.loads(info_json)
def _real_extract(self, url):
m=re.match(self._VALID_URL, url, re.VERBOSE)
m = re.match(self._VALID_URL, url, re.VERBOSE)
name = m.group('name')
if m.group('type_talk'):
return self._talk_info(url)
else :
playlist_id=m.group('playlist_id')
name=m.group('name')
self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
return [self._playlist_videos_info(url,name,playlist_id)]
return self._talk_info(url, name)
else:
return self._playlist_videos_info(url, name)
def _playlist_videos_info(self, url, name, playlist_id):
def _playlist_videos_info(self, url, name):
'''Returns the videos of the playlist'''
webpage = self._download_webpage(
url, playlist_id, 'Downloading playlist webpage')
matches = re.finditer(
r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
webpage)
playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
webpage, 'playlist title')
webpage = self._download_webpage(url, name,
'Downloading playlist webpage')
info = self._extract_info(webpage)
playlist_info = info['playlist']
playlist_entries = [
self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
for m in matches
self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
for talk in info['talks']
]
return self.playlist_result(
playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
playlist_entries,
playlist_id=compat_str(playlist_info['id']),
playlist_title=playlist_info['title'])
def _talk_info(self, url, video_id=0):
"""Return the video for the talk in the url"""
m = re.match(self._VALID_URL, url,re.VERBOSE)
video_name = m.group('name')
webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name)
self.report_extraction(video_name)
# If the url includes the language we get the title translated
title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
webpage, 'title')
json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
webpage, 'json data')
info = json.loads(json_data)
desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
webpage, 'description', flags = re.DOTALL)
thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
webpage, 'thumbnail')
talk_info = self._extract_info(webpage)['talks'][0]
formats = [{
'ext': 'mp4',
'url': stream['file'],
'format': stream['id']
} for stream in info['htmlStreams']]
video_id = info['id']
'url': format_url,
'format_id': format_id,
'format': format_id,
'preference': self._FORMATS_PREFERENCE.get(format_id, -1),
} for (format_id, format_url) in talk_info['nativeDownloads'].items()]
self._sort_formats(formats)
video_id = compat_str(talk_info['id'])
# subtitles
video_subtitles = self.extract_subtitles(video_id, webpage)
video_subtitles = self.extract_subtitles(video_id, talk_info)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, webpage)
self._list_available_subtitles(video_id, talk_info)
return
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'description': desc,
'title': talk_info['title'],
'uploader': talk_info['speaker'],
'thumbnail': talk_info['thumb'],
'description': self._og_search_description(webpage),
'subtitles': video_subtitles,
'formats': formats,
}
def _get_available_subtitles(self, video_id, webpage):
try:
options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
languages = re.findall(r'(?:<option value=")(\S+)"', options)
if languages:
sub_lang_list = {}
for l in languages:
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
sub_lang_list[l] = url
return sub_lang_list
except RegexNotFoundError:
def _get_available_subtitles(self, video_id, talk_info):
languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
if languages:
sub_lang_list = {}
for l in languages:
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
sub_lang_list[l] = url
return sub_lang_list
else:
self._downloader.report_warning(u'video doesn\'t have subtitles')
return {}
return {}

View File

@@ -14,19 +14,32 @@ from ..utils import (
class TvigleIE(InfoExtractor):
IE_NAME = 'tvigle'
IE_DESC = 'Интернет-телевидение Tvigle.ru'
_VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?video=(?P<id>\d+)'
_VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)'
_TEST = {
'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
'md5': '09afba4616666249f087efc6dcf83cb3',
'info_dict': {
'id': '503081',
'ext': 'flv',
'title': 'Брат 2 ',
'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
'upload_date': '20110919',
}
}
_TESTS = [
{
'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
'md5': '09afba4616666249f087efc6dcf83cb3',
'info_dict': {
'id': '503081',
'ext': 'flv',
'title': 'Брат 2 ',
'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
'upload_date': '20110919',
},
},
{
'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433',
'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
'info_dict': {
'id': '676433',
'ext': 'flv',
'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
'description': 'md5:027f7dc872948f14c96d19b4178428a4',
'upload_date': '20121218',
},
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)

View File

@@ -13,7 +13,7 @@ from ..utils import (
class VestiIE(InfoExtractor):
IE_NAME = 'vesti'
IE_DESC = 'Вести.Ru'
_VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
_VALID_URL = r'http://(?:.+?\.)?(?:vesti\.ru|russia\.tv)/(?P<id>.+)'
_TESTS = [
{
@@ -30,6 +30,20 @@ class VestiIE(InfoExtractor):
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/doc.html?id=1349233',
'info_dict': {
'id': '773865',
'ext': 'mp4',
'title': 'Участники митинга штурмуют Донецкую областную администрацию',
'description': 'md5:1a160e98b3195379b4c849f2f4958009',
'duration': 210,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/only_video.html?vid=576180',
'info_dict': {
@@ -44,6 +58,48 @@ class VestiIE(InfoExtractor):
'skip_download': True,
},
},
{
'url': 'http://hitech.vesti.ru/news/view/id/4000',
'info_dict': {
'id': '766888',
'ext': 'mp4',
'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
'duration': 279,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://russia.tv/video/show/brand_id/5169/episode_id/970443/video_id/975648',
'info_dict': {
'id': '771852',
'ext': 'mp4',
'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
'duration': 3096,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://russia.tv/brand/show/brand_id/57638',
'info_dict': {
'id': '774016',
'ext': 'mp4',
'title': 'Чужой в семье Сталина',
'description': '',
'duration': 2539,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
'info_dict': {
@@ -81,16 +137,26 @@ class VestiIE(InfoExtractor):
page = self._download_webpage(url, video_id, 'Downloading page')
mobj = re.search(r'<meta property="og:video" content=".+?\.swf\?v?id=(?P<id>\d+).*?" />', page)
mobj = re.search(
r'<meta property="og:video" content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
page)
if mobj:
video_id = mobj.group('id')
page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
'Downloading video page')
mobj = re.search(
r'<meta property="og:video" content="http://player\.rutv\.ru/flash2v/container\.swf\?id=(?P<id>\d+)', page)
if mobj:
video_type = 'video'
video_id = mobj.group('id')
else:
mobj = re.search(
r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>', page)
r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>',
page)
if not mobj:
raise ExtractorError('No media found')
raise ExtractorError('No media found', expected=True)
video_type = mobj.group('type')
video_id = mobj.group('id')
@@ -113,8 +179,8 @@ class VestiIE(InfoExtractor):
priority_transport = playlist['priority_transport']
thumbnail = media['picture']
width = media['width']
height = media['height']
width = int_or_none(media['width'])
height = int_or_none(media['height'])
description = media['anons']
title = media['title']
duration = int_or_none(media.get('duration'))

View File

@@ -7,19 +7,24 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
parse_duration,
str_to_int,
)
class XTubeIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_TEST = {
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
'file': 'kVTUy_G222_.mp4',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
'info_dict': {
"title": "strange erotica",
"description": "surreal gay themed erotica...almost an ET kind of thing",
"uploader": "greenshowers",
"age_limit": 18,
'id': 'kVTUy_G222_',
'ext': 'mp4',
'title': 'strange erotica',
'description': 'surreal gay themed erotica...almost an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
'age_limit': 18,
}
}
@@ -32,10 +37,23 @@ class XTubeIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False)
video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(
r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
duration = parse_duration(self._html_search_regex(
r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
view_count = self._html_search_regex(
r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = self._html_search_regex(
r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
if comment_count:
comment_count = str_to_int(comment_count)
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
@@ -48,6 +66,9 @@ class XTubeIE(InfoExtractor):
'title': video_title,
'uploader': video_uploader,
'description': video_description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'url': video_url,
'ext': extension,
'format': format,

View File

@@ -1263,3 +1263,7 @@ def read_batch_urls(batch_fd):
with contextlib.closing(batch_fd) as fd:
return [url for url in map(fixup, fd) if url]
def urlencode_postdata(*args, **kargs):
return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')

View File

@@ -1,2 +1,2 @@
__version__ = '2014.03.04.1'
__version__ = '2014.03.07.1'