Compare commits

..

53 Commits

Author SHA1 Message Date
Philipp Hagemeister
9e0c5791c1 release 2014.04.04.4 2014-04-04 22:15:32 +02:00
Philipp Hagemeister
29a1ab2afc Add alternative --prefer-unsecure spelling (Closes #2697) 2014-04-04 22:15:21 +02:00
Jaime Marquínez Ferrándiz
659eb98a53 [breakcom] Fix YouTube videos extraction (fixes #2699) 2014-04-04 19:01:18 +02:00
Jaime Marquínez Ferrándiz
43df5a7e71 [keezmovies] Modernize 2014-04-04 18:52:43 +02:00
Jaime Marquínez Ferrándiz
88f1c6de7b [yahoo] Modernize 2014-04-04 18:52:43 +02:00
Sergey M․
65a40ab82b [pornhd] Update test checksum 2014-04-04 22:47:38 +07:00
Sergey M․
4b9cced103 [pornhd] Fix extraction (Closes #2693) 2014-04-04 22:45:39 +07:00
Sergey M․
6344fa04bb [rts] Add more formats and audio support (Closes #2689) 2014-04-04 20:42:06 +07:00
Jaime Marquínez Ferrándiz
e3ced9ed61 [downloader/common] Use compat_str with the error in try_rename (appeared in #2389)
Otherwise on python 2.x we get `UnicodeDecodeError` because it may contain non ascii characters.
2014-04-04 14:59:11 +02:00
Philipp Hagemeister
5075d598bc release 2014.04.04.2 2014-04-04 02:24:21 +02:00
Philipp Hagemeister
68eb8e90e6 [youtube:playlist] Fix playlists for logged-in users (Fixes #2690) 2014-04-04 02:23:36 +02:00
Philipp Hagemeister
d3a96346c4 release 2014.04.04.3 2014-04-04 02:09:16 +02:00
Philipp Hagemeister
0e518e2fea [cnet] Fall back to "videos" key 2014-04-04 02:09:04 +02:00
Philipp Hagemeister
1e0a235f39 [dailymotion] Fix playlist+user 2014-04-04 02:04:16 +02:00
Philipp Hagemeister
9ad400f75e [generic] Remove test case that has become a 404 2014-04-04 01:47:17 +02:00
Philipp Hagemeister
3537b93d8a [tests] Fix YoutubeDL tests
Since bec1fad, the id, title, and url (also in formats) keys are mandatory. Change the tests to reflect that.
2014-04-04 01:45:49 +02:00
Philipp Hagemeister
56eca2e956 release 2014.04.04.1 2014-04-04 00:25:43 +02:00
Philipp Hagemeister
2ad4d1ba07 [morningstar] Add new extractor (Fixes #2687) 2014-04-04 00:25:35 +02:00
Philipp Hagemeister
4853de808b release 2014.04.04 2014-04-04 00:06:06 +02:00
Philipp Hagemeister
6ff5f12218 [motorsport] Add extractor (Fixes #2688) 2014-04-04 00:05:43 +02:00
Philipp Hagemeister
52a180684f [README] Fix VALID_URL in extractor example 2014-04-03 23:25:23 +02:00
Philipp Hagemeister
b21e25702f Merge pull request #2681 from phihag/readme-dev-instructions
[README] Improve developer instructions
2014-04-03 23:06:15 +02:00
Jaime Marquínez Ferrándiz
983af2600f [wimp] Detect youtube videos (fixes #2686) 2014-04-03 20:44:51 +02:00
Philipp Hagemeister
f34e6a2cd6 [comedycentral:shows] Do no include 6-digit identifier in display ID 2014-04-03 18:39:00 +02:00
Philipp Hagemeister
a9f304031b release 2014.04.03.3 2014-04-03 16:21:54 +02:00
Philipp Hagemeister
9271bc8355 [cnet] Add new extractor (Fixes #2679) 2014-04-03 16:21:21 +02:00
Philipp Hagemeister
d1b3e3dd75 [README] Add md5 to code example 2014-04-03 15:59:04 +02:00
Philipp Hagemeister
968ed2a777 [comedycentral] Add test for #2677 2014-04-03 15:31:04 +02:00
Philipp Hagemeister
24de5d2556 release 2014.04.03.2 2014-04-03 15:28:56 +02:00
Philipp Hagemeister
d26e981df4 Correct check for empty dirname (Fixes #2683) 2014-04-03 15:28:41 +02:00
Jaime Marquínez Ferrándiz
e45d40b171 [youtube:subscriptions] Add space to the description 2014-04-03 15:13:52 +02:00
Sergey M․
4a419b8851 [c56] Modernize and add duration extraction 2014-04-03 19:53:11 +07:00
Philipp Hagemeister
5fbd672c38 [README] Improve developer instructions
Add a longer tutorial that should cover everything needed to start developing IEs.

Fixes #2676
2014-04-03 14:46:24 +02:00
Philipp Hagemeister
bec1fad223 [YouTubeDL] Throw an early error if the info_dict result is invalid 2014-04-03 14:38:16 +02:00
Philipp Hagemeister
177fed41bc [comedycentral:shows] Support guest/ URLs (Fixes #2677) 2014-04-03 14:38:16 +02:00
Jaime Marquínez Ferrándiz
b900e7cba4 [downloader/f4m] Close the final video 2014-04-03 13:35:07 +02:00
Jaime Marquínez Ferrándiz
14cb4979f0 MANIFEST.in: Only list the files from the docs folder that will be included (closes #2623)
Pruning the _build folder produced the message `no previously-included directories found matching 'docs/_build'` when installing from the source distribution.
2014-04-03 13:26:27 +02:00
Philipp Hagemeister
69e61e30fe release 2014.04.03.1 2014-04-03 08:55:59 +02:00
Philipp Hagemeister
cce929eaac [franceculture] Add extractor (Fixes #2669) 2014-04-03 08:55:38 +02:00
Philipp Hagemeister
b6cfde99b7 Only mention websense URL once 2014-04-03 08:12:53 +02:00
Philipp Hagemeister
1be99f052d release 2014.04.03 2014-04-03 06:09:45 +02:00
Philipp Hagemeister
2410c43d83 Detect Websense censorship (Fixes #2670) 2014-04-03 06:09:38 +02:00
Philipp Hagemeister
aea6e7fc3c [cspan] Support multiple segments (Fixes #2674) 2014-04-03 06:09:38 +02:00
Sergey M․
91a76c40c0 [musicplayon] Add support for musicplayon.com 2014-04-02 22:10:20 +07:00
Philipp Hagemeister
d2b194607c release 2014.04.02 2014-04-02 14:26:34 +02:00
Jaime Marquínez Ferrándiz
f6177462db [youtube] feeds: Also look for the html in the 'content_html' field (fixes #2671) 2014-04-02 14:13:08 +02:00
Jaime Marquínez Ferrándiz
9ddaf4ef8c [comedycentral] Change XPath .//guid to ./guid (fixes #2668)
It fails to find the element in python 2.6 and it's not required, the
element is a direct child of the item node.
2014-04-01 21:38:07 +02:00
Jaime Marquínez Ferrándiz
97b5573848 [comedycentral] Update test title for 34cbc7ee8d 2014-04-01 21:29:40 +02:00
Jaime Marquínez Ferrándiz
18c95c1ab0 [rutube] Use _download_json 2014-04-01 20:30:22 +02:00
Sergey M․
0479c625a4 [brightcove] Encode object_str with utf-8 2014-04-01 20:17:35 +07:00
Sergey M․
f659951e22 [vk] Support optional dash for oid in embedded links 2014-04-01 19:38:42 +07:00
Philipp Hagemeister
5853a7316e release 2014.04.01.3 2014-04-01 13:17:15 +02:00
Philipp Hagemeister
a612753db9 [utils] Correct decoding of large unicode codepoints in uppercase_escape (Fixes #2664) 2014-04-01 13:17:07 +02:00
33 changed files with 802 additions and 188 deletions

View File

@@ -3,5 +3,4 @@ include test/*.py
include test/*.json include test/*.json
include youtube-dl.bash-completion include youtube-dl.bash-completion
include youtube-dl.1 include youtube-dl.1
recursive-include docs * recursive-include docs Makefile conf.py *.rst
prune docs/_build

View File

@@ -371,7 +371,67 @@ If you want to create a build of youtube-dl yourself, you'll need
### Adding support for a new site ### Adding support for a new site
If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py TestDownload.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/). If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`):
1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class YourExtractorIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://yourextractor.com/watch/42',
'md5': 'TODO: md5 sum of the first 10KiB of the video file',
'info_dict': {
'id': '42',
'ext': 'mp4',
'title': 'Video title goes here',
# TODO more properties, either as:
# * A value
# * MD5 checksum; start the string with md5:
# * A regular expression; start the string with re:
# * Any Python type (for example int or float)
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
# TODO more code goes here, for example ...
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
return {
'id': video_id,
'title': title,
# TODO more properties (see youtube_dl/extractor/common.py)
}
5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done.
7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501).
9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this:
$ git add youtube_dl/extractor/__init__.py
$ git add youtube_dl/extractor/yourextractor.py
$ git commit -m '[yourextractor] Add new extractor'
$ git push origin yourextractor
10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
In any case, thank you very much for your contributions!
# BUGS # BUGS

View File

@@ -26,16 +26,27 @@ class YDL(FakeYDL):
self.msgs.append(msg) self.msgs.append(msg)
def _make_result(formats, **kwargs):
res = {
'formats': formats,
'id': 'testid',
'title': 'testttitle',
'extractor': 'testex',
}
res.update(**kwargs)
return res
class TestFormatSelection(unittest.TestCase): class TestFormatSelection(unittest.TestCase):
def test_prefer_free_formats(self): def test_prefer_free_formats(self):
# Same resolution => download webm # Same resolution => download webm
ydl = YDL() ydl = YDL()
ydl.params['prefer_free_formats'] = True ydl.params['prefer_free_formats'] = True
formats = [ formats = [
{'ext': 'webm', 'height': 460}, {'ext': 'webm', 'height': 460, 'url': 'x'},
{'ext': 'mp4', 'height': 460}, {'ext': 'mp4', 'height': 460, 'url': 'y'},
] ]
info_dict = {'formats': formats, 'extractor': 'test'} info_dict = _make_result(formats)
yie = YoutubeIE(ydl) yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats']) yie._sort_formats(info_dict['formats'])
ydl.process_ie_result(info_dict) ydl.process_ie_result(info_dict)
@@ -46,8 +57,8 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL() ydl = YDL()
ydl.params['prefer_free_formats'] = True ydl.params['prefer_free_formats'] = True
formats = [ formats = [
{'ext': 'webm', 'height': 720}, {'ext': 'webm', 'height': 720, 'url': 'a'},
{'ext': 'mp4', 'height': 1080}, {'ext': 'mp4', 'height': 1080, 'url': 'b'},
] ]
info_dict['formats'] = formats info_dict['formats'] = formats
yie = YoutubeIE(ydl) yie = YoutubeIE(ydl)
@@ -60,9 +71,9 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL() ydl = YDL()
ydl.params['prefer_free_formats'] = False ydl.params['prefer_free_formats'] = False
formats = [ formats = [
{'ext': 'webm', 'height': 720}, {'ext': 'webm', 'height': 720, 'url': '_'},
{'ext': 'mp4', 'height': 720}, {'ext': 'mp4', 'height': 720, 'url': '_'},
{'ext': 'flv', 'height': 720}, {'ext': 'flv', 'height': 720, 'url': '_'},
] ]
info_dict['formats'] = formats info_dict['formats'] = formats
yie = YoutubeIE(ydl) yie = YoutubeIE(ydl)
@@ -74,8 +85,8 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL() ydl = YDL()
ydl.params['prefer_free_formats'] = False ydl.params['prefer_free_formats'] = False
formats = [ formats = [
{'ext': 'flv', 'height': 720}, {'ext': 'flv', 'height': 720, 'url': '_'},
{'ext': 'webm', 'height': 720}, {'ext': 'webm', 'height': 720, 'url': '_'},
] ]
info_dict['formats'] = formats info_dict['formats'] = formats
yie = YoutubeIE(ydl) yie = YoutubeIE(ydl)
@@ -91,8 +102,7 @@ class TestFormatSelection(unittest.TestCase):
{'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3}, {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3},
{'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4}, {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4},
] ]
info_dict = { info_dict = _make_result(formats)
'formats': formats, 'extractor': 'test', 'id': 'testvid'}
ydl = YDL() ydl = YDL()
ydl.process_ie_result(info_dict) ydl.process_ie_result(info_dict)
@@ -120,12 +130,12 @@ class TestFormatSelection(unittest.TestCase):
def test_format_selection(self): def test_format_selection(self):
formats = [ formats = [
{'format_id': '35', 'ext': 'mp4', 'preference': 1}, {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'},
{'format_id': '45', 'ext': 'webm', 'preference': 2}, {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'},
{'format_id': '47', 'ext': 'webm', 'preference': 3}, {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'},
{'format_id': '2', 'ext': 'flv', 'preference': 4}, {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'},
] ]
info_dict = {'formats': formats, 'extractor': 'test'} info_dict = _make_result(formats)
ydl = YDL({'format': '20/47'}) ydl = YDL({'format': '20/47'})
ydl.process_ie_result(info_dict.copy()) ydl.process_ie_result(info_dict.copy())
@@ -154,12 +164,12 @@ class TestFormatSelection(unittest.TestCase):
def test_format_selection_audio(self): def test_format_selection_audio(self):
formats = [ formats = [
{'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none'}, {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'},
{'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none'}, {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'},
{'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none'}, {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'},
{'format_id': 'vid', 'ext': 'mp4', 'preference': 4}, {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'},
] ]
info_dict = {'formats': formats, 'extractor': 'test'} info_dict = _make_result(formats)
ydl = YDL({'format': 'bestaudio'}) ydl = YDL({'format': 'bestaudio'})
ydl.process_ie_result(info_dict.copy()) ydl.process_ie_result(info_dict.copy())
@@ -172,10 +182,10 @@ class TestFormatSelection(unittest.TestCase):
self.assertEqual(downloaded['format_id'], 'audio-low') self.assertEqual(downloaded['format_id'], 'audio-low')
formats = [ formats = [
{'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1}, {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'},
{'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2}, {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'},
] ]
info_dict = {'formats': formats, 'extractor': 'test'} info_dict = _make_result(formats)
ydl = YDL({'format': 'bestaudio/worstaudio/best'}) ydl = YDL({'format': 'bestaudio/worstaudio/best'})
ydl.process_ie_result(info_dict.copy()) ydl.process_ie_result(info_dict.copy())
@@ -184,11 +194,11 @@ class TestFormatSelection(unittest.TestCase):
def test_format_selection_video(self): def test_format_selection_video(self):
formats = [ formats = [
{'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none'}, {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'},
{'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none'}, {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'},
{'format_id': 'vid', 'ext': 'mp4', 'preference': 3}, {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'},
] ]
info_dict = {'formats': formats, 'extractor': 'test'} info_dict = _make_result(formats)
ydl = YDL({'format': 'bestvideo'}) ydl = YDL({'format': 'bestvideo'})
ydl.process_ie_result(info_dict.copy()) ydl.process_ie_result(info_dict.copy())
@@ -217,10 +227,12 @@ class TestFormatSelection(unittest.TestCase):
for f1id, f2id in zip(order, order[1:]): for f1id, f2id in zip(order, order[1:]):
f1 = YoutubeIE._formats[f1id].copy() f1 = YoutubeIE._formats[f1id].copy()
f1['format_id'] = f1id f1['format_id'] = f1id
f1['url'] = 'url:' + f1id
f2 = YoutubeIE._formats[f2id].copy() f2 = YoutubeIE._formats[f2id].copy()
f2['format_id'] = f2id f2['format_id'] = f2id
f2['url'] = 'url:' + f2id
info_dict = {'formats': [f1, f2], 'extractor': 'youtube'} info_dict = _make_result([f1, f2], extractor='youtube')
ydl = YDL() ydl = YDL()
yie = YoutubeIE(ydl) yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats']) yie._sort_formats(info_dict['formats'])
@@ -228,7 +240,7 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0] downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1id) self.assertEqual(downloaded['format_id'], f1id)
info_dict = {'formats': [f2, f1], 'extractor': 'youtube'} info_dict = _make_result([f2, f1], extractor='youtube')
ydl = YDL() ydl = YDL()
yie = YoutubeIE(ydl) yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats']) yie._sort_formats(info_dict['formats'])

View File

@@ -153,6 +153,9 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch( self.assertMatch(
'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114',
['ComedyCentralShows']) ['ComedyCentralShows'])
self.assertMatch(
'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3',
['ComedyCentralShows'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@@ -42,6 +42,7 @@ from youtube_dl.extractor import (
ToypicsUserIE, ToypicsUserIE,
XTubeUserIE, XTubeUserIE,
InstagramUserIE, InstagramUserIE,
CSpanIE,
) )
@@ -314,6 +315,19 @@ class TestPlaylists(unittest.TestCase):
} }
expect_info_dict(self, EXPECTED, test_video) expect_info_dict(self, EXPECTED, test_video)
def test_CSpan_playlist(self):
dl = FakeYDL()
ie = CSpanIE(dl)
result = ie.extract(
'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], '342759')
self.assertEqual(
result['title'], 'General Motors Ignition Switch Recall')
self.assertEqual(len(result['entries']), 9)
whole_duration = sum(e['duration'] for e in result['entries'])
self.assertEqual(whole_duration, 14855)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@@ -702,6 +702,11 @@ class YoutubeDL(object):
def process_video_result(self, info_dict, download=True): def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video' assert info_dict.get('_type', 'video') == 'video'
if 'id' not in info_dict:
raise ExtractorError('Missing "id" field in extractor result')
if 'title' not in info_dict:
raise ExtractorError('Missing "title" field in extractor result')
if 'playlist' not in info_dict: if 'playlist' not in info_dict:
# It isn't part of a playlist # It isn't part of a playlist
info_dict['playlist'] = None info_dict['playlist'] = None
@@ -733,6 +738,9 @@ class YoutubeDL(object):
# We check that all the formats have the format and format_id fields # We check that all the formats have the format and format_id fields
for i, format in enumerate(formats): for i, format in enumerate(formats):
if 'url' not in format:
raise ExtractorError('Missing "url" key in result (index %d)' % i)
if format.get('format_id') is None: if format.get('format_id') is None:
format['format_id'] = compat_str(i) format['format_id'] = compat_str(i)
if format.get('format') is None: if format.get('format') is None:
@@ -743,7 +751,7 @@ class YoutubeDL(object):
) )
# Automatically determine file extension if missing # Automatically determine file extension if missing
if 'ext' not in format: if 'ext' not in format:
format['ext'] = determine_ext(format['url']) format['ext'] = determine_ext(format['url']).lower()
format_limit = self.params.get('format_limit', None) format_limit = self.params.get('format_limit', None)
if format_limit: if format_limit:
@@ -868,7 +876,7 @@ class YoutubeDL(object):
try: try:
dn = os.path.dirname(encodeFilename(filename)) dn = os.path.dirname(encodeFilename(filename))
if dn != '' and not os.path.exists(dn): if dn and not os.path.exists(dn):
os.makedirs(dn) os.makedirs(dn)
except (OSError, IOError) as err: except (OSError, IOError) as err:
self.report_error('unable to create directory ' + compat_str(err)) self.report_error('unable to create directory ' + compat_str(err))

View File

@@ -242,7 +242,7 @@ def parseOpts(overrideArguments=None):
help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
general.add_option( general.add_option(
'--prefer-insecure', action='store_true', dest='prefer_insecure', '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure',
help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)') help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')
general.add_option( general.add_option(
'--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',

View File

@@ -4,9 +4,10 @@ import sys
import time import time
from ..utils import ( from ..utils import (
compat_str,
encodeFilename, encodeFilename,
timeconvert,
format_bytes, format_bytes,
timeconvert,
) )
@@ -173,7 +174,7 @@ class FileDownloader(object):
return return
os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
except (IOError, OSError) as err: except (IOError, OSError) as err:
self.report_error(u'unable to rename file: %s' % str(err)) self.report_error(u'unable to rename file: %s' % compat_str(err))
def try_utime(self, filename, last_modified_hdr): def try_utime(self, filename, last_modified_hdr):
"""Try to set the last-modified time of the given file.""" """Try to set the last-modified time of the given file."""

View File

@@ -297,6 +297,7 @@ class F4mFD(FileDownloader):
break break
frags_filenames.append(frag_filename) frags_filenames.append(frag_filename)
dest_stream.close()
self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start) self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start)
self.try_rename(tmpfilename, filename) self.try_rename(tmpfilename, filename)

View File

@@ -40,6 +40,7 @@ from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE from .clipsyndicate import ClipsyndicateIE
from .cmt import CMTIE from .cmt import CMTIE
from .cnet import CNETIE
from .cnn import ( from .cnn import (
CNNIE, CNNIE,
CNNBlogsIE, CNNBlogsIE,
@@ -83,6 +84,7 @@ from .fktv import (
) )
from .flickr import FlickrIE from .flickr import FlickrIE
from .fourtube import FourTubeIE from .fourtube import FourTubeIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE from .franceinter import FranceInterIE
from .francetv import ( from .francetv import (
PluzzIE, PluzzIE,
@@ -152,10 +154,13 @@ from .mixcloud import MixcloudIE
from .mpora import MporaIE from .mpora import MporaIE
from .mofosex import MofosexIE from .mofosex import MofosexIE
from .mooshare import MooshareIE from .mooshare import MooshareIE
from .morningstar import MorningstarIE
from .motorsport import MotorsportIE
from .mtv import ( from .mtv import (
MTVIE, MTVIE,
MTVIggyIE, MTVIggyIE,
) )
from .musicplayon import MusicPlayOnIE
from .muzu import MuzuTVIE from .muzu import MuzuTVIE
from .myspace import MySpaceIE from .myspace import MySpaceIE
from .myspass import MySpassIE from .myspass import MySpassIE

View File

@@ -27,9 +27,10 @@ class BreakIE(InfoExtractor):
webpage, 'info json', flags=re.DOTALL) webpage, 'info json', flags=re.DOTALL)
info = json.loads(info_json) info = json.loads(info_json)
video_url = info['videoUri'] video_url = info['videoUri']
m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) youtube_id = info.get('youtubeId')
if m_youtube is not None: if youtube_id:
return self.url_result(m_youtube.group(1), 'Youtube') return self.url_result(youtube_id, 'Youtube')
final_url = video_url + '?' + info['AuthToken'] final_url = video_url + '?' + info['AuthToken']
return { return {
'id': video_id, 'id': video_id,

View File

@@ -87,7 +87,7 @@ class BrightcoveIE(InfoExtractor):
object_str = object_str.replace('<--', '<!--') object_str = object_str.replace('<--', '<!--')
object_str = fix_xml_ampersands(object_str) object_str = fix_xml_ampersands(object_str)
object_doc = xml.etree.ElementTree.fromstring(object_str) object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
if fv_el is not None: if fv_el is not None:

View File

@@ -2,39 +2,46 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
class C56IE(InfoExtractor): class C56IE(InfoExtractor):
_VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P<textid>.+?)\.(html|swf)' _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
IE_NAME = '56.com' IE_NAME = '56.com'
_TEST = { _TEST = {
'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
'file': '93440716.flv',
'md5': 'e59995ac63d0457783ea05f93f12a866', 'md5': 'e59995ac63d0457783ea05f93f12a866',
'info_dict': { 'info_dict': {
'id': '93440716',
'ext': 'flv',
'title': '网事知多少 第32期车怒', 'title': '网事知多少 第32期车怒',
'duration': 283.813,
}, },
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
text_id = mobj.group('textid') text_id = mobj.group('textid')
info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id,
text_id, 'Downloading video info') page = self._download_json(
info = json.loads(info_page)['info'] 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
formats = [{
'format_id': f['type'], info = page['info']
'filesize': int(f['filesize']),
'url': f['url'] formats = [
} for f in info['rfiles']] {
'format_id': f['type'],
'filesize': int(f['filesize']),
'url': f['url']
} for f in info['rfiles']
]
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': info['vid'], 'id': info['vid'],
'title': info['Subject'], 'title': info['Subject'],
'duration': int(info['duration']) / 1000.0,
'formats': formats, 'formats': formats,
'thumbnail': info.get('bimg') or info.get('img'), 'thumbnail': info.get('bimg') or info.get('img'),
} }

View File

@@ -0,0 +1,75 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
)
class CNETIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
'md5': '041233212a0d06b179c87cbcca1577b8',
'info_dict': {
'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
'ext': 'mp4',
'title': 'Hands-on with Microsoft Windows 8.1 Update',
'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
'thumbnail': 're:^http://.*/flmswindows8.jpg$',
'uploader_id': 'sarah.mitroff@cbsinteractive.com',
'uploader': 'Sarah Mitroff',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
data_json = self._html_search_regex(
r"<div class=\"cnetVideoPlayer\" data-cnet-video-options='([^']+)'",
webpage, 'data json')
data = json.loads(data_json)
vdata = data['video']
if not vdata:
vdata = data['videos'][0]
if not vdata:
raise ExtractorError('Cannot find video data')
video_id = vdata['id']
title = vdata['headline']
description = vdata.get('dek')
thumbnail = vdata.get('image', {}).get('path')
author = vdata.get('author')
if author:
uploader = '%s %s' % (author['firstName'], author['lastName'])
uploader_id = author.get('email')
else:
uploader = None
uploader_id = None
formats = [{
'format_id': '%s-%s-%s' % (
f['type'], f['format'],
int_or_none(f.get('bitrate'), 1000, default='')),
'url': f['uri'],
'tbr': int_or_none(f.get('bitrate'), 1000),
} for f in vdata['files']['data']]
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'formats': formats,
'description': description,
'uploader': uploader,
'uploader_id': uploader_id,
'thumbnail': thumbnail,
}

View File

@@ -41,9 +41,9 @@ class ComedyCentralShowsIE(InfoExtractor):
_VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport) _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
|https?://(:www\.)? |https?://(:www\.)?
(?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/ (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
(full-episodes/(?P<episode>.*)| (full-episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
(?P<clip> (?P<clip>
(?:videos/[^/]+/(?P<videotitle>[^/?#]+)) (?:(?:guests/[^/]+|videos)/[^/]+/(?P<videotitle>[^/?#]+))
|(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)) |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
)| )|
@@ -59,7 +59,7 @@ class ComedyCentralShowsIE(InfoExtractor):
'upload_date': '20121213', 'upload_date': '20121213',
'description': 'Kristen Stewart learns to let loose in "On the Road."', 'description': 'Kristen Stewart learns to let loose in "On the Road."',
'uploader': 'thedailyshow', 'uploader': 'thedailyshow',
'title': 'thedailyshow-kristen-stewart part 1', 'title': 'thedailyshow kristen-stewart part 1',
} }
} }
@@ -165,7 +165,7 @@ class ComedyCentralShowsIE(InfoExtractor):
content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') content = itemEl.find('.//{http://search.yahoo.com/mrss/}content')
duration = float_or_none(content.attrib.get('duration')) duration = float_or_none(content.attrib.get('duration'))
mediagen_url = content.attrib['url'] mediagen_url = content.attrib['url']
guid = itemEl.find('.//guid').text.rpartition(':')[-1] guid = itemEl.find('./guid').text.rpartition(':')[-1]
cdoc = self._download_xml( cdoc = self._download_xml(
mediagen_url, epTitle, mediagen_url, epTitle,

View File

@@ -252,6 +252,17 @@ class InfoExtractor(object):
outf.write(webpage_bytes) outf.write(webpage_bytes)
content = webpage_bytes.decode(encoding, 'replace') content = webpage_bytes.decode(encoding, 'replace')
if (u'<title>Access to this site is blocked</title>' in content and
u'Websense' in content[:512]):
msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
blocked_iframe = self._html_search_regex(
r'<iframe src="([^"]+)"', content,
u'Websense information URL', default=None)
if blocked_iframe:
msg += u' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True)
return (content, urlh) return (content, urlh)
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):

View File

@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none,
unescapeHTML, unescapeHTML,
find_xpath_attr, find_xpath_attr,
) )
@@ -54,18 +55,29 @@ class CSpanIE(InfoExtractor):
info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
data = self._download_json(info_url, video_id) data = self._download_json(info_url, video_id)
url = unescapeHTML(data['video']['files'][0]['path']['#text']) doc = self._download_xml(
'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
doc = self._download_xml('http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
video_id) video_id)
def find_string(s): title = find_xpath_attr(doc, './/string', 'name', 'title').text
return find_xpath_attr(doc, './/string', 'name', s).text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
files = data['video']['files']
entries = [{
'id': '%s_%d' % (video_id, partnum + 1),
'title': (
title if len(files) == 1 else
'%s part %d' % (title, partnum + 1)),
'url': unescapeHTML(f['path']['#text']),
'description': description,
'thumbnail': thumbnail,
'duration': int_or_none(f.get('length', {}).get('#text')),
} for partnum, f in enumerate(files)]
return { return {
'_type': 'playlist',
'entries': entries,
'title': title,
'id': video_id, 'id': video_id,
'title': find_string('title'),
'url': url,
'description': description,
'thumbnail': find_string('poster'),
} }

View File

@@ -180,7 +180,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
IE_NAME = u'dailymotion:playlist' IE_NAME = u'dailymotion:playlist'
_VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
_MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>' _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
def _extract_entries(self, id): def _extract_entries(self, id):
@@ -190,10 +190,9 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
webpage = self._download_webpage(request, webpage = self._download_webpage(request,
id, u'Downloading page %s' % pagenum) id, u'Downloading page %s' % pagenum)
playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage) video_ids.extend(re.findall(r'data-id="(.+?)"', webpage))
video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break break
return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
for video_id in orderedSet(video_ids)] for video_id in orderedSet(video_ids)]
@@ -212,8 +211,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
class DailymotionUserIE(DailymotionPlaylistIE): class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = u'dailymotion:user' IE_NAME = u'dailymotion:user'
_VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)' _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
_MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/user/.+?".*?>.*?</a>.*?</div>'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
def _real_extract(self, url): def _real_extract(self, url):

View File

@@ -0,0 +1,77 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
compat_parse_qs,
compat_urlparse,
)
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'(?P<baseurl>http://(?:www\.)?franceculture\.fr/)player/reecouter\?play=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.franceculture.fr/player/reecouter?play=4795174',
'info_dict': {
'id': '4795174',
'ext': 'mp3',
'title': 'Rendez-vous au pays des geeks',
'vcodec': 'none',
'uploader': 'Colette Fellous',
'upload_date': '20140301',
'duration': 3601,
'thumbnail': r're:^http://www\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
'description': 'Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche des « geeks », une enquête menée aux Etats-Unis dans la S ...',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
baseurl = mobj.group('baseurl')
webpage = self._download_webpage(url, video_id)
params_code = self._search_regex(
r"<param name='movie' value='/sites/all/modules/rf/rf_player/swf/loader.swf\?([^']+)' />",
webpage, 'parameter code')
params = compat_parse_qs(params_code)
video_url = compat_urlparse.urljoin(baseurl, params['urlAOD'][0])
title = self._html_search_regex(
r'<h1 class="title[^"]+">(.+?)</h1>', webpage, 'title')
uploader = self._html_search_regex(
r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
webpage, 'uploader', fatal=False)
thumbnail_part = self._html_search_regex(
r'(?s)<div id="emission".*?<img src="([^"]+)"', webpage,
'thumbnail', fatal=False)
if thumbnail_part is None:
thumbnail = None
else:
thumbnail = compat_urlparse.urljoin(baseurl, thumbnail_part)
description = self._html_search_regex(
r'(?s)<p class="desc">(.*?)</p>', webpage, 'description')
info = json.loads(params['infoData'][0])[0]
duration = info.get('media_length')
upload_date_candidate = info.get('media_section5')
upload_date = (
upload_date_candidate
if (upload_date_candidate is not None and
re.match(r'[0-9]{8}$', upload_date_candidate))
else None)
return {
'id': video_id,
'url': video_url,
'vcodec': 'none' if video_url.lower().endswith('.mp3') else None,
'duration': duration,
'uploader': uploader,
'upload_date': upload_date,
'title': title,
'thumbnail': thumbnail,
'description': description,
}

View File

@@ -82,6 +82,17 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': ['Brightcove'], 'add_ie': ['Brightcove'],
}, },
{
'url': 'http://www.championat.com/video/football/v/87/87499.html',
'md5': 'fb973ecf6e4a78a67453647444222983',
'info_dict': {
'id': '3414141473001',
'ext': 'mp4',
'title': 'Видео. Удаление Дзагоева (ЦСКА)',
'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
'uploader': 'Championat',
},
},
# Direct link to a video # Direct link to a video
{ {
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
@@ -103,20 +114,6 @@ class GenericIE(InfoExtractor):
'title': '2cc213299525360.mov', # that's what we get 'title': '2cc213299525360.mov', # that's what we get
}, },
}, },
# second style of embedded ooyala videos
{
'url': 'http://www.smh.com.au/tv/business/show/financial-review-sunday/behind-the-scenes-financial-review-sunday--4350201.html',
'info_dict': {
'id': '13djJjYjptA1XpPx8r9kuzPyj3UZH0Uk',
'ext': 'mp4',
'title': 'Behind-the-scenes: Financial Review Sunday ',
'description': 'Step inside Channel Nine studios for an exclusive tour of its upcoming financial business show.',
},
'params': {
# m3u8 download
'skip_download': True,
},
},
# google redirect # google redirect
{ {
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import os import os
import re import re
@@ -11,22 +13,22 @@ from ..aes import (
aes_decrypt_text aes_decrypt_text
) )
class KeezMoviesIE(InfoExtractor): class KeezMoviesIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)'
_TEST = { _TEST = {
u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
u'file': u'1214711.mp4', 'file': '1214711.mp4',
u'md5': u'6e297b7e789329923fcf83abb67c9289', 'md5': '6e297b7e789329923fcf83abb67c9289',
u'info_dict': { 'info_dict': {
u"title": u"Petite Asian Lady Mai Playing In Bathtub", 'title': 'Petite Asian Lady Mai Playing In Bathtub',
u"age_limit": 18, 'age_limit': 18,
} }
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid') video_id = mobj.group('videoid')
url = 'http://www.' + mobj.group('url')
req = compat_urllib_request.Request(url) req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1') req.add_header('Cookie', 'age_verified=1')
@@ -38,10 +40,10 @@ class KeezMoviesIE(InfoExtractor):
embedded_url = mobj.group(1) embedded_url = mobj.group(1)
return self.url_result(embedded_url) return self.url_result(embedded_url)
video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title') video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, 'title')
video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url')) video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, 'video_url'))
if webpage.find('encrypted=true')!=-1: if 'encrypted=true' in webpage:
password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, u'password') password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, 'password')
video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
path = compat_urllib_parse_urlparse(video_url).path path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:] extension = os.path.splitext(path)[1][1:]

View File

@@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import json
import re
import time
from .common import InfoExtractor
from ..utils import (
compat_parse_qs,
compat_str,
int_or_none,
)
class MorningstarIE(InfoExtractor):
IE_DESC = 'morningstar.com'
_VALID_URL = r'https?://(?:www\.)?morningstar\.com/cover/videocenter\.aspx\?id=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869',
'md5': '6c0acface7a787aadc8391e4bbf7b0f5',
'info_dict': {
'id': '615869',
'ext': 'mp4',
'title': 'Get Ahead of the Curve on 2013 Taxes',
'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.",
'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$'
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title')
video_url = self._html_search_regex(
r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"',
webpage, 'video URL')
thumbnail = self._html_search_regex(
r'<input type="hidden" id="hidSnapshot" value="([^"]+)"',
webpage, 'thumbnail', fatal=False)
description = self._html_search_regex(
r'<div id="mstarDeck".*?>(.*?)</div>',
webpage, 'description', fatal=False)
return {
'id': video_id,
'title': title,
'url': video_url,
'thumbnail': thumbnail,
'description': description,
}

View File

@@ -0,0 +1,63 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import json
import re
import time
from .common import InfoExtractor
from ..utils import (
compat_parse_qs,
compat_str,
int_or_none,
)
class MotorsportIE(InfoExtractor):
IE_DESC = 'motorsport.com'
_VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])'
_TEST = {
'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
'md5': '5592cb7c5005d9b2c163df5ac3dc04e4',
'info_dict': {
'id': '7063',
'ext': 'mp4',
'title': 'Red Bull Racing: 2014 Rules Explained',
'duration': 207,
'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations which are arguably the most complex the sport has ever seen.',
'uploader': 'rainiere',
'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$'
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
flashvars_code = self._html_search_regex(
r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars')
flashvars = compat_parse_qs(flashvars_code)
params = json.loads(flashvars['parameters'][0])
e = compat_str(int(time.time()) + 24 * 60 * 60)
base_video_url = params['location'] + '?e=' + e
s = 'h3hg713fh32'
h = hashlib.md5(s + base_video_url).hexdigest()
video_url = base_video_url + '&h=' + h
uploader = self._html_search_regex(
r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage,
'uploader', fatal=False)
return {
'id': params['video_id'],
'display_id': display_id,
'title': params['title'],
'url': video_url,
'description': params.get('description'),
'thumbnail': params.get('main_thumb'),
'duration': int_or_none(params.get('duration')),
'uploader': uploader,
}

View File

@@ -0,0 +1,75 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import int_or_none
class MusicPlayOnIE(InfoExtractor):
_VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=100&play)=(?P<id>\d+)'
_TEST = {
'url': 'http://en.musicplayon.com/play?v=433377',
'info_dict': {
'id': '433377',
'ext': 'mp4',
'title': 'Rick Ross - Interview On Chelsea Lately (2014)',
'description': 'Rick Ross Interview On Chelsea Lately',
'duration': 342,
'uploader': 'ultrafish',
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id)
title = self._og_search_title(page)
description = self._og_search_description(page)
thumbnail = self._og_search_thumbnail(page)
duration = self._html_search_meta('video:duration', page, 'duration', fatal=False)
view_count = self._og_search_property('count', page, fatal=False)
uploader = self._html_search_regex(
r'<div>by&nbsp;<a href="[^"]+" class="purple">([^<]+)</a></div>', page, 'uploader', fatal=False)
formats = [
{
'url': 'http://media0-eu-nl.musicplayon.com/stream-mobile?id=%s&type=.mp4' % video_id,
'ext': 'mp4',
}
]
manifest = self._download_webpage(
'http://en.musicplayon.com/manifest.m3u8?v=%s' % video_id, video_id, 'Downloading manifest')
for entry in manifest.split('#')[1:]:
if entry.startswith('EXT-X-STREAM-INF:'):
meta, url, _ = entry.split('\n')
params = dict(param.split('=') for param in meta.split(',')[1:])
formats.append({
'url': url,
'ext': 'mp4',
'tbr': int(params['BANDWIDTH']),
'width': int(params['RESOLUTION'].split('x')[1]),
'height': int(params['RESOLUTION'].split('x')[-1]),
'format_note': params['NAME'].replace('"', '').strip(),
})
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'duration': int_or_none(duration),
'view_count': int_or_none(view_count),
'formats': formats,
}

View File

@@ -1,44 +1,81 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import compat_urllib_parse from ..utils import int_or_none
class PornHdIE(InfoExtractor): class PornHdIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)'
_TEST = { _TEST = {
'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
'file': '1962.flv', 'md5': '956b8ca569f7f4d8ec563e2c41598441',
'md5': '35272469887dca97abd30abecc6cdf75',
'info_dict': { 'info_dict': {
"title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video", 'id': '1962',
"age_limit": 18, 'ext': 'mp4',
'title': 'Sierra loves doing laundry',
'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
'age_limit': 18,
} }
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = mobj.group('video_id')
video_title = mobj.group('video_title')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
next_url = self._html_search_regex( title = self._og_search_title(webpage)
r'&hd=(http.+?)&', webpage, 'video URL') TITLE_SUFFIX = ' porn HD Video | PornHD.com '
next_url = compat_urllib_parse.unquote(next_url) if title.endswith(TITLE_SUFFIX):
title = title[:-len(TITLE_SUFFIX)]
video_url = self._download_webpage( description = self._html_search_regex(
next_url, video_id, note='Retrieving video URL', r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
errnote='Could not retrieve video URL') view_count = int_or_none(self._html_search_regex(
age_limit = 18 r'(\d+) views </span>', webpage, 'view count', fatal=False))
formats = [
{
'url': url,
'ext': format.lower(),
'format_id': '%s-%s' % (format.lower(), quality.lower()),
'quality': 1 if quality.lower() == 'high' else 0,
} for format, quality, url in re.findall(
r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
]
mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage)
if mobj:
flashvars = json.loads(mobj.group('flashvars'))
formats.extend([
{
'url': flashvars['hashlink'].replace('?noProxy=1', ''),
'ext': 'flv',
'format_id': 'flv-low',
'quality': 0,
},
{
'url': flashvars['hd'].replace('?noProxy=1', ''),
'ext': 'flv',
'format_id': 'flv-high',
'quality': 1,
}
])
thumbnail = flashvars['urlWallpaper']
else:
thumbnail = self._og_search_thumbnail(webpage)
self._sort_formats(formats)
return { return {
'id': video_id, 'id': video_id,
'url': video_url, 'title': title,
'ext': 'flv', 'description': description,
'title': video_title, 'thumbnail': thumbnail,
'age_limit': age_limit, 'view_count': view_count,
'formats': formats,
'age_limit': 18,
} }

View File

@@ -9,46 +9,133 @@ from ..utils import (
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
unescapeHTML, unescapeHTML,
compat_str,
) )
class RTSIE(InfoExtractor): class RTSIE(InfoExtractor):
IE_DESC = 'RTS.ch' IE_DESC = 'RTS.ch'
_VALID_URL = r'^https?://(?:www\.)?rts\.ch/archives/tv/[^/]+/(?P<id>[0-9]+)-.*?\.html' _VALID_URL = r'^https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-.*?\.html'
_TEST = { _TESTS = [
'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', {
'md5': '753b877968ad8afaeddccc374d4256a5', 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html',
'info_dict': { 'md5': '753b877968ad8afaeddccc374d4256a5',
'id': '3449373', 'info_dict': {
'ext': 'mp4', 'id': '3449373',
'duration': 1488, 'ext': 'mp4',
'title': 'Les Enfants Terribles', 'duration': 1488,
'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', 'title': 'Les Enfants Terribles',
'uploader': 'Divers', 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.',
'upload_date': '19680921', 'uploader': 'Divers',
'timestamp': -40280400, 'upload_date': '19680921',
'thumbnail': 're:^https?://.*\.image' 'timestamp': -40280400,
'thumbnail': 're:^https?://.*\.image'
},
}, },
} {
'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',
'md5': 'c197f0b2421995c63a64cc73d800f42e',
'info_dict': {
'id': '5738317',
'ext': 'mp4',
'duration': 55,
'title': 'Bande de lancement de Passe-moi les jumelles',
'description': '',
'uploader': 'Passe-moi les jumelles',
'upload_date': '20140404',
'timestamp': 1396635300,
'thumbnail': 're:^https?://.*\.image'
},
},
{
'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html',
'md5': 'b4326fecd3eb64a458ba73c73e91299d',
'info_dict': {
'id': '5745975',
'ext': 'mp4',
'duration': 48,
'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski',
'description': 'Hockey - Playoff',
'uploader': 'Hockey',
'upload_date': '20140403',
'timestamp': 1396556882,
'thumbnail': 're:^https?://.*\.image'
},
'skip': 'Blocked outside Switzerland',
},
{
'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html',
'md5': '9bb06503773c07ce83d3cbd793cebb91',
'info_dict': {
'id': '5745356',
'ext': 'mp4',
'duration': 33,
'title': 'Londres cachée par un épais smog',
'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.',
'uploader': 'Le Journal en continu',
'upload_date': '20140403',
'timestamp': 1396537322,
'thumbnail': 're:^https?://.*\.image'
},
},
{
'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html',
'md5': 'dd8ef6a22dff163d063e2a52bc8adcae',
'info_dict': {
'id': '5706148',
'ext': 'mp3',
'duration': 123,
'title': '"Urban Hippie", de Damien Krisl',
'description': 'Des Hippies super glam.',
'upload_date': '20140403',
'timestamp': 1396551600,
},
},
]
def _real_extract(self, url): def _real_extract(self, url):
m = re.match(self._VALID_URL, url) m = re.match(self._VALID_URL, url)
video_id = m.group('id') video_id = m.group('id')
all_info = self._download_json( def download_json(video_id):
'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id) return self._download_json(
info = all_info['video']['JSONinfo'] 'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id)
all_info = download_json(video_id)
# video_id extracted out of URL is not always a real id
if 'video' not in all_info and 'audio' not in all_info:
page = self._download_webpage(url, video_id)
video_id = self._html_search_regex(r'<(?:video|audio) data-id="(\d+)"', page, 'video id')
all_info = download_json(video_id)
info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio']
upload_timestamp = parse_iso8601(info.get('broadcast_date')) upload_timestamp = parse_iso8601(info.get('broadcast_date'))
duration = parse_duration(info.get('duration')) duration = info.get('duration') or info.get('cutout') or info.get('cutduration')
if isinstance(duration, compat_str):
duration = parse_duration(duration)
view_count = info.get('plays')
thumbnail = unescapeHTML(info.get('preview_image_url')) thumbnail = unescapeHTML(info.get('preview_image_url'))
def extract_bitrate(url):
return int_or_none(self._search_regex(
r'-([0-9]+)k\.', url, 'bitrate', default=None))
formats = [{ formats = [{
'format_id': fid, 'format_id': fid,
'url': furl, 'url': furl,
'tbr': int_or_none(self._search_regex( 'tbr': extract_bitrate(furl),
r'-([0-9]+)k\.', furl, 'bitrate', default=None)),
} for fid, furl in info['streams'].items()] } for fid, furl in info['streams'].items()]
if 'media' in info:
formats.extend([{
'format_id': '%s-%sk' % (media['ext'], media['rate']),
'url': 'http://download-video.rts.ch/%s' % media['url'],
'tbr': media['rate'] or extract_bitrate(media['url']),
} for media in info['media'] if media.get('rate')])
self._sort_formats(formats) self._sort_formats(formats)
return { return {
@@ -57,6 +144,7 @@ class RTSIE(InfoExtractor):
'title': info['title'], 'title': info['title'],
'description': info.get('intro'), 'description': info.get('intro'),
'duration': duration, 'duration': duration,
'view_count': view_count,
'uploader': info.get('programName'), 'uploader': info.get('programName'),
'timestamp': upload_timestamp, 'timestamp': upload_timestamp,
'thumbnail': thumbnail, 'thumbnail': thumbnail,

View File

@@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import json
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
@@ -39,17 +38,15 @@ class RutubeIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
api_response = self._download_webpage( video = self._download_json(
'http://rutube.ru/api/video/%s/?format=json' % video_id, 'http://rutube.ru/api/video/%s/?format=json' % video_id,
video_id, 'Downloading video JSON') video_id, 'Downloading video JSON')
video = json.loads(api_response)
trackinfo = self._download_json(
api_response = self._download_webpage(
'http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id, 'http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id,
video_id, 'Downloading trackinfo JSON') video_id, 'Downloading trackinfo JSON')
trackinfo = json.loads(api_response)
# Some videos don't have the author field # Some videos don't have the author field
author = trackinfo.get('author') or {} author = trackinfo.get('author') or {}
m3u8_url = trackinfo['video_balancer'].get('m3u8') m3u8_url = trackinfo['video_balancer'].get('m3u8')
@@ -82,10 +79,9 @@ class RutubeChannelIE(InfoExtractor):
def _extract_videos(self, channel_id, channel_title=None): def _extract_videos(self, channel_id, channel_title=None):
entries = [] entries = []
for pagenum in itertools.count(1): for pagenum in itertools.count(1):
api_response = self._download_webpage( page = self._download_json(
self._PAGE_TEMPLATE % (channel_id, pagenum), self._PAGE_TEMPLATE % (channel_id, pagenum),
channel_id, 'Downloading page %s' % pagenum) channel_id, 'Downloading page %s' % pagenum)
page = json.loads(api_response)
results = page['results'] results = page['results']
if not results: if not results:
break break
@@ -111,10 +107,9 @@ class RutubeMovieIE(RutubeChannelIE):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
movie_id = mobj.group('id') movie_id = mobj.group('id')
api_response = self._download_webpage( movie = self._download_json(
self._MOVIE_TEMPLATE % movie_id, movie_id, self._MOVIE_TEMPLATE % movie_id, movie_id,
'Downloading movie JSON') 'Downloading movie JSON')
movie = json.loads(api_response)
movie_name = movie['name'] movie_name = movie['name']
return self._extract_videos(movie_id, movie_name) return self._extract_videos(movie_id, movie_name)

View File

@@ -16,7 +16,7 @@ from ..utils import (
class VKIE(InfoExtractor): class VKIE(InfoExtractor):
IE_NAME = 'vk.com' IE_NAME = 'vk.com'
_VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))' _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk' _NETRC_MACHINE = 'vk'
_TESTS = [ _TESTS = [

View File

@@ -3,11 +3,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE
class WimpIE(InfoExtractor): class WimpIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/' _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/'
_TEST = { _TESTS = [{
'url': 'http://www.wimp.com/maruexhausted/', 'url': 'http://www.wimp.com/maruexhausted/',
'md5': 'f1acced123ecb28d9bb79f2479f2b6a1', 'md5': 'f1acced123ecb28d9bb79f2479f2b6a1',
'info_dict': { 'info_dict': {
@@ -16,7 +17,20 @@ class WimpIE(InfoExtractor):
'title': 'Maru is exhausted.', 'title': 'Maru is exhausted.',
'description': 'md5:57e099e857c0a4ea312542b684a869b8', 'description': 'md5:57e099e857c0a4ea312542b684a869b8',
} }
} }, {
# youtube video
'url': 'http://www.wimp.com/clowncar/',
'info_dict': {
'id': 'cG4CEr2aiSg',
'ext': 'mp4',
'title': 'Basset hound clown car...incredible!',
'description': 'md5:8d228485e0719898c017203f900b3a35',
'uploader': 'Gretchen Hoey',
'uploader_id': 'gretchenandjeff1',
'upload_date': '20140303',
},
'add_ie': ['Youtube'],
}]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
@@ -24,6 +38,13 @@ class WimpIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_url = self._search_regex( video_url = self._search_regex(
r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL') r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL')
if YoutubeIE.suitable(video_url):
self.to_screen('Found YouTube video')
return {
'_type': 'url',
'url': video_url,
'ie_key': YoutubeIE.ie_key(),
}
return { return {
'id': video_id, 'id': video_id,
@@ -31,4 +52,4 @@ class WimpIE(InfoExtractor):
'title': self._og_search_title(webpage), 'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
} }

View File

@@ -1,7 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools import itertools
import json
import re import re
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
@@ -19,18 +18,20 @@ class YahooIE(InfoExtractor):
_TESTS = [ _TESTS = [
{ {
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
'file': '214727115.mp4',
'md5': '4962b075c08be8690a922ee026d05e69', 'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': { 'info_dict': {
'id': '214727115',
'ext': 'mp4',
'title': 'Julian Smith & Travis Legg Watch Julian Smith', 'title': 'Julian Smith & Travis Legg Watch Julian Smith',
'description': 'Julian and Travis watch Julian Smith', 'description': 'Julian and Travis watch Julian Smith',
}, },
}, },
{ {
'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
'file': '103000935.mp4',
'md5': 'd6e6fc6e1313c608f316ddad7b82b306', 'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
'info_dict': { 'info_dict': {
'id': '103000935',
'ext': 'mp4',
'title': 'Codefellas - The Cougar Lies with Spanish Moss', 'title': 'Codefellas - The Cougar Lies with Spanish Moss',
'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
}, },
@@ -60,10 +61,9 @@ class YahooIE(InfoExtractor):
'env': 'prod', 'env': 'prod',
'format': 'json', 'format': 'json',
}) })
query_result_json = self._download_webpage( query_result = self._download_json(
'http://video.query.yahoo.com/v1/public/yql?' + data, 'http://video.query.yahoo.com/v1/public/yql?' + data,
video_id, 'Downloading video info') video_id, 'Downloading video info')
query_result = json.loads(query_result_json)
info = query_result['query']['results']['mediaObj'][0] info = query_result['query']['results']['mediaObj'][0]
meta = info['meta'] meta = info['meta']
@@ -86,7 +86,6 @@ class YahooIE(InfoExtractor):
else: else:
format_url = compat_urlparse.urljoin(host, path) format_url = compat_urlparse.urljoin(host, path)
format_info['url'] = format_url format_info['url'] = format_url
formats.append(format_info) formats.append(format_info)
self._sort_formats(formats) self._sort_formats(formats)
@@ -134,27 +133,25 @@ class YahooSearchIE(SearchInfoExtractor):
def _get_n_results(self, query, n): def _get_n_results(self, query, n):
"""Get a specified number of results for a query""" """Get a specified number of results for a query"""
entries = []
res = { for pagenum in itertools.count(0):
'_type': 'playlist',
'id': query,
'entries': []
}
for pagenum in itertools.count(0):
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
webpage = self._download_webpage(result_url, query, info = self._download_json(result_url, query,
note='Downloading results page '+str(pagenum+1)) note='Downloading results page '+str(pagenum+1))
info = json.loads(webpage)
m = info['m'] m = info['m']
results = info['results'] results = info['results']
for (i, r) in enumerate(results): for (i, r) in enumerate(results):
if (pagenum * 30) +i >= n: if (pagenum * 30) + i >= n:
break break
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r) mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo') e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
res['entries'].append(e) entries.append(e)
if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)): if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
break break
return res return {
'_type': 'playlist',
'id': query,
'entries': entries,
}

View File

@@ -1453,7 +1453,8 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
more_widget_html = more['load_more_widget_html'] more_widget_html = more['load_more_widget_html']
playlist_title = self._html_search_regex( playlist_title = self._html_search_regex(
r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title') r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
page, u'title')
url_results = self._ids_to_results(ids) url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, playlist_title) return self.playlist_result(url_results, playlist_id, playlist_title)
@@ -1738,11 +1739,10 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
feed_entries = [] feed_entries = []
paging = 0 paging = 0
for i in itertools.count(1): for i in itertools.count(1):
info = self._download_webpage(self._FEED_TEMPLATE % paging, info = self._download_json(self._FEED_TEMPLATE % paging,
u'%s feed' % self._FEED_NAME, u'%s feed' % self._FEED_NAME,
u'Downloading page %s' % i) u'Downloading page %s' % i)
info = json.loads(info) feed_html = info.get('feed_html') or info.get('content_html')
feed_html = info['feed_html']
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids) ids = orderedSet(m.group(1) for m in m_ids)
feed_entries.extend( feed_entries.extend(
@@ -1754,7 +1754,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)' IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
_FEED_NAME = 'subscriptions' _FEED_NAME = 'subscriptions'
_PLAYLIST_TITLE = u'Youtube Subscriptions' _PLAYLIST_TITLE = u'Youtube Subscriptions'

View File

@@ -1176,12 +1176,12 @@ class HEADRequest(compat_urllib_request.Request):
return "HEAD" return "HEAD"
def int_or_none(v, scale=1): def int_or_none(v, scale=1, default=None):
return v if v is None else (int(v) // scale) return default if v is None else (int(v) // scale)
def float_or_none(v, scale=1): def float_or_none(v, scale=1, default=None):
return v if v is None else (float(v) / scale) return default if v is None else (float(v) / scale)
def parse_duration(s): def parse_duration(s):
@@ -1264,8 +1264,8 @@ class PagedList(object):
def uppercase_escape(s): def uppercase_escape(s):
return re.sub( return re.sub(
r'\\U([0-9a-fA-F]{8})', r'\\U[0-9a-fA-F]{8}',
lambda m: compat_chr(int(m.group(1), base=16)), s) lambda m: m.group(0).decode('unicode-escape'), s)
try: try:
struct.pack(u'!I', 0) struct.pack(u'!I', 0)

View File

@@ -1,2 +1,2 @@
__version__ = '2014.04.01.2' __version__ = '2014.04.04.4'