Compare commits

..

260 Commits

Author SHA1 Message Date
Philipp Hagemeister
b07d9c23c5 release 2013.06.23 2013-06-23 23:42:21 +02:00
Philipp Hagemeister
d71cae62cc allow skipping tests when releasing
(YouTube Subtitles are currently flaky in Germany, especially via IPv6)
2013-06-23 23:41:54 +02:00
Philipp Hagemeister
633a50cf4b Update Makefile to packaged paths 2013-06-23 23:27:28 +02:00
Philipp Hagemeister
825e0984e2 [break] adapt to new paths 2013-06-23 22:59:51 +02:00
Philipp Hagemeister
d1cade5ade Correct module name 2013-06-23 22:53:42 +02:00
Philipp Hagemeister
190717e31f [justin.tv] Clarify variable content 2013-06-23 22:52:43 +02:00
Philipp Hagemeister
0824c28c8b Remove mentions of old InfoExtractors module 2013-06-23 22:42:59 +02:00
Philipp Hagemeister
c59b4aaeef Fix imports and restrict available legacy imports 2013-06-23 22:38:59 +02:00
Philipp Hagemeister
f9c6cbf002 Move extractor imports and functions into extractor/__init__.py 2013-06-23 22:36:24 +02:00
Philipp Hagemeister
b8fe71ab86 Remove unused imports from InfoExtractor 2013-06-23 22:34:23 +02:00
Philipp Hagemeister
cb10cded2a [xhamster] Move into own file 2013-06-23 22:32:44 +02:00
Philipp Hagemeister
cd8b830292 [Teamcoco] Move into own file 2013-06-23 22:31:50 +02:00
Philipp Hagemeister
1ac4004f3a [flickr] Move into own file 2013-06-23 22:31:12 +02:00
Philipp Hagemeister
e17d368ae2 [howcast] Move into own file 2013-06-23 22:30:16 +02:00
Philipp Hagemeister
27110b0567 [hypem] Move into own file 2013-06-23 22:29:27 +02:00
Philipp Hagemeister
9fe4de3471 [ina] Move into own file 2013-06-23 22:28:19 +02:00
Philipp Hagemeister
d26d440e19 [redtube] Simplify 2013-06-23 22:27:34 +02:00
Philipp Hagemeister
9f5daf0006 [redtube] move into own file 2013-06-23 22:27:16 +02:00
Philipp Hagemeister
eb1634cbf8 [Vine] move into own file 2013-06-23 22:26:30 +02:00
Philipp Hagemeister
01c10ca26e [VBox7] move into own file 2013-06-23 22:25:46 +02:00
Philipp Hagemeister
45aef47281 [Bandcamp] move into own file 2013-06-23 22:24:58 +02:00
Philipp Hagemeister
ae287755b7 [Tumblr] move into own file 2013-06-23 22:24:07 +02:00
Philipp Hagemeister
a37f27ae99 [LiveLeak] move into own file 2013-06-23 22:23:19 +02:00
Philipp Hagemeister
49f5f315fd [Spiegel] move into own file 2013-06-23 22:22:08 +02:00
Philipp Hagemeister
97d2db017c [myspass] Move into own file and default to mp4 ext 2013-06-23 22:20:45 +02:00
Philipp Hagemeister
2c64df0399 [keek] move into own file 2013-06-23 22:16:41 +02:00
Philipp Hagemeister
828400422a [8tracks] Move into own file 2013-06-23 22:15:50 +02:00
Philipp Hagemeister
c3c77cec30 [youjizz] move into own file 2013-06-23 22:14:22 +02:00
Philipp Hagemeister
1183b85f50 [pornotube] move into own file 2013-06-23 22:13:32 +02:00
Philipp Hagemeister
0143dc029c [YouPorn] move into own file 2013-06-23 22:12:14 +02:00
Philipp Hagemeister
e10e576fed [RBMARadio] move into own file 2013-06-23 22:09:32 +02:00
Philipp Hagemeister
78af8eb1d1 [ustream] move into its own file 2013-06-23 22:08:28 +02:00
Philipp Hagemeister
79e93125d0 [justin.tv] move into own file 2013-06-23 22:07:27 +02:00
Philipp Hagemeister
48db0b1f4a [FunnyOrDie] Remove unused import 2013-06-23 22:07:17 +02:00
Philipp Hagemeister
8f0578f0fc Move FunnyOrDie into its own file 2013-06-23 22:05:23 +02:00
Philipp Hagemeister
250f557872 Move WorldStarHipHop into its own file 2013-06-23 22:04:08 +02:00
Philipp Hagemeister
462dc88b17 Move Steam IE into its own file 2013-06-23 22:02:56 +02:00
Philipp Hagemeister
570fa151fc Move XNXX into its own file 2013-06-23 22:01:57 +02:00
Philipp Hagemeister
9c286cfa00 Move Youku IE into its own file 2013-06-23 22:01:02 +02:00
Philipp Hagemeister
80cbb6ddbb Move MixCloud into its own file 2013-06-23 21:59:15 +02:00
Philipp Hagemeister
9fd5ce0cbe Move TED IE into its own file 2013-06-23 21:55:53 +02:00
Philipp Hagemeister
1736dec629 Mark MTV as broken for now (#913) 2013-06-23 21:52:41 +02:00
Philipp Hagemeister
b8a360837a Fix Statigram test 2013-06-23 21:34:40 +02:00
Philipp Hagemeister
fc28721960 Add MTV IE file (oops) 2013-06-23 21:34:03 +02:00
Philipp Hagemeister
51ce3a75c9 Improve error reporting for downloads 2013-06-23 21:33:11 +02:00
Philipp Hagemeister
335056663a Move MTV IE into its own file 2013-06-23 21:27:38 +02:00
Philipp Hagemeister
5b286728de Move NBA IE into its own file 2013-06-23 21:18:00 +02:00
Philipp Hagemeister
291a168bcc Move StanfordOC IE into its own file 2013-06-23 21:16:32 +02:00
Philipp Hagemeister
fda7d31aa0 Move infoq into its own file 2013-06-23 21:14:19 +02:00
Philipp Hagemeister
cbf46c737c Move XVideos IE into its own file (and simplify it a bit) 2013-06-23 21:11:47 +02:00
Philipp Hagemeister
7beb36a529 Move Collegehumor IE into its own file 2013-06-23 21:10:21 +02:00
Philipp Hagemeister
153697660d Move Escapist into its own file 2013-06-23 21:08:17 +02:00
Philipp Hagemeister
60a72e8d45 Simplify EscapistIE 2013-06-23 21:06:49 +02:00
Philipp Hagemeister
426ff04282 Move DepositFiles into its own IE 2013-06-23 21:06:20 +02:00
Philipp Hagemeister
a50e1b32e4 Add facebook import 2013-06-23 21:00:34 +02:00
Philipp Hagemeister
9eae41ddef Move Facebook into its own file 2013-06-23 20:59:45 +02:00
Philipp Hagemeister
aad0d6d5ba Move Soundcloud into its own file 2013-06-23 20:57:44 +02:00
Philipp Hagemeister
7aca14a1ec Move G+ IE into its own file, and move google search into a more descriptive module 2013-06-23 20:55:15 +02:00
Philipp Hagemeister
d1596ef439 Add import for google search 2013-06-23 20:51:42 +02:00
Philipp Hagemeister
ea63e4998b Move comedycentral into its own file 2013-06-23 20:51:04 +02:00
Philipp Hagemeister
a08dfd27a8 Move MyVideo into its own file 2013-06-23 20:48:32 +02:00
Philipp Hagemeister
f58848011e Move blip.tv extractors into their own file 2013-06-23 20:44:48 +02:00
Philipp Hagemeister
934858ad86 Move YahooSearchIE to youtube_dl.extractor.yahoo 2013-06-23 20:41:54 +02:00
Philipp Hagemeister
3c25b9abae Remove useless headers 2013-06-23 20:35:50 +02:00
Philipp Hagemeister
3fc03845a1 Move GoogleSearchIE into its own file 2013-06-23 20:32:49 +02:00
Philipp Hagemeister
9b122384e9 Move GenericIE into its own file 2013-06-23 20:31:45 +02:00
Philipp Hagemeister
9f4e6bbaeb Move gametrailers IE into its own file 2013-06-23 20:29:56 +02:00
Philipp Hagemeister
b05654f0e3 Move YoutubeSearchIE to the other youtube IEs 2013-06-23 20:28:15 +02:00
Philipp Hagemeister
9b3a760bbb [arte] Mark dead code as such 2013-06-23 20:26:35 +02:00
Philipp Hagemeister
d5822b96b0 Move ARD, Arte, ZDF into their own files 2013-06-23 20:24:07 +02:00
Philipp Hagemeister
b3d14cbfa7 Move Vimeo into its own file 2013-06-23 20:18:21 +02:00
Philipp Hagemeister
d6039175e5 Move yahoo into its own file 2013-06-23 20:13:52 +02:00
Philipp Hagemeister
97d6faaced Move Photobucket into its own file 2013-06-23 20:12:18 +02:00
Philipp Hagemeister
219b8130df Move DailyMotion into its own file 2013-06-23 20:12:03 +02:00
Philipp Hagemeister
38cbc40a64 Move Metacafe and Statigram into their own files, and remove absolute import 2013-06-23 20:07:51 +02:00
Philipp Hagemeister
93d3a642a9 [youtube] remove dead code 2013-06-23 19:59:40 +02:00
Philipp Hagemeister
c5e8d7af0e Move youtube extractors to youtube_dl.extractor.youtube 2013-06-23 19:58:33 +02:00
Philipp Hagemeister
d6983cb460 Fix generic class move (add all files) 2013-06-23 19:57:38 +02:00
Philipp Hagemeister
dd9829292e Improve vevo message 2013-06-23 19:45:42 +02:00
Philipp Hagemeister
89cb0eb0b6 Use new signature calculation method only if sig is not present 2013-06-23 19:43:18 +02:00
M.Yasoob Khalid
9b5fffb149 added an IE and test for break.com 2013-06-23 22:42:51 +05:00
Philipp Hagemeister
1f90438025 Merge remote-tracking branch 'jaimeMF/vevo_fix' 2013-06-23 19:42:27 +02:00
Philipp Hagemeister
a130adb25b [Statigr.am] Correct uploader id 2013-06-23 19:41:28 +02:00
Philipp Hagemeister
8756c5fe7a Merge remote-tracking branch 'origin/vimeo_passworded_videos' 2013-06-23 19:00:16 +02:00
Philipp Hagemeister
828dba2983 Improvge error reporting 2013-06-23 18:59:01 +02:00
Philipp Hagemeister
6b3f5a329b Improve Statigr.am IE 2013-06-23 18:58:53 +02:00
Philipp Hagemeister
63ef586b05 Merge remote-tracking branch 'yasoob/master' 2013-06-23 18:45:50 +02:00
Philipp Hagemeister
383a6a61b1 Merge pull request #905 from rbrito/manpage-apropos
README: Add brief description for manpages/apropos.
2013-06-23 09:41:59 -07:00
M.Yasoob Khalid
4fdd4e6f6f added test for Statigr 2013-06-23 18:56:26 +05:00
M.Yasoob Khalid
01ba4b80a7 added StatigrIE 2013-06-23 18:02:55 +05:00
M.Yasoob Khalid
de66764e4e added StatigrIE 2013-06-23 17:46:14 +05:00
Jaime Marquínez Ferrándiz
1037d53988 GenericIE: look for Open Graph info
Only if there is a direct link to the file, don't try if it points to a Flash player
2013-06-23 13:26:49 +02:00
Jaime Marquínez Ferrándiz
c3ab8f866c Change metavar of "--sub-format" from LANG to FORMAT 2013-06-23 12:59:20 +02:00
Rogério Brito
94eb2dd1fe README: Add brief description for manpages/apropos.
Trying to mimic the manpage of (GNU) `ls`, we don't conjugate the verb as
"downloads" or something else.

Signed-off-by: Rogério Brito <rbrito@ime.usp.br>
2013-06-22 19:16:11 -03:00
Jaime Marquínez Ferrándiz
346b5ce8fd YoutubeIE: report warnings instead of errors if the subtitles are not found (related #901)
For example when downloading a playlist some videos may not have subtitles but the download shouldn't stop.
2013-06-22 14:15:33 +02:00
Jaime Marquínez Ferrándiz
b37fbb990b Move the decrypting function to a static method 2013-06-22 13:20:06 +02:00
Jaime Marquínez Ferrándiz
ef75f76f5c Detect more vevo videos 2013-06-22 13:13:40 +02:00
Jaime Marquínez Ferrándiz
e296100005 Merge pull request #888 from rg3/youtube_playlists_fix_886
YoutubePlaylistIE: try to extract the url of the entries from the media$group dictionary (closes #886)
2013-06-22 03:35:32 -07:00
Jaime Marquínez Ferrándiz
953dd93a48 YoutubePlaylistIE: don't look into entry['content']['src'], accruing to the docs this can return live stream urls 2013-06-22 12:32:27 +02:00
Jaime Marquínez Ferrándiz
e704f4d378 YoutubeIE: If not subtitles language is given default to English for automatic captions (related #901) 2013-06-22 12:14:24 +02:00
Jaime Marquínez Ferrándiz
77d0f05f71 YoutubeIE: Detect new Vevo style videos
The url_encoded_fmt_stream_map can be found in the video page, but the signature must be decrypted, we get it from the webpage instead of the `get_video_info` pages because we have only discover the algorithm for keys with both sub keys of size 43.
2013-06-21 21:51:10 +02:00
Philipp Hagemeister
50d2376769 Leave out sig if not present (#896) 2013-06-21 01:22:47 +02:00
Philipp Hagemeister
759d525301 release 2013.06.21 2013-06-21 00:33:44 +02:00
Philipp Hagemeister
fcfa188548 Show which IEs are slow during release 2013-06-21 00:29:31 +02:00
Jaime Marquínez Ferrándiz
f4c8bbcfc2 TEDIE: download the best quality video and use the new _search_regex functions
Also extracts the description.
2013-06-20 20:51:20 +02:00
Jaime Marquínez Ferrándiz
31eead52e7 YoutubePlaylistIE: try to extract the url of the entries from the media$group dictionary
Extracting it from content can return rtsp urls.
2013-06-20 17:23:27 +02:00
Jaime Marquínez Ferrándiz
038a3a1a61 RBMARadioIE: fix the extraction of the JSON data 2013-06-20 14:37:43 +02:00
Jaime Marquínez Ferrándiz
587c68b2cd DailymotionIE: fix the extraction of the video uploader and use _search_regex for getting it 2013-06-20 14:15:29 +02:00
Jaime Marquínez Ferrándiz
377fdf5dde Update the TumblrIE: the video is no longer available 2013-06-20 14:02:21 +02:00
Jaime Marquínez Ferrándiz
5c67601931 Revert "Fix GooglePlusIE: the video_page url has changed of place"
The old method is working again.

This reverts commit 449d5c910c.
2013-06-20 13:53:04 +02:00
Jaime Marquínez Ferrándiz
68f54207a3 SteamIE: only verify the age if needed
Also use the _html_search_regex function
2013-06-20 13:43:44 +02:00
Philipp Hagemeister
bb47437686 Ignore invalid dates (Fixes #894) 2013-06-19 22:13:16 +02:00
Jaime Marquínez Ferrándiz
213b715893 Merge pull request #887 from anisse/master
Fetch all entries that are in a youtube playlist

Also add a test.
2013-06-19 12:52:44 +02:00
Jaime Marquínez Ferrándiz
449d5c910c Fix GooglePlusIE: the video_page url has changed of place 2013-06-18 14:22:16 +02:00
Filippo Valsorda
0251f9c9c0 add _search_regex to the new IEs 2013-06-17 19:47:44 +02:00
Filippo Valsorda
8bc7c3d858 Merge branch 'search_regex' - PR #872 - closes #847 2013-06-17 19:28:18 +02:00
Filippo Valsorda
af44c94862 use _search_regex in GenericIE 2013-06-17 19:25:35 +02:00
Jaime Marquínez Ferrándiz
36ed7177f0 Fix HypemIE test: the song name has been changed 2013-06-16 20:42:28 +02:00
Jaime Marquínez Ferrándiz
32aa88bcae Add GametrailersIE 2013-06-16 20:34:45 +02:00
Jaime Marquínez Ferrándiz
51090d636b VimeoIE: allow to download password protected videos 2013-06-15 11:35:14 +02:00
Jaime Marquínez Ferrándiz
31513ea6b9 Update test_issue_673 in Youtube Lists
Some videos have been removed.
Delete the title check, it's not the purpose of that test.
2013-06-15 11:20:22 +02:00
Anisse Astier
88cebbd7b8 YoutubePlaylistIE: get *all* videos
For that, we add parameter safeSearch=none that asks youtube not filter
results before sending them to us.

Note: this parameter could be added to YoutubeSearchIE and YoutubeUserIE
as well, but I don't know what would be the impact in term of unwanted
results. Maybe expose that as a parameter? For a playlist it's different
since the user chose what she put in the playlist.
2013-06-13 23:45:32 +02:00
Jaime Marquínez Ferrándiz
fb8f7280bc GenericIE: try to find videos from twitter cards info 2013-06-13 08:26:39 +02:00
Jaime Marquínez Ferrándiz
f380401bbd YoutubeSearchIE: the query is a str, in python 3 it fails if decode is called 2013-06-11 19:15:07 +02:00
Jaime Marquínez Ferrándiz
9abc6c8b31 Update YahooIE test
The old test video is no longer available.
2013-06-10 19:42:02 +02:00
Philipp Hagemeister
8cd252f115 Use long rtmpdump options
Note that we accidentally called rtmpdump with -v (--live) instead of -V (--verbose) because we missed this.
2013-06-10 18:14:45 +02:00
Philipp Hagemeister
53f72b11e5 Allow unsetting the proxy with the --proxy option 2013-06-09 23:43:18 +02:00
Filippo Valsorda
ee55fcbe12 switch long info_dict fields checking to md5 2013-06-09 15:03:54 +02:00
Filippo Valsorda
78d3442b12 test: extend the reach of info_dict checking
* print the info_dict in a format suitable to easy adding to tests.json during tests if un-tested fields are detected
* make it possible to put the crc32 in tests.json if the field is too long
* complete the "info_dict" fields in existing tests
* fixed the bugs catched doing this
2013-06-09 14:21:42 +02:00
Filippo Valsorda
979a9dd4c4 _html_search_regex with clean_html superpowers 2013-06-09 11:57:13 +02:00
Filippo Valsorda
d5979c5d55 do not ask the user to report network errors 2013-06-09 11:55:08 +02:00
Jaime Marquínez Ferrándiz
8027175600 Set the extractor key in playlists entries
If they were videos the extractor key wasn't being set anywhere else
Closes 877
2013-06-08 12:08:44 +02:00
Jaime Marquínez Ferrándiz
3054ff0cbe Merge pull request #853 from mc2avr/master
add ZDFIE
2013-06-08 11:44:01 +02:00
Jaime Marquínez Ferrándiz
cd453d38bb Merge pull request #878 from yasoob/master
Added Vbox7.com InfoExtractor and tests.
2013-06-08 10:54:47 +02:00
Filippo Valsorda
f5a290eed9 print "please report this issue on GitHub" on every ExtractorError 2013-06-08 09:56:34 +02:00
M.Yasoob Khalid
ecb3e676a5 Added Vbox7 Infoextractor 2013-06-08 12:44:38 +05:00
Filippo Valsorda
8b59a98610 XHamster: Can't see the description anywhere in the UI 2013-06-07 12:47:12 +02:00
Filippo Valsorda
8409501206 use search_regex in new IEs 2013-06-07 12:47:12 +02:00
Filippo Valsorda
be95cac157 raise exceptions on warnings during tests - and solve a couple of them 2013-06-07 12:46:23 +02:00
Filippo Valsorda
476203d025 print WARNINGs during test + minor fix to NBAIE 2013-06-06 15:07:05 +02:00
Filippo Valsorda
468e2e926b implement fallbacks and defaults in _search_regex 2013-06-06 14:35:08 +02:00
Anna Bernardi
ac3e9394e7 Implement search_regex from #847 2013-06-06 14:01:44 +02:00
Filippo Valsorda
868d62a509 style and error handling edits to HypemIE 2013-06-06 12:02:36 +02:00
M.Yasoob Khalid
157b864a01 added HypemIE
rebased, closes PR #871
2013-06-06 12:01:07 +02:00
Filippo Valsorda
951b9dfd94 Merge pull request #866 from yasoob/master
Added support for XHamster - closes #841
2013-06-04 10:39:31 -07:00
Filippo Valsorda
1142d31164 Merge pull request #863 from davidcl/master
Add some tests to match Justin.tv / Twitch.tv URLs
2013-06-04 10:36:36 -07:00
Jaime Marquínez Ferrándiz
9131bde941 SpiegelE: the page layout has changed a bit 2013-06-04 19:31:06 +02:00
Jaime Marquínez Ferrándiz
1132c10dc2 Merge pull request #864 from jacobian/vimeopro
Fixed an error downloading vimeo pro videos.
2013-06-04 10:15:12 -07:00
M.Yasoob Ullah Khalid
c978a96c02 Added test for XHamster.com 2013-06-04 17:33:02 +05:00
M.Yasoob Ullah Khalid
71e458d437 Added support for xhamster in infoextractors 2013-06-04 17:30:54 +05:00
Clément DAVID
57bde0d9c7 Fix the test_all_urls (Import issue) 2013-06-04 13:10:12 +02:00
Clément DAVID
50b4d25980 Merge within test_all_urls 2013-06-04 13:06:49 +02:00
Jaime Marquínez Ferrándiz
eda60e8251 VimeoIE: support videos from vimeopro.com 2013-06-04 12:04:54 +02:00
Jacob Kaplan-Moss
c794cbbb19 Fixed an error downloading vimeo pro videos. 2013-06-03 18:03:59 -05:00
Clément DAVID
4a76d1dbe5 Add tests for justin.tv and twitch.tv 2013-06-03 22:16:55 +02:00
Jaime Marquínez Ferrándiz
418f734a58 Merge pull request #854 from rg3/youtube_automatic_captions
YoutubeIE: fallback to automatic captions when subtitles aren't found
2013-06-01 14:18:27 -07:00
Jaime Marquínez Ferrándiz
dc1c355b72 YoutubeIE: fallback to automatic captions when subtitles aren't found (closes #843)
Also modify test_youtube_subtitles to support running the tests in any order.
2013-05-31 17:03:40 +02:00
Jaime Marquínez Ferrándiz
1b2b22ed9f BlipTV: accept urls in the format http://a.blip.tv/api.swf#{id} (closes #857)
Tweak the regex so that BlipTV can be before BlipTVUser.
2013-05-28 15:12:39 +02:00
mc2avr
f2cd958c0a add ZDFIE and _download_with_mplayer(mms://,rtsp://) 2013-05-23 21:42:03 +02:00
Philipp Hagemeister
57adeaea87 release 2013.05.23 2013-05-23 13:37:19 +02:00
Philipp Hagemeister
8f3f1aef05 Fix HowCast IE 2013-05-23 13:34:33 +02:00
Filippo Valsorda
51d2453c7a small tweaks 2013-05-21 16:07:27 +02:00
Jaime Marquínez Ferrándiz
45014296be Add TeamcocoIE (closes #212) 2013-05-21 14:37:32 +02:00
Anna Bernardi
afef36c950 add support for Flickr videos - closes #261 2013-05-20 23:19:38 +02:00
Filippo Valsorda
b31756c18e Python 2 compat fixes for MyVideo.de rtmpdump downloads 2013-05-20 11:57:10 +02:00
Filippo Valsorda
f008688520 make rtmpdump inherit the verbose option for debugging 2013-05-20 11:54:21 +02:00
Filippo Valsorda
5b68ea215b Merge pull request #842 - myvideo, rtmp support
@dersphere code, from dersphere/plugin.video.myvideo_de.git
rewritten by @mc2avr
released in the Public Domain by the author
ref: https://github.com/rg3/youtube-dl/pull/842
2013-05-20 09:49:58 +02:00
Jaime Marquínez Ferrándiz
b1d568f0bc HowcastIE: extract thumbnail 2013-05-20 08:39:41 +02:00
Jaime Marquínez Ferrándiz
17bd1b2f41 VineIE: extract more information and minor style changes 2013-05-20 08:31:03 +02:00
Anna Bernardi
5b0d3cc0cd Add support for Vine - closes #845 2013-05-20 00:33:14 +02:00
Filippo Valsorda
d4f76f1674 Add support for Howcast.com - closes #835 2013-05-18 19:17:19 +02:00
Jaime Marquínez Ferrándiz
340fa21198 UstreamIE: get thumbnail and uploader name 2013-05-18 11:54:18 +02:00
mc2avr
de5d66d431 MyVideoIE: add rtmp support 2013-05-15 23:38:44 +02:00
Jaime Marquínez Ferrándiz
7bdb17d4d5 Add extra_info argument to extract_info and process_ie_result
It allows to update the info_dicts with other values

(closes #840)
2013-05-14 14:40:40 +02:00
Philipp Hagemeister
419c64b107 Throw a better error if the protocol is invalid 2013-05-13 19:54:07 +02:00
Philipp Hagemeister
99a5ae3f8e Simplify generic search IE (Closes #839) 2013-05-13 19:53:52 +02:00
Philipp Hagemeister
c7563c528b Merge remote-tracking branch 'jaimeMF/SearchIE' 2013-05-13 19:43:35 +02:00
Jaime Marquínez Ferrándiz
e30e9318da Add base class SearchInfoExtractor for search queries IEs 2013-05-13 14:58:44 +02:00
Philipp Hagemeister
5c51028d38 release 2013.05.14 2013-05-13 13:50:05 +02:00
Philipp Hagemeister
c1d58e1c67 Merge pull request #834 from chocolateboy/install_prefix_fix
only install to /etc if PREFIX is /usr or /usr/local
2013-05-13 00:42:24 -07:00
Philipp Hagemeister
02030ff7fe release 2013.05.13 2013-05-13 09:38:27 +02:00
Philipp Hagemeister
f45c185fa9 Do not re-encode / to # if / is a platform separator, and correctly handle permission errors (Fixes #831) 2013-05-13 09:20:08 +02:00
Philipp Hagemeister
1bd96c3a60 Deprecate --only-sub 2013-05-13 09:06:18 +02:00
Jaime Marquínez Ferrándiz
929f85d851 Remove a print call used for debugging 2013-05-12 20:56:54 +02:00
Jaime Marquínez Ferrándiz
98d4a4e6bc YoutubeSearchIE: return a playlist (related #838) 2013-05-12 20:53:37 +02:00
Jaime Marquínez Ferrándiz
fb2f83360c FFmpegPostProcessor: decode stderr first and then get the last line (closes #837) 2013-05-12 19:08:32 +02:00
Jaime Marquínez Ferrándiz
3c5e7729e1 GoogleSearchIE: change query urls to http://www.google.com/search
The old one was given HTTP 404 errors
2013-05-12 18:44:56 +02:00
Jaime Marquínez Ferrándiz
5a853e1423 Fix YahooSearchIE: (closes #300) 2013-05-12 17:49:35 +02:00
Jaime Marquínez Ferrándiz
2f58b12dad YahooIE: support more videos 2013-05-12 17:05:43 +02:00
Jaime Marquínez Ferrándiz
59f4fd4dc6 YahooIE: remove old code and accept screen.yahoo.com videos (#300)
Videos require rtmpdump
2013-05-12 14:05:14 +02:00
chocolateboy
5738240ee8 only install to /etc if PREFIX is /usr or /usr/local 2013-05-10 23:05:58 +01:00
Philipp Hagemeister
86fd453ea8 Merge remote-tracking branch 'origin/master' 2013-05-10 09:21:24 +02:00
Philipp Hagemeister
c83411b9ee Skip bandcamp tests for now - free limit has been exceeded 2013-05-10 09:10:34 +02:00
Jaime Marquínez Ferrándiz
057c9938a1 Import FileDownloader in test_youtube_subtitles
Fix last commit
2013-05-10 08:37:49 +02:00
Jaime Marquínez Ferrándiz
9259966132 test_youtube_subtitles: FakeDownloader inherits form FileDownloader 2013-05-10 08:31:30 +02:00
Philipp Hagemeister
b08980412e Merge pull request #826 from jakeogh/master
Added --get-id option to print video IDs
2013-05-09 16:52:54 -07:00
Philipp Hagemeister
532a1e0429 release 2013.05.10 2013-05-10 01:45:21 +02:00
Filippo Valsorda
2a36c352a0 Retry to disable YT ratelimit to unlock full bandwidth
This is the second attempt: a60b854d90
Sometimes the ratelimit=yes is already in the URL, and doubling it
leads to a 403. Now should work on all videos, at least works on all
I could test.

Closes #648
2013-05-09 00:39:10 +02:00
jakeogh
1a2adf3f49 added --get-id option to print video IDs 2013-05-05 22:30:07 -07:00
Jaime Marquínez Ferrándiz
43b62accbb GoogleSearchIE: rename _download_n_results to _get_n_results 2013-05-05 22:12:41 +02:00
Jaime Marquínez Ferrándiz
be74864ace Credit @JohnyMoSwag for WorldstarhiphopIE (#730) 2013-05-05 21:56:38 +02:00
Philipp Hagemeister
0ae456f08a Credit @julienfr112 for Ina IE (#823) 2013-05-05 21:35:50 +02:00
Philipp Hagemeister
0f75d25991 release 2013.05.07 2013-05-05 21:13:16 +02:00
Philipp Hagemeister
67129e4a15 release 2013.05.06 2013-05-05 21:01:46 +02:00
Philipp Hagemeister
dfb9323cf9 Clean up InaIE (Closes #823) 2013-05-05 20:57:19 +02:00
julien
7f5bd09baf Add support to www.ina.fr 2013-05-05 20:54:36 +02:00
Philipp Hagemeister
02d5eb935f Merge remote-tracking branch 'origin/master'
Conflicts:
	youtube_dl/InfoExtractors.py
2013-05-05 20:51:27 +02:00
Philipp Hagemeister
94ca71b7cc Fix GoogleSearchIE (Fixes #822) 2013-05-05 20:49:57 +02:00
Philipp Hagemeister
b338f1b154 FileDownloader: Simplify and document 2013-05-05 20:49:42 +02:00
Jaime Marquínez Ferrándiz
486f0c9476 More callbacks changed to raise ExtractorError 2013-05-05 13:59:25 +02:00
Jaime Marquínez Ferrándiz
d96680f58d PhotobucketIE: accept new format of urls and add a test 2013-05-05 13:07:00 +02:00
Jaime Marquínez Ferrándiz
f8602d3242 ArteTvIE: Fix format of upload date 2013-05-05 11:48:47 +02:00
Jaime Marquínez Ferrándiz
0c021ad171 More callbacks changed to raise ExtractorError 2013-05-04 14:23:16 +02:00
Philipp Hagemeister
086d7b4500 Merge pull request #802 from joeframbach/master
If path and new_path are the same, then dont delete the file
2013-05-04 03:35:19 -07:00
Philipp Hagemeister
891629c84a release 2013.05.05 2013-05-04 12:31:17 +02:00
Philipp Hagemeister
ea6d901e51 Add --no-check-certificate (#814) 2013-05-04 12:22:56 +02:00
Philipp Hagemeister
4539dd30e6 twitch.tv chapters (#810): print out start and end time 2013-05-04 12:02:18 +02:00
Philipp Hagemeister
c43e57242e twitch.tv chapters: Include uploader (#810) 2013-05-04 11:44:59 +02:00
Philipp Hagemeister
db8fd71ca9 twitch.tv chapters: Use API for title and other metadata 2013-05-04 11:42:44 +02:00
Philipp Hagemeister
f4f316881d Improve Twitch.tv chapter support (#810) 2013-05-04 11:27:39 +02:00
Philipp Hagemeister
0e16f09474 Work on twitch.tv chapters (#810) 2013-05-04 10:36:37 +02:00
Philipp Hagemeister
09dd418f53 Experimentally whitelist Escapist test 2013-05-04 09:11:38 +02:00
Philipp Hagemeister
decd1d1737 raise ExtractorError instead of calling back 2013-05-04 08:38:28 +02:00
Philipp Hagemeister
180e689f7e Simplify WorldStarHipHop 2013-05-04 08:06:56 +02:00
Johny Mo Swag
7da5556ac2 Better fix for getting source url's 2013-05-04 08:04:28 +02:00
Johny Mo Swag
f23a03a89b updated regular experssion for possible future updates to source url 2013-05-04 07:59:33 +02:00
Philipp Hagemeister
84e4682f0e Always use HTTPS for youtube (Fixes #691) 2013-05-04 07:49:25 +02:00
Philipp Hagemeister
1f99511210 release 2013.05.04 2013-05-04 07:12:33 +02:00
Philipp Hagemeister
0d94f2474c Work around a Python bug on Windows with UTF-8 configuration (#820) 2013-05-04 07:09:50 +02:00
Philipp Hagemeister
480b6c1e8b Fix comedycentral: newest 2013-05-04 02:53:26 +02:00
Philipp Hagemeister
95464f14d1 Credit @yasoob for IE 2013-05-03 20:08:16 +02:00
Philipp Hagemeister
c34407d16c Simplify RedTube 2013-05-03 20:07:35 +02:00
M.Yasoob Ullah Khalid
5e34d2ebbf Moved redtube info extractor to the end 2013-05-03 23:57:16 +06:00
M.Yasoob Ullah Khalid
815dd2ffa8 Redtube test now works
I just did a little makeover by changing redtube tests. Now they are passed.
2013-05-03 23:51:27 +06:00
M.Yasoob Ullah Khalid
ecd5fb49c5 added redtube.com in InfoExtractors (2nd pull request with the required amindments)
added redtube.com in InfoExtractors (2nd pull request with the required amindments). Now this script can also download redtube.com videos
2013-05-03 22:44:34 +06:00
M.Yasoob Ullah Khalid
b86174e7a3 added test for redtube.com
I just added the test for redtube.com
2013-05-03 22:40:56 +06:00
Jaime Marquínez Ferrándiz
2e2038dc35 TEDIE: report the correct talk title when a link with the language code is given 2013-05-02 18:28:07 +02:00
Jaime Marquínez Ferrándiz
46bfb42258 InfoExtractors: use _download_webpage in more IEs
IEs without tests are intact.
2013-05-02 18:18:27 +02:00
Jaime Marquínez Ferrándiz
feecf22511 InfoExtractors: fix some regular expressions where dots weren't escaped 2013-05-02 13:39:56 +02:00
Jaime Marquínez Ferrándiz
4c4f15eb78 Merge pull request #815 from JohnyMoSwag/master
Update for new source links on worldstarhiphop.com
2013-05-02 13:23:32 +02:00
Jaime Marquínez Ferrándiz
104ccdb8b4 TumblrIE: fix title matching 2013-05-02 13:12:41 +02:00
Johny Mo Swag
6ccff79594 Small update for additon of new video source links 2013-05-01 20:30:14 -07:00
Jaime Marquínez Ferrándiz
aed523ecc1 Add BandcampIE (closes #568) 2013-05-01 15:55:46 +02:00
Philipp Hagemeister
d496a75d0a release 2013.05.01 2013-05-01 14:07:23 +02:00
Philipp Hagemeister
5c01dd1e73 Merge remote-tracking branch 'origin/master' 2013-05-01 14:05:02 +02:00
Philipp Hagemeister
11d9224e3b add --write-thumbnail option to download thumbnail (Suggested by `) 2013-05-01 14:04:33 +02:00
Jaime Marquínez Ferrándiz
34c29ba1d7 Add test for SoundcloudSet 2013-04-30 21:23:38 +02:00
Philipp Hagemeister
6cd657f9f2 release 2013.04.31 2013-04-30 19:50:20 +02:00
Philipp Hagemeister
4ae9e55822 Correctly clear the line before writing a new status line 2013-04-30 19:42:58 +02:00
Philipp Hagemeister
8749b71273 Fix FakeDownloaders 2013-04-30 19:42:13 +02:00
Philipp Hagemeister
dbc50fdf82 Fix help for --proxy 2013-04-30 18:27:54 +02:00
Philipp Hagemeister
b1d2ef9255 release 2013.04.30 2013-04-30 18:00:56 +02:00
Philipp Hagemeister
5fb16555af --proxy option 2013-04-30 17:57:13 +02:00
Jaime Marquínez Ferrándiz
ba7c775a04 Remove a commented line I forgot.
[ci skip]
2013-04-30 14:21:46 +02:00
Jaime Marquínez Ferrándiz
fe348844d9 SoundcloudSetIE: Use upload_date in the unified format (fixes #812) 2013-04-29 23:57:36 +02:00
Jaime Marquínez Ferrándiz
767e00277f Use report_warning when a not working IE will be uses 2013-04-28 17:12:07 +02:00
Philipp Hagemeister
6ce533a220 release 2013.04.28 2013-04-28 16:32:05 +02:00
Philipp Hagemeister
08b2ac745a Default to --title (Fixes #499) 2013-04-28 16:26:11 +02:00
Philipp Hagemeister
46a127eecb Fix print_notes 2013-04-28 16:21:29 +02:00
Joe Frambach
e74c504f91 Dont delete source file when source file and post-processed file are the same 2013-04-24 21:59:10 +00:00
75 changed files with 5941 additions and 4512 deletions

View File

@@ -9,9 +9,19 @@ cleanall: clean
PREFIX=/usr/local
BINDIR=$(PREFIX)/bin
MANDIR=$(PREFIX)/man
SYSCONFDIR=/etc
PYTHON=/usr/bin/env python
# set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local
ifeq ($(PREFIX),/usr)
SYSCONFDIR=/etc
else
ifeq ($(PREFIX),/usr/local)
SYSCONFDIR=/etc
else
SYSCONFDIR=$(PREFIX)/etc
endif
endif
install: youtube-dl youtube-dl.1 youtube-dl.bash-completion
install -d $(DESTDIR)$(BINDIR)
install -m 755 youtube-dl $(DESTDIR)$(BINDIR)
@@ -30,15 +40,15 @@ tar: youtube-dl.tar.gz
pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1
youtube-dl: youtube_dl/*.py
zip --quiet youtube-dl youtube_dl/*.py
youtube-dl: youtube_dl/*.py youtube_dl/*/*.py
zip --quiet youtube-dl youtube_dl/*.py youtube_dl/*/*.py
zip --quiet --junk-paths youtube-dl youtube_dl/__main__.py
echo '#!$(PYTHON)' > youtube-dl
cat youtube-dl.zip >> youtube-dl
rm youtube-dl.zip
chmod a+x youtube-dl
README.md: youtube_dl/*.py
README.md: youtube_dl/*.py youtube_dl/*/*.py
COLUMNS=80 python -m youtube_dl --help | python devscripts/make_readme.py
README.txt: README.md
@@ -47,7 +57,7 @@ README.txt: README.md
youtube-dl.1: README.md
pandoc -s -f markdown -t man README.md -o youtube-dl.1
youtube-dl.bash-completion: youtube_dl/*.py devscripts/bash-completion.in
youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in
python devscripts/bash-completion.py
bash-completion: youtube-dl.bash-completion

View File

@@ -1,7 +1,7 @@
% YOUTUBE-DL(1)
# NAME
youtube-dl
youtube-dl - download videos from youtube.com or other video platforms
# SYNOPSIS
**youtube-dl** [OPTIONS] URL [URL...]
@@ -31,6 +31,8 @@ which means you can modify it, redistribute it or use it however you like.
is restricted to one domain
--list-extractors List all supported extractors and the URLs they
would handle
--proxy URL Use the specified HTTP/HTTPS proxy
--no-check-certificate Suppress HTTPS certificate validation.
## Video Selection:
--playlist-start NUMBER playlist video to start at (default is 1)
@@ -49,8 +51,8 @@ which means you can modify it, redistribute it or use it however you like.
--dateafter DATE download only videos uploaded after this date
## Filesystem Options:
-t, --title use title in file name
--id use video ID in file name
-t, --title use title in file name (default)
--id use only video ID in file name
-l, --literal [deprecated] alias of --title
-A, --auto-number number downloaded files starting from 00000
-o, --output TEMPLATE output filename template. Use %(title)s to get
@@ -83,6 +85,7 @@ which means you can modify it, redistribute it or use it however you like.
file modification time
--write-description write video description to a .description file
--write-info-json write video metadata to a .info.json file
--write-thumbnail write thumbnail image to disk
## Verbosity / Simulation Options:
-q, --quiet activates quiet mode
@@ -91,6 +94,7 @@ which means you can modify it, redistribute it or use it however you like.
--skip-download do not download the video
-g, --get-url simulate, quiet but print URL
-e, --get-title simulate, quiet but print title
--get-id simulate, quiet but print id
--get-thumbnail simulate, quiet but print thumbnail URL
--get-description simulate, quiet but print video description
--get-filename simulate, quiet but print output filename
@@ -112,12 +116,12 @@ which means you can modify it, redistribute it or use it however you like.
-F, --list-formats list all available formats (currently youtube
only)
--write-sub write subtitle file (currently youtube only)
--only-sub downloads only the subtitles (no video)
--only-sub [deprecated] alias of --skip-download
--all-subs downloads all the available subtitles of the
video (currently youtube only)
--list-subs lists all available subtitles for the video
(currently youtube only)
--sub-format LANG subtitle format [srt/sbv] (default=srt)
--sub-format FORMAT subtitle format [srt/sbv] (default=srt)
(currently youtube only)
--sub-lang LANG language of the subtitles to download (optional)
use IETF language tags like 'en'

View File

@@ -14,6 +14,10 @@
set -e
skip_test=false
if [ "$2" == '--skip-test' ]; then
skip_test=true
fi
if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi
version="$1"
if [ ! -z "`git tag | grep "$version"`" ]; then echo 'ERROR: version already present'; exit 1; fi
@@ -22,7 +26,11 @@ if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit
/bin/echo -e "\n### First of all, testing..."
make cleanall
nosetests --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1
if $skip_tests; then
echo 'SKIPPING TESTS'
else
nosetests --verbose --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1
fi
/bin/echo -e "\n### Changing version in version.py..."
sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py

View File

@@ -7,7 +7,7 @@ import unittest
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.InfoExtractors import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE
from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE
class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
@@ -29,6 +29,22 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec'))
self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM/videos'))
def test_justin_tv_channelid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/"))
self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/"))
def test_justintv_videoid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483"))
def test_justin_tv_chapterid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361"))
def test_youtube_extract(self):
self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')
self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc')

View File

@@ -7,14 +7,14 @@ import os
import json
import unittest
import sys
import hashlib
import socket
import binascii
# Allow direct execution
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import youtube_dl.FileDownloader
import youtube_dl.InfoExtractors
import youtube_dl.extractor
from youtube_dl.utils import *
DEF_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tests.json')
@@ -38,11 +38,16 @@ def _try_rm(filename):
if ose.errno != errno.ENOENT:
raise
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
class FileDownloader(youtube_dl.FileDownloader):
def __init__(self, *args, **kwargs):
self.to_stderr = self.to_screen
self.processed_info_dicts = []
return youtube_dl.FileDownloader.__init__(self, *args, **kwargs)
def report_warning(self, message):
# Don't accept warnings during tests
raise ExtractorError(message)
def process_info(self, info_dict):
self.processed_info_dicts.append(info_dict)
return youtube_dl.FileDownloader.process_info(self, info_dict)
@@ -67,7 +72,7 @@ class TestDownload(unittest.TestCase):
def generator(test_case):
def test_template(self):
ie = youtube_dl.InfoExtractors.get_info_extractor(test_case['name'])#getattr(youtube_dl.InfoExtractors, test_case['name'] + 'IE')
ie = youtube_dl.extractor.get_info_extractor(test_case['name'])
if not ie._WORKING:
print('Skipping: IE marked as not _WORKING')
return
@@ -82,7 +87,7 @@ def generator(test_case):
params.update(test_case.get('params', {}))
fd = FileDownloader(params)
for ie in youtube_dl.InfoExtractors.gen_extractors():
for ie in youtube_dl.extractor.gen_extractors():
fd.add_info_extractor(ie)
finished_hook_called = set()
def _hook(status):
@@ -120,8 +125,25 @@ def generator(test_case):
self.assertEqual(md5_for_file, tc['md5'])
with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof:
info_dict = json.load(infof)
for (info_field, value) in tc.get('info_dict', {}).items():
self.assertEqual(value, info_dict.get(info_field))
for (info_field, expected) in tc.get('info_dict', {}).items():
if isinstance(expected, compat_str) and expected.startswith('md5:'):
self.assertEqual(expected, 'md5:' + md5(info_dict.get(info_field)))
else:
got = info_dict.get(info_field)
self.assertEqual(
expected, got,
u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
# If checkable fields are missing from the test case, print the info_dict
test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
for key, value in info_dict.items()
if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location'))
if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()):
sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=2) + u'\n')
# Check for the presence of mandatory fields
for key in ('id', 'url', 'title', 'ext'):
self.assertTrue(key in info_dict.keys() and info_dict[key])
finally:
for tc in test_cases:
_try_rm(tc['file'])

View File

@@ -10,7 +10,7 @@ import unittest
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import youtube_dl.FileDownloader
import youtube_dl.InfoExtractors
import youtube_dl.extractor
from youtube_dl.utils import *
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
@@ -48,7 +48,7 @@ class TestInfoJSON(unittest.TestCase):
self.tearDown()
def test_info_json(self):
ie = youtube_dl.InfoExtractors.YoutubeIE()
ie = youtube_dl.extractor.YoutubeIE()
fd = FileDownloader(params)
fd.add_info_extractor(ie)
fd.download([TEST_ID])

View File

@@ -8,7 +8,7 @@ import json
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.InfoExtractors import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE
from youtube_dl.extractor import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE
from youtube_dl.utils import *
from youtube_dl.FileDownloader import FileDownloader
@@ -29,7 +29,7 @@ class FakeDownloader(FileDownloader):
self.params = parameters
def to_screen(self, s):
print(s)
def trouble(self, s):
def trouble(self, s, tb=None):
raise Exception(s)
def extract_info(self, url):
self.result.append(url)
@@ -53,8 +53,7 @@ class TestYoutubeLists(unittest.TestCase):
dl = FakeDownloader()
ie = YoutubePlaylistIE(dl)
result = ie.extract('PLBB231211A4F62143')[0]
self.assertEqual(result['title'], 'Team Fortress 2')
self.assertTrue(len(result['entries']) > 40)
self.assertTrue(len(result['entries']) > 25)
def test_youtube_playlist_long(self):
dl = FakeDownloader()
@@ -105,5 +104,11 @@ class TestYoutubeLists(unittest.TestCase):
result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
self.assertTrue(len(result['entries']) >= 320)
def test_youtube_safe_search(self):
dl = FakeDownloader()
ie = YoutubePlaylistIE(dl)
result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0]
self.assertEqual(len(result['entries']), 2)
if __name__ == '__main__':
unittest.main()

View File

@@ -10,8 +10,9 @@ import hashlib
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.InfoExtractors import YoutubeIE
from youtube_dl.extractor import YoutubeIE
from youtube_dl.utils import *
from youtube_dl import FileDownloader
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
@@ -24,13 +25,15 @@ proxy_handler = compat_urllib_request.ProxyHandler()
opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
compat_urllib_request.install_opener(opener)
class FakeDownloader(object):
class FakeDownloader(FileDownloader):
def __init__(self):
self.result = []
self.params = parameters
# Different instances of the downloader can't share the same dictionary
# some test set the "sublang" parameter, which would break the md5 checks.
self.params = dict(parameters)
def to_screen(self, s):
print(s)
def trouble(self, s):
def trouble(self, s, tb=None):
raise Exception(s)
def download(self, x):
self.result.append(x)
@@ -95,6 +98,14 @@ class TestYoutubeSubtitles(unittest.TestCase):
IE = YoutubeIE(DL)
info_dict = IE.extract('QRS8MkLhQmM')
self.assertEqual(info_dict, None)
def test_youtube_automatic_captions(self):
DL = FakeDownloader()
DL.params['writesubtitles'] = True
DL.params['subtitleslang'] = 'it'
IE = YoutubeIE(DL)
info_dict = IE.extract('8YoUxe5ncPo')
sub = info_dict[0]['subtitles'][0]
self.assertTrue(sub[2] is not None)
if __name__ == '__main__':
unittest.main()

View File

@@ -15,43 +15,76 @@
"name": "Dailymotion",
"md5": "392c4b85a60a90dc4792da41ce3144eb",
"url": "http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech",
"file": "x33vw9.mp4"
"file": "x33vw9.mp4",
"info_dict": {
"uploader": "Alex and Van .",
"title": "Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
}
},
{
"name": "Metacafe",
"add_ie": ["Youtube"],
"url": "http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
"file": "_aUehQsCQtM.flv"
"file": "_aUehQsCQtM.flv",
"info_dict": {
"upload_date": "20090102",
"title": "The Electric Company | \"Short I\" | PBS KIDS GO!",
"description": "md5:2439a8ef6d5a70e380c22f5ad323e5a8",
"uploader": "PBS",
"uploader_id": "PBS"
}
},
{
"name": "BlipTV",
"md5": "b2d849efcf7ee18917e4b4d9ff37cafe",
"url": "http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352",
"file": "5779306.m4v"
"file": "5779306.m4v",
"info_dict": {
"upload_date": "20111205",
"description": "md5:9bc31f227219cde65e47eeec8d2dc596",
"uploader": "Comic Book Resources - CBR TV",
"title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3"
}
},
{
"name": "XVideos",
"md5": "1d0c835822f0a71a7bf011855db929d0",
"url": "http://www.xvideos.com/video939581/funny_porns_by_s_-1",
"file": "939581.flv"
"file": "939581.flv",
"info_dict": {
"title": "Funny Porns By >>>>S<<<<<< -1"
}
},
{
"name": "YouPorn",
"md5": "c37ddbaaa39058c76a7e86c6813423c1",
"url": "http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/",
"file": "505835.mp4"
"file": "505835.mp4",
"info_dict": {
"upload_date": "20101221",
"description": "Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
"uploader": "Ask Dan And Jennifer",
"title": "Sex Ed: Is It Safe To Masturbate Daily?"
}
},
{
"name": "Pornotube",
"md5": "374dd6dcedd24234453b295209aa69b6",
"url": "http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing",
"file": "1689755.flv"
"file": "1689755.flv",
"info_dict": {
"upload_date": "20090708",
"title": "Marilyn-Monroe-Bathing"
}
},
{
"name": "YouJizz",
"md5": "07e15fa469ba384c7693fd246905547c",
"url": "http://www.youjizz.com/videos/zeichentrick-1-2189178.html",
"file": "2189178.flv"
"file": "2189178.flv",
"info_dict": {
"title": "Zeichentrick 1"
}
},
{
"name": "Vimeo",
@@ -70,62 +103,103 @@
"name": "Soundcloud",
"md5": "ebef0a451b909710ed1d7787dddbf0d7",
"url": "http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy",
"file": "62986583.mp3"
"file": "62986583.mp3",
"info_dict": {
"upload_date": "20121011",
"description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
"uploader": "E.T. ExTerrestrial Music",
"title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
}
},
{
"name": "StanfordOpenClassroom",
"md5": "544a9468546059d4e80d76265b0443b8",
"url": "http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100",
"file": "PracticalUnix_intro-environment.mp4"
"file": "PracticalUnix_intro-environment.mp4",
"info_dict": {
"title": "Intro Environment"
}
},
{
"name": "XNXX",
"md5": "0831677e2b4761795f68d417e0b7b445",
"url": "http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_",
"file": "1135332.flv"
"file": "1135332.flv",
"info_dict": {
"title": "lida » Naked Funny Actress (5)"
}
},
{
"name": "Youku",
"url": "http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
"file": "XNDgyMDQ2NTQw_part00.flv",
"md5": "ffe3f2e435663dc2d1eea34faeff5b5b",
"params": { "test": false }
"params": { "test": false },
"info_dict": {
"title": "youtube-dl test video \"'/\\ä↭𝕐"
}
},
{
"name": "NBA",
"url": "http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html",
"file": "0021200253-okc-bkn-recap.nba.mp4",
"md5": "c0edcfc37607344e2ff8f13c378c88a4"
"md5": "c0edcfc37607344e2ff8f13c378c88a4",
"info_dict": {
"description": "Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.",
"title": "Thunder vs. Nets"
}
},
{
"name": "JustinTV",
"url": "http://www.twitch.tv/thegamedevhub/b/296128360",
"file": "296128360.flv",
"md5": "ecaa8a790c22a40770901460af191c9a"
"md5": "ecaa8a790c22a40770901460af191c9a",
"info_dict": {
"upload_date": "20110927",
"uploader_id": 25114803,
"uploader": "thegamedevhub",
"title": "Beginner Series - Scripting With Python Pt.1"
}
},
{
"name": "MyVideo",
"url": "http://www.myvideo.de/watch/8229274/bowling_fail_or_win",
"file": "8229274.flv",
"md5": "2d2753e8130479ba2cb7e0a37002053e"
"md5": "2d2753e8130479ba2cb7e0a37002053e",
"info_dict": {
"title": "bowling-fail-or-win"
}
},
{
"name": "Escapist",
"url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate",
"file": "6618-Breaking-Down-Baldurs-Gate.flv",
"file": "6618-Breaking-Down-Baldurs-Gate.mp4",
"md5": "c6793dbda81388f4264c1ba18684a74d",
"skip": "Fails with timeout on Travis"
"info_dict": {
"description": "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
"uploader": "the-escapist-presents",
"title": "Breaking Down Baldur's Gate"
}
},
{
"name": "GooglePlus",
"url": "https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH",
"file": "ZButuJc6CtH.flv"
"file": "ZButuJc6CtH.flv",
"info_dict": {
"upload_date": "20120613",
"uploader": "井上ヨシマサ",
"title": "嘆きの天使 降臨"
}
},
{
"name": "FunnyOrDie",
"url": "http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version",
"file": "0732f586d7.mp4",
"md5": "f647e9e90064b53b6e046e75d0241fbd"
"md5": "f647e9e90064b53b6e046e75d0241fbd",
"info_dict": {
"description": "Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.",
"title": "Heart-Shaped Box: Literal Video Version"
}
},
{
"name": "Steam",
@@ -153,7 +227,8 @@
"file": "20274954.flv",
"md5": "088f151799e8f572f84eb62f17d73e5c",
"info_dict": {
"title": "Young Americans for Liberty February 7, 2012 2:28 AM"
"title": "Young Americans for Liberty February 7, 2012 2:28 AM",
"uploader": "Young Americans for Liberty"
}
},
{
@@ -161,6 +236,7 @@
"url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things",
"file": "12-jan-pythonthings.mp4",
"info_dict": {
"description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.",
"title": "A Few of My Favorite [Python] Things"
},
"params": {
@@ -173,7 +249,10 @@
"file": "422212.mp4",
"md5": "4e2f5cb088a83cd8cdb7756132f9739d",
"info_dict": {
"title": "thedailyshow-kristen-stewart part 1"
"upload_date": "20121214",
"description": "Kristen Stewart",
"uploader": "thedailyshow",
"title": "thedailyshow-kristen-stewart part 1"
}
},
{
@@ -224,42 +303,48 @@
"file": "11885679.m4a",
"md5": "d30b5b5f74217410f4689605c35d1fd7",
"info_dict": {
"title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad"
"title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad",
"uploader_id": "ytdl"
}
},
{
"file": "11885680.m4a",
"md5": "4eb0a669317cd725f6bbd336a29f923a",
"info_dict": {
"title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad"
"title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad",
"uploader_id": "ytdl"
}
},
{
"file": "11885682.m4a",
"md5": "1893e872e263a2705558d1d319ad19e8",
"info_dict": {
"title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad"
"title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad",
"uploader_id": "ytdl"
}
},
{
"file": "11885683.m4a",
"md5": "b673c46f47a216ab1741ae8836af5899",
"info_dict": {
"title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad"
"title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad",
"uploader_id": "ytdl"
}
},
{
"file": "11885684.m4a",
"md5": "1d74534e95df54986da7f5abf7d842b7",
"info_dict": {
"title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad"
"title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad",
"uploader_id": "ytdl"
}
},
{
"file": "11885685.m4a",
"md5": "f081f47af8f6ae782ed131d38b9cd1c0",
"info_dict": {
"title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad"
"title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad",
"uploader_id": "ytdl"
}
}
]
@@ -270,18 +355,18 @@
"file": "NODfbab.mp4",
"md5": "9b0636f8c0f7614afa4ea5e4c6e57e83",
"info_dict": {
"uploader": "ytdl",
"title": "test chars: \"'/\\ä<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ."
}
},
{
"name": "TED",
"url": "http://www.ted.com/talks/dan_dennett_on_our_consciousness.html",
"file": "102.mp4",
"md5": "7bc087e71d16f18f9b8ab9fa62a8a031",
"md5": "8cd9dfa41ee000ce658fd48fb5d89a61",
"info_dict": {
"title": "Dan Dennett: The illusion of consciousness",
"thumbnail": "http://images.ted.com/images/ted/488_389x292.jpg"
"description": "md5:c6fa72e6eedbd938c9caf6b2702f5922"
}
},
{
@@ -290,14 +375,19 @@
"file": "11741.mp4",
"md5": "0b49f4844a068f8b33f4b7c88405862b",
"info_dict": {
"title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2"
"description": "Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?",
"title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2"
}
},
{
"name": "Generic",
"url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html",
"file": "13601338388002.mp4",
"md5": "85b90ccc9d73b4acd9138d3af4c27f89"
"md5": "85b90ccc9d73b4acd9138d3af4c27f89",
"info_dict": {
"uploader": "www.hodiho.fr",
"title": "Régis plante sa Jeep"
}
},
{
"name": "Spiegel",
@@ -325,7 +415,7 @@
"file": "wshh6a7q1ny0G34ZwuIO.mp4",
"md5": "9d04de741161603bf7071bbf4e883186",
"info_dict": {
"title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick! "
"title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
}
},
{
@@ -340,12 +430,224 @@
},
{
"name": "Tumblr",
"url": "http://birthdayproject2012.tumblr.com/post/17258355236/a-sample-video-from-leeann-if-you-need-an-idea",
"file": "17258355236.mp4",
"md5": "7c6a514d691b034ccf8567999e9e88a3",
"url": "http://resigno.tumblr.com/post/53364321212/e-de-extrema-importancia-que-esse-video-seja",
"file": "53364321212.mp4",
"md5": "0716d3dd51baf68a28b40fdf1251494e",
"info_dict": {
"title": "A sample video from LeeAnn. (If you need an idea..."
"title": "Rafael Lemos | Tumblr"
}
},
{
"name": "SoundcloudSet",
"url":"https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep",
"playlist":[
{
"file":"30510138.mp3",
"md5":"f9136bf103901728f29e419d2c70f55d",
"info_dict": {
"upload_date": "20111213",
"description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
"uploader": "The Royal Concept",
"title": "D-D-Dance"
}
},
{
"file":"47127625.mp3",
"md5":"09b6758a018470570f8fd423c9453dd8",
"info_dict": {
"upload_date": "20120521",
"description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
"uploader": "The Royal Concept",
"title": "The Royal Concept - Gimme Twice"
}
},
{
"file":"47127627.mp3",
"md5":"154abd4e418cea19c3b901f1e1306d9c",
"info_dict": {
"upload_date": "20120521",
"uploader": "The Royal Concept",
"title": "Goldrushed"
}
},
{
"file":"47127629.mp3",
"md5":"2f5471edc79ad3f33a683153e96a79c1",
"info_dict": {
"upload_date": "20120521",
"description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
"uploader": "The Royal Concept",
"title": "In the End"
}
},
{
"file":"47127631.mp3",
"md5":"f9ba87aa940af7213f98949254f1c6e2",
"info_dict": {
"upload_date": "20120521",
"description": "The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com",
"uploader": "The Royal Concept",
"title": "Knocked Up"
}
},
{
"file":"75206121.mp3",
"md5":"f9d1fe9406717e302980c30de4af9353",
"info_dict": {
"upload_date": "20130116",
"description": "The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central). \r\nAs a gift to our fans we would like to offer you a free download of the track! ",
"uploader": "The Royal Concept",
"title": "World On Fire"
}
}
]
},
{
"name":"Bandcamp",
"url":"http://youtube-dl.bandcamp.com/track/youtube-dl-test-song",
"file":"1812978515.mp3",
"md5":"cdeb30cdae1921719a3cbcab696ef53c",
"info_dict": {
"title":"youtube-dl test song \"'/\\ä↭"
},
"skip": "There is a limit of 200 free downloads / month for the test song"
},
{
"name": "RedTube",
"url": "http://www.redtube.com/66418",
"file": "66418.mp4",
"md5": "7b8c22b5e7098a3e1c09709df1126d2d",
"info_dict":{
"title":"Sucked on a toilet"
}
},
{
"name": "Photobucket",
"url": "http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0",
"file": "zpsc0c3b9fa.mp4",
"md5": "7dabfb92b0a31f6c16cebc0f8e60ff99",
"info_dict": {
"upload_date": "20130504",
"uploader": "rachaneronas",
"title": "Tired of Link Building? Try BacklinkMyDomain.com!"
}
},
{
"name": "Ina",
"url": "www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html",
"file": "I12055569.mp4",
"md5": "a667021bf2b41f8dc6049479d9bb38a3",
"info_dict":{
"title":"François Hollande \"Je crois que c'est clair\""
}
},
{
"name": "Yahoo",
"url": "http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html",
"file": "214727115.flv",
"md5": "2e717f169c1be93d84d3794a00d4a325",
"info_dict": {
"title": "Julian Smith & Travis Legg Watch Julian Smith"
},
"skip": "Requires rtmpdump"
},
{
"name": "Howcast",
"url": "http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly",
"file": "390161.mp4",
"md5": "1d7ba54e2c9d7dc6935ef39e00529138",
"info_dict":{
"title":"How to Tie a Square Knot Properly",
"description":"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot."
}
},
{
"name": "Vine",
"url": "https://vine.co/v/b9KOOWX7HUx",
"file": "b9KOOWX7HUx.mp4",
"md5": "2f36fed6235b16da96ce9b4dc890940d",
"info_dict":{
"title": "Chicken.",
"uploader": "Jack Dorsey"
}
},
{
"name": "Flickr",
"url": "http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/",
"file": "5645318632.mp4",
"md5": "6fdc01adbc89d72fc9c4f15b4a4ba87b",
"info_dict":{
"title": "Dark Hollow Waterfalls",
"uploader_id": "forestwander-nature-pictures",
"description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up."
}
},
{
"name": "Teamcoco",
"url": "http://teamcoco.com/video/louis-ck-interview-george-w-bush",
"file": "19705.mp4",
"md5": "27b6f7527da5acf534b15f21b032656e",
"info_dict":{
"title": "Louis C.K. Interview Pt. 1 11/3/11",
"description": "Louis C.K. got starstruck by George W. Bush, so what? Part one."
}
},
{
"name": "XHamster",
"url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html",
"file": "1509445.flv",
"md5": "9f48e0e8d58e3076bb236ff412ab62fa",
"info_dict": {
"upload_date": "20121014",
"uploader_id": "Ruseful2011",
"title": "FemaleAgent Shy beauty takes the bait"
}
},
{
"name": "Hypem",
"url": "http://hypem.com/track/1v6ga/BODYWORK+-+TAME",
"file": "1v6ga.mp3",
"md5": "b9cc91b5af8995e9f0c1cee04c575828",
"info_dict":{
"title":"Tame"
}
},
{
"name": "Vbox7",
"url": "http://vbox7.com/play:249bb972c2",
"file": "249bb972c2.flv",
"md5": "9c70d6d956f888bdc08c124acc120cfe",
"info_dict":{
"title":"Смях! Чудо - чист за секунди - Скрита камера"
}
},
{
"name": "Gametrailers",
"url": "http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer",
"file": "zbvr8i.flv",
"md5": "c3edbc995ab4081976e16779bd96a878",
"info_dict": {
"title": "E3 2013: Debut Trailer"
},
"skip": "Requires rtmpdump"
},
{
"name": "Statigram",
"url": "http://statigr.am/p/484091715184808010_284179915",
"file": "484091715184808010_284179915.mp4",
"md5": "deda4ff333abe2e118740321e992605b",
"info_dict": {
"uploader_id": "videoseconds",
"title": "Instagram photo by @videoseconds (Videos)"
}
},
{
"name": "Break",
"url": "http://www.break.com/video/when-girls-act-like-guys-2468056",
"file": "2468056.mp4",
"md5": "a3513fb1547fba4fb6cfac1bffc6c46b",
"info_dict": {
"title": "When Girls Act Like D-Bags"
}
}
]

View File

@@ -1,12 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import math
import io
import os
import re
import shutil
import socket
import subprocess
import sys
@@ -17,7 +13,7 @@ if os.name == 'nt':
import ctypes
from .utils import *
from .InfoExtractors import get_info_extractor
from .extractor import get_info_extractor
class FileDownloader(object):
@@ -53,6 +49,7 @@ class FileDownloader(object):
quiet: Do not print messages to stdout.
forceurl: Force printing final URL.
forcetitle: Force printing title.
forceid: Force printing ID.
forcethumbnail: Force printing thumbnail URL.
forcedescription: Force printing description.
forcefilename: Force printing final filename.
@@ -79,8 +76,8 @@ class FileDownloader(object):
updatetime: Use the Last-modified header to set output file timestamps.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
writethumbnail: Write the thumbnail image to a file
writesubtitles: Write the video subtitles to a file
onlysubtitles: Downloads only the subtitles of the video
allsubtitles: Downloads all the subtitles of the video
listsubtitles: Lists all available subtitles for the video
subtitlesformat: Subtitle format [sbv/srt] (default=srt)
@@ -90,6 +87,7 @@ class FileDownloader(object):
min_filesize: Skip files smaller than this size
max_filesize: Skip files larger than this size
daterange: A DateRange object, download only if the upload_date is in the range.
skip_download: Skip the actual download of the video file
"""
params = None
@@ -319,6 +317,9 @@ class FileDownloader(object):
filetime = timeconvert(timestr)
if filetime is None:
return filetime
# Ignore obviously invalid dates
if filetime == 0:
return
try:
os.utime(filename, (time.time(), filetime))
except:
@@ -345,12 +346,13 @@ class FileDownloader(object):
"""Report download progress."""
if self.params.get('noprogress', False):
return
clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'')
if self.params.get('progress_with_newline', False):
self.to_screen(u'[download] %s of %s at %s ETA %s' %
(percent_str, data_len_str, speed_str, eta_str))
else:
self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' %
(clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
(percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
@@ -432,47 +434,45 @@ class FileDownloader(object):
return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
return None
def extract_info(self, url, download = True, ie_name = None):
def extract_info(self, url, download=True, ie_key=None, extra_info={}):
'''
Returns a list with a dictionary for each video we find.
If 'download', also downloads the videos.
extra_info is a dict containing the extra values to add to each result
'''
suitable_found = False
#We copy the original list
ies = list(self._ies)
if ie_name is not None:
#We put in the first place the given info extractor
first_ie = get_info_extractor(ie_name)()
first_ie.set_downloader(self)
ies.insert(0, first_ie)
if ie_key:
ie = get_info_extractor(ie_key)()
ie.set_downloader(self)
ies = [ie]
else:
ies = self._ies
for ie in ies:
# Go to next InfoExtractor if not suitable
if not ie.suitable(url):
continue
# Warn if the _WORKING attribute is False
if not ie.working():
self.to_stderr(u'WARNING: the program functionality for this site has been marked as broken, '
u'and will probably not work. If you want to go on, use the -i option.')
self.report_warning(u'The program functionality for this site has been marked as broken, '
u'and will probably not work.')
# Suitable InfoExtractor found
suitable_found = True
# Extract information from URL and process it
try:
ie_results = ie.extract(url)
if ie_results is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
ie_result = ie.extract(url)
if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
break
results = []
for ie_result in ie_results:
if not 'extractor' in ie_result:
#The extractor has already been set somewhere else
ie_result['extractor'] = ie.IE_NAME
results.append(self.process_ie_result(ie_result, download))
return results
if isinstance(ie_result, list):
# Backwards compatibility: old IE result format
for result in ie_result:
result.update(extra_info)
ie_result = {
'_type': 'compat_list',
'entries': ie_result,
}
else:
ie_result.update(extra_info)
if 'extractor' not in ie_result:
ie_result['extractor'] = ie.IE_NAME
return self.process_ie_result(ie_result, download=download)
except ExtractorError as de: # An error we somewhat expected
self.report_error(compat_str(de), de.format_traceback())
break
@@ -482,33 +482,36 @@ class FileDownloader(object):
break
else:
raise
if not suitable_found:
self.report_error(u'no suitable InfoExtractor: %s' % url)
else:
self.report_error(u'no suitable InfoExtractor: %s' % url)
def process_ie_result(self, ie_result, download = True):
def process_ie_result(self, ie_result, download=True, extra_info={}):
"""
Take the result of the ie and return a list of videos.
For url elements it will search the suitable ie and get the videos
For playlist elements it will process each of the elements of the 'entries' key
Take the result of the ie(may be modified) and resolve all unresolved
references (URLs, playlist items).
It will also download the videos if 'download'.
Returns the resolved ie_result.
"""
result_type = ie_result.get('_type', 'video') #If not given we suppose it's a video, support the dafault old system
result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
if result_type == 'video':
if 'playlist' not in ie_result:
#It isn't part of a playlist
# It isn't part of a playlist
ie_result['playlist'] = None
ie_result['playlist_index'] = None
if download:
#Do the download:
self.process_info(ie_result)
return ie_result
elif result_type == 'url':
#We get the video pointed by the url
result = self.extract_info(ie_result['url'], download, ie_name = ie_result['ie_key'])[0]
return result
# We have to add extra_info to the results because it may be
# contained in a playlist
return self.extract_info(ie_result['url'],
download,
ie_key=ie_result.get('ie_key'),
extra_info=extra_info)
elif result_type == 'playlist':
#We process each entry in the playlist
# We process each entry in the playlist
playlist = ie_result.get('title', None) or ie_result.get('id', None)
self.to_screen(u'[download] Downloading playlist: %s' % playlist)
@@ -530,23 +533,40 @@ class FileDownloader(object):
for i,entry in enumerate(entries,1):
self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
entry_result = self.process_ie_result(entry, False)
entry_result['playlist'] = playlist
entry_result['playlist_index'] = i + playliststart
#We must do the download here to correctly set the 'playlist' key
if download:
self.process_info(entry_result)
extra = {
'playlist': playlist,
'playlist_index': i + playliststart,
}
if not 'extractor' in entry:
# We set the extractor, if it's an url it will be set then to
# the new extractor, but if it's already a video we must make
# sure it's present: see issue #877
entry['extractor'] = ie_result['extractor']
entry_result = self.process_ie_result(entry,
download=download,
extra_info=extra)
playlist_results.append(entry_result)
result = ie_result.copy()
result['entries'] = playlist_results
return result
ie_result['entries'] = playlist_results
return ie_result
elif result_type == 'compat_list':
def _fixup(r):
r.setdefault('extractor', ie_result['extractor'])
return r
ie_result['entries'] = [
self.process_ie_result(_fixup(r), download=download)
for r in ie_result['entries']
]
return ie_result
else:
raise Exception('Invalid result type: %s' % result_type)
def process_info(self, info_dict):
"""Process a single dictionary returned by an InfoExtractor."""
"""Process a single resolved IE result."""
assert info_dict.get('_type', 'video') == 'video'
#We increment the download the download count here to match the previous behaviour.
self.increment_downloads()
info_dict['fulltitle'] = info_dict['title']
if len(info_dict['title']) > 200:
info_dict['title'] = info_dict['title'][:197] + u'...'
@@ -572,6 +592,8 @@ class FileDownloader(object):
# Forced printings
if self.params.get('forcetitle', False):
compat_print(info_dict['title'])
if self.params.get('forceid', False):
compat_print(info_dict['id'])
if self.params.get('forceurl', False):
compat_print(info_dict['url'])
if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
@@ -592,7 +614,7 @@ class FileDownloader(object):
try:
dn = os.path.dirname(encodeFilename(filename))
if dn != '' and not os.path.exists(dn): # dn is already encoded
if dn != '' and not os.path.exists(dn):
os.makedirs(dn)
except (OSError, IOError) as err:
self.report_error(u'unable to create directory ' + compat_str(err))
@@ -625,8 +647,6 @@ class FileDownloader(object):
except (OSError, IOError):
self.report_error(u'Cannot write subtitles file ' + descfn)
return
if self.params.get('onlysubtitles', False):
return
if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
subtitles = info_dict['subtitles']
@@ -644,8 +664,6 @@ class FileDownloader(object):
except (OSError, IOError):
self.report_error(u'Cannot write subtitles file ' + descfn)
return
if self.params.get('onlysubtitles', False):
return
if self.params.get('writeinfojson', False):
infofn = filename + u'.info.json'
@@ -657,6 +675,20 @@ class FileDownloader(object):
self.report_error(u'Cannot write metadata to JSON file ' + infofn)
return
if self.params.get('writethumbnail', False):
if 'thumbnail' in info_dict:
thumb_format = info_dict['thumbnail'].rpartition(u'/')[2].rpartition(u'.')[2]
if not thumb_format:
thumb_format = 'jpg'
thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
(info_dict['extractor'], info_dict['id']))
uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
with open(thumb_filename, 'wb') as thumbf:
shutil.copyfileobj(uf, thumbf)
self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
(info_dict['extractor'], info_dict['id'], thumb_filename))
if not self.params.get('skip_download', False):
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
success = True
@@ -719,7 +751,7 @@ class FileDownloader(object):
except (IOError, OSError):
self.report_warning(u'Unable to remove downloaded video file')
def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path):
def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
@@ -729,18 +761,21 @@ class FileDownloader(object):
except (OSError, IOError):
self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
return False
verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet'
# Download using rtmpdump. rtmpdump returns exit code 2 when
# the connection was interrumpted and resuming appears to be
# possible. This is part of rtmpdump's normal usage, AFAIK.
basic_args = ['rtmpdump', '-q', '-r', url, '-o', tmpfilename]
basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename]
if player_url is not None:
basic_args += ['-W', player_url]
basic_args += ['--swfVfy', player_url]
if page_url is not None:
basic_args += ['--pageUrl', page_url]
if play_path is not None:
basic_args += ['-y', play_path]
args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
basic_args += ['--playpath', play_path]
if tc_url is not None:
basic_args += ['--tcUrl', url]
args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
if self.params.get('verbose', False):
try:
import pipes
@@ -778,6 +813,37 @@ class FileDownloader(object):
self.report_error(u'rtmpdump exited with code %d' % retval)
return False
def _download_with_mplayer(self, filename, url):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url]
# Check for mplayer first
try:
subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
except (OSError, IOError):
self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] )
return False
# Download using mplayer.
retval = subprocess.call(args)
if retval == 0:
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
'total_bytes': fsize,
'filename': filename,
'status': 'finished',
})
return True
else:
self.to_stderr(u"\n")
self.report_error(u'mplayer exited with code %d' % retval)
return False
def _do_download(self, filename, info_dict):
url = info_dict['url']
@@ -795,7 +861,12 @@ class FileDownloader(object):
return self._download_with_rtmpdump(filename, url,
info_dict.get('player_url', None),
info_dict.get('page_url', None),
info_dict.get('play_path', None))
info_dict.get('play_path', None),
info_dict.get('tc_url', None))
# Attempt to download using mplayer
if url.startswith('mms') or url.startswith('rtsp'):
return self._download_with_mplayer(filename, url)
tmpfilename = self.temp_name(filename)
stream = None

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os
import subprocess
import sys
@@ -85,8 +80,9 @@ class FFmpegPostProcessor(PostProcessor):
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout,stderr = p.communicate()
if p.returncode != 0:
stderr = stderr.decode('utf-8', 'replace')
msg = stderr.strip().split('\n')[-1]
raise FFmpegPostProcessorError(msg.decode('utf-8', 'replace'))
raise FFmpegPostProcessorError(msg)
def _ffmpeg_filename_argument(self, fn):
# ffmpeg broke --, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details
@@ -188,6 +184,11 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
new_path = prefix + sep + extension
# If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly.
if new_path == path:
self._nopostoverwrites = True
try:
if self._nopostoverwrites and os.path.exists(encodeFilename(new_path)):
self._downloader.to_screen(u'[youtube] Post-process file %s exists, skipping' % new_path)
@@ -210,7 +211,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
information['filepath'] = new_path
return False,information
return self._nopostoverwrites,information
class FFmpegVideoConvertor(FFmpegPostProcessor):
def __init__(self, downloader=None,preferedformat=None):

View File

@@ -1,9 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import with_statement
from __future__ import absolute_import
__authors__ = (
'Ricardo Garcia Gonzalez',
'Danny Colligan',
@@ -25,10 +22,14 @@ __authors__ = (
'Jeff Crouse',
'Osama Khalid',
'Michael Walter',
'M. Yasoob Ullah Khalid',
'Julien Fraichard',
'Johny Mo Swag',
)
__license__ = 'Public Domain'
import codecs
import getpass
import optparse
import os
@@ -44,7 +45,7 @@ from .utils import *
from .update import update_self
from .version import __version__
from .FileDownloader import *
from .InfoExtractors import gen_extractors
from .extractor import gen_extractors
from .PostProcessor import *
def parseOpts(overrideArguments=None):
@@ -146,6 +147,8 @@ def parseOpts(overrideArguments=None):
general.add_option('--list-extractors',
action='store_true', dest='list_extractors',
help='List all supported extractors and the URLs they would handle', default=False)
general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL')
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
general.add_option('--test', action='store_true', dest='test', default=False, help=optparse.SUPPRESS_HELP)
selection.add_option('--playlist-start',
@@ -185,8 +188,8 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='writesubtitles',
help='write subtitle file (currently youtube only)', default=False)
video_format.add_option('--only-sub',
action='store_true', dest='onlysubtitles',
help='downloads only the subtitles (no video)', default=False)
action='store_true', dest='skip_download',
help='[deprecated] alias of --skip-download', default=False)
video_format.add_option('--all-subs',
action='store_true', dest='allsubtitles',
help='downloads all the available subtitles of the video (currently youtube only)', default=False)
@@ -194,7 +197,7 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='listsubtitles',
help='lists all available subtitles for the video (currently youtube only)', default=False)
video_format.add_option('--sub-format',
action='store', dest='subtitlesformat', metavar='LANG',
action='store', dest='subtitlesformat', metavar='FORMAT',
help='subtitle format [srt/sbv] (default=srt) (currently youtube only)', default='srt')
video_format.add_option('--sub-lang', '--srt-lang',
action='store', dest='subtitleslang', metavar='LANG',
@@ -210,6 +213,8 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
verbosity.add_option('-e', '--get-title',
action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
verbosity.add_option('--get-id',
action='store_true', dest='getid', help='simulate, quiet but print id', default=False)
verbosity.add_option('--get-thumbnail',
action='store_true', dest='getthumbnail',
help='simulate, quiet but print thumbnail URL', default=False)
@@ -236,9 +241,9 @@ def parseOpts(overrideArguments=None):
help='print downloaded pages to debug problems(very verbose)')
filesystem.add_option('-t', '--title',
action='store_true', dest='usetitle', help='use title in file name', default=False)
action='store_true', dest='usetitle', help='use title in file name (default)', default=False)
filesystem.add_option('--id',
action='store_true', dest='useid', help='use video ID in file name', default=False)
action='store_true', dest='useid', help='use only video ID in file name', default=False)
filesystem.add_option('-l', '--literal',
action='store_true', dest='usetitle', help='[deprecated] alias of --title', default=False)
filesystem.add_option('-A', '--auto-number',
@@ -283,6 +288,9 @@ def parseOpts(overrideArguments=None):
filesystem.add_option('--write-info-json',
action='store_true', dest='writeinfojson',
help='write video metadata to a .info.json file', default=False)
filesystem.add_option('--write-thumbnail',
action='store_true', dest='writethumbnail',
help='write thumbnail image to disk', default=False)
postproc.add_option('-x', '--extract-audio', action='store_true', dest='extractaudio', default=False,
@@ -330,6 +338,11 @@ def parseOpts(overrideArguments=None):
return parser, opts, args
def _real_main(argv=None):
# Compatibility fixes for Windows
if sys.platform == 'win32':
# https://github.com/rg3/youtube-dl/issues/820
codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
parser, opts, args = parseOpts(argv)
# Open appropriate CookieJar
@@ -376,12 +389,19 @@ def _real_main(argv=None):
# General configuration
cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar)
proxies = compat_urllib_request.getproxies()
# Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
if 'http' in proxies and 'https' not in proxies:
proxies['https'] = proxies['http']
if opts.proxy is not None:
if opts.proxy == '':
proxies = {}
else:
proxies = {'http': opts.proxy, 'https': opts.proxy}
else:
proxies = compat_urllib_request.getproxies()
# Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
if 'http' in proxies and 'https' not in proxies:
proxies['https'] = proxies['http']
proxy_handler = compat_urllib_request.ProxyHandler(proxies)
opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
https_handler = make_HTTPS_handler(opts)
opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
compat_urllib_request.install_opener(opener)
socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
@@ -400,7 +420,7 @@ def _real_main(argv=None):
if opts.usenetrc and (opts.username is not None or opts.password is not None):
parser.error(u'using .netrc conflicts with giving username/password')
if opts.password is not None and opts.username is None:
parser.error(u'account username missing')
print(u'WARNING: account username missing')
if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid):
parser.error(u'using output template conflicts with using title, video ID or auto number')
if opts.usetitle and opts.useid:
@@ -470,22 +490,23 @@ def _real_main(argv=None):
or (opts.usetitle and u'%(title)s-%(id)s.%(ext)s')
or (opts.useid and u'%(id)s.%(ext)s')
or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
or u'%(id)s.%(ext)s')
or u'%(title)s-%(id)s.%(ext)s')
# File downloader
fd = FileDownloader({
'usenetrc': opts.usenetrc,
'username': opts.username,
'password': opts.password,
'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
'forceid': opts.getid,
'forcethumbnail': opts.getthumbnail,
'forcedescription': opts.getdescription,
'forcefilename': opts.getfilename,
'forceformat': opts.getformat,
'simulate': opts.simulate,
'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
'format': opts.format,
'format_limit': opts.format_limit,
'listformats': opts.listformats,
@@ -509,8 +530,8 @@ def _real_main(argv=None):
'updatetime': opts.updatetime,
'writedescription': opts.writedescription,
'writeinfojson': opts.writeinfojson,
'writethumbnail': opts.writethumbnail,
'writesubtitles': opts.writesubtitles,
'onlysubtitles': opts.onlysubtitles,
'allsubtitles': opts.allsubtitles,
'listsubtitles': opts.listsubtitles,
'subtitlesformat': opts.subtitlesformat,
@@ -525,7 +546,7 @@ def _real_main(argv=None):
'keepvideo': opts.keepvideo,
'min_filesize': opts.min_filesize,
'max_filesize': opts.max_filesize,
'daterange': date
'daterange': date,
})
if opts.verbose:

View File

@@ -0,0 +1,133 @@
from .ard import ARDIE
from .arte import ArteTvIE
from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .breakcom import BreakIE
from .comedycentral import ComedyCentralIE
from .collegehumor import CollegeHumorIE
from .dailymotion import DailymotionIE
from .depositfiles import DepositFilesIE
from .eighttracks import EightTracksIE
from .escapist import EscapistIE
from .facebook import FacebookIE
from .flickr import FlickrIE
from .funnyordie import FunnyOrDieIE
from .gametrailers import GametrailersIE
from .generic import GenericIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .howcast import HowcastIE
from .hypem import HypemIE
from .ina import InaIE
from .infoq import InfoQIE
from .justintv import JustinTVIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
from .metacafe import MetacafeIE
from .mixcloud import MixcloudIE
from .mtv import MTVIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .nba import NBAIE
from .statigram import StatigramIE
from .photobucket import PhotobucketIE
from .pornotube import PornotubeIE
from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE
from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .teamcoco import TeamcocoIE
from .ted import TEDIE
from .tumblr import TumblrIE
from .ustream import UstreamIE
from .vbox7 import Vbox7IE
from .vimeo import VimeoIE
from .vine import VineIE
from .worldstarhiphop import WorldStarHipHopIE
from .xnxx import XNXXIE
from .xhamster import XHamsterIE
from .xvideos import XVideosIE
from .yahoo import YahooIE, YahooSearchIE
from .youjizz import YouJizzIE
from .youku import YoukuIE
from .youporn import YouPornIE
from .youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
from .zdf import ZDFIE
def gen_extractors():
""" Return a list of an instance of every supported extractor.
The order does matter; the first extractor matched is the one handling the URL.
"""
return [
YoutubePlaylistIE(),
YoutubeChannelIE(),
YoutubeUserIE(),
YoutubeSearchIE(),
YoutubeIE(),
MetacafeIE(),
DailymotionIE(),
GoogleSearchIE(),
PhotobucketIE(),
YahooIE(),
YahooSearchIE(),
DepositFilesIE(),
FacebookIE(),
BlipTVIE(),
BlipTVUserIE(),
VimeoIE(),
MyVideoIE(),
ComedyCentralIE(),
EscapistIE(),
CollegeHumorIE(),
XVideosIE(),
SoundcloudSetIE(),
SoundcloudIE(),
InfoQIE(),
MixcloudIE(),
StanfordOpenClassroomIE(),
MTVIE(),
YoukuIE(),
XNXXIE(),
YouJizzIE(),
PornotubeIE(),
YouPornIE(),
GooglePlusIE(),
ArteTvIE(),
NBAIE(),
WorldStarHipHopIE(),
JustinTVIE(),
FunnyOrDieIE(),
SteamIE(),
UstreamIE(),
RBMARadioIE(),
EightTracksIE(),
KeekIE(),
TEDIE(),
MySpassIE(),
SpiegelIE(),
LiveLeakIE(),
ARDIE(),
ZDFIE(),
TumblrIE(),
BandcampIE(),
RedTubeIE(),
InaIE(),
HowcastIE(),
VineIE(),
FlickrIE(),
TeamcocoIE(),
XHamsterIE(),
HypemIE(),
Vbox7IE(),
GametrailersIE(),
StatigramIE(),
BreakIE(),
GenericIE()
]
def get_info_extractor(ie_name):
"""Returns the info extractor class with the given ie_name"""
return globals()[ie_name+'IE']

View File

@@ -0,0 +1,45 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class ARDIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
_TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
_MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
def _real_extract(self, url):
# determine video id from url
m = re.match(self._VALID_URL, url)
numid = re.search(r'documentId=([0-9]+)', url)
if numid:
video_id = numid.group(1)
else:
video_id = m.group('video_id')
# determine title and media streams from webpage
html = self._download_webpage(url, video_id)
title = re.search(self._TITLE, html).group('title')
streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
if not streams:
assert '"fsk"' in html
raise ExtractorError(u'This video is only available after 8:00 pm')
# choose default media type and highest quality for now
stream = max([s for s in streams if int(s["media_type"]) == 0],
key=lambda s: int(s["quality"]))
# there's two possibilities: RTMP stream or HTTP download
info = {'id': video_id, 'title': title, 'ext': 'mp4'}
if stream['rtmp_url']:
self.to_screen(u'RTMP download detected')
assert stream['video_url'].startswith('mp4:')
info["url"] = stream["rtmp_url"]
info["play_path"] = stream['video_url']
else:
assert stream["video_url"].endswith('.mp4')
info["url"] = stream["video_url"]
return [info]

View File

@@ -0,0 +1,136 @@
import re
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
unified_strdate,
)
class ArteTvIE(InfoExtractor):
"""arte.tv information extractor."""
_VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
_LIVE_URL = r'index-[0-9]+\.html$'
IE_NAME = u'arte.tv'
def fetch_webpage(self, url):
request = compat_urllib_request.Request(url)
try:
self.report_download_webpage(url)
webpage = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
except ValueError as err:
raise ExtractorError(u'Invalid URL: %s' % url)
return webpage
def grep_webpage(self, url, regex, regexFlags, matchTuples):
page = self.fetch_webpage(url)
mobj = re.search(regex, page, regexFlags)
info = {}
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
for (i, key, err) in matchTuples:
if mobj.group(i) is None:
raise ExtractorError(err)
else:
info[key] = mobj.group(i)
return info
# TODO implement Live Stream
# def extractLiveStream(self, url):
# video_lang = url.split('/')[-4]
# info = self.grep_webpage(
# url,
# r'src="(.*?/videothek_js.*?\.js)',
# 0,
# [
# (1, 'url', u'Invalid URL: %s' % url)
# ]
# )
# http_host = url.split('/')[2]
# next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
# info = self.grep_webpage(
# next_url,
# r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
# '(http://.*?\.swf).*?' +
# '(rtmp://.*?)\'',
# re.DOTALL,
# [
# (1, 'path', u'could not extract video path: %s' % url),
# (2, 'player', u'could not extract video player: %s' % url),
# (3, 'url', u'could not extract video url: %s' % url)
# ]
# )
# video_url = u'%s/%s' % (info.get('url'), info.get('path'))
def extractPlus7Stream(self, url):
video_lang = url.split('/')[-3]
info = self.grep_webpage(
url,
r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
0,
[
(1, 'url', u'Invalid URL: %s' % url)
]
)
next_url = compat_urllib_parse.unquote(info.get('url'))
info = self.grep_webpage(
next_url,
r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
0,
[
(1, 'url', u'Could not find <video> tag: %s' % url)
]
)
next_url = compat_urllib_parse.unquote(info.get('url'))
info = self.grep_webpage(
next_url,
r'<video id="(.*?)".*?>.*?' +
'<name>(.*?)</name>.*?' +
'<dateVideo>(.*?)</dateVideo>.*?' +
'<url quality="hd">(.*?)</url>',
re.DOTALL,
[
(1, 'id', u'could not extract video id: %s' % url),
(2, 'title', u'could not extract video title: %s' % url),
(3, 'date', u'could not extract video date: %s' % url),
(4, 'url', u'could not extract video url: %s' % url)
]
)
return {
'id': info.get('id'),
'url': compat_urllib_parse.unquote(info.get('url')),
'uploader': u'arte.tv',
'upload_date': unified_strdate(info.get('date')),
'title': info.get('title').decode('utf-8'),
'ext': u'mp4',
'format': u'NA',
'player_url': None,
}
def _real_extract(self, url):
video_id = url.split('/')[-1]
self.report_extraction(video_id)
if re.search(self._LIVE_URL, video_id) is not None:
raise ExtractorError(u'Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url)
# return
else:
info = self.extractPlus7Stream(url)
return [info]

View File

@@ -0,0 +1,54 @@
import json
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class BandcampIE(InfoExtractor):
_VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
webpage = self._download_webpage(url, title)
# We get the link to the free download page
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if m_download is None:
raise ExtractorError(u'No free songs found')
download_link = m_download.group(1)
id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
webpage, re.MULTILINE|re.DOTALL).group('id')
download_webpage = self._download_webpage(download_link, id,
'Downloading free downloads page')
# We get the dictionary of the track from some javascrip code
info = re.search(r'items: (.*?),$',
download_webpage, re.MULTILINE).group(1)
info = json.loads(info)[0]
# We pick mp3-320 for now, until format selection can be easily implemented.
mp3_info = info[u'downloads'][u'mp3-320']
# If we try to use this url it says the link has expired
initial_url = mp3_info[u'url']
re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
m_url = re.match(re_url, initial_url)
#We build the url we will use to get the final track url
# This url is build in Bandcamp in the script download_bunde_*.js
request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
# If we could correctly generate the .rand field the url would be
#in the "download_url" key
final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
track_info = {'id':id,
'title' : info[u'title'],
'ext' : 'mp3',
'url' : final_url,
'thumbnail' : info[u'thumb_url'],
'uploader' : info[u'artist']
}
return [track_info]

View File

@@ -0,0 +1,177 @@
import datetime
import json
import os
import re
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_parse_qs,
compat_str,
compat_urllib_error,
compat_urllib_parse_urlparse,
compat_urllib_request,
ExtractorError,
unescapeHTML,
)
class BlipTVIE(InfoExtractor):
"""Information extractor for blip.tv"""
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
_URL_EXT = r'^.*\.([a-z0-9]+)$'
IE_NAME = u'blip.tv'
def report_direct_download(self, title):
"""Report information extraction."""
self.to_screen(u'%s: Direct download detected' % title)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# See https://github.com/rg3/youtube-dl/issues/857
api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
if api_mobj is not None:
url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
urlp = compat_urllib_parse_urlparse(url)
if urlp.path.startswith('/play/'):
request = compat_urllib_request.Request(url)
response = compat_urllib_request.urlopen(request)
redirecturl = response.geturl()
rurlp = compat_urllib_parse_urlparse(redirecturl)
file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
url = 'http://blip.tv/a/a-' + file_id
return self._real_extract(url)
if '?' in url:
cchar = '&'
else:
cchar = '?'
json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
request = compat_urllib_request.Request(json_url)
request.add_header('User-Agent', 'iTunes/10.6.1')
self.report_extraction(mobj.group(1))
info = None
try:
urlh = compat_urllib_request.urlopen(request)
if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
basename = url.split('/')[-1]
title,ext = os.path.splitext(basename)
title = title.decode('UTF-8')
ext = ext.replace('.', '')
self.report_direct_download(title)
info = {
'id': title,
'url': url,
'uploader': None,
'upload_date': None,
'title': title,
'ext': ext,
'urlhandle': urlh
}
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
if info is None: # Regular URL
try:
json_code_bytes = urlh.read()
json_code = json_code_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
try:
json_data = json.loads(json_code)
if 'Post' in json_data:
data = json_data['Post']
else:
data = json_data
upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
video_url = data['media']['url']
umobj = re.match(self._URL_EXT, video_url)
if umobj is None:
raise ValueError('Can not determine filename extension')
ext = umobj.group(1)
info = {
'id': data['item_id'],
'url': video_url,
'uploader': data['display_name'],
'upload_date': upload_date,
'title': data['title'],
'ext': ext,
'format': data['media']['mimeType'],
'thumbnail': data['thumbnailUrl'],
'description': data['description'],
'player_url': data['embedUrl'],
'user_agent': 'iTunes/10.6.1',
}
except (ValueError,KeyError) as err:
raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
return [info]
class BlipTVUserIE(InfoExtractor):
"""Information Extractor for blip.tv users."""
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
_PAGE_SIZE = 12
IE_NAME = u'blip.tv:user'
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
username = mobj.group(1)
page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
page = self._download_webpage(url, username, u'Downloading user page')
mobj = re.search(r'data-users-id="([^"]+)"', page)
page_base = page_base % mobj.group(1)
# Download video ids using BlipTV Ajax calls. Result size per
# query is limited (currently to 12 videos) so we need to query
# page by page until there are no video ids - it means we got
# all of them.
video_ids = []
pagenum = 1
while True:
url = page_base + "&page=" + str(pagenum)
page = self._download_webpage(url, username,
u'Downloading video ids from page %d' % pagenum)
# Extract video identifiers
ids_in_page = []
for mobj in re.finditer(r'href="/([^"]+)"', page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(unescapeHTML(mobj.group(1)))
video_ids.extend(ids_in_page)
# A little optimization - if current page is not
# "full", ie. does not contain PAGE_SIZE video ids then
# we can assume that this page is the last one - there
# are no more ids on further pages - no need to query
# again.
if len(ids_in_page) < self._PAGE_SIZE:
break
pagenum += 1
urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
url_entries = [self.url_result(url, 'BlipTV') for url in urls]
return [self.playlist_result(url_entries, playlist_title = username)]

View File

@@ -0,0 +1,25 @@
import re
from .common import InfoExtractor
class BreakIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?break\.com/video/([^/]+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1).split("-")[-1]
webpage = self._download_webpage(url, video_id)
video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1)
key = re.search(r"icon: '(.+?)',",webpage).group(1)
final_url = str(video_url)+"?"+str(key)
thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1)
title = re.search(r"sVidTitle: '(.+)',",webpage).group(1)
ext = video_url.split('.')[-1]
return [{
'id': video_id,
'url': final_url,
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
}]

View File

@@ -0,0 +1,74 @@
import re
import socket
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse_urlparse,
compat_urllib_request,
ExtractorError,
)
class CollegeHumorIE(InfoExtractor):
_WORKING = False
_VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
def report_manifest(self, video_id):
"""Report information extraction."""
self.to_screen(u'%s: Downloading XML manifest' % video_id)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('videoid')
info = {
'id': video_id,
'uploader': None,
'upload_date': None,
}
self.report_extraction(video_id)
xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
try:
metaXml = compat_urllib_request.urlopen(xmlUrl).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
mdoc = xml.etree.ElementTree.fromstring(metaXml)
try:
videoNode = mdoc.findall('./video')[0]
info['description'] = videoNode.findall('./description')[0].text
info['title'] = videoNode.findall('./caption')[0].text
info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
manifest_url = videoNode.findall('./file')[0].text
except IndexError:
raise ExtractorError(u'Invalid metadata XML file')
manifest_url += '?hdcore=2.10.3'
self.report_manifest(video_id)
try:
manifestXml = compat_urllib_request.urlopen(manifest_url).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
adoc = xml.etree.ElementTree.fromstring(manifestXml)
try:
media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
node_id = media_node.attrib['url']
video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
except IndexError as err:
raise ExtractorError(u'Invalid manifest file')
url_pr = compat_urllib_parse_urlparse(manifest_url)
url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
info['url'] = url
info['ext'] = 'f4f'
return [info]

View File

@@ -0,0 +1,179 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
ExtractorError,
unified_strdate,
)
class ComedyCentralIE(InfoExtractor):
"""Information extractor for The Daily Show and Colbert Report """
# urls can be abbreviations like :thedailyshow or :colbert
# urls for episodes like:
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
_VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
|(https?://)?(www\.)?
(?P<showname>thedailyshow|colbertnation)\.com/
(full-episodes/(?P<episode>.*)|
(?P<clip>
(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
$"""
_available_formats = ['3500', '2200', '1700', '1200', '750', '400']
_video_extensions = {
'3500': 'mp4',
'2200': 'mp4',
'1700': 'mp4',
'1200': 'mp4',
'750': 'mp4',
'400': 'mp4',
}
_video_dimensions = {
'3500': '1280x720',
'2200': '960x540',
'1700': '768x432',
'1200': '640x360',
'750': '512x288',
'400': '384x216',
}
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def _print_formats(self, formats):
print('Available formats:')
for x in formats:
print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
if mobj.group('shortname'):
if mobj.group('shortname') in ('tds', 'thedailyshow'):
url = u'http://www.thedailyshow.com/full-episodes/'
else:
url = u'http://www.colbertnation.com/full-episodes/'
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
assert mobj is not None
if mobj.group('clip'):
if mobj.group('showname') == 'thedailyshow':
epTitle = mobj.group('tdstitle')
else:
epTitle = mobj.group('cntitle')
dlNewest = False
else:
dlNewest = not mobj.group('episode')
if dlNewest:
epTitle = mobj.group('showname')
else:
epTitle = mobj.group('episode')
self.report_extraction(epTitle)
webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
if dlNewest:
url = htmlHandle.geturl()
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError(u'Invalid redirected URL: ' + url)
if mobj.group('episode') == '':
raise ExtractorError(u'Redirected URL is still not specific: ' + url)
epTitle = mobj.group('episode')
mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
if len(mMovieParams) == 0:
# The Colbert Report embeds the information in a without
# a URL prefix; so extract the alternate reference
# and then add the URL prefix manually.
altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
if len(altMovieParams) == 0:
raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
else:
mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
uri = mMovieParams[0][1]
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
indexXml = self._download_webpage(indexUrl, epTitle,
u'Downloading show index',
u'unable to download episode index')
results = []
idoc = xml.etree.ElementTree.fromstring(indexXml)
itemEls = idoc.findall('.//item')
for partNum,itemEl in enumerate(itemEls):
mediaId = itemEl.findall('./guid')[0].text
shortMediaId = mediaId.split(':')[-1]
showId = mediaId.split(':')[-2].replace('.com', '')
officialTitle = itemEl.findall('./title')[0].text
officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
compat_urllib_parse.urlencode({'uri': mediaId}))
configXml = self._download_webpage(configUrl, epTitle,
u'Downloading configuration for %s' % shortMediaId)
cdoc = xml.etree.ElementTree.fromstring(configXml)
turls = []
for rendition in cdoc.findall('.//rendition'):
finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
turls.append(finfo)
if len(turls) == 0:
self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
continue
if self._downloader.params.get('listformats', None):
self._print_formats([i[0] for i in turls])
return
# For now, just pick the highest bitrate
format,rtmp_video_url = turls[-1]
# Get the format arg from the arg stream
req_format = self._downloader.params.get('format', None)
# Select format if we can find one
for f,v in turls:
if f == req_format:
format, rtmp_video_url = f, v
break
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
if not m:
raise ExtractorError(u'Cannot transform RTMP url')
base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
video_url = base + m.group('finalid')
effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
info = {
'id': shortMediaId,
'url': video_url,
'uploader': showId,
'upload_date': officialDate,
'title': effTitle,
'ext': 'mp4',
'format': format,
'thumbnail': None,
'description': officialTitle,
}
results.append(info)
return results

View File

@@ -0,0 +1,264 @@
import base64
import os
import re
import socket
import sys
from ..utils import (
compat_http_client,
compat_urllib_error,
compat_urllib_request,
compat_str,
clean_html,
compiled_regex_type,
ExtractorError,
)
class InfoExtractor(object):
"""Information Extractor class.
Information extractors are the classes that, given a URL, extract
information about the video (or videos) the URL refers to. This
information includes the real video URL, the video title, author and
others. The information is stored in a dictionary which is then
passed to the FileDownloader. The FileDownloader processes this
information possibly downloading the video to the file system, among
other possible outcomes.
The dictionaries must include the following fields:
id: Video identifier.
url: Final video URL.
title: Video title, unescaped.
ext: Video filename extension.
The following fields are optional:
format: The video format, defaults to ext (used for --get-format)
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
uploader: Full name of the video uploader.
upload_date: Video upload date (YYYYMMDD).
uploader_id: Nickname or id of the video uploader.
location: Physical location of the video.
player_url: SWF Player URL (used for rtmpdump).
subtitles: The subtitle file contents.
urlhandle: [internal] The urlHandle to be used to download the file,
like returned by urllib.request.urlopen
The fields should all be Unicode strings.
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
_real_extract() must return a *list* of information dictionaries as
described above.
Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
"""
_ready = False
_downloader = None
_WORKING = True
def __init__(self, downloader=None):
"""Constructor. Receives an optional downloader."""
self._ready = False
self.set_downloader(downloader)
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url) is not None
@classmethod
def working(cls):
"""Getter method for _WORKING."""
return cls._WORKING
def initialize(self):
"""Initializes an instance (authentication, etc)."""
if not self._ready:
self._real_initialize()
self._ready = True
def extract(self, url):
"""Extracts URL information and returns it in list of dicts."""
self.initialize()
return self._real_extract(url)
def set_downloader(self, downloader):
"""Sets the downloader for this IE."""
self._downloader = downloader
def _real_initialize(self):
"""Real initialization process. Redefine in subclasses."""
pass
def _real_extract(self, url):
"""Real extraction process. Redefine in subclasses."""
pass
@property
def IE_NAME(self):
return type(self).__name__[:-2]
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
""" Returns the response handle """
if note is None:
self.report_download_webpage(video_id)
elif note is not False:
self.to_screen(u'%s: %s' % (video_id, note))
try:
return compat_urllib_request.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
else:
encoding = 'utf-8'
webpage_bytes = urlh.read()
if self._downloader.params.get('dump_intermediate_pages', False):
try:
url = url_or_request.get_full_url()
except AttributeError:
url = url_or_request
self.to_screen(u'Dumping request to ' + url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
content = webpage_bytes.decode(encoding, 'replace')
return (content, urlh)
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
""" Returns the data of the page as a string """
return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
def report_extraction(self, id_or_name):
"""Report information extraction."""
self.to_screen(u'%s: Extracting information' % id_or_name)
def report_download_webpage(self, video_id):
"""Report webpage download."""
self.to_screen(u'%s: Downloading webpage' % video_id)
def report_age_confirmation(self):
"""Report attempt to confirm age."""
self.to_screen(u'Confirming age')
#Methods for following #608
#They set the correct value of the '_type' key
def video_result(self, video_info):
"""Returns a video"""
video_info['_type'] = 'video'
return video_info
def url_result(self, url, ie=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
return video_info
def playlist_result(self, entries, playlist_id=None, playlist_title=None):
"""Returns a playlist"""
video_info = {'_type': 'playlist',
'entries': entries}
if playlist_id:
video_info['id'] = playlist_id
if playlist_title:
video_info['title'] = playlist_title
return video_info
def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
In case of failure return a default value or raise a WARNING or a
ExtractorError, depending on fatal, specifying the field name.
"""
if isinstance(pattern, (str, compat_str, compiled_regex_type)):
mobj = re.search(pattern, string, flags)
else:
for p in pattern:
mobj = re.search(p, string, flags)
if mobj: break
if sys.stderr.isatty() and os.name != 'nt':
_name = u'\033[0;34m%s\033[0m' % name
else:
_name = name
if mobj:
# return the first matching group
return next(g for g in mobj.groups() if g is not None)
elif default is not None:
return default
elif fatal:
raise ExtractorError(u'Unable to extract %s' % _name)
else:
self._downloader.report_warning(u'unable to extract %s; '
u'please report this issue on GitHub.' % _name)
return None
def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
res = self._search_regex(pattern, string, name, default, fatal, flags)
if res:
return clean_html(res).strip()
else:
return res
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
Instances should define _SEARCH_KEY and _MAX_RESULTS.
"""
@classmethod
def _make_valid_url(cls):
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
@classmethod
def suitable(cls, url):
return re.match(cls._make_valid_url(), url) is not None
def _real_extract(self, query):
mobj = re.match(self._make_valid_url(), query)
if mobj is None:
raise ExtractorError(u'Invalid search query "%s"' % query)
prefix = mobj.group('prefix')
query = mobj.group('query')
if prefix == '':
return self._get_n_results(query, 1)
elif prefix == 'all':
return self._get_n_results(query, self._MAX_RESULTS)
else:
n = int(prefix)
if n <= 0:
raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
elif n > self._MAX_RESULTS:
self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
n = self._MAX_RESULTS
return self._get_n_results(query, n)
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
raise NotImplementedError("This method must be implemented by sublclasses")

View File

@@ -0,0 +1,77 @@
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
compat_urllib_parse,
ExtractorError,
unescapeHTML,
)
class DailymotionIE(InfoExtractor):
"""Information Extractor for Dailymotion"""
_VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
IE_NAME = u'dailymotion'
def _real_extract(self, url):
# Extract id and simplified title from URL
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1).split('_')[0].split('?')[0]
video_extension = 'mp4'
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url)
request.add_header('Cookie', 'family_filter=off')
webpage = self._download_webpage(request, video_id)
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'\s*var flashvars = (.*)', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
flashvars = compat_urllib_parse.unquote(mobj.group(1))
for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
if key in flashvars:
max_quality = key
self.to_screen(u'Using %s' % key)
break
else:
raise ExtractorError(u'Unable to extract video URL')
mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
if mobj is None:
raise ExtractorError(u'Unable to extract video URL')
video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
# TODO: support choosing qualities
mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract title')
video_title = unescapeHTML(mobj.group('title'))
video_uploader = None
video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
# Looking for official user
r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
webpage, 'video uploader')
video_upload_date = None
mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
if mobj is not None:
video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
return [{
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': video_upload_date,
'title': video_title,
'ext': video_extension,
}]

View File

@@ -0,0 +1,60 @@
import re
import os
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
)
class DepositFilesIE(InfoExtractor):
"""Information extractor for depositfiles.com"""
_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
def _real_extract(self, url):
file_id = url.split('/')[-1]
# Rebuild url in english locale
url = 'http://depositfiles.com/en/files/' + file_id
# Retrieve file webpage with 'Free download' button pressed
free_download_indication = { 'gateway_result' : '1' }
request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
try:
self.report_download_webpage(file_id)
webpage = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
# Search for the real file URL
mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
if (mobj is None) or (mobj.group(1) is None):
# Try to figure out reason of the error.
mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
if (mobj is not None) and (mobj.group(1) is not None):
restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
raise ExtractorError(u'%s' % restriction_message)
else:
raise ExtractorError(u'Unable to extract download URL from: %s' % url)
file_url = mobj.group(1)
file_extension = os.path.splitext(file_url)[1][1:]
# Search for file title
file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
return [{
'id': file_id.decode('utf-8'),
'url': file_url.decode('utf-8'),
'uploader': None,
'upload_date': None,
'title': file_title,
'ext': file_extension.decode('utf-8'),
}]

View File

@@ -0,0 +1,51 @@
import itertools
import json
import random
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class EightTracksIE(InfoExtractor):
IE_NAME = '8tracks'
_VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
data = json.loads(json_like)
session = str(random.randint(0, 1000000000))
mix_id = data['id']
track_count = data['tracks_count']
first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
next_url = first_url
res = []
for i in itertools.count():
api_json = self._download_webpage(next_url, playlist_id,
note=u'Downloading song information %s/%s' % (str(i+1), track_count),
errnote=u'Failed to download song information')
api_data = json.loads(api_json)
track_data = api_data[u'set']['track']
info = {
'id': track_data['id'],
'url': track_data['track_file_stream_url'],
'title': track_data['performer'] + u' - ' + track_data['name'],
'raw_title': track_data['name'],
'uploader_id': data['user']['login'],
'ext': 'm4a',
}
res.append(info)
if api_data['set']['at_last_track']:
break
next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
return res

View File

@@ -0,0 +1,68 @@
import json
import re
from .common import InfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
ExtractorError,
)
class EscapistIE(InfoExtractor):
_VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
showName = mobj.group('showname')
videoId = mobj.group('episode')
self.report_extraction(videoId)
webpage = self._download_webpage(url, videoId)
videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
webpage, u'thumbnail', fatal=False)
playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
webpage, u'player url')
title = self._html_search_regex('<meta name="title" content="([^"]*)"',
webpage, u'player url').split(' : ')[-1]
configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
configUrl = compat_urllib_parse.unquote(configUrl)
configJSON = self._download_webpage(configUrl, videoId,
u'Downloading configuration',
u'unable to download configuration')
# Technically, it's JavaScript, not JSON
configJSON = configJSON.replace("'", '"')
try:
config = json.loads(configJSON)
except (ValueError,) as err:
raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
playlist = config['playlist']
videoUrl = playlist[1]['url']
info = {
'id': videoId,
'url': videoUrl,
'uploader': showName,
'upload_date': None,
'title': title,
'ext': 'mp4',
'thumbnail': imgUrl,
'description': videoDesc,
'player_url': playerUrl,
}
return [info]

View File

@@ -0,0 +1,111 @@
import json
import netrc
import re
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
)
class FacebookIE(InfoExtractor):
"""Information Extractor for Facebook"""
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
_NETRC_MACHINE = 'facebook'
IE_NAME = u'facebook'
def report_login(self):
"""Report attempt to log in."""
self.to_screen(u'Logging in')
def _real_initialize(self):
if self._downloader is None:
return
useremail = None
password = None
downloader_params = self._downloader.params
# Attempt to use provided username and password or .netrc data
if downloader_params.get('username', None) is not None:
useremail = downloader_params['username']
password = downloader_params['password']
elif downloader_params.get('usenetrc', False):
try:
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
if info is not None:
useremail = info[0]
password = info[2]
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
return
if useremail is None:
return
# Log in
login_form = {
'email': useremail,
'pass': password,
'login': 'Log+In'
}
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
try:
self.report_login()
login_results = compat_urllib_request.urlopen(request).read()
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
return
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('ID')
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
webpage = self._download_webpage(url, video_id)
BEFORE = '{swf.addParam(param[0], param[1]);});\n'
AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
if not m:
raise ExtractorError(u'Cannot parse data')
data = dict(json.loads(m.group(1)))
params_raw = compat_urllib_parse.unquote(data['params'])
params = json.loads(params_raw)
video_data = params['video_data'][0]
video_url = video_data.get('hd_src')
if not video_url:
video_url = video_data['sd_src']
if not video_url:
raise ExtractorError(u'Cannot find video URL')
video_duration = int(video_data['video_duration'])
thumbnail = video_data['thumbnail_src']
video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
webpage, u'title')
info = {
'id': video_id,
'title': video_title,
'url': video_url,
'ext': 'mp4',
'duration': video_duration,
'thumbnail': thumbnail,
}
return [info]

View File

@@ -0,0 +1,57 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
)
class FlickrIE(InfoExtractor):
"""Information Extractor for Flickr videos"""
_VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_uploader_id = mobj.group('uploader_id')
webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
first_xml, u'node_id')
second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
self.report_extraction(video_id)
mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
if mobj is None:
raise ExtractorError(u'Unable to extract video url')
video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'video title')
video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'description', fatal=False)
thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'description': video_description,
'thumbnail': thumbnail,
'uploader_id': video_uploader_id,
}]

View File

@@ -0,0 +1,31 @@
import re
from .common import InfoExtractor
class FunnyOrDieIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
webpage, u'video URL', flags=re.DOTALL)
title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
webpage, u'description', fatal=False, flags=re.DOTALL)
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
'description': video_description,
}
return [info]

View File

@@ -0,0 +1,59 @@
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
)
class GametrailersIE(InfoExtractor):
_VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
video_type = mobj.group('type')
webpage = self._download_webpage(url, video_id)
if video_type == 'full-episodes':
mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
else:
mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
mgid = self._search_regex(mgid_re, webpage, u'mgid')
data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
video_id, u'Downloading video info')
links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
video_id, u'Downloading video urls info')
self.report_extraction(video_id)
info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
<description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
<image>.*
<url>(?P<thumb>.*?)</url>.*
</image>'''
m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
if m_info is None:
raise ExtractorError(u'Unable to extract video info')
video_title = m_info.group('title')
video_description = m_info.group('description')
video_thumb = m_info.group('thumb')
m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
if m_urls is None or len(m_urls) == 0:
raise ExtractorError(u'Unable to extract video url')
# They are sorted from worst to best quality
video_url = m_urls[-1].group('url')
return {'url': video_url,
'id': video_id,
'title': video_title,
# Videos are actually flv not mp4
'ext': 'flv',
'thumbnail': video_thumb,
'description': video_description,
}

View File

@@ -0,0 +1,151 @@
import os
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
)
class GenericIE(InfoExtractor):
"""Generic last-resort information extractor."""
_VALID_URL = r'.*'
IE_NAME = u'generic'
def report_download_webpage(self, video_id):
"""Report webpage download."""
if not self._downloader.params.get('test', False):
self._downloader.report_warning(u'Falling back on generic information extractor.')
super(GenericIE, self).report_download_webpage(video_id)
def report_following_redirect(self, new_url):
"""Report information extraction."""
self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
def _test_redirect(self, url):
"""Check if it is a redirect, like url shorteners, in case return the new url."""
class HeadRequest(compat_urllib_request.Request):
def get_method(self):
return "HEAD"
class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
"""
Subclass the HTTPRedirectHandler to make it use our
HeadRequest also on the redirected URL
"""
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
newurl = newurl.replace(' ', '%20')
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return HeadRequest(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
else:
raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
class HTTPMethodFallback(compat_urllib_request.BaseHandler):
"""
Fallback to GET if HEAD is not allowed (405 HTTP error)
"""
def http_error_405(self, req, fp, code, msg, headers):
fp.read()
fp.close()
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True))
# Build our opener
opener = compat_urllib_request.OpenerDirector()
for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
HTTPMethodFallback, HEADRedirectHandler,
compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
opener.add_handler(handler())
response = opener.open(HeadRequest(url))
if response is None:
raise ExtractorError(u'Invalid URL protocol')
new_url = response.geturl()
if url == new_url:
return False
self.report_following_redirect(new_url)
return new_url
def _real_extract(self, url):
new_url = self._test_redirect(url)
if new_url: return [self.url_result(new_url)]
video_id = url.split('/')[-1]
try:
webpage = self._download_webpage(url, video_id)
except ValueError:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
raise ExtractorError(u'Invalid URL: %s' % url)
self.report_extraction(video_id)
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
if mobj is None:
# Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
if mobj is None:
# We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj.group(1) is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_url = compat_urllib_parse.unquote(mobj.group(1))
video_id = os.path.basename(video_url)
# here's a fun little line of code for you:
video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
video_title = self._html_search_regex(r'<title>(.*)</title>',
webpage, u'video title')
# video uploader is domain name
video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
url, u'video uploader')
return [{
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
'ext': video_extension,
}]

View File

@@ -0,0 +1,82 @@
import datetime
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class GooglePlusIE(InfoExtractor):
"""Information extractor for plus.google.com."""
_VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
IE_NAME = u'plus.google'
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
post_url = mobj.group(0)
video_id = mobj.group(1)
video_extension = 'flv'
# Step 1, Retrieve post webpage to extract further information
webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
self.report_extraction(video_id)
# Extract update date
upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
webpage, u'upload date', fatal=False)
if upload_date:
# Convert timestring to a format suitable for filename
upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
upload_date = upload_date.strftime('%Y%m%d')
# Extract uploader
uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
webpage, u'uploader', fatal=False)
# Extract title
# Get the first line for title
video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
webpage, 'title', default=u'NA')
# Step 2, Stimulate clicking the image box to launch video
video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
webpage, u'video page URL')
webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
# Extract video links on video page
"""Extract video links of all sizes"""
pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
mobj = re.findall(pattern, webpage)
if len(mobj) == 0:
raise ExtractorError(u'Unable to extract video links')
# Sort in resolution
links = sorted(mobj)
# Choose the lowest of the sort, i.e. highest resolution
video_url = links[-1]
# Only get the url. The resolution part in the tuple has no use anymore
video_url = video_url[-1]
# Treat escaped \u0026 style hex
try:
video_url = video_url.decode("unicode_escape")
except AttributeError: # Python 3
video_url = bytes(video_url, 'ascii').decode('unicode-escape')
return [{
'id': video_id,
'url': video_url,
'uploader': uploader,
'upload_date': upload_date,
'title': video_title,
'ext': video_extension,
}]

View File

@@ -0,0 +1,39 @@
import itertools
import re
from .common import SearchInfoExtractor
from ..utils import (
compat_urllib_parse,
)
class GoogleSearchIE(SearchInfoExtractor):
"""Information Extractor for Google Video search queries."""
_MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
_MAX_RESULTS = 1000
IE_NAME = u'video.google:search'
_SEARCH_KEY = 'gvsearch'
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
res = {
'_type': 'playlist',
'id': query,
'entries': []
}
for pagenum in itertools.count(1):
result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
webpage = self._download_webpage(result_url, u'gvsearch:' + query,
note='Downloading result page ' + str(pagenum))
for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
e = {
'_type': 'url',
'url': mobj.group(1)
}
res['entries'].append(e)
if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
return res

View File

@@ -0,0 +1,37 @@
import re
from .common import InfoExtractor
class HowcastIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.howcast.com/videos/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
webpage, u'video URL')
video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
webpage, u'title')
video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
webpage, u'description', fatal=False)
thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'description': video_description,
'thumbnail': thumbnail,
}]

View File

@@ -0,0 +1,63 @@
import json
import re
import time
from .common import InfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
)
class HypemIE(InfoExtractor):
"""Information Extractor for hypem"""
_VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
track_id = mobj.group(1)
data = { 'ax': 1, 'ts': time.time() }
data_encoded = compat_urllib_parse.urlencode(data)
complete_url = url + "?" + data_encoded
request = compat_urllib_request.Request(complete_url)
response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
cookie = urlh.headers.get('Set-Cookie', '')
self.report_extraction(track_id)
html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
try:
track_list = json.loads(html_tracks)
track = track_list[u'tracks'][0]
except ValueError:
raise ExtractorError(u'Hypemachine contained invalid JSON.')
key = track[u"key"]
track_id = track[u"id"]
artist = track[u"artist"]
title = track[u"song"]
serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
request.add_header('cookie', cookie)
song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
try:
song_data = json.loads(song_data_json)
except ValueError:
raise ExtractorError(u'Hypemachine contained invalid JSON.')
final_url = song_data[u"url"]
return [{
'id': track_id,
'url': final_url,
'ext': "mp3",
'title': title,
'artist': artist,
}]

View File

@@ -0,0 +1,31 @@
import re
from .common import InfoExtractor
class InaIE(InfoExtractor):
"""Information Extractor for Ina.fr"""
_VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
video_extension = 'mp4'
webpage = self._download_webpage(mrss_url, video_id)
self.report_extraction(video_id)
video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
webpage, u'video URL')
video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
webpage, u'title')
return [{
'id': video_id,
'url': video_url,
'ext': video_extension,
'title': video_title,
}]

View File

@@ -0,0 +1,50 @@
import base64
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
)
class InfoQIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
webpage = self._download_webpage(url, video_id=url)
self.report_extraction(url)
# Extract video URL
mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract video url')
real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
# Extract title
video_title = self._search_regex(r'contentTitle = "(.*?)";',
webpage, u'title')
# Extract description
video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
webpage, u'description', fatal=False)
video_filename = video_url.split('/')[-1]
video_id, extension = video_filename.split('.')
info = {
'id': video_id,
'url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': extension, # Extension is always(?) mp4, but seems to be flv
'thumbnail': None,
'description': video_description,
}
return [info]

View File

@@ -0,0 +1,144 @@
import json
import os
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
ExtractorError,
formatSeconds,
)
class JustinTVIE(InfoExtractor):
"""Information extractor for justin.tv and twitch.tv"""
# TODO: One broadcast may be split into multiple videos. The key
# 'broadcast_id' is the same for all parts, and 'broadcast_part'
# starts at 1 and increases. Can we treat all parts as one video?
_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
(?:
(?P<channelid>[^/]+)|
(?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
(?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
)
/?(?:\#.*)?$
"""
_JUSTIN_PAGE_LIMIT = 100
IE_NAME = u'justin.tv'
def report_download_page(self, channel, offset):
"""Report attempt to download a single page of videos."""
self.to_screen(u'%s: Downloading video information from %d to %d' %
(channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
# Return count of items, list of *valid* items
def _parse_page(self, url, video_id):
info_json = self._download_webpage(url, video_id,
u'Downloading video info JSON',
u'unable to download video info JSON')
response = json.loads(info_json)
if type(response) != list:
error_text = response.get('error', 'unknown error')
raise ExtractorError(u'Justin.tv API: %s' % error_text)
info = []
for clip in response:
video_url = clip['video_file_url']
if video_url:
video_extension = os.path.splitext(video_url)[1][1:]
video_date = re.sub('-', '', clip['start_time'][:10])
video_uploader_id = clip.get('user_id', clip.get('channel_id'))
video_id = clip['id']
video_title = clip.get('title', video_id)
info.append({
'id': video_id,
'url': video_url,
'title': video_title,
'uploader': clip.get('channel_name', video_uploader_id),
'uploader_id': video_uploader_id,
'upload_date': video_date,
'ext': video_extension,
})
return (len(response), info)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'invalid URL: %s' % url)
api_base = 'http://api.justin.tv'
paged = False
if mobj.group('channelid'):
paged = True
video_id = mobj.group('channelid')
api = api_base + '/channel/archives/%s.json' % video_id
elif mobj.group('chapterid'):
chapter_id = mobj.group('chapterid')
webpage = self._download_webpage(url, chapter_id)
m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
if not m:
raise ExtractorError(u'Cannot find archive of a chapter')
archive_id = m.group(1)
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
chapter_info_xml = self._download_webpage(api, chapter_id,
note=u'Downloading chapter information',
errnote=u'Chapter information download failed')
doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
for a in doc.findall('.//archive'):
if archive_id == a.find('./id').text:
break
else:
raise ExtractorError(u'Could not find chapter in chapter information')
video_url = a.find('./video_file_url').text
video_ext = video_url.rpartition('.')[2] or u'flv'
chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
note='Downloading chapter metadata',
errnote='Download of chapter metadata failed')
chapter_info = json.loads(chapter_info_json)
bracket_start = int(doc.find('.//bracket_start').text)
bracket_end = int(doc.find('.//bracket_end').text)
# TODO determine start (and probably fix up file)
# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
#video_url += u'?start=' + TODO:start_timestamp
# bracket_start is 13290, but we want 51670615
self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
info = {
'id': u'c' + chapter_id,
'url': video_url,
'ext': video_ext,
'title': chapter_info['title'],
'thumbnail': chapter_info['preview'],
'description': chapter_info['description'],
'uploader': chapter_info['channel']['display_name'],
'uploader_id': chapter_info['channel']['name'],
}
return [info]
else:
video_id = mobj.group('videoid')
api = api_base + '/broadcast/by_archive/%s.json' % video_id
self.report_extraction(video_id)
info = []
offset = 0
limit = self._JUSTIN_PAGE_LIMIT
while True:
if paged:
self.report_download_page(video_id, offset)
page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
page_count, page_info = self._parse_page(page_url, video_id)
info.extend(page_info)
if not paged or page_count != limit:
break
offset += limit
return info

View File

@@ -0,0 +1,32 @@
import re
from .common import InfoExtractor
class KeekIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
IE_NAME = u'keek'
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('videoID')
video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
webpage, u'title')
uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
webpage, u'uploader', fatal=False)
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'thumbnail': thumbnail,
'uploader': uploader
}
return [info]

View File

@@ -0,0 +1,44 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class LiveLeakIE(InfoExtractor):
_VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
IE_NAME = u'liveleak'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('video_id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'file: "(.*?)",',
webpage, u'video URL')
video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
webpage, u'title').replace('LiveLeak.com -', '').strip()
video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
webpage, u'description', fatal=False)
video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
webpage, u'uploader', fatal=False)
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'description': video_description,
'uploader': video_uploader
}
return [info]

View File

@@ -0,0 +1,110 @@
import re
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_parse_qs,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_str,
ExtractorError,
)
class MetacafeIE(InfoExtractor):
"""Information Extractor for metacafe.com."""
_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
IE_NAME = u'metacafe'
def report_disclaimer(self):
"""Report disclaimer retrieval."""
self.to_screen(u'Retrieving disclaimer')
def _real_initialize(self):
# Retrieve disclaimer
request = compat_urllib_request.Request(self._DISCLAIMER)
try:
self.report_disclaimer()
compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
# Confirm age
disclaimer_form = {
'filters': '0',
'submit': "Continue - I'm over 18",
}
request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
try:
self.report_age_confirmation()
compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
def _real_extract(self, url):
# Extract id and simplified title from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1)
# Check if video comes from YouTube
mobj2 = re.match(r'^yt-(.*)$', video_id)
if mobj2 is not None:
return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
# Retrieve video webpage to extract further information
webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
if mobj is not None:
mediaURL = compat_urllib_parse.unquote(mobj.group(1))
video_extension = mediaURL[-3:]
# Extract gdaKey if available
mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
if mobj is None:
video_url = mediaURL
else:
gdaKey = mobj.group(1)
video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
else:
mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
vardict = compat_parse_qs(mobj.group(1))
if 'mediaData' not in vardict:
raise ExtractorError(u'Unable to extract media URL')
mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
mediaURL = mobj.group('mediaURL').replace('\\/', '/')
video_extension = mediaURL[-3:]
video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract title')
video_title = mobj.group(1).decode('utf-8')
mobj = re.search(r'submitter=(.*?);', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract uploader nickname')
video_uploader = mobj.group(1)
return [{
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'upload_date': None,
'title': video_title,
'ext': video_extension.decode('utf-8'),
}]

View File

@@ -0,0 +1,115 @@
import json
import re
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_request,
ExtractorError,
)
class MixcloudIE(InfoExtractor):
_WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
IE_NAME = u'mixcloud'
def report_download_json(self, file_id):
"""Report JSON download."""
self.to_screen(u'Downloading json')
def get_urls(self, jsonData, fmt, bitrate='best'):
"""Get urls from 'audio_formats' section in json"""
try:
bitrate_list = jsonData[fmt]
if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
bitrate = max(bitrate_list) # select highest
url_list = jsonData[fmt][bitrate]
except TypeError: # we have no bitrate info.
url_list = jsonData[fmt]
return url_list
def check_urls(self, url_list):
"""Returns 1st active url from list"""
for url in url_list:
try:
compat_urllib_request.urlopen(url)
return url
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error):
url = None
return None
def _print_formats(self, formats):
print('Available formats:')
for fmt in formats.keys():
for b in formats[fmt]:
try:
ext = formats[fmt][b][0]
print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
except TypeError: # we have no bitrate info
ext = formats[fmt][0]
print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
break
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# extract uploader & filename from url
uploader = mobj.group(1).decode('utf-8')
file_id = uploader + "-" + mobj.group(2).decode('utf-8')
# construct API request
file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
# retrieve .json file with links to files
request = compat_urllib_request.Request(file_url)
try:
self.report_download_json(file_url)
jsonData = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
# parse JSON
json_data = json.loads(jsonData)
player_url = json_data['player_swf_url']
formats = dict(json_data['audio_formats'])
req_format = self._downloader.params.get('format', None)
if self._downloader.params.get('listformats', None):
self._print_formats(formats)
return
if req_format is None or req_format == 'best':
for format_param in formats.keys():
url_list = self.get_urls(formats, format_param)
# check urls
file_url = self.check_urls(url_list)
if file_url is not None:
break # got it!
else:
if req_format not in formats:
raise ExtractorError(u'Format is not available')
url_list = self.get_urls(formats, req_format)
file_url = self.check_urls(url_list)
format_param = req_format
return [{
'id': file_id.decode('utf-8'),
'url': file_url.decode('utf-8'),
'uploader': uploader.decode('utf-8'),
'upload_date': None,
'title': json_data['name'],
'ext': file_url.split('.')[-1].decode('utf-8'),
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
'thumbnail': json_data['thumbnail_url'],
'description': json_data['description'],
'player_url': player_url.decode('utf-8'),
}]

View File

@@ -0,0 +1,72 @@
import re
import socket
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_request,
ExtractorError,
)
class MTVIE(InfoExtractor):
_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
_WORKING = False
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
if not mobj.group('proto'):
url = 'http://' + url
video_id = mobj.group('videoid')
webpage = self._download_webpage(url, video_id)
#song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
# webpage, u'song name', fatal=False)
video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
webpage, u'title')
mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
webpage, u'mtvn_uri', fatal=False)
content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
webpage, u'content id', fatal=False)
videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
self.report_extraction(video_id)
request = compat_urllib_request.Request(videogen_url)
try:
metadataXml = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
mdoc = xml.etree.ElementTree.fromstring(metadataXml)
renditions = mdoc.findall('.//rendition')
# For now, always pick the highest quality.
rendition = renditions[-1]
try:
_,_,ext = rendition.attrib['type'].partition('/')
format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
video_url = rendition.find('./src').text
except KeyError:
raise ExtractorError('Invalid rendition field.')
info = {
'id': video_id,
'url': video_url,
'upload_date': None,
'title': video_title,
'ext': ext,
'format': format,
}
return [info]

View File

@@ -0,0 +1,64 @@
import os.path
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
ExtractorError,
)
class MySpassIE(InfoExtractor):
_VALID_URL = r'http://www.myspass.de/.*'
def _real_extract(self, url):
META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
# video id is the last path element of the URL
# usually there is a trailing slash, so also try the second but last
url_path = compat_urllib_parse_urlparse(url).path
url_parent_path, video_id = os.path.split(url_path)
if not video_id:
_, video_id = os.path.split(url_parent_path)
# get metadata
metadata_url = META_DATA_URL_TEMPLATE % video_id
metadata_text = self._download_webpage(metadata_url, video_id)
metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
# extract values from metadata
url_flv_el = metadata.find('url_flv')
if url_flv_el is None:
raise ExtractorError(u'Unable to extract download url')
video_url = url_flv_el.text
extension = os.path.splitext(video_url)[1][1:]
title_el = metadata.find('title')
if title_el is None:
raise ExtractorError(u'Unable to extract title')
title = title_el.text
format_id_el = metadata.find('format_id')
if format_id_el is None:
format = 'mp4'
else:
format = format_id_el.text
description_el = metadata.find('description')
if description_el is not None:
description = description_el.text
else:
description = None
imagePreview_el = metadata.find('imagePreview')
if imagePreview_el is not None:
thumbnail = imagePreview_el.text
else:
thumbnail = None
info = {
'id': video_id,
'url': video_url,
'title': title,
'ext': extension,
'format': format,
'thumbnail': thumbnail,
'description': description
}
return [info]

View File

@@ -0,0 +1,164 @@
import binascii
import base64
import hashlib
import re
from .common import InfoExtractor
from ..utils import (
compat_ord,
compat_urllib_parse,
ExtractorError,
)
class MyVideoIE(InfoExtractor):
"""Information Extractor for myvideo.de."""
_VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
IE_NAME = u'myvideo'
# Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
# Released into the Public Domain by Tristan Fischer on 2013-05-19
# https://github.com/rg3/youtube-dl/pull/842
def __rc4crypt(self,data, key):
x = 0
box = list(range(256))
for i in list(range(256)):
x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
box[i], box[x] = box[x], box[i]
x = 0
y = 0
out = ''
for char in data:
x = (x + 1) % 256
y = (y + box[x]) % 256
box[x], box[y] = box[y], box[x]
out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
return out
def __md5(self,s):
return hashlib.md5(s).hexdigest().encode()
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'invalid URL: %s' % url)
video_id = mobj.group(1)
GK = (
b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
b'TnpsbA0KTVRkbU1tSTRNdz09'
)
# Get video webpage
webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
webpage = self._download_webpage(webpage_url, video_id)
mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
if mobj is not None:
self.report_extraction(video_id)
video_url = mobj.group(1) + '.flv'
video_title = self._html_search_regex('<title>([^<]+)</title>',
webpage, u'title')
video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
return [{
'id': video_id,
'url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': video_ext,
}]
# try encxml
mobj = re.search('var flashvars={(.+?)}', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract video')
params = {}
encxml = ''
sec = mobj.group(1)
for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
if not a == '_encxml':
params[a] = b
else:
encxml = compat_urllib_parse.unquote(b)
if not params.get('domain'):
params['domain'] = 'www.myvideo.de'
xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
if 'flash_playertype=MTV' in xmldata_url:
self._downloader.report_warning(u'avoiding MTV player')
xmldata_url = (
'http://www.myvideo.de/dynamic/get_player_video_xml.php'
'?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
) % video_id
# get enc data
enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
enc_data_b = binascii.unhexlify(enc_data)
sk = self.__md5(
base64.b64decode(base64.b64decode(GK)) +
self.__md5(
str(video_id).encode('utf-8')
)
)
dec_data = self.__rc4crypt(enc_data_b, sk)
# extracting infos
self.report_extraction(video_id)
video_url = None
mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
if mobj:
video_url = compat_urllib_parse.unquote(mobj.group(1))
if 'myvideo2flash' in video_url:
self._downloader.report_warning(u'forcing RTMPT ...')
video_url = video_url.replace('rtmpe://', 'rtmpt://')
if not video_url:
# extract non rtmp videos
mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
if mobj is None:
raise ExtractorError(u'unable to extract url')
video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
video_file = compat_urllib_parse.unquote(video_file)
if not video_file.endswith('f4m'):
ppath, prefix = video_file.split('.')
video_playpath = '%s:%s' % (prefix, ppath)
video_hls_playlist = ''
else:
video_playpath = ''
video_hls_playlist = (
video_file
).replace('.f4m', '.m3u8')
video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
video_swfobj = compat_urllib_parse.unquote(video_swfobj)
video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
webpage, u'title')
return [{
'id': video_id,
'url': video_url,
'tc_url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': u'flv',
'play_path': video_playpath,
'video_file': video_file,
'video_hls_playlist': video_hls_playlist,
'player_url': video_swfobj,
}]

View File

@@ -0,0 +1,40 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class NBAIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
shortened_video_id = video_id.rpartition('/')[2]
title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
# It isn't there in the HTML it returns to us
# uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
info = {
'id': shortened_video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
# 'uploader_date': uploader_date,
'description': description,
}
return [info]

View File

@@ -0,0 +1,66 @@
import datetime
import json
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class PhotobucketIE(InfoExtractor):
"""Information extractor for photobucket.com."""
# TODO: the original _VALID_URL was:
# r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
# Check if it's necessary to keep the old extracion process
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
IE_NAME = u'photobucket'
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
video_extension = mobj.group('ext')
# Retrieve video webpage to extract further information
webpage = self._download_webpage(url, video_id)
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
# We try first by looking the javascript code:
mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
if mobj is not None:
info = json.loads(mobj.group('json'))
return [{
'id': video_id,
'url': info[u'downloadUrl'],
'uploader': info[u'username'],
'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
'title': info[u'title'],
'ext': video_extension,
'thumbnail': info[u'thumbUrl'],
}]
# We try looking in other parts of the webpage
video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
webpage, u'video URL')
mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract title')
video_title = mobj.group(1).decode('utf-8')
video_uploader = mobj.group(2).decode('utf-8')
return [{
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
'ext': video_extension.decode('utf-8'),
}]

View File

@@ -0,0 +1,41 @@
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
unified_strdate,
)
class PornotubeIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
video_title = mobj.group('title')
# Get webpage content
webpage = self._download_webpage(url, video_id)
# Get the video URL
VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
video_url = compat_urllib_parse.unquote(video_url)
#Get the uploaded date
VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
if upload_date: upload_date = unified_strdate(upload_date)
info = {'id': video_id,
'url': video_url,
'uploader': None,
'upload_date': upload_date,
'title': video_title,
'ext': 'flv',
'format': 'flv'}
return [info]

View File

@@ -0,0 +1,44 @@
import json
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
ExtractorError,
)
class RBMARadioIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('videoID')
webpage = self._download_webpage(url, video_id)
json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
webpage, u'json data', flags=re.MULTILINE)
try:
data = json.loads(json_data)
except ValueError as e:
raise ExtractorError(u'Invalid JSON: ' + str(e))
video_url = data['akamai_url'] + '&cbr=256'
url_parts = compat_urllib_parse_urlparse(video_url)
video_ext = url_parts.path.rpartition('.')[2]
info = {
'id': video_id,
'url': video_url,
'ext': video_ext,
'title': data['title'],
'description': data.get('teaser_text'),
'location': data.get('country_of_origin'),
'uploader': data.get('host', {}).get('name'),
'uploader_id': data.get('host', {}).get('slug'),
'thumbnail': data.get('image', {}).get('large_url_2x'),
'duration': data.get('duration'),
}
return [info]

View File

@@ -0,0 +1,29 @@
import re
from .common import InfoExtractor
class RedTubeIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_extension = 'mp4'
webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
webpage, u'video URL')
video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
webpage, u'title')
return [{
'id': video_id,
'url': video_url,
'ext': video_extension,
'title': video_title,
}]

View File

@@ -0,0 +1,129 @@
import json
import re
from .common import InfoExtractor
from ..utils import (
compat_str,
ExtractorError,
unified_strdate,
)
class SoundcloudIE(InfoExtractor):
"""Information extractor for soundcloud.com
To access the media, the uid of the song and a stream token
must be extracted from the page source and the script must make
a request to media.soundcloud.com/crossdomain.xml. Then
the media can be grabbed by requesting from an url composed
of the stream token and uid
"""
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
IE_NAME = u'soundcloud'
def report_resolve(self, video_id):
"""Report information extraction."""
self.to_screen(u'%s: Resolving id' % video_id)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# extract uploader (which is in the url)
uploader = mobj.group(1)
# extract simple title (uploader + slug of song title)
slug_title = mobj.group(2)
full_title = '%s/%s' % (uploader, slug_title)
self.report_resolve(full_title)
url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
info = json.loads(info_json)
video_id = info['id']
self.report_extraction(full_title)
streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
stream_json = self._download_webpage(streams_url, full_title,
u'Downloading stream definitions',
u'unable to download stream definitions')
streams = json.loads(stream_json)
mediaURL = streams['http_mp3_128_url']
upload_date = unified_strdate(info['created_at'])
return [{
'id': info['id'],
'url': mediaURL,
'uploader': info['user']['username'],
'upload_date': upload_date,
'title': info['title'],
'ext': u'mp3',
'description': info['description'],
}]
class SoundcloudSetIE(InfoExtractor):
"""Information extractor for soundcloud.com sets
To access the media, the uid of the song and a stream token
must be extracted from the page source and the script must make
a request to media.soundcloud.com/crossdomain.xml. Then
the media can be grabbed by requesting from an url composed
of the stream token and uid
"""
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
IE_NAME = u'soundcloud:set'
def report_resolve(self, video_id):
"""Report information extraction."""
self.to_screen(u'%s: Resolving id' % video_id)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# extract uploader (which is in the url)
uploader = mobj.group(1)
# extract simple title (uploader + slug of song title)
slug_title = mobj.group(2)
full_title = '%s/sets/%s' % (uploader, slug_title)
self.report_resolve(full_title)
url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
info_json = self._download_webpage(resolv_url, full_title)
videos = []
info = json.loads(info_json)
if 'errors' in info:
for err in info['errors']:
self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
return
self.report_extraction(full_title)
for track in info['tracks']:
video_id = track['id']
streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
self.report_extraction(video_id)
streams = json.loads(stream_json)
mediaURL = streams['http_mp3_128_url']
videos.append({
'id': video_id,
'url': mediaURL,
'uploader': track['user']['username'],
'upload_date': unified_strdate(track['created_at']),
'title': track['title'],
'ext': u'mp3',
'description': track['description'],
})
return videos

View File

@@ -0,0 +1,37 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
class SpiegelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('videoID')
webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
webpage, u'title')
xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
xml_code = self._download_webpage(xml_url, video_id,
note=u'Downloading XML', errnote=u'Failed to download XML')
idoc = xml.etree.ElementTree.fromstring(xml_code)
last_type = idoc[-1]
filename = last_type.findall('./filename')[0].text
duration = float(last_type.findall('./duration')[0].text)
video_url = 'http://video2.spiegel.de/flash/' + filename
video_ext = filename.rpartition('.')[2]
info = {
'id': video_id,
'url': video_url,
'ext': video_ext,
'title': video_title,
'duration': duration,
}
return [info]

View File

@@ -0,0 +1,112 @@
import re
import socket
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_request,
ExtractorError,
orderedSet,
unescapeHTML,
)
class StanfordOpenClassroomIE(InfoExtractor):
"""Information extractor for Stanford's Open ClassRoom"""
_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
IE_NAME = u'stanfordoc'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
if mobj.group('course') and mobj.group('video'): # A specific video
course = mobj.group('course')
video = mobj.group('video')
info = {
'id': course + '_' + video,
'uploader': None,
'upload_date': None,
}
self.report_extraction(info['id'])
baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
xmlUrl = baseUrl + video + '.xml'
try:
metaXml = compat_urllib_request.urlopen(xmlUrl).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
mdoc = xml.etree.ElementTree.fromstring(metaXml)
try:
info['title'] = mdoc.findall('./title')[0].text
info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
except IndexError:
raise ExtractorError(u'Invalid metadata XML file')
info['ext'] = info['url'].rpartition('.')[2]
return [info]
elif mobj.group('course'): # A course page
course = mobj.group('course')
info = {
'id': course,
'type': 'playlist',
'uploader': None,
'upload_date': None,
}
coursepage = self._download_webpage(url, info['id'],
note='Downloading course info page',
errnote='Unable to download course info page')
info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
info['description'] = self._html_search_regex('<description>([^<]+)</description>',
coursepage, u'description', fatal=False)
links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
info['list'] = [
{
'type': 'reference',
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
}
for vpage in links]
results = []
for entry in info['list']:
assert entry['type'] == 'reference'
results += self.extract(entry['url'])
return results
else: # Root page
info = {
'id': 'Stanford OpenClassroom',
'type': 'playlist',
'uploader': None,
'upload_date': None,
}
self.report_download_webpage(info['id'])
rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
try:
rootpage = compat_urllib_request.urlopen(rootURL).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
info['title'] = info['id']
links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
info['list'] = [
{
'type': 'reference',
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
}
for cpage in links]
results = []
for entry in info['list']:
assert entry['type'] == 'reference'
results += self.extract(entry['url'])
return results

View File

@@ -0,0 +1,33 @@
import re
from .common import InfoExtractor
class StatigramIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'<meta property="og:video:secure_url" content="(.+?)">',
webpage, u'video URL')
thumbnail_url = self._html_search_regex(
r'<meta property="og:image" content="(.+?)" />',
webpage, u'thumbnail URL', fatal=False)
html_title = self._html_search_regex(
r'<title>(.+?)</title>',
webpage, u'title')
title = html_title.rpartition(u' | Statigram')[0]
uploader_id = self._html_search_regex(
r'@([^ ]+)', title, u'uploader name', fatal=False)
ext = 'mp4'
return [{
'id': video_id,
'url': video_url,
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
'uploader_id' : uploader_id
}]

View File

@@ -0,0 +1,63 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
)
class SteamIE(InfoExtractor):
_VALID_URL = r"""http://store\.steampowered\.com/
(agecheck/)?
(?P<urltype>video|app)/ #If the page is only for videos or for a game
(?P<gameID>\d+)/?
(?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
"""
_VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
_AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE)
gameID = m.group('gameID')
videourl = self._VIDEO_PAGE_TEMPLATE % gameID
webpage = self._download_webpage(videourl, gameID)
if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
videourl = self._AGECHECK_TEMPLATE % gameID
self.report_age_confirmation()
webpage = self._download_webpage(videourl, gameID)
self.report_extraction(gameID)
game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
webpage, 'game title')
urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
mweb = re.finditer(urlRE, webpage)
namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
titles = re.finditer(namesRE, webpage)
thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
thumbs = re.finditer(thumbsRE, webpage)
videos = []
for vid,vtitle,thumb in zip(mweb,titles,thumbs):
video_id = vid.group('videoID')
title = vtitle.group('videoName')
video_url = vid.group('videoURL')
video_thumb = thumb.group('thumbnail')
if not video_url:
raise ExtractorError(u'Cannot find video url for %s' % video_id)
info = {
'id':video_id,
'url':video_url,
'ext': 'flv',
'title': unescapeHTML(title),
'thumbnail': video_thumb
}
videos.append(info)
return [self.playlist_result(videos, gameID, game_title)]

View File

@@ -0,0 +1,46 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class TeamcocoIE(InfoExtractor):
_VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
url_title = mobj.group('url_title')
webpage = self._download_webpage(url, url_title)
video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
webpage, u'video id')
self.report_extraction(video_id)
video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
webpage, u'title')
thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
webpage, u'thumbnail', fatal=False)
video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
webpage, u'description', fatal=False)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
data, u'video URL')
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'thumbnail': thumbnail,
'description': video_description,
}]

View File

@@ -0,0 +1,79 @@
import json
import re
from .common import InfoExtractor
class TEDIE(InfoExtractor):
_VALID_URL=r'''http://www\.ted\.com/
(
((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
'''
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def _real_extract(self, url):
m=re.match(self._VALID_URL, url, re.VERBOSE)
if m.group('type_talk'):
return [self._talk_info(url)]
else :
playlist_id=m.group('playlist_id')
name=m.group('name')
self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
return [self._playlist_videos_info(url,name,playlist_id)]
def _playlist_videos_info(self,url,name,playlist_id=0):
'''Returns the videos of the playlist'''
video_RE=r'''
<li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
([.\s]*?)data-playlist_item_id="(\d+)"
([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
'''
video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
m_names=re.finditer(video_name_RE,webpage)
playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
webpage, 'playlist title')
playlist_entries = []
for m_video, m_name in zip(m_videos,m_names):
talk_url='http://www.ted.com%s' % m_name.group('talk_url')
playlist_entries.append(self.url_result(talk_url, 'TED'))
return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
def _talk_info(self, url, video_id=0):
"""Return the video for the talk in the url"""
m = re.match(self._VALID_URL, url,re.VERBOSE)
video_name = m.group('name')
webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
self.report_extraction(video_name)
# If the url includes the language we get the title translated
title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
webpage, 'title')
json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
webpage, 'json data')
info = json.loads(json_data)
desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
webpage, 'description', flags = re.DOTALL)
thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
webpage, 'thumbnail')
info = {
'id': info['id'],
'url': info['htmlStreams'][-1]['file'],
'ext': 'mp4',
'title': title,
'thumbnail': thumbnail,
'description': desc,
}
return info

View File

@@ -0,0 +1,41 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class TumblrIE(InfoExtractor):
_VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
def _real_extract(self, url):
m_url = re.match(self._VALID_URL, url)
video_id = m_url.group('id')
blog = m_url.group('blog_name')
url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
webpage = self._download_webpage(url, video_id)
re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
video = re.search(re_video, webpage)
if video is None:
raise ExtractorError(u'Unable to extract video')
video_url = video.group('video_url')
ext = video.group('ext')
video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
webpage, u'thumbnail', fatal=False) # We pick the first poster
if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
webpage, u'title', flags=re.DOTALL)
return [{'id': video_id,
'url': video_url,
'title': video_title,
'thumbnail': video_thumbnail,
'ext': ext
}]

View File

@@ -0,0 +1,36 @@
import re
from .common import InfoExtractor
class UstreamIE(InfoExtractor):
_VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
IE_NAME = u'ustream'
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('videoID')
video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
webpage, u'title')
uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
webpage, u'thumbnail', fatal=False)
info = {
'id': video_id,
'url': video_url,
'ext': 'flv',
'title': video_title,
'uploader': uploader,
'thumbnail': thumbnail,
}
return info

View File

@@ -0,0 +1,46 @@
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
)
class Vbox7IE(InfoExtractor):
"""Information Extractor for Vbox7"""
_VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1)
redirect_page, urlh = self._download_webpage_handle(url, video_id)
new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
redirect_url = urlh.geturl() + new_location
webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
title = self._html_search_regex(r'<title>(.*)</title>',
webpage, u'title').split('/')[0].strip()
ext = "flv"
info_url = "http://vbox7.com/play/magare.do"
data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
info_request = compat_urllib_request.Request(info_url, data)
info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
if info_response is None:
raise ExtractorError(u'Unable to extract the media url')
(final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
return [{
'id': video_id,
'url': final_url,
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
}]

View File

@@ -0,0 +1,138 @@
import json
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
clean_html,
get_element_by_attribute,
ExtractorError,
std_headers,
)
class VimeoIE(InfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
IE_NAME = u'vimeo'
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('password', None)
if password is None:
raise ExtractorError(u'This video is protected by a password, use the --password option')
token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
data = compat_urllib_parse.urlencode({'password': password,
'token': token})
# I didn't manage to use the password with https
if url.startswith('https'):
pass_url = url.replace('https','http')
else:
pass_url = url
password_request = compat_urllib_request.Request(pass_url+'/password', data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
password_request.add_header('Cookie', 'xsrft=%s' % token)
self._download_webpage(password_request, video_id,
u'Verifying the password',
u'Wrong password')
def _real_extract(self, url, new_video=True):
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
if not mobj.group('proto'):
url = 'https://' + url
if mobj.group('direct_link') or mobj.group('pro'):
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url, None, std_headers)
webpage = self._download_webpage(request, video_id)
# Now we begin extracting as much information as we can from what we
# retrieved. First we extract the information common to all extractors,
# and latter we extract those that are Vimeo specific.
self.report_extraction(video_id)
# Extract the config JSON
try:
config = webpage.split(' = {config:')[1].split(',assets:')[0]
config = json.loads(config)
except:
if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
if re.search('If so please provide the correct password.', webpage):
self._verify_video_password(url, video_id, webpage)
return self._real_extract(url)
else:
raise ExtractorError(u'Unable to extract info section')
# Extract title
video_title = config["video"]["title"]
# Extract uploader and uploader_id
video_uploader = config["video"]["owner"]["name"]
video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
# Extract video thumbnail
video_thumbnail = config["video"]["thumbnail"]
# Extract video description
video_description = get_element_by_attribute("itemprop", "description", webpage)
if video_description: video_description = clean_html(video_description)
else: video_description = u''
# Extract upload date
video_upload_date = None
mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
if mobj is not None:
video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
# Vimeo specific: extract request signature and timestamp
sig = config['request']['signature']
timestamp = config['request']['timestamp']
# Vimeo specific: extract video codec and quality information
# First consider quality, then codecs, then take everything
# TODO bind to format param
codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
files = { 'hd': [], 'sd': [], 'other': []}
for codec_name, codec_extension in codecs:
if codec_name in config["video"]["files"]:
if 'hd' in config["video"]["files"][codec_name]:
files['hd'].append((codec_name, codec_extension, 'hd'))
elif 'sd' in config["video"]["files"][codec_name]:
files['sd'].append((codec_name, codec_extension, 'sd'))
else:
files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
for quality in ('hd', 'sd', 'other'):
if len(files[quality]) > 0:
video_quality = files[quality][0][2]
video_codec = files[quality][0][0]
video_extension = files[quality][0][1]
self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
break
else:
raise ExtractorError(u'No known codec found')
video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
%(video_id, sig, timestamp, video_quality, video_codec.upper())
return [{
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'uploader_id': video_uploader_id,
'upload_date': video_upload_date,
'title': video_title,
'ext': video_extension,
'thumbnail': video_thumbnail,
'description': video_description,
}]

View File

@@ -0,0 +1,37 @@
import re
from .common import InfoExtractor
class VineIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'https://vine.co/v/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
webpage, u'video URL')
video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
webpage, u'title')
thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
webpage, u'thumbnail', fatal=False)
uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
'thumbnail': thumbnail,
'uploader': uploader,
}]

View File

@@ -0,0 +1,44 @@
import re
from .common import InfoExtractor
class WorldStarHipHopIE(InfoExtractor):
_VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
IE_NAME = u'WorldStarHipHop'
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
webpage_src = self._download_webpage(url, video_id)
video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
webpage_src, u'video URL')
if 'mp4' in video_url:
ext = 'mp4'
else:
ext = 'flv'
video_title = self._html_search_regex(r"<title>(.*)</title>",
webpage_src, u'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
webpage_src, u'thumbnail', fatal=False)
if not thumbnail:
_title = r"""candytitles.*>(.*)</span>"""
mobj = re.search(_title, webpage_src)
if mobj is not None:
video_title = mobj.group(1)
results = [{
'id': video_id,
'url' : video_url,
'title' : video_title,
'thumbnail' : thumbnail,
'ext' : ext,
}]
return results

View File

@@ -0,0 +1,61 @@
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
)
class XHamsterIE(InfoExtractor):
"""Information Extractor for xHamster"""
_VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
webpage = self._download_webpage(mrss_url, video_id)
mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
if len(mobj.group('server')) == 0:
video_url = compat_urllib_parse.unquote(mobj.group('file'))
else:
video_url = mobj.group('server')+'/key='+mobj.group('file')
video_extension = video_url.split('.')[-1]
video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
webpage, u'title')
# Can't see the description anywhere in the UI
# video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
# webpage, u'description', fatal=False)
# if video_description: video_description = unescapeHTML(video_description)
mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
if mobj:
video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
else:
video_upload_date = None
self._downloader.report_warning(u'Unable to extract upload date')
video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
webpage, u'uploader id', default=u'anonymous')
video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': video_extension,
'title': video_title,
# 'description': video_description,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'thumbnail': video_thumbnail
}]

View File

@@ -0,0 +1,45 @@
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
)
class XNXXIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1)
# Get webpage content
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(self.VIDEO_URL_RE,
webpage, u'video URL')
video_url = compat_urllib_parse.unquote(video_url)
video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
webpage, u'title')
video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
'description': None,
}]

View File

@@ -0,0 +1,43 @@
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
)
class XVideosIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
# Extract video URL
video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
webpage, u'video URL'))
# Extract title
video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
webpage, u'title')
# Extract video thumbnail
video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
webpage, u'thumbnail', fatal=False)
info = {
'id': video_id,
'url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
'description': None,
}
return [info]

View File

@@ -0,0 +1,113 @@
import datetime
import itertools
import json
import re
from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
)
class YahooIE(InfoExtractor):
"""Information extractor for screen.yahoo.com."""
_VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
if m_id is None:
# TODO: Check which url parameters are required
info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
<description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
<media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
<media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
'''
self.report_extraction(video_id)
m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
if m_info is None:
raise ExtractorError(u'Unable to extract video info')
video_title = m_info.group('title')
video_description = m_info.group('description')
video_thumb = m_info.group('thumb')
video_date = m_info.group('date')
video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
# TODO: Find a way to get mp4 videos
rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
video_url = m_rest.group('url')
video_path = m_rest.group('path')
if m_rest is None:
raise ExtractorError(u'Unable to extract video url')
else: # We have to use a different method if another id is defined
long_id = m_id.group('new_id')
info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
info = json.loads(json_str)
res = info[u'query'][u'results'][u'mediaObj'][0]
stream = res[u'streams'][0]
video_path = stream[u'path']
video_url = stream[u'host']
meta = res[u'meta']
video_title = meta[u'title']
video_description = meta[u'description']
video_thumb = meta[u'thumbnail']
video_date = None # I can't find it
info_dict = {
'id': video_id,
'url': video_url,
'play_path': video_path,
'title':video_title,
'description': video_description,
'thumbnail': video_thumb,
'upload_date': video_date,
'ext': 'flv',
}
return info_dict
class YahooSearchIE(SearchInfoExtractor):
"""Information Extractor for Yahoo! Video search queries."""
_MAX_RESULTS = 1000
IE_NAME = u'screen.yahoo:search'
_SEARCH_KEY = 'yvsearch'
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
res = {
'_type': 'playlist',
'id': query,
'entries': []
}
for pagenum in itertools.count(0):
result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
webpage = self._download_webpage(result_url, query,
note='Downloading results page '+str(pagenum+1))
info = json.loads(webpage)
m = info[u'm']
results = info[u'results']
for (i, r) in enumerate(results):
if (pagenum * 30) +i >= n:
break
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
res['entries'].append(e)
if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
break
return res

View File

@@ -0,0 +1,45 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class YouJizzIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
# Get webpage content
webpage = self._download_webpage(url, video_id)
# Get the video title
video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
webpage, u'title').strip()
# Get the embed page
result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
if result is None:
raise ExtractorError(u'ERROR: unable to extract embed page')
embed_page_url = result.group(0).strip()
video_id = result.group('videoid')
webpage = self._download_webpage(embed_page_url, video_id)
# Get the video URL
video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
webpage, u'video URL')
info = {'id': video_id,
'url': video_url,
'title': video_title,
'ext': 'flv',
'format': 'flv',
'player_url': embed_page_url}
return [info]

View File

@@ -0,0 +1,104 @@
import json
import math
import random
import re
import time
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class YoukuIE(InfoExtractor):
_VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
def _gen_sid(self):
nowTime = int(time.time() * 1000)
random1 = random.randint(1000,1998)
random2 = random.randint(1000,9999)
return "%d%d%d" %(nowTime,random1,random2)
def _get_file_ID_mix_string(self, seed):
mixed = []
source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
seed = float(seed)
for i in range(len(source)):
seed = (seed * 211 + 30031 ) % 65536
index = math.floor(seed / 65536 * len(source) )
mixed.append(source[int(index)])
source.remove(source[int(index)])
#return ''.join(mixed)
return mixed
def _get_file_id(self, fileId, seed):
mixed = self._get_file_ID_mix_string(seed)
ids = fileId.split('*')
realId = []
for ch in ids:
if ch:
realId.append(mixed[int(ch)])
return ''.join(realId)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('ID')
info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
jsondata = self._download_webpage(info_url, video_id)
self.report_extraction(video_id)
try:
config = json.loads(jsondata)
video_title = config['data'][0]['title']
seed = config['data'][0]['seed']
format = self._downloader.params.get('format', None)
supported_format = list(config['data'][0]['streamfileids'].keys())
if format is None or format == 'best':
if 'hd2' in supported_format:
format = 'hd2'
else:
format = 'flv'
ext = u'flv'
elif format == 'worst':
format = 'mp4'
ext = u'mp4'
else:
format = 'flv'
ext = u'flv'
fileid = config['data'][0]['streamfileids'][format]
keys = [s['k'] for s in config['data'][0]['segs'][format]]
except (UnicodeDecodeError, ValueError, KeyError):
raise ExtractorError(u'Unable to extract info section')
files_info=[]
sid = self._gen_sid()
fileid = self._get_file_id(fileid, seed)
#column 8,9 of fileid represent the segment number
#fileid[7:9] should be changed
for index, key in enumerate(keys):
temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
info = {
'id': '%s_part%02d' % (video_id, index),
'url': download_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': ext,
}
files_info.append(info)
return files_info

View File

@@ -0,0 +1,117 @@
import json
import os
import re
import sys
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
ExtractorError,
unescapeHTML,
unified_strdate,
)
class YouPornIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
def _print_formats(self, formats):
"""Print all available formats"""
print(u'Available formats:')
print(u'ext\t\tformat')
print(u'---------------------------------')
for format in formats:
print(u'%s\t\t%s' % (format['ext'], format['format']))
def _specific(self, req_format, formats):
for x in formats:
if x["format"] == req_format:
return x
return None
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
# Get JSON parameters
json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
try:
params = json.loads(json_params)
except:
raise ExtractorError(u'Invalid JSON')
self.report_extraction(video_id)
try:
video_title = params['title']
upload_date = unified_strdate(params['release_date_f'])
video_description = params['description']
video_uploader = params['submitted_by']
thumbnail = params['thumbnails'][0]['image']
except KeyError:
raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
# Get all of the formats available
DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
webpage, u'download list').strip()
# Get all of the links from the page
LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
links = re.findall(LINK_RE, download_list_html)
if(len(links) == 0):
raise ExtractorError(u'ERROR: no known formats available for video')
self.to_screen(u'Links found: %d' % len(links))
formats = []
for link in links:
# A link looks like this:
# http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
# A path looks like this:
# /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
video_url = unescapeHTML( link )
path = compat_urllib_parse_urlparse( video_url ).path
extension = os.path.splitext( path )[1][1:]
format = path.split('/')[4].split('_')[:2]
# size = format[0]
# bitrate = format[1]
format = "-".join( format )
# title = u'%s-%s-%s' % (video_title, size, bitrate)
formats.append({
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': upload_date,
'title': video_title,
'ext': extension,
'format': format,
'thumbnail': thumbnail,
'description': video_description
})
if self._downloader.params.get('listformats', None):
self._print_formats(formats)
return
req_format = self._downloader.params.get('format', None)
self.to_screen(u'Format: %s' % req_format)
if req_format is None or req_format == 'best':
return [formats[0]]
elif req_format == 'worst':
return [formats[-1]]
elif req_format in ('-1', 'all'):
return formats
else:
format = self._specific( req_format, formats )
if format is None:
raise ExtractorError(u'Requested format not available')
return [format]

View File

@@ -0,0 +1,795 @@
# coding: utf-8
import json
import netrc
import re
import socket
from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
compat_http_client,
compat_parse_qs,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_str,
clean_html,
get_element_by_id,
ExtractorError,
unescapeHTML,
unified_strdate,
)
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
_VALID_URL = r"""^
(
(?:https?://)? # http(s):// (optional)
(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/) # v/ or embed/ or e/
|(?: # or the v= param in all its forms
(?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
)
)? # optional -> youtube.com/xxxx is OK
)? # all until now is optional -> you can pass the naked ID
([0-9A-Za-z_-]+) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow
$"""
_LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
_NETRC_MACHINE = 'youtube'
# Listed in order of quality
_available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
_available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
_video_extensions = {
'13': '3gp',
'17': 'mp4',
'18': 'mp4',
'22': 'mp4',
'37': 'mp4',
'38': 'video', # You actually don't know if this will be MOV, AVI or whatever
'43': 'webm',
'44': 'webm',
'45': 'webm',
'46': 'webm',
}
_video_dimensions = {
'5': '240x400',
'6': '???',
'13': '???',
'17': '144x176',
'18': '360x640',
'22': '720x1280',
'34': '360x640',
'35': '480x854',
'37': '1080x1920',
'38': '3072x4096',
'43': '360x640',
'44': '480x854',
'45': '720x1280',
'46': '1080x1920',
}
IE_NAME = u'youtube'
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
if YoutubePlaylistIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def report_lang(self):
"""Report attempt to set language."""
self.to_screen(u'Setting language')
def report_login(self):
"""Report attempt to log in."""
self.to_screen(u'Logging in')
def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage."""
self.to_screen(u'%s: Downloading video webpage' % video_id)
def report_video_info_webpage_download(self, video_id):
"""Report attempt to download video info webpage."""
self.to_screen(u'%s: Downloading video info webpage' % video_id)
def report_video_subtitles_download(self, video_id):
"""Report attempt to download video info webpage."""
self.to_screen(u'%s: Checking available subtitles' % video_id)
def report_video_subtitles_request(self, video_id, sub_lang, format):
"""Report attempt to download video info webpage."""
self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
def report_video_subtitles_available(self, video_id, sub_lang_list):
"""Report available subtitles."""
sub_lang = ",".join(list(sub_lang_list.keys()))
self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
self.to_screen(u'%s: Extracting video information' % video_id)
def report_unavailable_format(self, video_id, format):
"""Report extracted video URL."""
self.to_screen(u'%s: Format %s not available' % (video_id, format))
def report_rtmp_download(self):
"""Indicate the download will use the RTMP protocol."""
self.to_screen(u'RTMP download detected')
@staticmethod
def _decrypt_signature(s):
"""Decrypt the key the two subkeys must have a length of 43"""
(a,b) = s.split('.')
if len(a) != 43 or len(b) != 43:
raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
a = a[-40:]
s_dec = '.'.join((a,b))[::-1]
return s_dec
def _get_available_subtitles(self, video_id):
self.report_video_subtitles_download(video_id)
request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
try:
sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
return (u'unable to download video subtitles: %s' % compat_str(err), None)
sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
if not sub_lang_list:
return (u'video doesn\'t have subtitles', None)
return sub_lang_list
def _list_available_subtitles(self, video_id):
sub_lang_list = self._get_available_subtitles(video_id)
self.report_video_subtitles_available(video_id, sub_lang_list)
def _request_subtitle(self, sub_lang, sub_name, video_id, format):
"""
Return tuple:
(error_message, sub_lang, sub)
"""
self.report_video_subtitles_request(video_id, sub_lang, format)
params = compat_urllib_parse.urlencode({
'lang': sub_lang,
'name': sub_name,
'v': video_id,
'fmt': format,
})
url = 'http://www.youtube.com/api/timedtext?' + params
try:
sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
if not sub:
return (u'Did not fetch video subtitles', None, None)
return (None, sub_lang, sub)
def _request_automatic_caption(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
sub_lang = self._downloader.params.get('subtitleslang') or 'en'
sub_format = self._downloader.params.get('subtitlesformat')
self.to_screen(u'%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
if mobj is None:
return [(err_msg, None, None)]
player_config = json.loads(mobj.group(1))
try:
args = player_config[u'args']
caption_url = args[u'ttsurl']
timestamp = args[u'timestamp']
params = compat_urllib_parse.urlencode({
'lang': 'en',
'tlang': sub_lang,
'fmt': sub_format,
'ts': timestamp,
'kind': 'asr',
})
subtitles_url = caption_url + '&' + params
sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
return [(None, sub_lang, sub)]
except KeyError:
return [(err_msg, None, None)]
def _extract_subtitle(self, video_id):
"""
Return a list with a tuple:
[(error_message, sub_lang, sub)]
"""
sub_lang_list = self._get_available_subtitles(video_id)
sub_format = self._downloader.params.get('subtitlesformat')
if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
return [(sub_lang_list[0], None, None)]
if self._downloader.params.get('subtitleslang', False):
sub_lang = self._downloader.params.get('subtitleslang')
elif 'en' in sub_lang_list:
sub_lang = 'en'
else:
sub_lang = list(sub_lang_list.keys())[0]
if not sub_lang in sub_lang_list:
return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
return [subtitle]
def _extract_all_subtitles(self, video_id):
sub_lang_list = self._get_available_subtitles(video_id)
sub_format = self._downloader.params.get('subtitlesformat')
if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
return [(sub_lang_list[0], None, None)]
subtitles = []
for sub_lang in sub_lang_list:
subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
subtitles.append(subtitle)
return subtitles
def _print_formats(self, formats):
print('Available formats:')
for x in formats:
print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
def _real_initialize(self):
if self._downloader is None:
return
username = None
password = None
downloader_params = self._downloader.params
# Attempt to use provided username and password or .netrc data
if downloader_params.get('username', None) is not None:
username = downloader_params['username']
password = downloader_params['password']
elif downloader_params.get('usenetrc', False):
try:
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
if info is not None:
username = info[0]
password = info[2]
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
return
# Set language
request = compat_urllib_request.Request(self._LANG_URL)
try:
self.report_lang()
compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
return
# No authentication to be performed
if username is None:
return
request = compat_urllib_request.Request(self._LOGIN_URL)
try:
login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
return
galx = None
dsh = None
match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
if match:
galx = match.group(1)
match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
if match:
dsh = match.group(1)
# Log in
login_form_strs = {
u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
u'Email': username,
u'GALX': galx,
u'Passwd': password,
u'PersistentCookie': u'yes',
u'_utf8': u'',
u'bgresponse': u'js_disabled',
u'checkConnection': u'',
u'checkedDomains': u'youtube',
u'dnConn': u'',
u'dsh': dsh,
u'pstMsg': u'0',
u'rmShown': u'1',
u'secTok': u'',
u'signIn': u'Sign in',
u'timeStmp': u'',
u'service': u'youtube',
u'uilel': u'3',
u'hl': u'en_US',
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
try:
self.report_login()
login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
self._downloader.report_warning(u'unable to log in: bad username or password')
return
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
return
# Confirm age
age_form = {
'next_url': '/',
'action_confirm': 'Confirm',
}
request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
try:
self.report_age_confirmation()
compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
def _extract_id(self, url):
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(2)
return video_id
def _real_extract(self, url):
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
video_id = self._extract_id(url)
# Get video webpage
self.report_video_webpage_download(video_id)
url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
request = compat_urllib_request.Request(url)
try:
video_webpage_bytes = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
# Attempt to extract SWF player URL
mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
if mobj is not None:
player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
else:
player_url = None
# Get video info
self.report_video_info_webpage_download(video_id)
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
if 'token' in video_info:
break
if 'token' not in video_info:
if 'reason' in video_info:
raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
else:
raise ExtractorError(u'"token" parameter not in video info for unknown reason')
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
raise ExtractorError(u'"rental" videos not supported')
# Start extracting information
self.report_information_extraction(video_id)
# uploader
if 'author' not in video_info:
raise ExtractorError(u'Unable to extract uploader name')
video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
# uploader_id
video_uploader_id = None
mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
if mobj is not None:
video_uploader_id = mobj.group(1)
else:
self._downloader.report_warning(u'unable to extract uploader nickname')
# title
if 'title' not in video_info:
raise ExtractorError(u'Unable to extract video title')
video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
# thumbnail image
if 'thumbnail_url' not in video_info:
self._downloader.report_warning(u'unable to extract video thumbnail')
video_thumbnail = ''
else: # don't panic if we can't find it
video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
# upload date
upload_date = None
mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
if mobj is not None:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
# description
video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
video_description = clean_html(video_description)
else:
fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
if fd_mobj:
video_description = unescapeHTML(fd_mobj.group(1))
else:
video_description = u''
# subtitles
video_subtitles = None
if self._downloader.params.get('writesubtitles', False):
video_subtitles = self._extract_subtitle(video_id)
if video_subtitles:
(sub_error, sub_lang, sub) = video_subtitles[0]
if sub_error:
# We try with the automatic captions
video_subtitles = self._request_automatic_caption(video_id, video_webpage)
(sub_error_auto, sub_lang, sub) = video_subtitles[0]
if sub is not None:
pass
else:
# We report the original error
self._downloader.report_warning(sub_error)
if self._downloader.params.get('allsubtitles', False):
video_subtitles = self._extract_all_subtitles(video_id)
for video_subtitle in video_subtitles:
(sub_error, sub_lang, sub) = video_subtitle
if sub_error:
self._downloader.report_warning(sub_error)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id)
return
if 'length_seconds' not in video_info:
self._downloader.report_warning(u'unable to extract video duration')
video_duration = ''
else:
video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
# Decide which formats to download
req_format = self._downloader.params.get('format', None)
try:
mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
info = json.loads(mobj.group(1))
args = info['args']
if args.get('ptk','') == 'vevo' or 'dashmpd':
# Vevo videos with encrypted signatures
self.to_screen(u'%s: Vevo video detected.' % video_id)
video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
except ValueError:
pass
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
video_url_list = [(None, video_info['conn'][0])]
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
url_map = {}
for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' in url_data and 'url' in url_data:
url = url_data['url'][0]
if 'sig' in url_data:
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
signature = self._decrypt_signature(url_data['s'][0])
url += '&signature=' + signature
if 'ratebypass' not in url:
url += '&ratebypass=yes'
url_map[url_data['itag'][0]] = url
format_limit = self._downloader.params.get('format_limit', None)
available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
if format_limit is not None and format_limit in available_formats:
format_list = available_formats[available_formats.index(format_limit):]
else:
format_list = available_formats
existing_formats = [x for x in format_list if x in url_map]
if len(existing_formats) == 0:
raise ExtractorError(u'no known formats available for video')
if self._downloader.params.get('listformats', None):
self._print_formats(existing_formats)
return
if req_format is None or req_format == 'best':
video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
elif req_format == 'worst':
video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
elif req_format in ('-1', 'all'):
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
else:
# Specific formats. We pick the first in a slash-delimeted sequence.
# For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
req_formats = req_format.split('/')
video_url_list = None
for rf in req_formats:
if rf in url_map:
video_url_list = [(rf, url_map[rf])]
break
if video_url_list is None:
raise ExtractorError(u'requested format not available')
else:
raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
results = []
for format_param, video_real_url in video_url_list:
# Extension
video_extension = self._video_extensions.get(format_param, 'flv')
video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
self._video_dimensions.get(format_param, '???'))
results.append({
'id': video_id,
'url': video_real_url,
'uploader': video_uploader,
'uploader_id': video_uploader_id,
'upload_date': upload_date,
'title': video_title,
'ext': video_extension,
'format': video_format,
'thumbnail': video_thumbnail,
'description': video_description,
'player_url': player_url,
'subtitles': video_subtitles,
'duration': video_duration
})
return results
class YoutubePlaylistIE(InfoExtractor):
"""Information Extractor for YouTube playlists."""
_VALID_URL = r"""(?:
(?:https?://)?
(?:\w+\.)?
youtube\.com/
(?:
(?:course|view_play_list|my_playlists|artist|playlist|watch)
\? (?:.*?&)*? (?:p|a|list)=
| p/
)
((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
.*
|
((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
)"""
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
_MAX_RESULTS = 50
IE_NAME = u'youtube:playlist'
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# Download playlist videos from API
playlist_id = mobj.group(1) or mobj.group(2)
page_num = 1
videos = []
while True:
url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
try:
response = json.loads(page)
except ValueError as err:
raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
if 'feed' not in response:
raise ExtractorError(u'Got a malformed response from YouTube API')
playlist_title = response['feed']['title']['$t']
if 'entry' not in response['feed']:
# Number of videos is a multiple of self._MAX_RESULTS
break
for entry in response['feed']['entry']:
index = entry['yt$position']['$t']
if 'media$group' in entry and 'media$player' in entry['media$group']:
videos.append((index, entry['media$group']['media$player']['url']))
if len(response['feed']['entry']) < self._MAX_RESULTS:
break
page_num += 1
videos = [v[1] for v in sorted(videos)]
url_results = [self.url_result(url, 'Youtube') for url in videos]
return [self.playlist_result(url_results, playlist_id, playlist_title)]
class YoutubeChannelIE(InfoExtractor):
"""Information Extractor for YouTube channels."""
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
_TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
_MORE_PAGES_INDICATOR = 'yt-uix-load-more'
_MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = u'youtube:channel'
def extract_videos_from_page(self, page):
ids_in_page = []
for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1))
return ids_in_page
def _real_extract(self, url):
# Extract channel id
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# Download channel page
channel_id = mobj.group(1)
video_ids = []
pagenum = 1
url = self._TEMPLATE_URL % (channel_id, pagenum)
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
# Extract video identifiers
ids_in_page = self.extract_videos_from_page(page)
video_ids.extend(ids_in_page)
# Download any subsequent channel pages using the json-based channel_ajax query
if self._MORE_PAGES_INDICATOR in page:
while True:
pagenum = pagenum + 1
url = self._MORE_PAGES_URL % (pagenum, channel_id)
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
page = json.loads(page)
ids_in_page = self.extract_videos_from_page(page['content_html'])
video_ids.extend(ids_in_page)
if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
break
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
url_entries = [self.url_result(url, 'Youtube') for url in urls]
return [self.playlist_result(url_entries, channel_id)]
class YoutubeUserIE(InfoExtractor):
"""Information Extractor for YouTube users."""
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
_VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
IE_NAME = u'youtube:user'
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
username = mobj.group(1)
# Download video ids using YouTube Data API. Result size per
# query is limited (currently to 50 videos) so we need to query
# page by page until there are no video ids - it means we got
# all of them.
video_ids = []
pagenum = 0
while True:
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
page = self._download_webpage(gdata_url, username,
u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
# Extract video identifiers
ids_in_page = []
for mobj in re.finditer(self._VIDEO_INDICATOR, page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page)
# A little optimization - if current page is not
# "full", ie. does not contain PAGE_SIZE video ids then
# we can assume that this page is the last one - there
# are no more ids on further pages - no need to query
# again.
if len(ids_in_page) < self._GDATA_PAGE_SIZE:
break
pagenum += 1
urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
url_results = [self.url_result(url, 'Youtube') for url in urls]
return [self.playlist_result(url_results, playlist_title = username)]
class YoutubeSearchIE(SearchInfoExtractor):
"""Information Extractor for YouTube search queries."""
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
_MAX_RESULTS = 1000
IE_NAME = u'youtube:search'
_SEARCH_KEY = 'ytsearch'
def report_download_page(self, query, pagenum):
"""Report attempt to download search page with given number."""
self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
video_ids = []
pagenum = 0
limit = n
while (50 * pagenum) < limit:
self.report_download_page(query, pagenum+1)
result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
request = compat_urllib_request.Request(result_url)
try:
data = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
api_response = json.loads(data)['data']
if not 'items' in api_response:
raise ExtractorError(u'[youtube] No video results')
new_ids = list(video['id'] for video in api_response['items'])
video_ids += new_ids
limit = min(n, api_response['totalItems'])
pagenum += 1
if len(video_ids) > n:
video_ids = video_ids[:n]
videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
return self.playlist_result(videos, query)

View File

@@ -0,0 +1,65 @@
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
)
class ZDFIE(InfoExtractor):
_VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
_TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
_MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
_MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
_RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('video_id')
html = self._download_webpage(url, video_id)
streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
if streams is None:
raise ExtractorError(u'No media url found.')
# s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
# s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
# choose first/default media type and highest quality for now
for s in streams: #find 300 - dsl1000mbit
if s['quality'] == '300' and s['media_type'] == 'wstreaming':
stream_=s
break
for s in streams: #find veryhigh - dsl2000mbit
if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
stream_=s
break
if stream_ is None:
raise ExtractorError(u'No stream found.')
media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
self.report_extraction(video_id)
mobj = re.search(self._TITLE, html)
if mobj is None:
raise ExtractorError(u'Cannot extract title')
title = unescapeHTML(mobj.group('title'))
mobj = re.search(self._MMS_STREAM, media_link)
if mobj is None:
mobj = re.search(self._RTSP_STREAM, media_link)
if mobj is None:
raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
mms_url = mobj.group('video_url')
mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
if mobj is None:
raise ExtractorError(u'Cannot extract extention')
ext = mobj.group('ext')
return [{'id': video_id,
'url': mms_url,
'title': title,
'ext': ext
}]

View File

@@ -78,7 +78,7 @@ def update_self(to_screen, verbose, filename):
to_screen(u'Updating to version ' + versions_info['latest'] + '...')
version = versions_info['versions'][versions_info['latest']]
print_notes(versions_info['versions'])
print_notes(to_screen, versions_info['versions'])
if not os.access(filename, os.W_OK):
to_screen(u'ERROR: no write permissions on %s' % filename)
@@ -157,11 +157,15 @@ del "%s"
to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
def print_notes(versions, fromVersion=__version__):
def get_notes(versions, fromVersion):
notes = []
for v,vdata in sorted(versions.items()):
if v > fromVersion:
notes.extend(vdata.get('notes', []))
return notes
def print_notes(to_screen, versions, fromVersion=__version__):
notes = get_notes(versions, fromVersion)
if notes:
to_screen(u'PLEASE NOTE:')
for note in notes:

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import errno
import gzip
import io
import json
@@ -11,7 +12,7 @@ import sys
import traceback
import zlib
import email.utils
import json
import socket
import datetime
try:
@@ -149,6 +150,13 @@ try:
except NameError:
compat_chr = chr
def compat_ord(c):
if type(c) is int: return c
else: return ord(c)
# This is not clearly defined otherwise
compiled_regex_type = type(re.compile(''))
std_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
@@ -334,12 +342,20 @@ def sanitize_open(filename, open_mode):
stream = open(encodeFilename(filename), open_mode)
return (stream, filename)
except (IOError, OSError) as err:
# In case of error, try to remove win32 forbidden chars
filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
if err.errno in (errno.EACCES,):
raise
# An exception here should be caught in the caller
stream = open(encodeFilename(filename), open_mode)
return (stream, filename)
# In case of error, try to remove win32 forbidden chars
alt_filename = os.path.join(
re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
for path_part in os.path.split(filename)
)
if alt_filename == filename:
raise
else:
# An exception here should be caught in the caller
stream = open(encodeFilename(filename), open_mode)
return (stream, alt_filename)
def timeconvert(timestr):
@@ -430,11 +446,37 @@ def decodeOption(optval):
assert isinstance(optval, compat_str)
return optval
def formatSeconds(secs):
if secs > 3600:
return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
elif secs > 60:
return '%d:%02d' % (secs // 60, secs % 60)
else:
return '%d' % secs
def make_HTTPS_handler(opts):
if sys.version_info < (3,2):
# Python's 2.x handler is very simplistic
return compat_urllib_request.HTTPSHandler()
else:
import ssl
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
context.set_default_verify_paths()
context.verify_mode = (ssl.CERT_NONE
if opts.no_check_certificate
else ssl.CERT_REQUIRED)
return compat_urllib_request.HTTPSHandler(context=context)
class ExtractorError(Exception):
"""Error during info extraction."""
def __init__(self, msg, tb=None):
""" tb, if given, is the original traceback (so that it can be printed out). """
if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
msg = msg + u'; please report this issue on GitHub.'
super(ExtractorError, self).__init__(msg)
self.traceback = tb
self.exc_info = sys.exc_info() # preserve original exception

View File

@@ -1,2 +1,2 @@
__version__ = '2013.04.27'
__version__ = '2013.06.23'