Compare commits

..

295 Commits

Author SHA1 Message Date
c361c505b0 release 2014.03.06 2014-03-06 23:57:00 +01:00
d37c07c575 [vesti] Fix extraction and support more link formats (Closes #2517) 2014-03-07 02:27:39 +07:00
9d6105c9f0 Do not resume live streams
No resuming or seeking in live streams is possible (c) man rtmpdump
2014-03-05 22:46:20 +07:00
8dec03ecba Use unicode literals 2014-03-05 22:24:07 +07:00
826547870b Report no connect as error 2014-03-05 22:21:19 +07:00
52d6a9a61d Handle rtmpdump's no connection return value 2014-03-05 22:19:27 +07:00
ad242b5fbc Remove superfluous whitespace 2014-03-05 22:16:50 +07:00
3524175625 Use meaningful return value constants for rtmpdump 2014-03-05 22:12:02 +07:00
7b9965ea93 [ted] Remove unused import and modernize test 2014-03-05 14:27:45 +01:00
0a5bce566f [generic] Add all test attributes for embedly (#2447)
In the future, we may want to not only print something, but throw an error for untested properties.
2014-03-05 14:05:50 +01:00
8012bd2424 [generic] Get a better ID 2014-03-05 14:02:14 +01:00
f55a1f0a88 Merge remote-tracking branch 'rzhxeo/embedly'
Conflicts:
	youtube_dl/extractor/generic.py
2014-03-05 14:01:53 +01:00
bacac173a9 [ted] Style fixes 2014-03-05 13:27:26 +01:00
ca1fee34f2 [ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00
6dadaa9930 [prosiebensat1] Replace test 2014-03-05 15:10:49 +07:00
553f6e4633 [dailymotion] Convert width and height fields from strings to integers 2014-03-04 22:24:38 +01:00
652bee05f0 [ted] Fix video extraction
The site has been redesigned
2014-03-04 21:47:01 +01:00
d63516e9cd release 2014.03.04.2 2014-03-04 20:56:31 +01:00
e477dcf649 [vesti] Fix width and height 2014-03-04 21:40:35 +07:00
9d3f7781f3 [soundcloud:set] Fix _VALID_URL regex (Closes #2509) 2014-03-04 21:29:14 +07:00
c7095dada3 [tvigle] Add support for another video link format 2014-03-04 19:22:48 +07:00
607dbbad76 [xtube] Fix extraction add more metafields 2014-03-04 16:12:11 +07:00
17b75c0de1 Document width, height, and resolution (#1445) 2014-03-04 03:49:33 +01:00
ab24f4f3be [facebook] Use consistent quotes 2014-03-04 03:49:12 +01:00
e1a52d9e10 release 2014.03.04.1 2014-03-04 03:40:00 +01:00
d0ff838433 [facebook] Correct regexp 2014-03-04 03:39:45 +01:00
b37b94501c [facebook] Fix login detection (#2505) 2014-03-04 03:39:04 +01:00
cb3bb2cfef [facebook] Modernize 2014-03-04 03:36:54 +01:00
e2cc7983e9 release 2014.03.04 2014-03-04 03:32:54 +01:00
c9ae7b9565 [youtube] Add support for search result URLs (Fixes #2495) 2014-03-04 03:32:28 +01:00
86fb4347f7 release 2014.03.03 2014-03-03 13:51:25 +01:00
2fcec131f5 Credit @juancri for canal13cl (#2498) 2014-03-03 12:54:01 +01:00
9f62eaf4ef [canal13cl] Add test and improve extraction (#2498) 2014-03-03 12:53:11 +01:00
f92259c026 Merge remote-tracking branch 'origin/master' 2014-03-03 12:34:34 +01:00
0afef30b23 Add display_id field 2014-03-03 12:06:28 +01:00
dcdfd1c711 Merge remote-tracking branch 'origin/master' 2014-03-03 12:05:59 +01:00
2acc1f8f50 [orf] Fix segments extraction (Closes #2501) 2014-03-03 18:05:46 +07:00
2c39b0c695 [tinypic] Fix import 2014-03-03 17:40:12 +07:00
e77c5b4f63 [4tube] Fix import 2014-03-03 17:39:49 +07:00
409a16cb72 Allowing URLs for 13.cl without the /programas prefix 2014-03-02 23:41:13 -03:00
94d5e90b4f FIX: Typo in the extractor's name 2014-03-02 23:40:35 -03:00
2d73b45805 Adding support for 13.cl 2014-03-02 23:15:12 -03:00
271a2dbfa2 [tvigle] Add age limit 2014-03-02 22:07:18 +07:00
bf4adcac66 [tvigle] Fix like count 2014-03-02 20:56:36 +07:00
fb8b8fdd62 [tvigle] Add support for tvigle.ru 2014-03-02 19:59:34 +07:00
5a0b26252e [ceskatelevize] Simplify 2014-03-01 23:05:33 +07:00
7d78f0cc48 [ceskatelevize] Fix video availability check and add geo unrestricted test 2014-03-01 22:54:37 +07:00
f00fc78674 Merge branch '_ceskatelevize' of https://github.com/pulpe/youtube-dl into pulpe-_ceskatelevize 2014-03-01 22:26:18 +07:00
392017874c [CeskaTelevize] raise ExtractorError if you are outside of CR 2014-03-01 16:17:29 +01:00
c3cb92d1ab [CeskaTelevize] fix python3 support @dstftw 2014-03-01 16:02:51 +01:00
aa5590fa07 skip test 2014-03-01 12:34:01 +01:00
8cfb5bbf92 [CeskaTelevize] Add initial support for ceskatelevize.cz 2014-03-01 11:47:52 +01:00
69bb54ebf9 [mailru] Add support for mail.ru video 2014-03-01 16:34:38 +07:00
ca97a56e4b [vk] Add support for embedded videos (Closes #2473) 2014-02-28 23:51:54 +07:00
fc26f3b4c2 [lifenews] Add support for multiple videos on the same page (#2482) 2014-02-28 22:52:06 +07:00
f604c93c64 [gdcvault] Formatting / Remove unused variables 2014-02-28 15:50:19 +01:00
dc3727b65c Credit @mnem dor GDCVault 2014-02-28 15:14:25 +01:00
aba3231de1 Merge remote-tracking branch 'mnem/gdc-vault' 2014-02-28 12:52:11 +01:00
9193bab91d release 2014.02.28 2014-02-28 12:31:37 +01:00
fbcf3e416d Merge pull request #2463 from rzhxeo/resume
Set resume_len to 0 if download is restarted
2014-02-28 12:30:34 +01:00
c0e5d85631 [vimeo] Improve thumbnail extraction 2014-02-28 18:00:12 +07:00
ca7fa3dcb3 [vimeo] Fix thumbs extraction (Closes #2480) 2014-02-28 17:43:54 +07:00
4ccfba28d9 [collegehumor] Fix test's uploader field 2014-02-27 19:10:30 +01:00
abb82f1ddc [mixcloud] Unquote the track id (#2462) 2014-02-27 18:58:09 +01:00
cda008cff1 release 2014.02.27.1 2014-02-27 16:09:58 +01:00
1877a14049 [lifenews] Switch to non-mobile webpage version (Fixes #2476) 2014-02-27 21:45:34 +07:00
546582ec3e Removing MD5 check for ethereal file. 2014-02-27 14:28:55 +00:00
4534485586 Fix test, remove unused, tidy quotes and brackets 2014-02-27 12:50:48 +00:00
a9ab8855e4 [prosiebensat1] Fix typo 2014-02-27 17:53:09 +07:00
8a44ef6868 [prosiebensat1] Add rtmpe support 2014-02-27 17:52:52 +07:00
0c7214c404 [prosiebensat1] Add support for ProSiebenSat.1 Digital sites (Closes
#2346 #2469)
2014-02-27 17:44:29 +07:00
4cf9654693 Add one more format to unified_strdate 2014-02-27 17:44:05 +07:00
50a138d95c Add support for authenticated videos 2014-02-27 10:32:31 +00:00
1b86cc41cf Add support for embed.ly 2014-02-27 08:14:28 +01:00
91346358b0 release 2014.02.27 2014-02-27 07:22:34 +01:00
f3783d4b77 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-27 07:22:22 +01:00
89ef304bed [generic] Add support for <meta redirect>
Fixes #413
2014-02-27 07:22:02 +01:00
83cebb8b7a Add support for FLV videos with speaker decks 2014-02-27 00:20:34 +00:00
9e68f9fdf1 Extractor for non-password protected GDC Vault videos 2014-02-26 22:33:33 +00:00
2acea5c03d [mit] Fix MITIE test 2014-02-26 18:09:43 +07:00
978177527e [rtlnow] Remove unused import 2014-02-26 18:02:17 +07:00
2648c436f3 Merge pull request #2464 from rzhxeo/xhamster
[XHamsterIE] Make hd video search more robust
2014-02-26 02:53:54 -08:00
33f1f2c455 [rtlnow] Fix duration extraction 2014-02-26 17:49:49 +07:00
995befe0e9 [rtlnow] Replace n-tvnow.de test 2014-02-26 17:43:56 +07:00
1bb92aff55 [rtlnow] Modernize and add f4m support 2014-02-26 17:36:16 +07:00
b8e1471d3a [XHamsterIE] Make hd video search more robust 2014-02-26 10:01:44 +01:00
60daf7f0bb Set resume_len to 0 if download is restarted 2014-02-26 02:47:27 +01:00
a83a3139d1 [mit] Add import 2014-02-26 00:41:13 +01:00
fdb7ca3b8d release 2014.02.26 2014-02-26 00:32:22 +01:00
0d7caf5cdf Merge remote-tracking branch 'ruuk/master' 2014-02-26 00:31:08 +01:00
a339d7ba91 Credit @amlweems for ocw.mit (#2460) 2014-02-26 00:30:47 +01:00
7216de55d6 [mit] Fix ocw tests 2014-02-26 00:29:45 +01:00
2437fbca64 [tests] Raise an exception if test definition is invalid (Found in #2460) 2014-02-26 00:12:02 +01:00
7d75d06b78 Merge branch 'ocw-mit-edu' of https://github.com/amlweems/youtube-dl 2014-02-26 00:09:42 +01:00
13ef5648c4 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-26 00:07:45 +01:00
5b2478e2ba [mit] Modernize 2014-02-26 00:06:31 +01:00
8b286571c3 [mixcloud] Fix _VALID_RE (fixes #2462)
Accept any character except `/` for uploader and the name, caused problems with non ASCII characters
2014-02-26 00:04:03 +01:00
f3ac523794 Merge pull request #2461 from niebles/master
Update __init__.py

`io` wasn't imported.
2014-02-26 00:00:57 +01:00
020cf5ebfd [nbc] Add an extractor for the main nbc.com site
Some of the videos are encrypted, the f4m downloader doesn’t support them.
2014-02-25 23:57:54 +01:00
54ab193970 Extract thumbnail with _og_search_thumbnail 2014-02-25 14:41:36 -08:00
8f563f32ab Update __init__.py 2014-02-25 17:31:16 -05:00
151bae3566 Add support for ocw.mit.edu video lectures 2014-02-25 14:44:34 -06:00
76df418cba Add thumbnail for metacafe 2014-02-25 12:04:44 -08:00
d0a72674c6 [crunchyroll] Use enumerate 2014-02-25 20:51:51 +01:00
1d430674c7 [crunchyroll] Handle error message 2014-02-25 20:30:17 +07:00
70cb73922b [crunchyroll] Fix subtitle lang code extraction 2014-02-25 20:29:53 +07:00
344400951c [crunchyroll] Tidy and modernize 2014-02-25 20:29:53 +07:00
ea5a0be811 Skip youtube toptracks test
All the playlists return 500 errors.
2014-02-25 14:11:01 +01:00
3c7fd0bdb2 release 2014.02.25.1 2014-02-25 11:15:55 +01:00
6cadf8c858 [vevo] Add age_limit support 2014-02-25 11:15:34 +01:00
27579b9e4c [vevo] Add suppot for v3 SMIL URLs (Fixes #2409) 2014-02-25 11:06:47 +01:00
4d756a9cc0 [testurl] Fix case when only one IE matches 2014-02-25 10:43:34 +01:00
3e668e05be Merge pull request #2456 from AGSPhoenix/master
[YT] Fix incorrect format code descriptions
2014-02-25 10:24:02 +01:00
60d3a2e0f8 Fix incorrect format codes
Corrects the descriptions for the DASH video format codes 264 and 138
(1440p and 2160p, respectively).
2014-02-24 21:29:37 -05:00
cc3a3b6b47 release 2014.02.25 2014-02-25 01:45:10 +01:00
eda1d49a62 Merge remote-tracking branch 'origin/master' 2014-02-25 01:45:00 +01:00
62e609ab77 Ignore BOM in batch files (Fixes #2450) 2014-02-25 01:43:17 +01:00
2bfe4ead4b [veoh] Allow to download videos with age protection (fixes #2455) 2014-02-24 22:01:34 +01:00
b1c6c32f78 [generic] Add support for nowvideo embedded videos 2014-02-24 23:37:42 +07:00
f6acbdecf4 [podomatic] Use unicode_literals 2014-02-24 17:31:09 +01:00
f1c9dfcc01 [nowvideo] Rewrite based on novamov extractor 2014-02-24 23:30:58 +07:00
ce78943ae1 [novamov] Generalize extractor 2014-02-24 23:30:09 +07:00
d6f0d86649 [novamov] Improve _VALID_URL 2014-02-24 22:01:19 +07:00
5bb67dbfea [cinemassacre] Modernize 2014-02-24 14:44:29 +01:00
47610c4d3e [cinemassacre] Fix extraction
Now we download over http, we don't need rtmpdump.
2014-02-24 14:35:26 +01:00
b732f3581f [academicearth] Remove debug print 2014-02-24 14:20:17 +01:00
9e57ce716f [academicearth] Fix extraction
The courses seems to be no longer available, changed the test to a playlist.
2014-02-24 14:18:12 +01:00
cd7ee7aa44 [nbc] Modernize 2014-02-24 14:00:31 +01:00
3cfe791473 [iprima] Add missing ) 2014-02-24 13:50:53 +01:00
973f2532f5 [iprima] Add support for -WEB URLs (Closes #2449) 2014-02-24 10:12:36 +01:00
bc3be21d59 [iprima] Clean up a little bit 2014-02-24 09:53:48 +01:00
0bf5cf9886 release 2014.02.24 2014-02-24 09:44:22 +01:00
919052d094 [zdf] Fix podcast extraction and use unicode literals (Closes #2446) 2014-02-24 13:47:47 +07:00
a2dafe2887 [youtube] Fix mix video regex
Attributes' order in <li> is arbitrary and changes every time playlist
page is fetched, so we can't rely on `data-index` to be before
`data-video-username`.
2014-02-24 12:52:02 +07:00
92661c994b [normalboots] Modernize and simplify 2014-02-23 18:28:22 +01:00
ffe8fe356a [normalboots] Fix video url extraction 2014-02-23 18:06:51 +01:00
bc2f773b4f [youtube:playlist] Fix mixes extraction (fixes #2444) 2014-02-23 17:17:36 +01:00
f919201ecc [vine] Extract more metadata and support low format 2014-02-23 19:02:31 +07:00
7ff5d5c2e2 Add one more format to unified_strdate 2014-02-23 19:00:51 +07:00
9b77f951c7 [breakcom] Fix error when calling _search_regex
I passed `’webpage’` instead of the variable `webpage`.
2014-02-23 12:28:44 +01:00
a25f2f990a [breakcom] Fix info json extraction 2014-02-23 12:20:58 +01:00
78b373975d [vine] Fix uploader extraction 2014-02-23 12:08:30 +01:00
2fcc873c4c release 2014.02.22.1 2014-02-22 23:17:56 +01:00
23c2baadb3 [videobam] Set age_limit to 18
From [their ToS](http://videobam.com/terms): "User must be eighteen 18[sic] years of age or older to use or access this web site."
2014-02-22 23:15:41 +01:00
521ee82334 Fix imports 2014-02-22 23:03:12 +01:00
1df96e59ce [f4m] Clean up 2014-02-22 23:03:00 +01:00
3e123c1e28 [videobam] Add support for videobam.com (Closes #2411) 2014-02-23 04:50:05 +07:00
f38da66731 Credit @soult for br 2014-02-22 20:19:41 +01:00
06aabfc422 [br] Simplify 2014-02-22 20:17:26 +01:00
1052d2bfec Merge remote-tracking branch 'soult/br' 2014-02-22 17:14:47 +01:00
5e0b652344 release 2014.02.22 2014-02-22 15:07:25 +01:00
0f8f097183 [release.sh] Do not run tests by default
We are at the point that testing takes waay too long for a release cycle, and fails way too often.
Tests through travis are a better indicator than testing just before release.
2014-02-22 15:06:07 +01:00
491ed3dda2 [trutube] Support multiple formats (#2433) 2014-02-22 15:05:30 +01:00
af284c6d1b Merge remote-tracking branch 'JohnyMoSwag/master' 2014-02-22 14:38:42 +01:00
41d3ec5fba [savefrom] Add extractor (Fixes #2434) 2014-02-22 14:36:16 +01:00
0568c352f3 [canalc2] Modernize 2014-02-22 14:27:09 +01:00
2e7b4cb714 [spankwire] Fix uploader id regex 2014-02-22 16:50:08 +07:00
9767726b66 [spankwire] Improve and modernize 2014-02-22 16:45:03 +07:00
9ddfd84e41 added trutubeIE 2014-02-22 00:11:57 -08:00
1cf563d84b release 2014.02.21.1 2014-02-21 18:19:48 +01:00
7928024f57 [BR] Add basic test 2014-02-21 18:00:05 +01:00
3eb38acb43 [BR] Add "BR" extractor
Extractor for videos from the Bayerischer Rundfunk Mediathek[1]. Currently only
supports videos. Audio and podcasts do not work yet with this extractor.

1: http://br.de/mediathek
2014-02-21 17:58:52 +01:00
f7300c5c90 [generic] Fix on python 2.6
`ParseError` is not available, it raises `xml.parsers.expat.ExpatError`.
The webpage needs to be encoded.
2014-02-21 16:59:10 +01:00
3489b7d26c [youtube] Simplify the decryption process for the manifest urls and add a test (closes #2422) 2014-02-21 15:15:58 +01:00
acd2bcc384 Merge branch 'youtube-dash' of github.com:m0vie/youtube-dl 2014-02-21 15:02:47 +01:00
43e77ca455 release 2014.02.21 2014-02-21 12:16:03 +01:00
da36297988 [wimp] Modernize and replace test 2014-02-21 17:57:19 +07:00
dbb94fb044 [youtube] Fix playlist extraction (Closes #2423, #2424, #2425) 2014-02-21 17:19:55 +07:00
d68f0cdb23 [youtube] decrypt signature when downloading dash manifest 2014-02-21 03:24:56 +01:00
eae16eb67b release 2014.02.20 2014-02-20 13:14:21 +01:00
4fc946b546 [generic] Add support for RSS feeds (Fixes #667) 2014-02-20 13:14:09 +01:00
280bc5dad6 [bbccouk] Add friendly contry filter error message (#2184) 2014-02-20 18:50:34 +07:00
f43770d8c9 Merge pull request #2413 from bentley/optypo
Fix minor typo: “to to” → “to”.
2014-02-20 08:02:54 +01:00
98c4b8fa1b Fix minor typo: “to to” → “to”. 2014-02-19 20:02:29 -07:00
ccb079ee67 [xhamster] Fix and improve 2014-02-20 02:37:44 +07:00
2ea237472c Merge pull request #2408 from pulpe/_readme
[README.md] correct the test command
2014-02-19 16:45:14 +01:00
0d4b4865cc [README.md] correct the test command 2014-02-19 16:13:45 +01:00
fe52f9f956 Document prefered config location (#2407) 2014-02-19 11:35:35 +01:00
882907a818 release 2014.02.19.1 2014-02-19 01:27:22 +01:00
572a89cc4e [liveleak] Add support for prochan embeds (Fixes #2406) 2014-02-19 01:27:12 +01:00
c377110539 release 2014.02.19 2014-02-19 01:08:16 +01:00
a9c7198a0b [testurl] Add extractor
This is a pseudo extractor that can be used to quickly look up test URLs, or test without the test harness.
2014-02-19 01:06:16 +01:00
f6f01ea17b [space] modernize 2014-02-19 01:04:24 +01:00
f2d0fc6823 [bbccouk] Replace test
This older episode is from 1994 and hopefully won't get deleted.
2014-02-19 06:46:14 +07:00
f7000f3a1b [youtube] Add support for yourepeat.com URLs (Closes #2397) 2014-02-19 02:00:54 +07:00
c7f0177fa7 [bbccouk] Skip test 2014-02-18 00:26:12 +07:00
09c4d50944 Fix indenting in README 2014-02-17 14:58:39 +01:00
2eb5d315d4 [youtube] Match more truncated URLs (Closes #2402) 2014-02-17 14:56:21 +01:00
ad5976b4d9 [vimeo] Modernize test definition 2014-02-17 11:44:24 +01:00
a0dfcdce5e release 2014.02.17 2014-02-17 11:33:13 +01:00
96d1637082 Credit @Nikerabbit for helsinki 2014-02-17 11:33:01 +01:00
960f317171 [helsinki] Simplify 2014-02-17 11:32:30 +01:00
4412ca751d Merge remote-tracking branch 'Nikerabbit/hki' 2014-02-17 11:26:09 +01:00
cbffec0c95 Credit @patheticpat for 4tube.com (#2398) 2014-02-17 09:08:38 +07:00
0cea52cc18 Credit @pulpe for play.iprima.cz and stream.cz 2014-02-17 09:07:36 +07:00
6d784e87f4 Credit @prutz1311 for normalboots.com (#2279) 2014-02-17 09:03:28 +07:00
ae6cae78f1 [4tube] Minor changes and extract more metadata 2014-02-17 03:51:03 +07:00
0f99566c01 Add one more format in unified_strdate 2014-02-17 03:47:03 +07:00
2db806b4aa Improve parse_duration 2014-02-17 03:46:26 +07:00
3f32c0ba4c Merge branch '4tube' of https://github.com/patheticpat/youtube-dl into patheticpat-4tube 2014-02-17 02:21:45 +07:00
541cb26c0d [smotri] Add entry for netrc authentication 2014-02-17 02:19:55 +07:00
5544e038ab [vk] Add entry for netrc authentication 2014-02-17 02:17:10 +07:00
9032dc28a6 [vk] Add login feature (Closes #2206) 2014-02-17 02:05:15 +07:00
03635e2a71 Add support for 4tube.com. 2014-02-16 18:10:39 +01:00
00cf938aa5 [nfb] Add rtmp app field to format 2014-02-16 06:11:38 +07:00
a5f707c495 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-15 20:45:12 +01:00
1824b48169 [f4m] Download only the first fragment with the --test option 2014-02-15 17:53:23 +01:00
07ad22b8af [youtube:search] Mark "no results found" error as expected 2014-02-15 16:30:11 +01:00
b53466e168 Fix f4m downloading on Python 2.6 2014-02-15 16:24:43 +01:00
6a7a389679 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-15 15:34:17 +01:00
4edff78531 Merge remote-tracking branch 'jaimeMF/f4m'
Conflicts:
	youtube_dl/extractor/__init__.py
2014-02-15 15:32:13 +01:00
99043c2ea5 Replace test for dailymotion users 2014-02-15 13:17:31 +01:00
e68abba910 [sohu] Skip test
Only available from China
2014-02-15 13:12:41 +01:00
3165dc4d9f [france2.fr:generation-quoi] Skip test
The videos seem to not be available outside France
2014-02-15 13:04:31 +01:00
66c43a53e4 Add support for video.helsinki.fi archives 2014-02-14 18:14:28 +02:00
463b334616 [ndr] Replace 404 test 2014-02-14 23:12:15 +07:00
b71dbc57c4 [vesti] Fix player regex (Closes #2382) 2014-02-14 22:26:13 +07:00
72ca1d7f45 [vesti] Skip test 2 due to geo restrictions
At least that's how I interpret the error message "Просмотр вид��о ограничен в вашем регионе."
2014-02-13 22:19:59 +01:00
76e461f395 release 2014.02.13 2014-02-13 19:13:05 +01:00
1074982e6e [vesti] Add support for vesti.ru videos and live streams (Closes #2376) 2014-02-13 23:23:48 +07:00
29b2aaf035 [jadorecettepub] Remove unused import 2014-02-13 16:33:12 +01:00
6f90d098c5 [ecapist] modernize and fix id property 2014-02-13 16:32:42 +01:00
0715161450 Merge pull request #2373 from pulpe/_description_fixes
[collegehumor, chilloutzone] changed description in tests
2014-02-12 06:22:03 -08:00
896583517f [collegehumor, chilloutzone] changed description in tests 2014-02-12 15:11:57 +01:00
713d31fac8 [gametrailers] Fix gametrailers test 2014-02-12 01:50:53 +07:00
96cb10a5f5 [mtv] Improve title extraction 2014-02-12 01:07:30 +07:00
c207c1044e Merge pull request #2372 from pulpe/dropbox_fix
[dropbox] replace not working test
2014-02-11 09:34:49 -08:00
79629ec717 [dropbox] replace not working test 2014-02-11 17:27:36 +01:00
008fda0f08 [ndr] Replace 404 video test 2014-02-11 21:21:05 +07:00
0ae6b01937 [cnn] Add an extractor for blogs (closes #2361) 2014-02-11 14:38:17 +01:00
def630e523 [xtube] Fix uploader extraction 2014-02-11 14:20:41 +01:00
c5ba203e23 [xtube] use unicode_literals 2014-02-11 13:51:37 +01:00
2317e6b2b3 [yahoo] use unicode_literals 2014-02-11 13:51:23 +01:00
cb38928974 [firsttv] Skip test 2014-02-11 10:26:52 +07:00
fa78f13302 [streamcz] Minor changes 2014-02-11 10:19:02 +07:00
18395217c4 Merge branch '_stream' of https://github.com/pulpe/youtube-dl into pulpe-_stream 2014-02-11 09:18:46 +07:00
34bd987811 [freesound] Modernize 2014-02-10 21:03:14 +01:00
af6ba6a1c4 [exfm] Modernize 2014-02-10 21:00:37 +01:00
85409a0c69 [dotsub] Modernize 2014-02-10 20:52:53 +01:00
ebfe352b62 [breakcom] Modernize 2014-02-10 20:48:46 +01:00
fde56d2f17 [howcast] Modernize 2014-02-10 20:45:17 +01:00
3501423dfe [googleplus] Modernize and simplify 2014-02-10 20:36:11 +01:00
0de668af51 [instagram] Modernize 2014-02-10 20:24:12 +01:00
2a584ea90a [firsttv] Fix video URL regex 2014-02-11 00:49:37 +07:00
0f6ed94a15 [firsttv] Add support for 1tv.ru videoarchive 2014-02-11 00:20:41 +07:00
bcb891e82b [lifenews] Minor improvements 2014-02-10 21:07:41 +07:00
ac6e4ca1ed [brightcove] Unescape html entities from the 'og:video' url property (fixes #2360) 2014-02-10 07:50:10 +01:00
2e20bba708 release 2014.02.10 2014-02-10 02:01:11 +01:00
e70dc1d14b [youtube] Correct a minor regex typo 2014-02-10 01:30:47 +01:00
0793a7b3c7 [StreamCZ] Add support for stream.cz 2014-02-09 18:37:12 +01:00
026fcc0495 Fix #2355 (date parsing with dashes) 2014-02-09 18:09:57 +01:00
81c2f20b53 [youtube] Correct invalid JSON (Fixes #2353) 2014-02-09 17:56:10 +01:00
1afe753462 [slideshare] Fix description extraction and modernize
The ‘og:description’  property doesn’t contain the full description
2014-02-09 14:23:19 +01:00
524c2c716a [bloomberg] Fix extraction of ooyala embed code 2014-02-09 14:11:45 +01:00
b542d4bbd7 [kontrtube] Add support for kontrtube.ru (Closes #2354) 2014-02-09 19:53:11 +07:00
cf1eb45153 Add a downloader for f4m manifests 2014-02-09 12:24:54 +01:00
a97bcd80ba Add an extractor for syfy.com
It uses theplatfrom.com, which has been updated to work with f4m manifests
2014-02-08 22:30:00 +01:00
17968e444c [bbc.co.uk] Fix TV episode test 2014-02-09 04:04:21 +07:00
2e3fd9ec2f [bbc.co.uk] Improve overall extractor structure, add subtitles support
(#2184)

Everything from http://www.bbc.co.uk/iplayer/ should be downloadable
now.
2014-02-09 04:00:49 +07:00
d6a283b025 release 2014.02.08.2 2014-02-08 19:20:35 +01:00
9766538124 [jadorecettepub] Add extractor (Fixes #2148) 2014-02-08 19:20:23 +01:00
98dbee8681 [jeuxvideo] Modernize 2014-02-08 18:43:12 +01:00
e421491b3b release 2014.02.08.1 2014-02-08 18:38:05 +01:00
6828d37c41 Merge branch 'master' of github.com:rg3/youtube-dl 2014-02-08 18:37:53 +01:00
bf5f610099 [pbs] Add support for viralplayer links (Fixes #2350) 2014-02-08 18:37:33 +01:00
8b7f73404a [bbc.co.uk] Fix regex 2014-02-08 22:55:43 +07:00
85cacb2f51 [bbc.co.uk] Add one more link format 2014-02-08 22:54:05 +07:00
b3fa3917e2 release 2014.02.08 2014-02-08 16:25:03 +01:00
082c6c867a [bbc.co.uk] Add support for bbc.co.uk radio programmes (Closes #2184) 2014-02-08 21:55:28 +07:00
03fcf1ab57 Merge pull request #2342 from MikeCol/tube8
[Tube8] Extended valid urls schema
2014-02-08 04:00:50 +01:00
3b00dea5eb Extended valid urls schema 2014-02-08 00:09:26 +01:00
8bc6c8e3c0 [chilloutzone] Add additional tests (#2340) 2014-02-07 15:42:31 +01:00
79bc27b53a [channel9] Simplify 2014-02-07 19:41:18 +07:00
84dd703199 [ivi] Simplify 2014-02-07 19:36:50 +07:00
c6fdba23a6 [nfb] Add workaround for python2.6 2014-02-07 19:23:53 +07:00
b19fe521a9 Merge pull request #2340 from Fnordlab/master
[chilloutzone] Fixes refactoring bug
2014-02-07 12:46:56 +01:00
c1e672d121 [chilloutzone] fixes bug with youtube extraction
the id used for extracting the video from youtube is stored in
native_video_id not video_id. This id is only used on chilloutzone.net
2014-02-07 12:29:58 +01:00
f4371f4784 Merge remote-tracking branch 'upstream/master' 2014-02-07 12:20:58 +01:00
d914d9d187 [chilloutzone] Add import 2014-02-07 12:03:19 +01:00
845d14d377 credit @Fnordlab for chilloutzone 2014-02-07 12:00:58 +01:00
4a9540b6d2 [chilloutzone] Simplify (#2338) 2014-02-07 12:00:25 +01:00
9f31be7000 Merge remote-tracking branch 'Fnordlab/chilloutzone' 2014-02-07 11:50:26 +01:00
41fa1b627d release 2014.02.06.3 2014-02-07 01:41:01 +01:00
c0c4e66b29 Merge branch 'chilloutzone' 2014-02-06 21:33:16 +01:00
cd8662de22 [chilloutzone] Bug fix, runs against tests
Fixes a bug with python3.3 and made the extractor run successfully
against tox
2014-02-06 21:31:04 +01:00
3587159614 [nfb] Add encode POST data 2014-02-07 02:13:04 +07:00
d67cc9fa7c [youtube:playlist] Recognize ‘top tracks’ urls (closes #2332)
The list parameter starts with ‘MC’ and can have more characters after it, including dots
2014-02-06 19:46:26 +01:00
bf3a2fe923 [elpais] Fix typo 2014-02-07 00:38:29 +07:00
e9ea0bf123 [ndr] Add support for ndr.de (Closes #2325) 2014-02-07 00:35:26 +07:00
63424b6233 release 2014.02.06.2 2014-02-06 15:45:47 +01:00
0bf35c5cf5 [nfb] Add support for onf.ca URLs 2014-02-06 21:41:31 +07:00
95c29381eb [mooshare] Fix bogus video page URL 2014-02-06 21:26:12 +07:00
94c4abce7f [nfb] Add support for nfb.ca (Closes #2069) 2014-02-06 21:19:13 +07:00
f2dffe55f8 Merge branch 'chilloutzone' 2014-02-06 11:49:38 +01:00
46a073bfac [chilloutzone] Added support for chilloutzone.net
Added support for chilloutzone.net videos including embedded youtube
and vimeo movies. In case you find a not working movie, drop me an
email.
2014-02-06 11:44:44 +01:00
104 changed files with 4503 additions and 1128 deletions

View File

@ -20,7 +20,7 @@ which means you can modify it, redistribute it or use it however you like.
sure that you have sufficient permissions
(run with sudo if needed)
-i, --ignore-errors continue on download errors, for example to
to skip unavailable videos in a playlist
skip unavailable videos in a playlist
--abort-on-error Abort downloading of further videos (in the
playlist or the command line) if an error
occurs
@ -124,8 +124,12 @@ which means you can modify it, redistribute it or use it however you like.
video id, %(playlist)s for the playlist the
video is in, %(playlist_index)s for the
position in the playlist and %% for a
literal percent. Use - to output to stdout.
Can also be used to download to a different
literal percent. %(height)s and %(width)s
for the width and height of the video
format. %(resolution)s for a textual
description of the resolution of the video
format. Use - to output to stdout. Can also
be used to download to a different
directory, for example with -o '/my/downloa
ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
--autonumber-size NUMBER Specifies the number of digits in
@ -246,7 +250,7 @@ which means you can modify it, redistribute it or use it however you like.
# CONFIGURATION
You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
# OUTPUT TEMPLATE
@ -281,12 +285,14 @@ Videos can be filtered by their upload date using the options `--date`, `--dateb
Examples:
$ # Download only the videos uploaded in the last 6 months
$ youtube-dl --dateafter now-6months
$ # Download only the videos uploaded on January 1, 1970
$ youtube-dl --date 19700101
$ # will only download the videos uploaded in the 200x decade
$ youtube-dl --dateafter 20000101 --datebefore 20091231
# Download only the videos uploaded in the last 6 months
$ youtube-dl --dateafter now-6months
# Download only the videos uploaded on January 1, 1970
$ youtube-dl --date 19700101
$ # will only download the videos uploaded in the 200x decade
$ youtube-dl --dateafter 20000101 --datebefore 20091231
# FAQ
@ -355,7 +361,7 @@ If you want to create a build of youtube-dl yourself, you'll need
### Adding support for a new site
If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py Test_Download.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py TestDownload.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
# BUGS

View File

@ -14,9 +14,9 @@
set -e
skip_tests=false
if [ "$1" = '--skip-test' ]; then
skip_tests=true
skip_tests=true
if [ "$1" = '--run-tests' ]; then
skip_tests=false
shift
fi

View File

@ -1,5 +1,7 @@
#!/usr/bin/env python
from __future__ import unicode_literals
# Allow direct execution
import os
import sys
@ -13,6 +15,7 @@ from youtube_dl.extractor import (
FacebookIE,
gen_extractors,
JustinTVIE,
PBSIE,
YoutubeIE,
)
@ -29,18 +32,20 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
assertPlaylist(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist(u'UUBABnxM4Ar9ten8Mdjj1j0Q') #585
assertPlaylist(u'PL63F0C78739B09958')
assertPlaylist(u'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist(u'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M'))
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') #585
assertPlaylist('PL63F0C78739B09958')
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))
self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
@ -63,6 +68,13 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_show_matching(self):
self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
def test_youtube_truncated(self):
self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url'])
def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_justin_tv_channelid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
@ -80,7 +92,7 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361"))
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE()._extract_id(url), id)
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)
assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc')
assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc')
assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc')
@ -89,7 +101,7 @@ class TestAllURLsMatching(unittest.TestCase):
assertExtractId('BaW_jenozKc', 'BaW_jenozKc')
def test_facebook_matching(self):
self.assertTrue(FacebookIE.suitable(u'https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268'))
self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268'))
def test_no_duplicates(self):
ies = gen_extractors()
@ -124,5 +136,9 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', ['Tumblr'])
self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430', ['Tumblr'])
def test_pbs(self):
# https://github.com/rg3/youtube-dl/issues/2350
self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS'])
if __name__ == '__main__':
unittest.main()

View File

@ -18,6 +18,7 @@ from test.helper import (
import hashlib
import io
import json
import re
import socket
import youtube_dl.YoutubeDL
@ -72,9 +73,7 @@ def generator(test_case):
if 'playlist' not in test_case:
info_dict = test_case.get('info_dict', {})
if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
print_skipping('The output file cannot be know, the "file" '
'key is missing or the info_dict is incomplete')
return
raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?')
if 'skip' in test_case:
print_skipping(test_case['skip'])
return
@ -137,12 +136,21 @@ def generator(test_case):
with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
for (info_field, expected) in tc.get('info_dict', {}).items():
if isinstance(expected, compat_str) and expected.startswith('md5:'):
got = 'md5:' + md5(info_dict.get(info_field))
else:
if isinstance(expected, compat_str) and expected.startswith('re:'):
got = info_dict.get(info_field)
self.assertEqual(expected, got,
u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
match_str = expected[len('re:'):]
match_rex = re.compile(match_str)
self.assertTrue(
isinstance(got, compat_str) and match_rex.match(got),
u'field %s (value: %r) should match %r' % (info_field, got, match_str))
else:
if isinstance(expected, compat_str) and expected.startswith('md5:'):
got = 'md5:' + md5(info_dict.get(info_field))
else:
got = info_dict.get(info_field)
self.assertEqual(expected, got,
u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
# If checkable fields are missing from the test case, print the info_dict
test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))

View File

@ -36,6 +36,7 @@ from youtube_dl.extractor import (
RutubeChannelIE,
GoogleSearchIE,
GenericIE,
TEDIE,
)
@ -55,10 +56,10 @@ class TestPlaylists(unittest.TestCase):
def test_dailymotion_user(self):
dl = FakeYDL()
ie = DailymotionUserIE(dl)
result = ie.extract('http://www.dailymotion.com/user/generation-quoi/')
result = ie.extract('https://www.dailymotion.com/user/nqtv')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], 'Génération Quoi')
self.assertTrue(len(result['entries']) >= 26)
self.assertEqual(result['title'], 'Rémi Gaillard')
self.assertTrue(len(result['entries']) >= 100)
def test_vimeo_channel(self):
dl = FakeYDL()
@ -170,12 +171,12 @@ class TestPlaylists(unittest.TestCase):
def test_AcademicEarthCourse(self):
dl = FakeYDL()
ie = AcademicEarthCourseIE(dl)
result = ie.extract('http://academicearth.org/courses/building-dynamic-websites/')
result = ie.extract('http://academicearth.org/playlists/laws-of-nature/')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'building-dynamic-websites')
self.assertEqual(result['title'], 'Building Dynamic Websites')
self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
self.assertEqual(len(result['entries']), 10)
self.assertEqual(result['id'], 'laws-of-nature')
self.assertEqual(result['title'], 'Laws of Nature')
self.assertEqual(result['description'],u'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.')# u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
self.assertEqual(len(result['entries']), 4)
def test_ivi_compilation(self):
dl = FakeYDL()
@ -250,5 +251,23 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'python language')
self.assertTrue(len(result['entries']) == 15)
def test_generic_rss_feed(self):
dl = FakeYDL()
ie = GenericIE(dl)
result = ie.extract('http://www.escapistmagazine.com/rss/videos/list/1.xml')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'http://www.escapistmagazine.com/rss/videos/list/1.xml')
self.assertEqual(result['title'], 'Zero Punctuation')
self.assertTrue(len(result['entries']) > 10)
def test_ted_playlist(self):
dl = FakeYDL()
ie = TEDIE(dl)
result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], '10')
self.assertEqual(result['title'], 'Who are the hackers?')
self.assertTrue(len(result['entries']) >= 6)
if __name__ == '__main__':
unittest.main()

View File

@ -9,6 +9,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Various small unit tests
import io
import xml.etree.ElementTree
#from youtube_dl.utils import htmlentity_transform
@ -21,10 +22,12 @@ from youtube_dl.utils import (
orderedSet,
PagedList,
parse_duration,
read_batch_urls,
sanitize_filename,
shell_quote,
smuggle_url,
str_to_int,
struct_unpack,
timeconvert,
unescapeHTML,
unified_strdate,
@ -127,6 +130,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('8/7/2009'), '20090708')
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
def test_find_xpath_attr(self):
testxml = u'''<root>
@ -200,7 +204,16 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('1'), 1)
self.assertEqual(parse_duration('1337:12'), 80232)
self.assertEqual(parse_duration('9:12:43'), 33163)
self.assertEqual(parse_duration('12:00'), 720)
self.assertEqual(parse_duration('00:01:01'), 61)
self.assertEqual(parse_duration('x:y'), None)
self.assertEqual(parse_duration('3h11m53s'), 11513)
self.assertEqual(parse_duration('62m45s'), 3765)
self.assertEqual(parse_duration('6m59s'), 419)
self.assertEqual(parse_duration('49s'), 49)
self.assertEqual(parse_duration('0h0m0s'), 0)
self.assertEqual(parse_duration('0m0s'), 0)
self.assertEqual(parse_duration('0s'), 0)
def test_fix_xml_ampersands(self):
self.assertEqual(
@ -236,5 +249,17 @@ class TestUtil(unittest.TestCase):
testPL(5, 2, (2, 99), [2, 3, 4])
testPL(5, 2, (20, 99), [])
def test_struct_unpack(self):
self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,))
def test_read_batch_urls(self):
f = io.StringIO(u'''\xef\xbb\xbf foo
bar\r
baz
# More after this line\r
; or after this
bam''')
self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam'])
if __name__ == '__main__':
unittest.main()

View File

@ -16,6 +16,7 @@ from youtube_dl.extractor import (
YoutubeChannelIE,
YoutubeShowIE,
YoutubeTopListIE,
YoutubeSearchURLIE,
)
@ -30,7 +31,7 @@ class TestYoutubeLists(unittest.TestCase):
result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], 'ytdl test PL')
ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
ytie_results = [YoutubeIE().extract_id(url['url']) for url in result['entries']]
self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE'])
def test_youtube_playlist_noplaylist(self):
@ -39,7 +40,7 @@ class TestYoutubeLists(unittest.TestCase):
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
self.assertEqual(result['_type'], 'url')
self.assertEqual(YoutubeIE()._extract_id(result['url']), 'FXxLjLQi3Fg')
self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg')
def test_issue_673(self):
dl = FakeYDL()
@ -59,7 +60,7 @@ class TestYoutubeLists(unittest.TestCase):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
ytie_results = [YoutubeIE().extract_id(url['url']) for url in result['entries']]
self.assertFalse('pElCt5oNDuI' in ytie_results)
self.assertFalse('KdPEApIVdWM' in ytie_results)
@ -76,9 +77,9 @@ class TestYoutubeLists(unittest.TestCase):
# TODO find a > 100 (paginating?) videos course
result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
entries = result['entries']
self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs')
self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs')
self.assertEqual(len(entries), 25)
self.assertEqual(YoutubeIE()._extract_id(entries[-1]['url']), 'rYefUsYuEp0')
self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0')
def test_youtube_channel(self):
dl = FakeYDL()
@ -117,6 +118,15 @@ class TestYoutubeLists(unittest.TestCase):
original_video = entries[0]
self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
def test_youtube_toptracks(self):
print('Skipping: The playlist page gives error 500')
return
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
entries = result['entries']
self.assertEqual(len(entries), 100)
def test_youtube_toplist(self):
dl = FakeYDL()
ie = YoutubeTopListIE(dl)
@ -124,5 +134,14 @@ class TestYoutubeLists(unittest.TestCase):
entries = result['entries']
self.assertTrue(len(entries) >= 5)
def test_youtube_search_url(self):
dl = FakeYDL()
ie = YoutubeSearchURLIE(dl)
result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video')
entries = result['entries']
self.assertIsPlaylist(result)
self.assertEqual(result['title'], 'youtube-dl test video')
self.assertTrue(len(entries) >= 5)
if __name__ == '__main__':
unittest.main()

View File

@ -409,6 +409,13 @@ class YoutubeDL(object):
template_dict['autonumber'] = autonumber_templ % self._num_downloads
if template_dict.get('playlist_index') is not None:
template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
if template_dict.get('resolution') is None:
if template_dict.get('width') and template_dict.get('height'):
template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
elif template_dict.get('height'):
res = '%sp' % template_dict['height']
elif template_dict.get('width'):
res = '?x%d' % template_dict['width']
sanitize = lambda k, v: sanitize_filename(
compat_str(v),
@ -675,6 +682,9 @@ class YoutubeDL(object):
info_dict['playlist'] = None
info_dict['playlist_index'] = None
if 'display_id' not in info_dict and 'id' in info_dict:
info_dict['display_id'] = info_dict['id']
# This extractors handle format selection themselves
if info_dict['extractor'] in ['Youku']:
if download:

View File

@ -41,12 +41,22 @@ __authors__ = (
'Chris Gahan',
'Saimadhav Heblikar',
'Mike Col',
'Oleg Prutz',
'pulpe',
'Andreas Schmitz',
'Michael Kaiser',
'Niklas Laxström',
'David Triendl',
'Anthony Weems',
'David Wagner',
'Juan C. Olivares',
)
__license__ = 'Public Domain'
import codecs
import getpass
import io
import locale
import optparse
import os
@ -65,6 +75,7 @@ from .utils import (
get_cachedir,
MaxDownloadsReached,
preferredencoding,
read_batch_urls,
SameFileError,
setproctitle,
std_headers,
@ -203,7 +214,7 @@ def parseOpts(overrideArguments=None):
general.add_option('-U', '--update',
action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
general.add_option('-i', '--ignore-errors',
action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False)
action='store_true', dest='ignoreerrors', help='continue on download errors, for example to skip unavailable videos in a playlist', default=False)
general.add_option('--abort-on-error',
action='store_false', dest='ignoreerrors',
help='Abort downloading of further videos (in the playlist or the command line) if an error occurs')
@ -419,6 +430,8 @@ def parseOpts(overrideArguments=None):
'%(extractor)s for the provider (youtube, metacafe, etc), '
'%(id)s for the video id, %(playlist)s for the playlist the video is in, '
'%(playlist_index)s for the position in the playlist and %% for a literal percent. '
'%(height)s and %(width)s for the width and height of the video format. '
'%(resolution)s for a textual description of the resolution of the video format. '
'Use - to output to stdout. Can also be used to download to a different directory, '
'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
filesystem.add_option('--autonumber-size',
@ -546,21 +559,19 @@ def _real_main(argv=None):
sys.exit(0)
# Batch file verification
batchurls = []
batch_urls = []
if opts.batchfile is not None:
try:
if opts.batchfile == '-':
batchfd = sys.stdin
else:
batchfd = open(opts.batchfile, 'r')
batchurls = batchfd.readlines()
batchurls = [x.strip() for x in batchurls]
batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
batch_urls = read_batch_urls(batchfd)
if opts.verbose:
write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')
write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n')
except IOError:
sys.exit(u'ERROR: batch file could not be read')
all_urls = batchurls + args
all_urls = batch_urls + args
all_urls = [url.strip() for url in all_urls]
_enc = preferredencoding()
all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]

View File

@ -5,6 +5,7 @@ from .hls import HlsFD
from .http import HttpFD
from .mplayer import MplayerFD
from .rtmp import RtmpFD
from .f4m import F4mFD
from ..utils import (
determine_ext,
@ -22,5 +23,7 @@ def get_suitable_downloader(info_dict):
return HlsFD
if url.startswith('mms') or url.startswith('rtsp'):
return MplayerFD
if determine_ext(url) == 'f4m':
return F4mFD
else:
return HttpFD

View File

@ -0,0 +1,314 @@
from __future__ import unicode_literals
import base64
import io
import itertools
import os
import time
import xml.etree.ElementTree as etree
from .common import FileDownloader
from .http import HttpFD
from ..utils import (
struct_pack,
struct_unpack,
compat_urlparse,
format_bytes,
encodeFilename,
sanitize_open,
)
class FlvReader(io.BytesIO):
"""
Reader for Flv files
The file format is documented in https://www.adobe.com/devnet/f4v.html
"""
# Utility functions for reading numbers and strings
def read_unsigned_long_long(self):
return struct_unpack('!Q', self.read(8))[0]
def read_unsigned_int(self):
return struct_unpack('!I', self.read(4))[0]
def read_unsigned_char(self):
return struct_unpack('!B', self.read(1))[0]
def read_string(self):
res = b''
while True:
char = self.read(1)
if char == b'\x00':
break
res += char
return res
def read_box_info(self):
"""
Read a box and return the info as a tuple: (box_size, box_type, box_data)
"""
real_size = size = self.read_unsigned_int()
box_type = self.read(4)
header_end = 8
if size == 1:
real_size = self.read_unsigned_long_long()
header_end = 16
return real_size, box_type, self.read(real_size-header_end)
def read_asrt(self):
# version
self.read_unsigned_char()
# flags
self.read(3)
quality_entry_count = self.read_unsigned_char()
# QualityEntryCount
for i in range(quality_entry_count):
self.read_string()
segment_run_count = self.read_unsigned_int()
segments = []
for i in range(segment_run_count):
first_segment = self.read_unsigned_int()
fragments_per_segment = self.read_unsigned_int()
segments.append((first_segment, fragments_per_segment))
return {
'segment_run': segments,
}
def read_afrt(self):
# version
self.read_unsigned_char()
# flags
self.read(3)
# time scale
self.read_unsigned_int()
quality_entry_count = self.read_unsigned_char()
# QualitySegmentUrlModifiers
for i in range(quality_entry_count):
self.read_string()
fragments_count = self.read_unsigned_int()
fragments = []
for i in range(fragments_count):
first = self.read_unsigned_int()
first_ts = self.read_unsigned_long_long()
duration = self.read_unsigned_int()
if duration == 0:
discontinuity_indicator = self.read_unsigned_char()
else:
discontinuity_indicator = None
fragments.append({
'first': first,
'ts': first_ts,
'duration': duration,
'discontinuity_indicator': discontinuity_indicator,
})
return {
'fragments': fragments,
}
def read_abst(self):
# version
self.read_unsigned_char()
# flags
self.read(3)
self.read_unsigned_int() # BootstrapinfoVersion
# Profile,Live,Update,Reserved
self.read(1)
# time scale
self.read_unsigned_int()
# CurrentMediaTime
self.read_unsigned_long_long()
# SmpteTimeCodeOffset
self.read_unsigned_long_long()
self.read_string() # MovieIdentifier
server_count = self.read_unsigned_char()
# ServerEntryTable
for i in range(server_count):
self.read_string()
quality_count = self.read_unsigned_char()
# QualityEntryTable
for i in range(quality_count):
self.read_string()
# DrmData
self.read_string()
# MetaData
self.read_string()
segments_count = self.read_unsigned_char()
segments = []
for i in range(segments_count):
box_size, box_type, box_data = self.read_box_info()
assert box_type == b'asrt'
segment = FlvReader(box_data).read_asrt()
segments.append(segment)
fragments_run_count = self.read_unsigned_char()
fragments = []
for i in range(fragments_run_count):
box_size, box_type, box_data = self.read_box_info()
assert box_type == b'afrt'
fragments.append(FlvReader(box_data).read_afrt())
return {
'segments': segments,
'fragments': fragments,
}
def read_bootstrap_info(self):
total_size, box_type, box_data = self.read_box_info()
assert box_type == b'abst'
return FlvReader(box_data).read_abst()
def read_bootstrap_info(bootstrap_bytes):
return FlvReader(bootstrap_bytes).read_bootstrap_info()
def build_fragments_list(boot_info):
""" Return a list of (segment, fragment) for each fragment in the video """
res = []
segment_run_table = boot_info['segments'][0]
# I've only found videos with one segment
segment_run_entry = segment_run_table['segment_run'][0]
n_frags = segment_run_entry[1]
fragment_run_entry_table = boot_info['fragments'][0]['fragments']
first_frag_number = fragment_run_entry_table[0]['first']
for (i, frag_number) in zip(range(1, n_frags+1), itertools.count(first_frag_number)):
res.append((1, frag_number))
return res
def write_flv_header(stream, metadata):
"""Writes the FLV header and the metadata to stream"""
# FLV header
stream.write(b'FLV\x01')
stream.write(b'\x05')
stream.write(b'\x00\x00\x00\x09')
# FLV File body
stream.write(b'\x00\x00\x00\x00')
# FLVTAG
# Script data
stream.write(b'\x12')
# Size of the metadata with 3 bytes
stream.write(struct_pack('!L', len(metadata))[1:])
stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
stream.write(metadata)
# Magic numbers extracted from the output files produced by AdobeHDS.php
#(https://github.com/K-S-V/Scripts)
stream.write(b'\x00\x00\x01\x73')
def _add_ns(prop):
return '{http://ns.adobe.com/f4m/1.0}%s' % prop
class HttpQuietDownloader(HttpFD):
def to_screen(self, *args, **kargs):
pass
class F4mFD(FileDownloader):
"""
A downloader for f4m manifests or AdobeHDS.
"""
def real_download(self, filename, info_dict):
man_url = info_dict['url']
self.to_screen('[download] Downloading f4m manifest')
manifest = self.ydl.urlopen(man_url).read()
self.report_destination(filename)
http_dl = HttpQuietDownloader(self.ydl,
{
'continuedl': True,
'quiet': True,
'noprogress': True,
'test': self.params.get('test', False),
})
doc = etree.fromstring(manifest)
formats = [(int(f.attrib.get('bitrate', -1)), f) for f in doc.findall(_add_ns('media'))]
formats = sorted(formats, key=lambda f: f[0])
rate, media = formats[-1]
base_url = compat_urlparse.urljoin(man_url, media.attrib['url'])
bootstrap = base64.b64decode(doc.find(_add_ns('bootstrapInfo')).text)
metadata = base64.b64decode(media.find(_add_ns('metadata')).text)
boot_info = read_bootstrap_info(bootstrap)
fragments_list = build_fragments_list(boot_info)
if self.params.get('test', False):
# We only download the first fragment
fragments_list = fragments_list[:1]
total_frags = len(fragments_list)
tmpfilename = self.temp_name(filename)
(dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb')
write_flv_header(dest_stream, metadata)
# This dict stores the download progress, it's updated by the progress
# hook
state = {
'downloaded_bytes': 0,
'frag_counter': 0,
}
start = time.time()
def frag_progress_hook(status):
frag_total_bytes = status.get('total_bytes', 0)
estimated_size = (state['downloaded_bytes'] +
(total_frags - state['frag_counter']) * frag_total_bytes)
if status['status'] == 'finished':
state['downloaded_bytes'] += frag_total_bytes
state['frag_counter'] += 1
progress = self.calc_percent(state['frag_counter'], total_frags)
byte_counter = state['downloaded_bytes']
else:
frag_downloaded_bytes = status['downloaded_bytes']
byte_counter = state['downloaded_bytes'] + frag_downloaded_bytes
frag_progress = self.calc_percent(frag_downloaded_bytes,
frag_total_bytes)
progress = self.calc_percent(state['frag_counter'], total_frags)
progress += frag_progress / float(total_frags)
eta = self.calc_eta(start, time.time(), estimated_size, byte_counter)
self.report_progress(progress, format_bytes(estimated_size),
status.get('speed'), eta)
http_dl.add_progress_hook(frag_progress_hook)
frags_filenames = []
for (seg_i, frag_i) in fragments_list:
name = 'Seg%d-Frag%d' % (seg_i, frag_i)
url = base_url + name
frag_filename = '%s-%s' % (tmpfilename, name)
success = http_dl.download(frag_filename, {'url': url})
if not success:
return False
with open(frag_filename, 'rb') as down:
down_data = down.read()
reader = FlvReader(down_data)
while True:
_, box_type, box_data = reader.read_box_info()
if box_type == b'mdat':
dest_stream.write(box_data)
break
frags_filenames.append(frag_filename)
self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start)
self.try_rename(tmpfilename, filename)
for frag_file in frags_filenames:
os.remove(frag_file)
fsize = os.path.getsize(encodeFilename(filename))
self._hook_progress({
'downloaded_bytes': fsize,
'total_bytes': fsize,
'filename': filename,
'status': 'finished',
})
return True

View File

@ -85,6 +85,7 @@ class HttpFD(FileDownloader):
else:
# The length does not match, we start the download over
self.report_unable_to_resume()
resume_len = 0
open_mode = 'wb'
break
# Retry

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import os
import re
import subprocess
@ -22,7 +24,7 @@ class RtmpFD(FileDownloader):
proc_stderr_closed = False
while not proc_stderr_closed:
# read line from stderr
line = u''
line = ''
while True:
char = proc.stderr.read(1)
if not char:
@ -46,7 +48,7 @@ class RtmpFD(FileDownloader):
data_len = None
if percent > 0:
data_len = int(downloaded_data_len * 100 / percent)
data_len_str = u'~' + format_bytes(data_len)
data_len_str = '~' + format_bytes(data_len)
self.report_progress(percent, data_len_str, speed, eta)
cursor_in_new_line = False
self._hook_progress({
@ -76,19 +78,21 @@ class RtmpFD(FileDownloader):
})
elif self.params.get('verbose', False):
if not cursor_in_new_line:
self.to_screen(u'')
self.to_screen('')
cursor_in_new_line = True
self.to_screen(u'[rtmpdump] '+line)
self.to_screen('[rtmpdump] '+line)
proc.wait()
if not cursor_in_new_line:
self.to_screen(u'')
self.to_screen('')
return proc.returncode
url = info_dict['url']
player_url = info_dict.get('player_url', None)
page_url = info_dict.get('page_url', None)
app = info_dict.get('app', None)
play_path = info_dict.get('play_path', None)
tc_url = info_dict.get('tc_url', None)
flash_version = info_dict.get('flash_version', None)
live = info_dict.get('rtmp_live', False)
conn = info_dict.get('rtmp_conn', None)
@ -100,7 +104,7 @@ class RtmpFD(FileDownloader):
try:
subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
except (OSError, IOError):
self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
self.report_error('RTMP download detected but "rtmpdump" could not be run')
return False
# Download using rtmpdump. rtmpdump returns exit code 2 when
@ -111,17 +115,21 @@ class RtmpFD(FileDownloader):
basic_args += ['--swfVfy', player_url]
if page_url is not None:
basic_args += ['--pageUrl', page_url]
if app is not None:
basic_args += ['--app', app]
if play_path is not None:
basic_args += ['--playpath', play_path]
if tc_url is not None:
basic_args += ['--tcUrl', url]
if test:
basic_args += ['--stop', '1']
if flash_version is not None:
basic_args += ['--flashVer', flash_version]
if live:
basic_args += ['--live']
if conn:
basic_args += ['--conn', conn]
args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
args = basic_args + [[], ['--resume', '--skip', '1']][not live and self.params.get('continuedl', False)]
if sys.platform == 'win32' and sys.version_info < (3, 0):
# Windows subprocess module does not actually support Unicode
@ -144,26 +152,35 @@ class RtmpFD(FileDownloader):
shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
except ImportError:
shell_quote = repr
self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
self.to_screen('[debug] rtmpdump command line: ' + shell_quote(str_args))
RD_SUCCESS = 0
RD_FAILED = 1
RD_INCOMPLETE = 2
RD_NO_CONNECT = 3
retval = run_rtmpdump(args)
while (retval == 2 or retval == 1) and not test:
if retval == RD_NO_CONNECT:
self.report_error('[rtmpdump] Could not connect to RTMP server.')
return False
while (retval == RD_INCOMPLETE or retval == RD_FAILED) and not test and not live:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
self.to_screen('[rtmpdump] %s bytes' % prevsize)
time.sleep(5.0) # This seems to be needed
retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED])
cursize = os.path.getsize(encodeFilename(tmpfilename))
if prevsize == cursize and retval == 1:
if prevsize == cursize and retval == RD_FAILED:
break
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
if prevsize == cursize and retval == 2 and cursize > 1024:
self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = 0
if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024:
self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = RD_SUCCESS
break
if retval == 0 or (test and retval == 2):
if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE):
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'[rtmpdump] %s bytes' % fsize)
self.to_screen('[rtmpdump] %s bytes' % fsize)
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
@ -173,6 +190,6 @@ class RtmpFD(FileDownloader):
})
return True
else:
self.to_stderr(u"\n")
self.report_error(u'rtmpdump exited with code %d' % retval)
self.to_stderr('\n')
self.report_error('rtmpdump exited with code %d' % retval)
return False

View File

@ -15,22 +15,30 @@ from .arte import (
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbccouk import BBCCoUkIE
from .blinkx import BlinkxIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
from .br import BRIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
from .c56 import C56IE
from .canal13cl import Canal13clIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
from .cmt import CMTIE
from .cnn import CNNIE
from .cnn import (
CNNIE,
CNNBlogsIE,
)
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
from .condenast import CondeNastIE
@ -62,11 +70,13 @@ from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fktv import (
FKTVIE,
FKTVPosteckeIE,
)
from .flickr import FlickrIE
from .fourtube import FourTubeIE
from .franceinter import FranceInterIE
from .francetv import (
PluzzIE,
@ -81,10 +91,12 @@ from .funnyordie import FunnyOrDieIE
from .gamekings import GamekingsIE
from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .hark import HarkIE
from .helsinki import HelsinkiIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .huffpost import HuffPostIE
@ -103,6 +115,7 @@ from .ivi import (
IviIE,
IviCompilationIE
)
from .jadorecettepub import JadoreCettePubIE
from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
@ -112,6 +125,7 @@ from .keezmovies import KeezMoviesIE
from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
from .keek import KeekIE
from .kontrtube import KontrTubeIE
from .la7 import LA7IE
from .lifenews import LifeNewsIE
from .liveleak import LiveLeakIE
@ -122,11 +136,12 @@ from .lynda import (
)
from .m6 import M6IE
from .macgamestore import MacGameStoreIE
from .mailru import MailRuIE
from .malemotion import MalemotionIE
from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mit import TechTVMITIE, MITIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mixcloud import MixcloudIE
from .mpora import MporaIE
from .mofosex import MofosexIE
@ -141,14 +156,19 @@ from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import NBCNewsIE
from .nbc import (
NBCIE,
NBCNewsIE,
)
from .ndr import NDRIE
from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE
from .nfb import NFBIE
from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE
from .ninegag import NineGagIE
from .normalboots import NormalbootsIE
from .novamov import NovamovIE
from .novamov import NovaMovIE
from .nowness import NownessIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
@ -159,6 +179,7 @@ from .podomatic import PodomaticIE
from .pornhd import PornHdIE
from .pornhub import PornHubIE
from .pornotube import PornotubeIE
from .prosiebensat1 import ProSiebenSat1IE
from .pyvideo import PyvideoIE
from .radiofrance import RadioFranceIE
from .rbmaradio import RBMARadioIE
@ -174,6 +195,7 @@ from .rutube import (
RutubeMovieIE,
RutubePersonIE,
)
from .savefrom import SaveFromIE
from .servingsys import ServingSysIE
from .sina import SinaIE
from .slashdot import SlashdotIE
@ -198,10 +220,13 @@ from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE
from .steam import SteamIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .testurl import TestURLIE
from .tf1 import TF1IE
from .theplatform import ThePlatformIE
from .thisav import ThisAVIE
@ -209,19 +234,23 @@ from .tinypic import TinyPicIE
from .toutv import TouTvIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .trutube import TruTubeIE
from .tube8 import Tube8IE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tutv import TutvIE
from .tvigle import TvigleIE
from .tvp import TvpIE
from .unistra import UnistraIE
from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import VevoIE
from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
@ -256,19 +285,20 @@ from .youku import YoukuIE
from .youporn import YouPornIE
from .youtube import (
YoutubeIE,
YoutubePlaylistIE,
YoutubeSearchIE,
YoutubeSearchDateIE,
YoutubeUserIE,
YoutubeChannelIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
YoutubeRecommendedIE,
YoutubeTruncatedURLIE,
YoutubeWatchLaterIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubePlaylistIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
YoutubeSearchURLIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
YoutubeTopListIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
YoutubeWatchLaterIE,
)
from .zdf import ZDFIE

View File

@ -5,7 +5,7 @@ from .common import InfoExtractor
class AcademicEarthCourseIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)'
_VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
IE_NAME = 'AcademicEarth:Course'
def _real_extract(self, url):
@ -14,12 +14,12 @@ class AcademicEarthCourseIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
title = self._html_search_regex(
r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title')
r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, u'title')
description = self._html_search_regex(
r'<p class="excerpt">(.*?)</p>',
r'<p class="excerpt"[^>]*?>(.*?)</p>',
webpage, u'description', fatal=False)
urls = re.findall(
r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">',
r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
webpage)
entries = [self.url_result(u) for u in urls]

View File

@ -0,0 +1,223 @@
from __future__ import unicode_literals
import re
from .subtitles import SubtitlesInfoExtractor
from ..utils import ExtractorError
class BBCCoUkIE(SubtitlesInfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
_TESTS = [
{
'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
'info_dict': {
'id': 'b039d07m',
'ext': 'flv',
'title': 'Kaleidoscope: Leonard Cohen',
'description': 'md5:db4755d7a665ae72343779f7dacb402c',
'duration': 1740,
},
'params': {
# rtmp download
'skip_download': True,
}
},
{
'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
'info_dict': {
'id': 'b00yng1d',
'ext': 'flv',
'title': 'The Man in Black: Series 3: The Printed Name',
'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
'duration': 1800,
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Episode is no longer available on BBC iPlayer Radio',
},
{
'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
'info_dict': {
'id': 'b00yng1d',
'ext': 'flv',
'title': 'The Voice UK: Series 3: Blind Auditions 5',
'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
'duration': 5100,
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
}
]
def _extract_asx_playlist(self, connection, programme_id):
asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
return [ref.get('href') for ref in asx.findall('./Entry/ref')]
def _extract_connection(self, connection, programme_id):
formats = []
protocol = connection.get('protocol')
supplier = connection.get('supplier')
if protocol == 'http':
href = connection.get('href')
# ASX playlist
if supplier == 'asx':
for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
formats.append({
'url': ref,
'format_id': 'ref%s_%s' % (i, supplier),
})
# Direct link
else:
formats.append({
'url': href,
'format_id': supplier,
})
elif protocol == 'rtmp':
application = connection.get('application', 'ondemand')
auth_string = connection.get('authString')
identifier = connection.get('identifier')
server = connection.get('server')
formats.append({
'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
'play_path': identifier,
'app': '%s?%s' % (application, auth_string),
'page_url': 'http://www.bbc.co.uk',
'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
'rtmp_live': False,
'ext': 'flv',
'format_id': supplier,
})
return formats
def _extract_items(self, playlist):
return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
def _extract_medias(self, media_selection):
return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
def _extract_connections(self, media):
return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
def _extract_video(self, media, programme_id):
formats = []
vbr = int(media.get('bitrate'))
vcodec = media.get('encoding')
service = media.get('service')
width = int(media.get('width'))
height = int(media.get('height'))
file_size = int(media.get('media_file_size'))
for connection in self._extract_connections(media):
conn_formats = self._extract_connection(connection, programme_id)
for format in conn_formats:
format.update({
'format_id': '%s_%s' % (service, format['format_id']),
'width': width,
'height': height,
'vbr': vbr,
'vcodec': vcodec,
'filesize': file_size,
})
formats.extend(conn_formats)
return formats
def _extract_audio(self, media, programme_id):
formats = []
abr = int(media.get('bitrate'))
acodec = media.get('encoding')
service = media.get('service')
for connection in self._extract_connections(media):
conn_formats = self._extract_connection(connection, programme_id)
for format in conn_formats:
format.update({
'format_id': '%s_%s' % (service, format['format_id']),
'abr': abr,
'acodec': acodec,
})
formats.extend(conn_formats)
return formats
def _extract_captions(self, media, programme_id):
subtitles = {}
for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
srt = ''
for pos, p in enumerate(ps):
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
p.text.strip() if p.text is not None else '')
subtitles[lang] = srt
return subtitles
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
group_id = mobj.group('id')
webpage = self._download_webpage(url, group_id, 'Downloading video page')
if re.search(r'id="emp-error" class="notinuk">', webpage):
raise ExtractorError('Currently BBC iPlayer TV programmes are available to play in the UK only',
expected=True)
playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
'Downloading playlist XML')
no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
if no_items is not None:
reason = no_items.get('reason')
if reason == 'preAvailability':
msg = 'Episode %s is not yet available' % group_id
elif reason == 'postAvailability':
msg = 'Episode %s is no longer available' % group_id
else:
msg = 'Episode %s is not available: %s' % (group_id, reason)
raise ExtractorError(msg, expected=True)
formats = []
subtitles = None
for item in self._extract_items(playlist):
kind = item.get('kind')
if kind != 'programme' and kind != 'radioProgramme':
continue
title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
programme_id = item.get('identifier')
duration = int(item.get('duration'))
media_selection = self._download_xml(
'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
programme_id, 'Downloading media selection XML')
for media in self._extract_medias(media_selection):
kind = media.get('kind')
if kind == 'audio':
formats.extend(self._extract_audio(media, programme_id))
elif kind == 'video':
formats.extend(self._extract_video(media, programme_id))
elif kind == 'captions':
subtitles = self._extract_captions(media, programme_id)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(programme_id, subtitles)
return
self._sort_formats(formats)
return {
'id': programme_id,
'title': title,
'description': description,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
}

View File

@ -24,5 +24,7 @@ class BloombergIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
webpage = self._download_webpage(url, name)
ooyala_url = self._twitter_search_player(webpage)
return self.url_result(ooyala_url, OoyalaIE.ie_key())
embed_code = self._search_regex(
r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
'embed code')
return OoyalaIE._build_url_result(embed_code)

View File

@ -0,0 +1,80 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class BRIE(InfoExtractor):
IE_DESC = "Bayerischer Rundfunk Mediathek"
_VALID_URL = r"^https?://(?:www\.)?br\.de/mediathek/video/(?:sendungen/)?(?P<id>[a-z0-9\-]+)\.html$"
_BASE_URL = "http://www.br.de"
_TEST = {
"url": "http://www.br.de/mediathek/video/anselm-gruen-114.html",
"md5": "c4f83cf0f023ba5875aba0bf46860df2",
"info_dict": {
"id": "2c8d81c5-6fb7-4a74-88d4-e768e5856532",
"ext": "mp4",
"title": "Feiern und Verzichten",
"description": "Anselm Grün: Feiern und Verzichten",
"uploader": "BR/Birgit Baier",
"upload_date": "20140301"
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
page = self._download_webpage(url, display_id)
xml_url = self._search_regex(
r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL")
xml = self._download_xml(self._BASE_URL + xml_url, None)
videos = [{
"id": xml_video.get("externalId"),
"title": xml_video.find("title").text,
"formats": self._extract_formats(xml_video.find("assets")),
"thumbnails": self._extract_thumbnails(xml_video.find("teaserImage/variants")),
"description": " ".join(xml_video.find("shareTitle").text.splitlines()),
"uploader": xml_video.find("author").text,
"upload_date": "".join(reversed(xml_video.find("broadcastDate").text.split("."))),
"webpage_url": xml_video.find("permalink").text,
} for xml_video in xml.findall("video")]
if len(videos) > 1:
self._downloader.report_warning(
'found multiple videos; please '
'report this with the video URL to http://yt-dl.org/bug')
if not videos:
raise ExtractorError('No video entries found')
return videos[0]
def _extract_formats(self, assets):
formats = [{
"url": asset.find("downloadUrl").text,
"ext": asset.find("mediaType").text,
"format_id": asset.get("type"),
"width": int(asset.find("frameWidth").text),
"height": int(asset.find("frameHeight").text),
"tbr": int(asset.find("bitrateVideo").text),
"abr": int(asset.find("bitrateAudio").text),
"vcodec": asset.find("codecVideo").text,
"container": asset.find("mediaType").text,
"filesize": int(asset.find("size").text),
} for asset in assets.findall("asset")
if asset.find("downloadUrl") is not None]
self._sort_formats(formats)
return formats
def _extract_thumbnails(self, variants):
thumbnails = [{
"url": self._BASE_URL + variant.find("url").text,
"width": int(variant.find("width").text),
"height": int(variant.find("height").text),
} for variant in variants.findall("variant")]
thumbnails.sort(key=lambda x: x["width"] * x["height"], reverse=True)
return thumbnails

View File

@ -1,18 +1,20 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import determine_ext
class BreakIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?break\.com/video/([^/]+)'
_VALID_URL = r'http://(?:www\.)?break\.com/video/([^/]+)'
_TEST = {
u'url': u'http://www.break.com/video/when-girls-act-like-guys-2468056',
u'file': u'2468056.mp4',
u'md5': u'a3513fb1547fba4fb6cfac1bffc6c46b',
u'info_dict': {
u"title": u"When Girls Act Like D-Bags"
'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056',
'md5': 'a3513fb1547fba4fb6cfac1bffc6c46b',
'info_dict': {
'id': '2468056',
'ext': 'mp4',
'title': 'When Girls Act Like D-Bags',
}
}
@ -21,18 +23,17 @@ class BreakIE(InfoExtractor):
video_id = mobj.group(1).split("-")[-1]
embed_url = 'http://www.break.com/embed/%s' % video_id
webpage = self._download_webpage(embed_url, video_id)
info_json = self._search_regex(r'var embedVars = ({.*?});', webpage,
u'info json', flags=re.DOTALL)
info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>',
webpage, 'info json', flags=re.DOTALL)
info = json.loads(info_json)
video_url = info['videoUri']
m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
if m_youtube is not None:
return self.url_result(m_youtube.group(1), 'Youtube')
final_url = video_url + '?' + info['AuthToken']
return [{
'id': video_id,
'url': final_url,
'ext': determine_ext(final_url),
'title': info['contentName'],
return {
'id': video_id,
'url': final_url,
'title': info['contentName'],
'thumbnail': info['thumbUri'],
}]
}

View File

@ -17,6 +17,7 @@ from ..utils import (
ExtractorError,
unsmuggle_url,
unescapeHTML,
)
@ -139,7 +140,7 @@ class BrightcoveIE(InfoExtractor):
url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
if url_m:
return [url_m.group(1)]
return [unescapeHTML(url_m.group(1))]
matches = re.findall(
r'''(?sx)<object

View File

@ -0,0 +1,48 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class Canal13clIE(InfoExtractor):
_VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
_TEST = {
'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
'md5': '4cb1fa38adcad8fea88487a078831755',
'info_dict': {
'id': '1403022125',
'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
'ext': 'mp4',
'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda',
'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
title = self._html_search_meta(
'twitter:title', webpage, 'title', fatal=True)
description = self._html_search_meta(
'twitter:description', webpage, 'description')
url = self._html_search_regex(
r'articuloVideo = \"(.*?)\"', webpage, 'url')
real_id = self._search_regex(
r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id)
thumbnail = self._html_search_regex(
r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail')
return {
'id': real_id,
'display_id': display_id,
'url': url,
'title': title,
'description': description,
'ext': 'mp4',
'thumbnail': thumbnail,
}

View File

@ -1,4 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@ -9,11 +11,12 @@ class Canalc2IE(InfoExtractor):
_VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
_TEST = {
u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
u'file': u'12163.mp4',
u'md5': u'060158428b650f896c542dfbb3d6487f',
u'info_dict': {
u'title': u'Terrasses du Numérique'
'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
'ext': 'mp4',
'title': 'Terrasses du Numérique'
}
}
@ -28,10 +31,11 @@ class Canalc2IE(InfoExtractor):
video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
title = self._html_search_regex(
r'class="evenement8">(.*?)</a>', webpage, u'title')
return {'id': video_id,
'ext': 'mp4',
'url': video_url,
'title': title,
}
r'class="evenement8">(.*?)</a>', webpage, 'title')
return {
'id': video_id,
'ext': 'mp4',
'url': video_url,
'title': title,
}

View File

@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
compat_urllib_parse,
compat_urllib_parse_urlparse,
ExtractorError,
)
class CeskaTelevizeIE(InfoExtractor):
_VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
_TESTS = [
{
'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka',
'info_dict': {
'id': '213512120230004',
'ext': 'flv',
'title': 'První republika: Španělská chřipka',
'duration': 3107.4,
},
'params': {
'skip_download': True, # requires rtmpdump
},
'skip': 'Works only from Czech Republic.',
},
{
'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt',
'info_dict': {
'id': '20138143440',
'ext': 'flv',
'title': 'Tsatsiki, maminka a policajt',
'duration': 6754.1,
},
'params': {
'skip_download': True, # requires rtmpdump
},
'skip': 'Works only from Czech Republic.',
},
{
'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
'info_dict': {
'id': '14716',
'ext': 'flv',
'title': 'První republika: Zpěvačka z Dupárny Bobina',
'duration': 90,
},
'params': {
'skip_download': True, # requires rtmpdump
},
},
]
def _real_extract(self, url):
url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
data = {
'playlist[0][type]': typ,
'playlist[0][id]': episode_id,
'requestUrl': compat_urllib_parse_urlparse(url).path,
'requestSource': 'iVysilani',
}
req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url',
data=compat_urllib_parse.urlencode(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
req.add_header('x-addr', '127.0.0.1')
req.add_header('X-Requested-With', 'XMLHttpRequest')
req.add_header('Referer', url)
playlistpage = self._download_json(req, video_id)
req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url']))
req.add_header('Referer', url)
playlist = self._download_xml(req, video_id)
formats = []
for i in playlist.find('smilRoot/body'):
if 'AD' not in i.attrib['id']:
base_url = i.attrib['base']
parsedurl = compat_urllib_parse_urlparse(base_url)
duration = i.attrib['duration']
for video in i.findall('video'):
if video.attrib['label'] != 'AD':
format_id = video.attrib['label']
play_path = video.attrib['src']
vbr = int(video.attrib['system-bitrate'])
formats.append({
'format_id': format_id,
'url': base_url,
'vbr': vbr,
'play_path': play_path,
'app': parsedurl.path[1:] + '?' + parsedurl.query,
'rtmp_live': True,
'ext': 'flv',
})
self._sort_formats(formats)
return {
'id': episode_id,
'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'),
'duration': float(duration),
'formats': formats,
}

View File

@ -15,14 +15,15 @@ class Channel9IE(InfoExtractor):
'''
IE_DESC = 'Channel 9'
IE_NAME = 'channel9'
_VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
_VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
_TESTS = [
{
'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
'file': 'Events_TechEd_Australia_2013_KOS002.mp4',
'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
'info_dict': {
'id': 'Events/TechEd/Australia/2013/KOS002',
'ext': 'mp4',
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'duration': 4576,
@ -35,9 +36,10 @@ class Channel9IE(InfoExtractor):
},
{
'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
'file': 'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
'info_dict': {
'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
'ext': 'mp4',
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'duration': 1540,

View File

@ -0,0 +1,97 @@
from __future__ import unicode_literals
import re
import base64
import json
from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError
)
class ChilloutzoneIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html'
_TESTS = [{
'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
'md5': 'a76f3457e813ea0037e5244f509e66d1',
'info_dict': {
'id': 'enemene-meck-alle-katzen-weg',
'ext': 'mp4',
'title': 'Enemene Meck - Alle Katzen weg',
'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?',
},
}, {
'note': 'Video hosted at YouTube',
'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html',
'info_dict': {
'id': '1YVQaAgHyRU',
'ext': 'mp4',
'title': '16 Photos Taken 1 Second Before Disaster',
'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814',
'uploader': 'BuzzFeedVideo',
'uploader_id': 'BuzzFeedVideo',
'upload_date': '20131105',
},
}, {
'note': 'Video hosted at Vimeo',
'url': 'http://www.chilloutzone.net/video/icon-blending.html',
'md5': '2645c678b8dc4fefcc0e1b60db18dac1',
'info_dict': {
'id': '85523671',
'ext': 'mp4',
'title': 'The Sunday Times - Icons',
'description': 'md5:3e1c0dc6047498d6728dcdaad0891762',
'uploader': 'Us',
'uploader_id': 'usfilms',
'upload_date': '20140131'
},
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
base64_video_info = self._html_search_regex(
r'var cozVidData = "(.+?)";', webpage, 'video data')
decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
video_info_dict = json.loads(decoded_video_info)
# get video information from dict
video_url = video_info_dict['mediaUrl']
description = clean_html(video_info_dict.get('description'))
title = video_info_dict['title']
native_platform = video_info_dict['nativePlatform']
native_video_id = video_info_dict['nativeVideoId']
source_priority = video_info_dict['sourcePriority']
# If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
if native_platform is None:
youtube_url = self._html_search_regex(
r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
webpage, 'fallback video URL', default=None)
if youtube_url is not None:
return self.url_result(youtube_url, ie='Youtube')
# Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
# the own CDN
if source_priority == 'native':
if native_platform == 'youtube':
return self.url_result(native_video_id, ie='Youtube')
if native_platform == 'vimeo':
return self.url_result(
'http://vimeo.com/' + native_video_id, ie='Vimeo')
if not video_url:
raise ExtractorError('No video found')
return {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
'description': description,
}

View File

@ -1,4 +1,5 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@ -8,73 +9,63 @@ from ..utils import (
class CinemassacreIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?'
_TESTS = [{
u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
u'file': u'19911.flv',
u'info_dict': {
u'upload_date': u'20121110',
u'title': u'“Angry Video Game Nerd: The Movie” Trailer',
u'description': u'md5:fb87405fcb42a331742a0dce2708560b',
_VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?'
_TESTS = [
{
'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
'file': '19911.mp4',
'md5': 'fde81fbafaee331785f58cd6c0d46190',
'info_dict': {
'upload_date': '20121110',
'title': '“Angry Video Game Nerd: The Movie” Trailer',
'description': 'md5:fb87405fcb42a331742a0dce2708560b',
},
},
u'params': {
# rtmp download
u'skip_download': True,
},
},
{
u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
u'file': u'521be8ef82b16.flv',
u'info_dict': {
u'upload_date': u'20131002',
u'title': u'The Mummys Hand (1940)',
},
u'params': {
# rtmp download
u'skip_download': True,
},
}]
{
'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
'file': '521be8ef82b16.mp4',
'md5': 'd72f10cd39eac4215048f62ab477a511',
'info_dict': {
'upload_date': '20131002',
'title': 'The Mummys Hand (1940)',
},
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
webpage_url = u'http://' + mobj.group('url')
webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
webpage = self._download_webpage(url, None) # Don't know video id yet
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
if not mobj:
raise ExtractorError(u'Can\'t extract embed url and video id')
playerdata_url = mobj.group(u'embed_url')
video_id = mobj.group(u'video_id')
raise ExtractorError('Can\'t extract embed url and video id')
playerdata_url = mobj.group('embed_url')
video_id = mobj.group('video_id')
video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
webpage, u'title')
webpage, 'title')
video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
webpage, u'description', flags=re.DOTALL, fatal=False)
webpage, 'description', flags=re.DOTALL, fatal=False)
if len(video_description) == 0:
video_description = None
playerdata = self._download_webpage(playerdata_url, video_id)
url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url')
sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file')
hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file')
video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False)
sd_url = self._html_search_regex(r'file: \'(?P<sd_file>[^\']+)\', label: \'SD\'', playerdata, 'sd_file')
hd_url = self._html_search_regex(r'file: \'(?P<hd_file>[^\']+)\', label: \'HD\'', playerdata, 'hd_file')
video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
formats = [
{
'url': url,
'play_path': 'mp4:' + sd_file,
'rtmp_live': True, # workaround
'ext': 'flv',
'url': sd_url,
'ext': 'mp4',
'format': 'sd',
'format_id': 'sd',
},
{
'url': url,
'play_path': 'mp4:' + hd_file,
'rtmp_live': True, # workaround
'ext': 'flv',
'url': hd_url,
'ext': 'mp4',
'format': 'hd',
'format_id': 'hd',
},

View File

@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
url_basename,
)
@ -98,3 +99,28 @@ class CNNIE(InfoExtractor):
'duration': duration,
'upload_date': upload_date,
}
class CNNBlogsIE(InfoExtractor):
_VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+'
_TEST = {
'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/',
'md5': '3e56f97b0b6ffb4b79f4ea0749551084',
'info_dict': {
'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn',
'ext': 'mp4',
'title': 'Criminalizing journalism?',
'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.',
'upload_date': '20140209',
},
'add_ie': ['CNN'],
}
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
return {
'_type': 'url',
'url': cnn_url,
'ie_key': CNNIE.ie_key(),
}

View File

@ -40,9 +40,9 @@ class CollegeHumorIE(InfoExtractor):
'id': 'W5gMp3ZjYg4',
'ext': 'mp4',
'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
'uploader': 'Funnyplox TV',
'uploader': 'FunnyPlox TV',
'uploader_id': 'funnyploxtv',
'description': 'md5:11812366244110c3523968aa74f02521',
'description': 'md5:7ded37421526d54afdf005e25bc2b7a3',
'upload_date': '20140128',
},
'params': {

View File

@ -88,6 +88,10 @@ class InfoExtractor(object):
The following fields are optional:
display_id An alternative identifier for the video, not necessarily
unique, but available before title. Typically, id is
something like "4234987", title "Dancing naked mole rats",
and display_id "dancing-naked-mole-rats"
thumbnails: A list of dictionaries (with the entries "resolution" and
"url") for the varying thumbnails
thumbnail: Full URL to a video thumbnail image.
@ -271,8 +275,11 @@ class InfoExtractor(object):
def _download_json(self, url_or_request, video_id,
note=u'Downloading JSON metadata',
errnote=u'Unable to download JSON metadata'):
errnote=u'Unable to download JSON metadata',
transform_source=None):
json_string = self._download_webpage(url_or_request, video_id, note, errnote)
if transform_source:
json_string = transform_source(json_string)
try:
return json.loads(json_string)
except ValueError as ve:
@ -429,14 +436,14 @@ class InfoExtractor(object):
if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
def _html_search_meta(self, name, html, display_name=None):
def _html_search_meta(self, name, html, display_name=None, fatal=False):
if display_name is None:
display_name = name
return self._html_search_regex(
r'''(?ix)<meta
(?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
html, display_name, fatal=False)
html, display_name, fatal=fatal)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')

View File

@ -1,7 +1,11 @@
# encoding: utf-8
from __future__ import unicode_literals
import re, base64, zlib
import re
import json
import base64
import zlib
from hashlib import sha1
from math import pow, sqrt, floor
from .common import InfoExtractor
@ -19,13 +23,15 @@ from ..aes import (
inc,
)
class CrunchyrollIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_TEST = {
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
'file': '645513.flv',
#'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
'info_dict': {
'id': '645513',
'ext': 'flv',
'title': 'Wanna be the Strongest in the World Episode 1 An Idol-Wrestler is Born!',
'description': 'md5:2d17137920c64f2f49981a7797d275ef',
'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
@ -36,7 +42,7 @@ class CrunchyrollIE(InfoExtractor):
# rtmp
'skip_download': True,
},
}]
}
_FORMAT_IDS = {
'360': ('60', '106'),
@ -68,7 +74,7 @@ class CrunchyrollIE(InfoExtractor):
shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
# Extend 160 Bit hash to 256 Bit
return shaHash + [0] * 12
key = obfuscate_key(id)
class Counter:
__value = iv
@ -80,9 +86,8 @@ class CrunchyrollIE(InfoExtractor):
return zlib.decompress(decrypted_data)
def _convert_subtitles_to_srt(self, subtitles):
i=1
output = ''
for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles):
for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
start = start.replace('.', ',')
end = end.replace('.', ',')
text = clean_html(text)
@ -90,7 +95,6 @@ class CrunchyrollIE(InfoExtractor):
if not text:
continue
output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
i+=1
return output
def _real_extract(self,url):
@ -108,6 +112,12 @@ class CrunchyrollIE(InfoExtractor):
if note_m:
raise ExtractorError(note_m)
mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
if mobj:
msg = json.loads(mobj.group('msg'))
if msg.get('type') == 'error':
raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
video_title = re.sub(r' {2,}', ' ', video_title)
video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
@ -123,7 +133,7 @@ class CrunchyrollIE(InfoExtractor):
playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
@ -161,7 +171,7 @@ class CrunchyrollIE(InfoExtractor):
data = base64.b64decode(data)
subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, 'subtitle_lang_code', fatal=False)
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)

View File

@ -12,6 +12,7 @@ from ..utils import (
get_element_by_id,
orderedSet,
str_to_int,
int_or_none,
ExtractorError,
)
@ -124,7 +125,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
if video_url is not None:
m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
if m_size is not None:
width, height = m_size.group(1), m_size.group(2)
width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
else:
width, height = None, None
formats.append({

View File

@ -1,41 +1,42 @@
from __future__ import unicode_literals
import re
import json
import time
from .common import InfoExtractor
class DotsubIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?dotsub\.com/view/([^/]+)'
_VALID_URL = r'http://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)'
_TEST = {
u'url': u'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
u'file': u'aed3b8b2-1889-4df5-ae63-ad85f5572f27.flv',
u'md5': u'0914d4d69605090f623b7ac329fea66e',
u'info_dict': {
u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary",
u"uploader": u"4v4l0n42",
u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
u'upload_date': u'20101213',
'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
'md5': '0914d4d69605090f623b7ac329fea66e',
'info_dict': {
'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27',
'ext': 'flv',
'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary',
'uploader': '4v4l0n42',
'description': 'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
'thumbnail': 'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
'upload_date': '20101213',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id)
webpage = self._download_webpage(info_url, video_id)
info = json.loads(webpage)
video_id = mobj.group('id')
info_url = "https://dotsub.com/api/media/%s/metadata" % video_id
info = self._download_json(info_url, video_id)
date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
return [{
'id': video_id,
'url': info['mediaURI'],
'ext': 'flv',
'title': info['title'],
'thumbnail': info['screenshotURI'],
return {
'id': video_id,
'url': info['mediaURI'],
'ext': 'flv',
'title': info['title'],
'thumbnail': info['screenshotURI'],
'description': info['description'],
'uploader': info['user'],
'view_count': info['numberOfViews'],
'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
}]
'uploader': info['user'],
'view_count': info['numberOfViews'],
'upload_date': '%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
}

View File

@ -10,11 +10,12 @@ from .common import InfoExtractor
class DropboxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)'
_TEST = {
'url': 'https://www.dropbox.com/s/mcnzehi9wo55th4/20131219_085616.mp4',
'file': 'mcnzehi9wo55th4.mp4',
'md5': 'f6d65b1b326e82fd7ab7720bea3dacae',
'url': 'https://www.dropbox.com/s/0qr9sai2veej4f8/THE_DOCTOR_GAMES.mp4',
'md5': '8ae17c51172fb7f93bdd6a214cc8c896',
'info_dict': {
'title': '20131219_085616'
'id': '0qr9sai2veej4f8',
'ext': 'mp4',
'title': 'THE_DOCTOR_GAMES'
}
}

View File

@ -9,7 +9,7 @@ from ..utils import unified_strdate
class ElPaisIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
IE_DESCR = 'El País'
IE_DESC = 'El País'
_TEST = {
'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',

View File

@ -1,9 +1,9 @@
import json
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
ExtractorError,
@ -11,70 +11,68 @@ from ..utils import (
class EscapistIE(InfoExtractor):
_VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
_VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<id>[0-9]+)-'
_TEST = {
u'url': u'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
u'file': u'6618-Breaking-Down-Baldurs-Gate.mp4',
u'md5': u'ab3a706c681efca53f0a35f1415cf0d1',
u'info_dict': {
u"description": u"Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
u"uploader": u"the-escapist-presents",
u"title": u"Breaking Down Baldur's Gate"
'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
'md5': 'ab3a706c681efca53f0a35f1415cf0d1',
'info_dict': {
'id': '6618',
'ext': 'mp4',
'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
'uploader': 'the-escapist-presents',
'title': "Breaking Down Baldur's Gate",
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
showName = mobj.group('showname')
videoId = mobj.group('episode')
video_id = mobj.group('id')
self.report_extraction(videoId)
webpage = self._download_webpage(url, videoId)
self.report_extraction(video_id)
webpage = self._download_webpage(url, video_id)
videoDesc = self._html_search_regex(
r'<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
webpage, 'description', fatal=False)
playerUrl = self._og_search_video_url(webpage, name=u'player URL')
title = self._html_search_regex(
r'<meta name="title" content="([^"]*)"',
webpage, u'title').split(' : ')[-1]
webpage, 'title').split(' : ')[-1]
configUrl = self._search_regex('config=(.*)$', playerUrl, u'config URL')
configUrl = self._search_regex('config=(.*)$', playerUrl, 'config URL')
configUrl = compat_urllib_parse.unquote(configUrl)
formats = []
def _add_format(name, cfgurl):
configJSON = self._download_webpage(
cfgurl, videoId,
u'Downloading ' + name + ' configuration',
u'Unable to download ' + name + ' configuration')
def _add_format(name, cfgurl, quality):
config = self._download_json(
cfgurl, video_id,
'Downloading ' + name + ' configuration',
'Unable to download ' + name + ' configuration',
transform_source=lambda s: s.replace("'", '"'))
# Technically, it's JavaScript, not JSON
configJSON = configJSON.replace("'", '"')
try:
config = json.loads(configJSON)
except (ValueError,) as err:
raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
playlist = config['playlist']
formats.append({
'url': playlist[1]['url'],
'format_id': name,
'quality': quality,
})
_add_format(u'normal', configUrl)
_add_format('normal', configUrl, quality=0)
hq_url = (configUrl +
('&hq=1' if '?' in configUrl else configUrl + '?hq=1'))
try:
_add_format(u'hq', hq_url)
_add_format('hq', hq_url, quality=1)
except ExtractorError:
pass # That's fine, we'll just use normal quality
self._sort_formats(formats)
return {
'id': videoId,
'id': video_id,
'formats': formats,
'uploader': showName,
'title': title,

View File

@ -1,56 +1,58 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
class ExfmIE(InfoExtractor):
IE_NAME = u'exfm'
IE_DESC = u'ex.fm'
_VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)'
_SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream'
IE_NAME = 'exfm'
IE_DESC = 'ex.fm'
_VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)'
_SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream'
_TESTS = [
{
u'url': u'http://ex.fm/song/eh359',
u'file': u'44216187.mp3',
u'md5': u'e45513df5631e6d760970b14cc0c11e7',
u'info_dict': {
u"title": u"Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive",
u"uploader": u"deadjournalist",
u'upload_date': u'20120424',
u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',
'url': 'http://ex.fm/song/eh359',
'md5': 'e45513df5631e6d760970b14cc0c11e7',
'info_dict': {
'id': '44216187',
'ext': 'mp3',
'title': 'Test House "Love Is Not Enough" (Extended Mix) DeadJournalist Exclusive',
'uploader': 'deadjournalist',
'upload_date': '20120424',
'description': 'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',
},
u'note': u'Soundcloud song',
u'skip': u'The site is down too often',
'note': 'Soundcloud song',
'skip': 'The site is down too often',
},
{
u'url': u'http://ex.fm/song/wddt8',
u'file': u'wddt8.mp3',
u'md5': u'966bd70741ac5b8570d8e45bfaed3643',
u'info_dict': {
u'title': u'Safe and Sound',
u'uploader': u'Capital Cities',
'url': 'http://ex.fm/song/wddt8',
'md5': '966bd70741ac5b8570d8e45bfaed3643',
'info_dict': {
'id': 'wddt8',
'ext': 'mp3',
'title': 'Safe and Sound',
'uploader': 'Capital Cities',
},
u'skip': u'The site is down too often',
'skip': 'The site is down too often',
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
song_id = mobj.group(1)
info_url = "http://ex.fm/api/v3/song/%s" %(song_id)
webpage = self._download_webpage(info_url, song_id)
info = json.loads(webpage)
song_url = info['song']['url']
song_id = mobj.group('id')
info_url = "http://ex.fm/api/v3/song/%s" % song_id
info = self._download_json(info_url, song_id)['song']
song_url = info['url']
if re.match(self._SOUNDCLOUD_URL, song_url) is not None:
self.to_screen('Soundcloud song detected')
return self.url_result(song_url.replace('/stream',''), 'Soundcloud')
return [{
'id': song_id,
'url': song_url,
'ext': 'mp3',
'title': info['song']['title'],
'thumbnail': info['song']['image']['large'],
'uploader': info['song']['artist'],
'view_count': info['song']['loved_count'],
}]
return self.url_result(song_url.replace('/stream', ''), 'Soundcloud')
return {
'id': song_id,
'url': song_url,
'ext': 'mp3',
'title': info['title'],
'thumbnail': info['image']['large'],
'uploader': info['artist'],
'view_count': info['loved_count'],
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import json
import re
import socket
@ -26,20 +28,21 @@ class FacebookIE(InfoExtractor):
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
IE_NAME = u'facebook'
IE_NAME = 'facebook'
_TEST = {
u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
u'file': u'120708114770723.mp4',
u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
u'info_dict': {
u"duration": 279,
u"title": u"PEOPLE ARE AWESOME 2013"
'url': 'https://www.facebook.com/photo.php?v=120708114770723',
'md5': '48975a41ccc4b7a581abd68651c1a5a8',
'info_dict': {
'id': '120708114770723',
'ext': 'mp4',
'duration': 279,
'title': 'PEOPLE ARE AWESOME 2013'
}
}
def report_login(self):
"""Report attempt to log in."""
self.to_screen(u'Logging in')
self.to_screen('Logging in')
def _login(self):
(useremail, password) = self._get_login_info()
@ -50,9 +53,11 @@ class FacebookIE(InfoExtractor):
login_page_req.add_header('Cookie', 'locale=en_US')
self.report_login()
login_page = self._download_webpage(login_page_req, None, note=False,
errnote=u'Unable to download login page')
lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
errnote='Unable to download login page')
lsd = self._search_regex(
r'<input type="hidden" name="lsd" value="([^"]*)"',
login_page, 'lsd')
lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
login_form = {
'email': useremail,
@ -70,22 +75,22 @@ class FacebookIE(InfoExtractor):
try:
login_results = compat_urllib_request.urlopen(request).read()
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
check_form = {
'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, 'fb_dtsg'),
'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, 'nh'),
'name_action_selected': 'dont_save',
'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, 'continue'),
}
check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
check_response = compat_urllib_request.urlopen(check_req).read()
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
self._downloader.report_warning('unable to log in: %s' % compat_str(err))
return
def _real_initialize(self):
@ -94,7 +99,7 @@ class FacebookIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group('id')
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
@ -107,10 +112,10 @@ class FacebookIE(InfoExtractor):
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
if m_msg is not None:
raise ExtractorError(
u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
'The video is not available, Facebook said: "%s"' % m_msg.group(1),
expected=True)
else:
raise ExtractorError(u'Cannot parse data')
raise ExtractorError('Cannot parse data')
data = dict(json.loads(m.group(1)))
params_raw = compat_urllib_parse.unquote(data['params'])
params = json.loads(params_raw)
@ -119,12 +124,12 @@ class FacebookIE(InfoExtractor):
if not video_url:
video_url = video_data['sd_src']
if not video_url:
raise ExtractorError(u'Cannot find video URL')
raise ExtractorError('Cannot find video URL')
video_duration = int(video_data['video_duration'])
thumbnail = video_data['thumbnail_src']
video_title = self._html_search_regex(
r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title')
info = {
'id': video_id,

View File

@ -0,0 +1,60 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import int_or_none
class FirstTVIE(InfoExtractor):
IE_NAME = 'firsttv'
IE_DESC = 'Видеоархив - Первый канал'
_VALID_URL = r'http://(?:www\.)?1tv\.ru/videoarchive/(?P<id>\d+)'
_TEST = {
'url': 'http://www.1tv.ru/videoarchive/73390',
'md5': '3de6390cf0cca4a5eae1d1d83895e5ad',
'info_dict': {
'id': '73390',
'ext': 'mp4',
'title': 'Олимпийские канатные дороги',
'description': 'md5:cc730d2bf4215463e37fff6a1e277b13',
'thumbnail': 'http://img1.1tv.ru/imgsize640x360/PR20140210114657.JPG',
'duration': 149,
},
'skip': 'Only works from Russia',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id, 'Downloading page')
video_url = self._html_search_regex(
r'''(?s)jwplayer\('flashvideoportal_1'\)\.setup\({.*?'file': '([^']+)'.*?}\);''', webpage, 'video URL')
title = self._html_search_regex(
r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', webpage, 'title')
description = self._html_search_regex(
r'<div class="descr">\s*<div>&nbsp;</div>\s*<p>([^<]*)</p></div>', webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(webpage)
duration = self._og_search_property('video:duration', webpage, 'video duration', fatal=False)
like_count = self._html_search_regex(r'title="Понравилось".*?/></label> \[(\d+)\]',
webpage, 'like count', fatal=False)
dislike_count = self._html_search_regex(r'title="Не понравилось".*?/></label> \[(\d+)\]',
webpage, 'dislike count', fatal=False)
return {
'id': video_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
'description': description,
'duration': int_or_none(duration),
'like_count': int_or_none(like_count),
'dislike_count': int_or_none(dislike_count),
}

View File

@ -0,0 +1,95 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
unified_strdate,
str_to_int,
parse_duration,
clean_html,
)
class FourTubeIE(InfoExtractor):
IE_NAME = '4tube'
_VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)'
_TEST = {
'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
'md5': '6516c8ac63b03de06bc8eac14362db4f',
'info_dict': {
'id': '209733',
'ext': 'mp4',
'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
'uploader': 'WCP Club',
'uploader_id': 'wcp-club',
'upload_date': '20131031',
'duration': 583,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.4tube.com/videos/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist')
media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id')
sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',')
title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title')
thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False)
uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False)
mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str)
(uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None)
upload_date = None
view_count = None
duration = None
description = self._html_search_meta('description', webpage, 'description')
if description:
upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date',
fatal=False)
if upload_date:
upload_date = unified_strdate(upload_date)
view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False))
token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources))
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
b'Origin': b'http://www.4tube.com',
}
token_req = compat_urllib_request.Request(token_url, b'{}', headers)
tokens = self._download_json(token_req, video_id)
formats = [{
'url': tokens[format]['token'],
'format_id': format + 'p',
'resolution': format + 'p',
'quality': int(format),
} for format in sources]
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail_url,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
'view_count': view_count,
'duration': duration,
'age_limit': 18,
'webpage_url': webpage_url,
}

View File

@ -184,6 +184,7 @@ class GenerationQuoiIE(InfoExtractor):
# It uses Dailymotion
'skip_download': True,
},
'skip': 'Only available from France',
}
def _real_extract(self, url):

View File

@ -1,18 +1,21 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import determine_ext
class FreesoundIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)'
_VALID_URL = r'https?://(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)'
_TEST = {
u'url': u'http://www.freesound.org/people/miklovan/sounds/194503/',
u'file': u'194503.mp3',
u'md5': u'12280ceb42c81f19a515c745eae07650',
u'info_dict': {
u"title": u"gulls in the city.wav",
u"uploader" : u"miklovan",
u'description': u'the sounds of seagulls in the city',
'url': 'http://www.freesound.org/people/miklovan/sounds/194503/',
'md5': '12280ceb42c81f19a515c745eae07650',
'info_dict': {
'id': '194503',
'ext': 'mp3',
'title': 'gulls in the city.wav',
'uploader': 'miklovan',
'description': 'the sounds of seagulls in the city',
}
}
@ -20,17 +23,17 @@ class FreesoundIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
music_id = mobj.group('id')
webpage = self._download_webpage(url, music_id)
title = self._html_search_regex(r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>',
webpage, 'music title', flags=re.DOTALL)
music_url = self._og_search_property('audio', webpage, 'music url')
description = self._html_search_regex(r'<div id="sound_description">(.*?)</div>',
webpage, 'description', fatal=False, flags=re.DOTALL)
title = self._html_search_regex(
r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>',
webpage, 'music title', flags=re.DOTALL)
description = self._html_search_regex(
r'<div id="sound_description">(.*?)</div>', webpage, 'description',
fatal=False, flags=re.DOTALL)
return [{
'id': music_id,
'title': title,
'url': music_url,
return {
'id': music_id,
'title': title,
'url': self._og_search_property('audio', webpage, 'music url'),
'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'),
'ext': determine_ext(music_url),
'description': description,
}]
}

View File

@ -7,10 +7,11 @@ class GametrailersIE(MTVServicesInfoExtractor):
_VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
'file': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7',
'info_dict': {
'title': 'Mirror\'s Edge 2|E3 2013: Debut Trailer',
'id': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d',
'ext': 'mp4',
'title': 'E3 2013: Debut Trailer',
'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
},
}

View File

@ -0,0 +1,134 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
)
class GDCVaultIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
_TESTS = [
{
'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
'md5': '7ce8388f544c88b7ac11c7ab1b593704',
'info_dict': {
'id': '1019721',
'ext': 'mp4',
'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
}
},
{
'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
'info_dict': {
'id': '1015683',
'ext': 'flv',
'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
},
'params': {
'skip_download': True, # Requires rtmpdump
}
},
]
def _parse_mp4(self, xml_description):
video_formats = []
mp4_video = xml_description.find('./metadata/mp4video')
if mp4_video is None:
return None
mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
video_root = mobj.group('root')
formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
for format in formats:
mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
url = video_root + mobj.group('path')
vbr = format.find('bitrate').text
video_formats.append({
'url': url,
'vbr': int(vbr),
})
return video_formats
def _parse_flv(self, xml_description):
video_formats = []
akami_url = xml_description.find('./metadata/akamaiHost').text
slide_video_path = xml_description.find('./metadata/slideVideo').text
video_formats.append({
'url': 'rtmp://' + akami_url + '/' + slide_video_path,
'format_note': 'slide deck video',
'quality': -2,
'preference': -2,
'format_id': 'slides',
})
speaker_video_path = xml_description.find('./metadata/speakerVideo').text
video_formats.append({
'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
'format_note': 'speaker video',
'quality': -1,
'preference': -1,
'format_id': 'speaker',
})
return video_formats
def _login(self, webpage_url, video_id):
(username, password) = self._get_login_info()
if username is None or password is None:
self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
return None
mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url)
login_url = mobj.group('root_url') + 'api/login.php'
logout_url = mobj.group('root_url') + 'logout'
login_form = {
'email': username,
'password': password,
}
request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self._download_webpage(request, video_id, 'Logging in')
start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
self._download_webpage(logout_url, video_id, 'Logging out')
return start_page
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.gdcvault.com/play/' + video_id
start_page = self._download_webpage(webpage_url, video_id)
xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', None, False)
if xml_root is None:
# Probably need to authenticate
start_page = self._login(webpage_url, video_id)
if start_page is None:
self.report_warning('Could not login.')
else:
# Grab the url from the authenticated page
xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False)
if xml_name is None:
# Fallback to the older format
xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
xml_decription_url = xml_root + 'xml/' + xml_name
xml_description = self._download_xml(xml_decription_url, video_id)
video_title = xml_description.find('./metadata/title').text
video_formats = self._parse_mp4(xml_description)
if video_formats is None:
video_formats = self._parse_flv(xml_description)
return {
'id': video_id,
'title': video_title,
'formats': video_formats,
}

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import os
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from .youtube import YoutubeIE
@ -12,6 +13,7 @@ from ..utils import (
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
compat_xml_parse_error,
ExtractorError,
HEADRequest,
@ -81,10 +83,10 @@ class GenericIE(InfoExtractor):
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
'file': 'trailer.mp4',
'md5': '67d406c2bcb6af27fa886f31aa934bbe',
'info_dict': {
'id': 'trailer',
'ext': 'mp4',
'title': 'trailer',
'upload_date': '20100513',
}
@ -92,7 +94,6 @@ class GenericIE(InfoExtractor):
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
@ -100,6 +101,39 @@ class GenericIE(InfoExtractor):
'title': '2cc213299525360.mov', # that's what we get
},
},
# google redirect
{
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
'info_dict': {
'id': 'cmQHVoWB5FY',
'ext': 'mp4',
'upload_date': '20130224',
'uploader_id': 'TheVerge',
'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
'uploader': 'The Verge',
'title': 'First Firefox OS phones side-by-side',
},
'params': {
'skip_download': False,
}
},
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
'info_dict': {
'id': '9ODmcdjQcHQ',
'ext': 'mp4',
'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
'upload_date': '20140225',
'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
'uploader': 'Tested',
'uploader_id': 'testedcom',
},
# No need to test YoutubeIE here
'params': {
'skip_download': True,
},
},
]
def report_download_webpage(self, video_id):
@ -159,6 +193,25 @@ class GenericIE(InfoExtractor):
raise ExtractorError('Invalid URL protocol')
return response
def _extract_rss(self, url, video_id, doc):
playlist_title = doc.find('./channel/title').text
playlist_desc_el = doc.find('./channel/description')
playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
entries = [{
'_type': 'url',
'url': e.find('link').text,
'title': e.find('title').text,
} for e in doc.findall('./channel/item')]
return {
'_type': 'playlist',
'id': url,
'title': playlist_title,
'description': playlist_desc,
'entries': entries,
}
def _real_extract(self, url):
parsed_url = compat_urlparse.urlparse(url)
if not parsed_url.scheme:
@ -175,7 +228,7 @@ class GenericIE(InfoExtractor):
else:
assert ':' in default_search
return self.url_result(default_search + url)
video_id = os.path.splitext(url.split('/')[-1])[0]
video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
self.to_screen('%s: Requesting header' % video_id)
@ -219,6 +272,14 @@ class GenericIE(InfoExtractor):
self.report_extraction(video_id)
# Is it an RSS feed?
try:
doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
except compat_xml_parse_error:
pass
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
@ -334,11 +395,17 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group(1), 'Mpora')
# Look for embedded Novamov player
# Look for embedded NovaMov player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'Novamov')
return self.url_result(mobj.group('url'), 'NovaMov')
# Look for embedded NowVideo player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?nowvideo\.(?:ch|sx|eu)/embed\.php.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'NowVideo')
# Look for embedded Facebook player
mobj = re.search(
@ -346,12 +413,25 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'Facebook')
# Look for embedded VK player
mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'VK')
# Look for embedded Huffington Post player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'HuffPost')
# Look for embed.ly
mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'))
mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
if mobj is not None:
return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
@ -376,6 +456,18 @@ class GenericIE(InfoExtractor):
if mobj is None:
# HTML5 video
mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
if mobj is None:
mobj = re.search(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
webpage)
if mobj:
new_url = mobj.group(1)
self.report_following_redirect(new_url)
return {
'_type': 'url',
'url': new_url,
}
if mobj is None:
raise ExtractorError('Unsupported URL: %s' % url)

View File

@ -1,4 +1,5 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
import re
@ -10,32 +11,28 @@ from ..utils import (
class GooglePlusIE(InfoExtractor):
IE_DESC = u'Google Plus'
_VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
IE_NAME = u'plus.google'
IE_DESC = 'Google Plus'
_VALID_URL = r'https://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)'
IE_NAME = 'plus.google'
_TEST = {
u"url": u"https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH",
u"file": u"ZButuJc6CtH.flv",
u"info_dict": {
u"upload_date": u"20120613",
u"uploader": u"井上ヨシマサ",
u"title": u"嘆きの天使 降臨"
'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH',
'info_dict': {
'id': 'ZButuJc6CtH',
'ext': 'flv',
'upload_date': '20120613',
'uploader': '井上ヨシマサ',
'title': '嘆きの天使 降臨',
}
}
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
post_url = mobj.group(0)
video_id = mobj.group(1)
video_extension = 'flv'
video_id = mobj.group('id')
# Step 1, Retrieve post webpage to extract further information
webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
self.report_extraction(video_id)
@ -43,7 +40,7 @@ class GooglePlusIE(InfoExtractor):
upload_date = self._html_search_regex(
r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
webpage, u'upload date', fatal=False, flags=re.VERBOSE)
webpage, 'upload date', fatal=False, flags=re.VERBOSE)
if upload_date:
# Convert timestring to a format suitable for filename
upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
@ -51,28 +48,27 @@ class GooglePlusIE(InfoExtractor):
# Extract uploader
uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
webpage, u'uploader', fatal=False)
webpage, 'uploader', fatal=False)
# Extract title
# Get the first line for title
video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
webpage, 'title', default=u'NA')
webpage, 'title', default='NA')
# Step 2, Simulate clicking the image box to launch video
DOMAIN = 'https://plus.google.com/'
video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
webpage, u'video page URL')
webpage, 'video page URL')
if not video_page.startswith(DOMAIN):
video_page = DOMAIN + video_page
webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
# Extract video links on video page
"""Extract video links of all sizes"""
# Extract video links all sizes
pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
mobj = re.findall(pattern, webpage)
if len(mobj) == 0:
raise ExtractorError(u'Unable to extract video links')
raise ExtractorError('Unable to extract video links')
# Sort in resolution
links = sorted(mobj)
@ -87,12 +83,11 @@ class GooglePlusIE(InfoExtractor):
except AttributeError: # Python 3
video_url = bytes(video_url, 'ascii').decode('unicode-escape')
return [{
'id': video_id,
'url': video_url,
return {
'id': video_id,
'url': video_url,
'uploader': uploader,
'upload_date': upload_date,
'title': video_title,
'ext': video_extension,
}]
'upload_date': upload_date,
'title': video_title,
'ext': 'flv',
}

View File

@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class HelsinkiIE(InfoExtractor):
IE_DESC = 'helsinki.fi'
_VALID_URL = r'https?://video\.helsinki\.fi/Arkisto/flash\.php\?id=(?P<id>\d+)'
_TEST = {
'url': 'http://video.helsinki.fi/Arkisto/flash.php?id=20258',
'info_dict': {
'id': '20258',
'ext': 'mp4',
'title': 'Tietotekniikkafoorumi-iltapäivä',
'description': 'md5:f5c904224d43c133225130fe156a5ee0',
},
'params': {
'skip_download': True, # RTMP
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
formats = []
mobj = re.search(r'file=((\w+):[^&]+)', webpage)
if mobj:
formats.append({
'ext': mobj.group(2),
'play_path': mobj.group(1),
'url': 'rtmp://flashvideo.it.helsinki.fi/vod/',
'player_url': 'http://video.helsinki.fi/player.swf',
'format_note': 'sd',
'quality': 0,
})
mobj = re.search(r'hd\.file=((\w+):[^&]+)', webpage)
if mobj:
formats.append({
'ext': mobj.group(2),
'play_path': mobj.group(1),
'url': 'rtmp://flashvideo.it.helsinki.fi/vod/',
'player_url': 'http://video.helsinki.fi/player.swf',
'format_note': 'hd',
'quality': 1,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage).replace('Video: ', ''),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
}

View File

@ -1,17 +1,20 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class HowcastIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
_TEST = {
u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
u'file': u'390161.mp4',
u'md5': u'8b743df908c42f60cf6496586c7f12c3',
u'info_dict': {
u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.",
u"title": u"How to Tie a Square Knot Properly"
'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
'md5': '8b743df908c42f60cf6496586c7f12c3',
'info_dict': {
'id': '390161',
'ext': 'mp4',
'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
'title': 'How to Tie a Square Knot Properly',
}
}
@ -24,22 +27,15 @@ class HowcastIE(InfoExtractor):
self.report_extraction(video_id)
video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
webpage, u'video URL')
video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
webpage, u'title')
webpage, 'video URL')
video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
webpage, u'description', fatal=False)
webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': video_title,
return {
'id': video_id,
'url': video_url,
'title': self._og_search_title(webpage),
'description': video_description,
'thumbnail': thumbnail,
}]
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@ -1,35 +1,39 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class InstagramIE(InfoExtractor):
_VALID_URL = r'(?:http://)?instagram\.com/p/(.*?)/'
_VALID_URL = r'http://instagram\.com/p/(?P<id>.*?)/'
_TEST = {
u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
u'file': u'aye83DjauH.mp4',
u'md5': u'0d2da106a9d2631273e192b372806516',
u'info_dict': {
u"uploader_id": u"naomipq",
u"title": u"Video by naomipq",
u'description': u'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
'info_dict': {
'id': 'aye83DjauH',
'ext': 'mp4',
'uploader_id': 'naomipq',
'title': 'Video by naomipq',
'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
webpage, u'uploader id', fatal=False)
desc = self._search_regex(r'"caption":"(.*?)"', webpage, u'description',
webpage, 'uploader id', fatal=False)
desc = self._search_regex(r'"caption":"(.*?)"', webpage, 'description',
fatal=False)
return [{
'id': video_id,
'url': self._og_search_video_url(webpage, secure=False),
'ext': 'mp4',
'title': u'Video by %s' % uploader_id,
return {
'id': video_id,
'url': self._og_search_video_url(webpage, secure=False),
'ext': 'mp4',
'title': 'Video by %s' % uploader_id,
'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id' : uploader_id,
'uploader_id': uploader_id,
'description': desc,
}]
}

View File

@ -10,7 +10,7 @@ from ..utils import compat_urllib_request
class IPrimaIE(InfoExtractor):
_VALID_URL = r'https?://play\.iprima\.cz/(?P<videogroup>.+)/(?P<videoid>.+)'
_VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://play.iprima.cz/particka/particka-92',
@ -22,20 +22,32 @@ class IPrimaIE(InfoExtractor):
'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
},
'params': {
'skip_download': True,
'skip_download': True, # requires rtmpdump
},
},
]
}, {
'url': 'http://play.iprima.cz/particka/tchibo-particka-jarni-moda',
'info_dict': {
'id': '9718337',
'ext': 'flv',
'title': 'Tchibo Partička - Jarní móda',
'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
'thumbnail': 're:^http:.*\.jpg$',
},
'params': {
'skip_download': True, # requires rtmpdump
},
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
player_url = 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % (
floor(random()*1073741824),
floor(random()*1073741824))
player_url = (
'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' %
(floor(random()*1073741824), floor(random()*1073741824))
)
req = compat_urllib_request.Request(player_url)
req.add_header('Referer', url)
@ -44,18 +56,20 @@ class IPrimaIE(InfoExtractor):
base_url = ''.join(re.findall(r"embed\['stream'\] = '(.+?)'.+'(\?auth=)'.+'(.+?)';", playerpage)[1])
zoneGEO = self._html_search_regex(r'"zoneGEO":(.+?),', webpage, 'zoneGEO')
if zoneGEO != '0':
base_url = base_url.replace('token', 'token_'+zoneGEO)
base_url = base_url.replace('token', 'token_' + zoneGEO)
formats = []
for format_id in ['lq', 'hq', 'hd']:
filename = self._html_search_regex(r'"%s_id":(.+?),' % format_id, webpage, 'filename')
filename = self._html_search_regex(
r'"%s_id":(.+?),' % format_id, webpage, 'filename')
if filename == 'null':
continue
real_id = self._search_regex(r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id')
real_id = self._search_regex(
r'Prima-(?:[0-9]{10}|WEB)-([0-9]+)[-_]',
filename, 'real video id')
if format_id == 'lq':
quality = 0
@ -63,13 +77,13 @@ class IPrimaIE(InfoExtractor):
quality = 1
elif format_id == 'hd':
quality = 2
filename = 'hq/'+filename
filename = 'hq/' + filename
formats.append({
'format_id': format_id,
'url': base_url,
'quality': quality,
'play_path': 'mp4:'+filename.replace('"', '')[:-4],
'play_path': 'mp4:' + filename.replace('"', '')[:-4],
'rtmp_live': True,
'ext': 'flv',
})

View File

@ -14,15 +14,16 @@ from ..utils import (
class IviIE(InfoExtractor):
IE_DESC = 'ivi.ru'
IE_NAME = 'ivi'
_VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)'
_VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)'
_TESTS = [
# Single movie
{
'url': 'http://www.ivi.ru/watch/53141',
'file': '53141.mp4',
'md5': '6ff5be2254e796ed346251d117196cf4',
'info_dict': {
'id': '53141',
'ext': 'mp4',
'title': 'Иван Васильевич меняет профессию',
'description': 'md5:b924063ea1677c8fe343d8a72ac2195f',
'duration': 5498,
@ -33,9 +34,10 @@ class IviIE(InfoExtractor):
# Serial's serie
{
'url': 'http://www.ivi.ru/watch/dezhurnyi_angel/74791',
'file': '74791.mp4',
'md5': '3e6cc9a848c1d2ebcc6476444967baa9',
'info_dict': {
'id': '74791',
'ext': 'mp4',
'title': 'Дежурный ангел - 1 серия',
'duration': 2490,
'thumbnail': 'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg',
@ -124,7 +126,7 @@ class IviIE(InfoExtractor):
class IviCompilationIE(InfoExtractor):
IE_DESC = 'ivi.ru compilations'
IE_NAME = 'ivi:compilation'
_VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'
_VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'
def _extract_entries(self, html, compilation_id):
return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi')

View File

@ -0,0 +1,48 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .youtube import YoutubeIE
class JadoreCettePubIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html'
_TEST = {
'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html',
'md5': '401286a06067c70b44076044b66515de',
'info_dict': {
'id': 'jLMja3tr7a4',
'ext': 'mp4',
'title': 'La pire utilisation de Star Wars',
'description': "Jadorecettepub.com vous a gratifié de plusieurs pubs géniales utilisant Star Wars et Dark Vador plus particulièrement... Mais l'heure est venue de vous proposer une version totalement massacrée, venue du Japon. Quand les Japonais détruisent l'image de Star Wars pour vendre du thon en boite, ça promet...",
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
title = self._html_search_regex(
r'<span style="font-size: x-large;"><b>(.*?)</b></span>',
webpage, 'title')
description = self._html_search_regex(
r'(?s)<div id="fb-root">(.*?)<script>', webpage, 'description',
fatal=False)
real_url = self._search_regex(
r'\[/postlink\](.*)endofvid', webpage, 'video URL')
video_id = YoutubeIE.extract_id(real_url)
return {
'_type': 'url_transparent',
'url': real_url,
'id': video_id,
'title': title,
'description': description,
}

View File

@ -1,5 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
@ -10,12 +12,13 @@ class JeuxVideoIE(InfoExtractor):
_VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm'
_TEST = {
u'url': u'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
u'file': u'5182.mp4',
u'md5': u'046e491afb32a8aaac1f44dd4ddd54ee',
u'info_dict': {
u'title': u'GC 2013 : Tearaway nous présente ses papiers d\'identité',
u'description': u'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n',
'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
'md5': '046e491afb32a8aaac1f44dd4ddd54ee',
'info_dict': {
'id': '5182',
'ext': 'mp4',
'title': 'GC 2013 : Tearaway nous présente ses papiers d\'identité',
'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n',
},
}
@ -25,14 +28,14 @@ class JeuxVideoIE(InfoExtractor):
webpage = self._download_webpage(url, title)
xml_link = self._html_search_regex(
r'<param name="flashvars" value="config=(.*?)" />',
webpage, u'config URL')
webpage, 'config URL')
video_id = self._search_regex(
r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml',
xml_link, u'video ID')
xml_link, 'video ID')
config = self._download_xml(
xml_link, title, u'Downloading XML config')
xml_link, title, 'Downloading XML config')
info_json = config.find('format.json').text
info = json.loads(info_json)['versions'][0]

View File

@ -0,0 +1,66 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class KontrTubeIE(InfoExtractor):
IE_NAME = 'kontrtube'
IE_DESC = 'KontrTube.ru - Труба зовёт'
_VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+'
_TEST = {
'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
'md5': '975a991a4926c9a85f383a736a2e6b80',
'info_dict': {
'id': '2678',
'ext': 'mp4',
'title': 'Над олимпийской деревней в Сочи поднят российский флаг',
'description': 'md5:80edc4c613d5887ae8ccf1d59432be41',
'thumbnail': 'http://www.kontrtube.ru/contents/videos_screenshots/2000/2678/preview.mp4.jpg',
'duration': 270,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id, 'Downloading page')
video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
'video title')
description = self._html_search_meta('description', webpage, 'video description')
mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
webpage)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
'view count', fatal=False)
view_count = int(view_count) if view_count is not None else None
comment_count = None
comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
fatal=False)
if comment_str.startswith('комментариев нет'):
comment_count = 0
else:
mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
if mobj:
comment_count = int(mobj.group('total'))
return {
'id': video_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
'description': description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
}

View File

@ -4,19 +4,24 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import unified_strdate
from ..utils import (
int_or_none,
unified_strdate,
ExtractorError,
)
class LifeNewsIE(InfoExtractor):
IE_NAME = 'lifenews'
IE_DESC = 'LIFE | NEWS'
_VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)'
_TEST = {
'url': 'http://lifenews.ru/news/126342',
'file': '126342.mp4',
'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
'info_dict': {
'id': '126342',
'ext': 'mp4',
'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом',
'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.',
'thumbnail': 'http://lifenews.ru/static/posts/2014/1/126342/.video.jpg',
@ -28,13 +33,11 @@ class LifeNewsIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page')
webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
video_url = self._html_search_regex(
r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL')
thumbnail = self._html_search_regex(
r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')
videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
if not videos:
raise ExtractorError('No media links available for %s' % video_id)
title = self._og_search_title(webpage)
TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
@ -44,20 +47,28 @@ class LifeNewsIE(InfoExtractor):
description = self._og_search_description(webpage)
view_count = self._html_search_regex(
r'<div class=\'views\'>(\d+)</div>', webpage, 'view count')
r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
comment_count = self._html_search_regex(
r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count')
r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
upload_date = self._html_search_regex(
r'<time datetime=\'([^\']+)\'>', webpage, 'upload date')
r'<time datetime=\'([^\']+)\'>', webpage, 'upload date',fatal=False)
if upload_date is not None:
upload_date = unified_strdate(upload_date)
return {
'id': video_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
'description': description,
'view_count': view_count,
'comment_count': comment_count,
'upload_date': unified_strdate(upload_date),
}
def make_entry(video_id, media, video_number=None):
return {
'id': video_id,
'url': media[1],
'thumbnail': media[0],
'title': title if video_number is None else '%s-video%s' % (title, video_number),
'description': description,
'view_count': int_or_none(view_count),
'comment_count': int_or_none(comment_count),
'upload_date': upload_date,
}
if len(videos) == 1:
return make_entry(video_id, videos[0])
else:
return [make_entry(video_id, media, video_number+1) for video_number, media in enumerate(videos)]

View File

@ -4,15 +4,17 @@ import json
import re
from .common import InfoExtractor
from ..utils import int_or_none
class LiveLeakIE(InfoExtractor):
_VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
_TESTS = [{
'url': 'http://www.liveleak.com/view?i=757_1364311680',
'file': '757_1364311680.mp4',
'md5': '0813c2430bea7a46bf13acf3406992f4',
'info_dict': {
'id': '757_1364311680',
'ext': 'mp4',
'description': 'extremely bad day for this guy..!',
'uploader': 'ljfriel2',
'title': 'Most unlucky car accident'
@ -20,25 +22,62 @@ class LiveLeakIE(InfoExtractor):
},
{
'url': 'http://www.liveleak.com/view?i=f93_1390833151',
'file': 'f93_1390833151.mp4',
'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
'info_dict': {
'id': 'f93_1390833151',
'ext': 'mp4',
'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
'uploader': 'ARD_Stinkt',
'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
}
},
{
'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
'md5': '42c6d97d54f1db107958760788c5f48f',
'info_dict': {
'id': '4f7_1392687779',
'ext': 'mp4',
'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.",
'uploader': 'CapObveus',
'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
'age_limit': 18,
}
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
webpage = self._download_webpage(url, video_id)
video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
video_description = self._og_search_description(webpage)
video_uploader = self._html_search_regex(
r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
age_limit = int_or_none(self._search_regex(
r'you confirm that you are ([0-9]+) years and over.',
webpage, 'age limit', default=None))
sources_raw = self._search_regex(
r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
if sources_raw is None:
sources_raw = '[{ %s}]' % (
self._search_regex(r'(file: ".*?"),', webpage, 'video URL'))
alt_source = self._search_regex(
r'(file: ".*?"),', webpage, 'video URL', default=None)
if alt_source:
sources_raw = '[{ %s}]' % alt_source
else:
# Maybe an embed?
embed_url = self._search_regex(
r'<iframe[^>]+src="(http://www.prochan.com/embed\?[^"]+)"',
webpage, 'embed URL')
return {
'_type': 'url_transparent',
'url': embed_url,
'id': video_id,
'title': video_title,
'description': video_description,
'uploader': video_uploader,
'age_limit': age_limit,
}
sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
sources = json.loads(sources_json)
@ -49,15 +88,11 @@ class LiveLeakIE(InfoExtractor):
} for s in sources]
self._sort_formats(formats)
video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
video_description = self._og_search_description(webpage)
video_uploader = self._html_search_regex(
r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
return {
'id': video_id,
'title': video_title,
'description': video_description,
'uploader': video_uploader,
'formats': formats,
'age_limit': age_limit,
}

View File

@ -0,0 +1,66 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
import datetime
from .common import InfoExtractor
class MailRuIE(InfoExtractor):
IE_NAME = 'mailru'
IE_DESC = 'Видео@Mail.Ru'
_VALID_URL = r'http://(?:www\.)?my\.mail\.ru/video/.*#video=/?(?P<id>[^/]+/[^/]+/[^/]+/\d+)'
_TEST = {
'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
'md5': 'dea205f03120046894db4ebb6159879a',
'info_dict': {
'id': '46301138',
'ext': 'mp4',
'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
'upload_date': '20140224',
'uploader': 'sonypicturesrus',
'uploader_id': 'sonypicturesrus@mail.ru',
'duration': 184,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_data = self._download_json(
'http://videoapi.my.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON')
author = video_data['author']
uploader = author['name']
uploader_id = author['id']
movie = video_data['movie']
content_id = str(movie['contentId'])
title = movie['title']
thumbnail = movie['poster']
duration = movie['duration']
upload_date = datetime.datetime.fromtimestamp(video_data['timestamp']).strftime('%Y%m%d')
view_count = video_data['views_count']
formats = [
{
'url': video['url'],
'format_id': video['name'],
} for video in video_data['videos']
]
return {
'id': content_id,
'title': title,
'thumbnail': thumbnail,
'upload_date': upload_date,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@ -166,6 +166,7 @@ class MetacafeIE(InfoExtractor):
video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
video_uploader = self._html_search_regex(
r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
webpage, u'uploader nickname', fatal=False)
@ -183,6 +184,7 @@ class MetacafeIE(InfoExtractor):
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
'thumbnail':thumbnail,
'ext': video_ext,
'age_limit': age_limit,
}

View File

@ -1,24 +1,30 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
compat_urlparse,
clean_html,
ExtractorError,
get_element_by_id,
)
class TechTVMITIE(InfoExtractor):
IE_NAME = u'techtv.mit.edu'
IE_NAME = 'techtv.mit.edu'
_VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
_TEST = {
u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
u'file': u'25418.mp4',
u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
u'info_dict': {
u'title': u'MIT DNA Learning Center Set',
u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
'md5': '1f8cb3e170d41fd74add04d3c9330e5f',
'info_dict': {
'id': '25418',
'ext': 'mp4',
'title': 'MIT DNA Learning Center Set',
'description': 'md5:82313335e8a8a3f243351ba55bc1b474',
},
}
@ -27,12 +33,12 @@ class TechTVMITIE(InfoExtractor):
video_id = mobj.group('id')
raw_page = self._download_webpage(
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
raw_page, u'base url')
formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
u'video formats')
base_url = self._search_regex(
r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url')
formats_json = self._search_regex(
r'bitrates: (\[.+?\])', raw_page, 'video formats')
formats_mit = json.loads(formats_json)
formats = [
{
@ -48,28 +54,31 @@ class TechTVMITIE(InfoExtractor):
title = get_element_by_id('edit-title', clean_page)
description = clean_html(get_element_by_id('edit-description', clean_page))
thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
raw_page, u'thumbnail', flags=re.DOTALL)
thumbnail = self._search_regex(
r'playlist:.*?url: \'(.+?)\'',
raw_page, 'thumbnail', flags=re.DOTALL)
return {'id': video_id,
'title': title,
'formats': formats,
'description': description,
'thumbnail': thumbnail,
}
return {
'id': video_id,
'title': title,
'formats': formats,
'description': description,
'thumbnail': thumbnail,
}
class MITIE(TechTVMITIE):
IE_NAME = u'video.mit.edu'
IE_NAME = 'video.mit.edu'
_VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
_TEST = {
u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
u'file': u'21783.mp4',
u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
u'info_dict': {
u'title': u'The Government is Profiling You',
u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
'md5': '7db01d5ccc1895fc5010e9c9e13648da',
'info_dict': {
'id': '21783',
'ext': 'mp4',
'title': 'The Government is Profiling You',
'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd',
},
}
@ -77,7 +86,73 @@ class MITIE(TechTVMITIE):
mobj = re.match(self._VALID_URL, url)
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
u'embed url')
embed_url = self._search_regex(
r'<iframe .*?src="(.+?)"', webpage, 'embed url')
return self.url_result(embed_url, ie='TechTVMIT')
class OCWMITIE(InfoExtractor):
IE_NAME = 'ocw.mit.edu'
_VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
_BASE_URL = 'http://ocw.mit.edu/'
_TESTS = [
{
'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
'info_dict': {
'id': 'EObHWIEKGjA',
'ext': 'mp4',
'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
#'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
}
},
{
'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
'info_dict': {
'id': '7K1sB05pE0A',
'ext': 'mp4',
'title': 'Session 1: Introduction to Derivatives',
'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
#'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
}
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
topic = mobj.group('topic')
webpage = self._download_webpage(url, topic)
title = self._html_search_meta('WT.cg_s', webpage)
description = self._html_search_meta('Description', webpage)
# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
if embed_chapter_media:
metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
metadata = re.split(r', ?', metadata)
yt = metadata[1]
subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])
else:
# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
if embed_media:
metadata = re.sub(r'[\'"]', '', embed_media.group(1))
metadata = re.split(r', ?', metadata)
yt = metadata[1]
subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])
else:
raise ExtractorError('Unable to find embedded YouTube video.')
video_id = YoutubeIE.extract_id(yt)
return {
'_type': 'url_transparent',
'id': video_id,
'title': title,
'description': description,
'url': yt,
'url_transparent'
'subtitles': subs,
'ie_key': 'Youtube',
}

View File

@ -5,18 +5,20 @@ import re
from .common import InfoExtractor
from ..utils import (
unified_strdate,
compat_urllib_parse,
ExtractorError,
)
class MixcloudIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
IE_NAME = 'mixcloud'
_TEST = {
'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
'file': 'dholbach-cryptkeeper.mp3',
'info_dict': {
'id': 'dholbach-cryptkeeper',
'ext': 'mp3',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
@ -45,7 +47,7 @@ class MixcloudIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group(1)
cloudcast_name = mobj.group(2)
track_id = '-'.join((uploader, cloudcast_name))
track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
webpage = self._download_webpage(url, track_id)

View File

@ -61,7 +61,7 @@ class MooshareIE(InfoExtractor):
}
request = compat_urllib_request.Request(
'http://mooshare.biz/8dqtk4bjbp8g', compat_urllib_parse.urlencode(download_form))
'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.to_screen('%s: Waiting for timeout' % video_id)
@ -111,4 +111,4 @@ class MooshareIE(InfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}
}

View File

@ -82,12 +82,12 @@ class MTVServicesInfoExtractor(InfoExtractor):
title_el = find_xpath_attr(
itemdoc, './/{http://search.yahoo.com/mrss/}category',
'scheme', 'urn:mtvn:video_title')
if title_el is None:
title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
if title_el is None:
title_el = itemdoc.find('.//title')
if title_el.text is None:
title_el = None
if title_el is None:
title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
title = title_el.text
if title is None:

View File

@ -1,19 +1,46 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import find_xpath_attr, compat_str
class NBCIE(InfoExtractor):
_VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)'
_TEST = {
'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
'md5': '54d0fbc33e0b853a65d7b4de5c06d64e',
'info_dict': {
'id': 'u1RInQZRN7QJ',
'ext': 'flv',
'title': 'I Am a Firefighter',
'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url')
if theplatform_url.startswith('//'):
theplatform_url = 'http:' + theplatform_url
return self.url_result(theplatform_url)
class NBCNewsIE(InfoExtractor):
_VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
_TEST = {
u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
u'file': u'52753292.flv',
u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
u'info_dict': {
u'title': u'Crew emerges after four-month Mars food study',
u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
'info_dict': {
'id': '52753292',
'ext': 'flv',
'title': 'Crew emerges after four-month Mars food study',
'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
},
}
@ -23,10 +50,11 @@ class NBCNewsIE(InfoExtractor):
all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
info = all_info.find('video')
return {'id': video_id,
'title': info.find('headline').text,
'ext': 'flv',
'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
'description': compat_str(info.find('caption').text),
'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
}
return {
'id': video_id,
'title': info.find('headline').text,
'ext': 'flv',
'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
'description': compat_str(info.find('caption').text),
'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
}

View File

@ -0,0 +1,89 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class NDRIE(InfoExtractor):
IE_NAME = 'ndr'
IE_DESC = 'NDR.de - Mediathek'
_VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
_TESTS = [
{
'url': 'http://www.ndr.de/fernsehen/sendungen/markt/markt7959.html',
'md5': 'e7a6079ca39d3568f4996cb858dd6708',
'note': 'Video file',
'info_dict': {
'id': '7959',
'ext': 'mp4',
'title': 'Markt - die ganze Sendung',
'description': 'md5:af9179cf07f67c5c12dc6d9997e05725',
'duration': 2655,
},
},
{
'url': 'http://www.ndr.de/info/audio51535.html',
'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
'note': 'Audio file',
'info_dict': {
'id': '51535',
'ext': 'mp3',
'title': 'La Valette entgeht der Hinrichtung',
'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
'duration': 884,
}
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
title = self._og_search_title(page)
description = self._og_search_description(page)
mobj = re.search(
r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
page)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
formats = []
mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
if mp3_url:
formats.append({
'url': mp3_url.group('audio'),
'format_id': 'mp3',
})
thumbnail = None
video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
if video_url:
thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
page, 'thumbnail', fatal=False)
if thumbnail:
thumbnail = 'http://www.ndr.de' + thumbnail
for format_id in ['lo', 'hi', 'hq']:
formats.append({
'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
'format_id': format_id,
})
if not formats:
raise ExtractorError('No media links available for %s' % video_id)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}

View File

@ -0,0 +1,94 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
compat_urllib_parse,
)
class NFBIE(InfoExtractor):
IE_NAME = 'nfb'
IE_DESC = 'National Film Board of Canada'
_VALID_URL = r'https?://(?:www\.)?(nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
_TEST = {
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
'info_dict': {
'id': 'qallunaat_why_white_people_are_funny',
'ext': 'mp4',
'title': 'Qallunaat! Why White People Are Funny ',
'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
'duration': 3128,
'uploader': 'Mark Sandiford',
'uploader_id': 'mark-sandiford',
},
'params': {
# rtmp download
'skip_download': True,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage('https://www.nfb.ca/film/%s' % video_id, video_id, 'Downloading film page')
uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
page, 'director id', fatal=False)
uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
page, 'director name', fatal=False)
request = compat_urllib_request.Request('https://www.nfb.ca/film/%s/player_config' % video_id,
compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
config = self._download_xml(request, video_id, 'Downloading player config XML')
title = None
description = None
thumbnail = None
duration = None
formats = []
def extract_thumbnail(media):
thumbnails = {}
for asset in media.findall('assets/asset'):
thumbnails[asset.get('quality')] = asset.find('default/url').text
if not thumbnails:
return None
if 'high' in thumbnails:
return thumbnails['high']
return list(thumbnails.values())[0]
for media in config.findall('./player/stream/media'):
if media.get('type') == 'posterImage':
thumbnail = extract_thumbnail(media)
elif media.get('type') == 'video':
duration = int(media.get('duration'))
title = media.find('title').text
description = media.find('description').text
# It seems assets always go from lower to better quality, so no need to sort
formats = [{
'url': x.find('default/streamerURI').text,
'app': x.find('default/streamerURI').text.split('/', 3)[3],
'play_path': x.find('default/url').text,
'rtmp_live': False,
'ext': 'mp4',
'format_id': x.get('quality'),
} for x in media.findall('assets/asset')]
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats,
}

View File

@ -1,61 +1,51 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unified_strdate,
)
class NormalbootsIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
_VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
_TEST = {
u'url': u'http://normalboots.com/video/home-alone-games-jontron/',
u'file': u'home-alone-games-jontron.mp4',
u'md5': u'8bf6de238915dd501105b44ef5f1e0f6',
u'info_dict': {
u'title': u'Home Alone Games - JonTron - NormalBoots',
u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/',
u'uploader': u'JonTron',
u'upload_date': u'20140125',
'url': 'http://normalboots.com/video/home-alone-games-jontron/',
'md5': '8bf6de238915dd501105b44ef5f1e0f6',
'info_dict': {
'id': 'home-alone-games-jontron',
'ext': 'mp4',
'title': 'Home Alone Games - JonTron - NormalBoots',
'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for Tense Battle Theme:\xa0http://www.youtube.com/Kiamet/',
'uploader': 'JonTron',
'upload_date': '20140125',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('videoid')
info = {
'id': video_id,
'uploader': None,
'upload_date': None,
}
if url[:4] != 'http':
url = 'http://' + url
webpage = self._download_webpage(url, video_id)
video_title = self._og_search_title(webpage)
video_description = self._og_search_description(webpage)
video_thumbnail = self._og_search_thumbnail(webpage)
video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>',
webpage, 'uploader')
raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
webpage, 'date')
video_upload_date = unified_strdate(raw_upload_date)
video_upload_date = unified_strdate(raw_upload_date)
player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url')
player_page = self._download_webpage(player_url, video_id)
video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P<file>[0-9A-Za-z-_\.]+)'", player_page, 'file')
info['url'] = video_url
info['title'] = video_title
info['description'] = video_description
info['thumbnail'] = video_thumbnail
info['uploader'] = video_uploader
info['upload_date'] = video_upload_date
return info
video_url = self._html_search_regex(r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file')
return {
'id': video_id,
'url': video_url,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'uploader': video_uploader,
'upload_date': video_upload_date,
}

View File

@ -9,14 +9,25 @@ from ..utils import (
)
class NovamovIE(InfoExtractor):
_VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?v=)(?P<videoid>[a-z\d]{13})'
class NovaMovIE(InfoExtractor):
IE_NAME = 'novamov'
IE_DESC = 'NovaMov'
_VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'novamov\.com'}
_HOST = 'www.novamov.com'
_FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>'
_FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";'
_TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>'
_DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>'
_TEST = {
'url': 'http://www.novamov.com/video/4rurhn9x446jj',
'file': '4rurhn9x446jj.flv',
'md5': '7205f346a52bbeba427603ba10d4b935',
'info_dict': {
'id': '4rurhn9x446jj',
'ext': 'flv',
'title': 'search engine optimization',
'description': 'search engine optimization is used to rank the web page in the google search engine'
},
@ -27,31 +38,26 @@ class NovamovIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
page = self._download_webpage('http://www.novamov.com/video/%s' % video_id,
video_id, 'Downloading video page')
page = self._download_webpage(
'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page')
if re.search(r'This file no longer exists on our servers!</h2>', page) is not None:
if re.search(self._FILE_DELETED_REGEX, page) is not None:
raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
filekey = self._search_regex(
r'flashvars\.filekey="(?P<filekey>[^"]+)";', page, 'filekey')
filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey')
title = self._html_search_regex(
r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>',
page, 'title', fatal=False)
title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
description = self._html_search_regex(
r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>',
page, 'description', fatal=False)
description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False)
api_response = self._download_webpage(
'http://www.novamov.com/api/player.api.php?key=%s&file=%s' % (filekey, video_id),
video_id, 'Downloading video api response')
'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id,
'Downloading video api response')
response = compat_urlparse.parse_qs(api_response)
if 'error_msg' in response:
raise ExtractorError('novamov returned error: %s' % response['error_msg'][0], expected=True)
raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True)
video_url = response['url'][0]
@ -60,4 +66,4 @@ class NovamovIE(InfoExtractor):
'url': video_url,
'title': title,
'description': description
}
}

View File

@ -1,46 +1,28 @@
import re
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import compat_urlparse
from .novamov import NovaMovIE
class NowVideoIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.(?:ch|sx)/video/(?P<id>\w+)'
class NowVideoIE(NovaMovIE):
IE_NAME = 'nowvideo'
IE_DESC = 'NowVideo'
_VALID_URL = r'http://(?:(?:www\.)?%(host)s/video/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<videoid>[a-z\d]{13})' % {'host': 'nowvideo\.(?:ch|sx|eu)'}
_HOST = 'www.nowvideo.ch'
_FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
_FILEKEY_REGEX = r'var fkzd="([^"]+)";'
_TITLE_REGEX = r'<h4>([^<]+)</h4>'
_DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>'
_TEST = {
u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
u'file': u'0mw0yow7b6dxa.flv',
u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
u'info_dict': {
u"title": u"youtubedl test video _BaW_jenozKc.mp4"
'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
'md5': 'f8fbbc8add72bd95b7850c6a02fc8817',
'info_dict': {
'id': '0mw0yow7b6dxa',
'ext': 'flv',
'title': 'youtubedl test video _BaW_jenozKc.mp4',
'description': 'Description',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.nowvideo.ch/video/' + video_id
embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id
webpage = self._download_webpage(webpage_url, video_id)
embed_page = self._download_webpage(embed_url, video_id,
u'Downloading embed page')
self.report_extraction(video_id)
video_title = self._html_search_regex(r'<h4>(.*)</h4>',
webpage, u'video title')
video_key = self._search_regex(r'var fkzd="(.*)";',
embed_page, u'video key')
api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
api_response = self._download_webpage(api_call, video_id,
u'Downloading API page')
video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
return [{
'id': video_id,
'url': video_url,
'ext': 'flv',
'title': video_title,
}]
}

View File

@ -8,6 +8,7 @@ from .common import InfoExtractor
from ..utils import (
HEADRequest,
unified_strdate,
ExtractorError,
)
@ -35,7 +36,15 @@ class ORFIE(InfoExtractor):
data_json = self._search_regex(
r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
all_data = json.loads(data_json)
sdata = all_data[0]['values']['segments']
def get_segments(all_data):
for data in all_data:
if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
return data['values']['segments']
sdata = get_segments(all_data)
if not sdata:
raise ExtractorError('Unable to extract segments')
def quality_to_int(s):
m = re.search('([0-9]+)', s)

View File

@ -9,7 +9,7 @@ class PBSIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?:
# Direct video URL
video\.pbs\.org/video/(?P<id>[0-9]+)/? |
video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
# Article with embedded player
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
# Player

View File

@ -1,7 +1,10 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import int_or_none
class PodomaticIE(InfoExtractor):
@ -9,14 +12,14 @@ class PodomaticIE(InfoExtractor):
_VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
_TEST = {
u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
u"file": u"2009-01-02T16_03_35-08_00.mp3",
u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
u"info_dict": {
u"uploader": u"Science Teaching Tips",
u"uploader_id": u"scienceteachingtips",
u"title": u"64. When the Moon Hits Your Eye",
u"duration": 446,
"url": "http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
"file": "2009-01-02T16_03_35-08_00.mp3",
"md5": "84bb855fcf3429e6bf72460e1eed782d",
"info_dict": {
"uploader": "Science Teaching Tips",
"uploader_id": "scienceteachingtips",
"title": "64. When the Moon Hits Your Eye",
"duration": 446,
}
}
@ -36,7 +39,7 @@ class PodomaticIE(InfoExtractor):
uploader = data['podcast']
title = data['title']
thumbnail = data['imageLocation']
duration = int(data['length'] / 1000.0)
duration = int_or_none(data.get('length'), 1000)
return {
'id': video_id,

View File

@ -0,0 +1,297 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from hashlib import sha1
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
unified_strdate,
clean_html,
RegexNotFoundError,
)
class ProSiebenSat1IE(InfoExtractor):
IE_NAME = 'prosiebensat1'
IE_DESC = 'ProSiebenSat.1 Digital'
_VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
_TESTS = [
{
'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
'info_dict': {
'id': '2104602',
'ext': 'mp4',
'title': 'Staffel 2, Episode 18 - Jahresrückblick',
'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
'upload_date': '20131231',
'duration': 5845.04,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html',
'info_dict': {
'id': '2570327',
'ext': 'mp4',
'title': 'Lady-Umstyling für Audrina',
'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d',
'upload_date': '20131014',
'duration': 606.76,
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Seems to be broken',
},
{
'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
'info_dict': {
'id': '2429369',
'ext': 'mp4',
'title': 'Countdown für die Autowerkstatt',
'description': 'md5:809fc051a457b5d8666013bc40698817',
'upload_date': '20140223',
'duration': 2595.04,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
'info_dict': {
'id': '2904997',
'ext': 'mp4',
'title': 'Sexy laufen in Ugg Boots',
'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6',
'upload_date': '20140122',
'duration': 245.32,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
'info_dict': {
'id': '2906572',
'ext': 'mp4',
'title': 'Im Interview: Kai Wiesinger',
'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
'upload_date': '20140225',
'duration': 522.56,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
'info_dict': {
'id': '2992323',
'ext': 'mp4',
'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
'description': 'md5:2669cde3febe9bce13904f701e774eb6',
'upload_date': '20140225',
'duration': 2410.44,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
'info_dict': {
'id': '3004256',
'ext': 'mp4',
'title': 'Schalke: Tönnies möchte Raul zurück',
'description': 'md5:4b5b271d9bcde223b54390754c8ece3f',
'upload_date': '20140226',
'duration': 228.96,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
'info_dict': {
'id': '2572814',
'ext': 'mp4',
'title': 'Andreas Kümmert: Rocket Man',
'description': 'md5:6ddb02b0781c6adf778afea606652e38',
'upload_date': '20131017',
'duration': 469.88,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html',
'info_dict': {
'id': '2156342',
'ext': 'mp4',
'title': 'Kurztrips zum Valentinstag',
'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
'upload_date': '20130206',
'duration': 307.24,
},
'params': {
# rtmp download
'skip_download': True,
},
},
]
_CLIPID_REGEXES = [
r'"clip_id"\s*:\s+"(\d+)"',
r'clipid: "(\d+)"',
]
_TITLE_REGEXES = [
r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
r'<header class="clearfix">\s*<h3>(.+?)</h3>',
r'<!-- start video -->\s*<h1>(.+?)</h1>',
r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>',
]
_DESCRIPTION_REGEXES = [
r'<p itemprop="description">\s*(.+?)</p>',
r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">',
]
_UPLOAD_DATE_REGEXES = [
r'<meta property="og:published_time" content="(.+?)">',
r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"',
r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr',
r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
def extract(patterns, name, page, fatal=False):
for pattern in patterns:
mobj = re.search(pattern, page)
if mobj:
return clean_html(mobj.group(1))
if fatal:
raise RegexNotFoundError(u'Unable to extract %s' % name)
return None
clip_id = extract(self._CLIPID_REGEXES, 'clip id', page, fatal=True)
access_token = 'testclient'
client_name = 'kolibri-1.2.5'
client_location = url
videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
'access_token': access_token,
'client_location': client_location,
'client_name': client_name,
'ids': clip_id,
})
videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
duration = float(videos[0]['duration'])
source_ids = [source['id'] for source in videos[0]['sources']]
source_ids_str = ','.join(map(str, source_ids))
g = '01!8d8F_)r9]4s[qeuXfP%'
client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
.encode('utf-8')).hexdigest()
sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({
'access_token': access_token,
'client_id': client_id,
'client_location': client_location,
'client_name': client_name,
}))
sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
server_id = sources['server_id']
client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
client_location, source_ids_str, g, client_name])
.encode('utf-8')).hexdigest()
url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({
'access_token': access_token,
'client_id': client_id,
'client_location': client_location,
'client_name': client_name,
'server_id': server_id,
'source_ids': source_ids_str,
}))
urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
title = extract(self._TITLE_REGEXES, 'title', page, fatal=True)
description = extract(self._DESCRIPTION_REGEXES, 'description', page)
thumbnail = self._og_search_thumbnail(page)
upload_date = extract(self._UPLOAD_DATE_REGEXES, 'upload date', page)
if upload_date:
upload_date = unified_strdate(upload_date)
formats = []
urls_sources = urls['sources']
if isinstance(urls_sources, dict):
urls_sources = urls_sources.values()
def fix_bitrate(bitrate):
return bitrate / 1000 if bitrate % 1000 == 0 else bitrate
for source in urls_sources:
protocol = source['protocol']
if protocol == 'rtmp' or protocol == 'rtmpe':
mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
if not mobj:
continue
formats.append({
'url': mobj.group('url'),
'app': mobj.group('app'),
'play_path': mobj.group('playpath'),
'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
'page_url': 'http://www.prosieben.de',
'vbr': fix_bitrate(source['bitrate']),
'ext': 'mp4',
'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
})
else:
formats.append({
'url': source['url'],
'vbr': fix_bitrate(source['bitrate']),
})
self._sort_formats(formats)
return {
'id': clip_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
'duration': duration,
'formats': formats,
}

View File

@ -1,148 +1,165 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError,
clean_html,
unified_strdate,
int_or_none,
)
class RTLnowIE(InfoExtractor):
"""Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
_VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
_TESTS = [{
'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
'file': '90419.flv',
'info_dict': {
'upload_date': '20070416',
'title': 'Ahornallee - Folge 1 - Der Einzug',
'description': 'Folge 1 - Der Einzug',
_VALID_URL = r'''(?x)
(?:https?://)?
(?P<url>
(?P<domain>
rtl-now\.rtl\.de|
rtl2now\.rtl2\.de|
(?:www\.)?voxnow\.de|
(?:www\.)?rtlnitronow\.de|
(?:www\.)?superrtlnow\.de|
(?:www\.)?n-tvnow\.de)
/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
(?:container_id|film_id)=(?P<video_id>[0-9]+)&
player=1(?:&season=[0-9]+)?(?:&.*)?
)'''
_TESTS = [
{
'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
'info_dict': {
'id': '90419',
'ext': 'flv',
'title': 'Ahornallee - Folge 1 - Der Einzug',
'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
'upload_date': '20070416',
'duration': 1685,
},
'params': {
'skip_download': True,
},
'skip': 'Only works from Germany',
},
'params': {
'skip_download': True,
{
'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
'info_dict': {
'id': '69756',
'ext': 'flv',
'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
'upload_date': '20120519',
'duration': 1245,
},
'params': {
'skip_download': True,
},
'skip': 'Only works from Germany',
},
'skip': 'Only works from Germany',
},
{
'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
'file': '69756.flv',
'info_dict': {
'upload_date': '20120519',
'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...',
'description': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
{
'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
'info_dict': {
'id': '13883',
'ext': 'flv',
'title': 'Voxtours - Südafrika-Reporter II',
'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
'upload_date': '20090627',
'duration': 1800,
},
'params': {
'skip_download': True,
},
},
'params': {
'skip_download': True,
{
'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
'info_dict': {
'id': '99205',
'ext': 'flv',
'title': 'Medicopter 117 - Angst!',
'description': 'md5:895b1df01639b5f61a04fc305a5cb94d',
'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
'upload_date': '20080928',
'duration': 2691,
},
'params': {
'skip_download': True,
},
},
'skip': 'Only works from Germany',
},
{
'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
'file': '13883.flv',
'info_dict': {
'upload_date': '20090627',
'title': 'Voxtours - Südafrika-Reporter II',
'description': 'Südafrika-Reporter II',
{
'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
'info_dict': {
'id': '153819',
'ext': 'flv',
'title': 'Deluxe - Alles was Spaß macht - Thema u.a.: Luxushotel für Vierbeiner',
'description': 'md5:c3705e1bb32e1a5b2bcd634fc065c631',
'thumbnail': 'http://autoimg.static-fra.de/ntvnow/383157/1500x1500/image2.jpg',
'upload_date': '20140221',
'duration': 2429,
},
'skip': 'Only works from Germany',
},
'params': {
'skip_download': True,
},
},
{
'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
'file': '99205.flv',
'info_dict': {
'upload_date': '20080928',
'title': 'Medicopter 117 - Angst!',
'description': 'Angst!',
'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg'
},
'params': {
'skip_download': True,
},
},
{
'url': 'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
'file': '124903.flv',
'info_dict': {
'upload_date': '20130101',
'title': 'Top Gear vom 01.01.2013',
'description': 'Episode 1',
},
'params': {
'skip_download': True,
},
'skip': 'Only works from Germany',
}]
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
webpage_url = 'http://' + mobj.group('url')
video_page_url = 'http://' + mobj.group('domain') + '/'
video_page_url = 'http://%s/' % mobj.group('domain')
video_id = mobj.group('video_id')
webpage = self._download_webpage(webpage_url, video_id)
webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
note_m = re.search(r'''(?sx)
<div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?)
<div[ ]id="playerteaser">''', webpage)
if note_m:
msg = clean_html(note_m.group(1))
raise ExtractorError(msg)
mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
if mobj:
raise ExtractorError(clean_html(mobj.group(1)), expected=True)
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage, default=None)
upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
duration = int(mobj.group('seconds')) if mobj else None
video_title = self._html_search_regex(
r'<title>(?P<title>[^<]+?)( \| [^<]*)?</title>',
webpage, 'title')
playerdata_url = self._html_search_regex(
r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'',
webpage, 'playerdata_url')
r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
playerdata = self._download_webpage(playerdata_url, video_id)
mobj = re.search(r'<title><!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]></title>', playerdata)
if mobj:
video_description = mobj.group('description')
if mobj.group('upload_date_Y'):
video_upload_date = mobj.group('upload_date_Y')
elif mobj.group('upload_date_y'):
video_upload_date = '20' + mobj.group('upload_date_y')
playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
videoinfo = playerdata.find('./playlist/videoinfo')
formats = []
for filename in videoinfo.findall('filename'):
mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
if mobj:
fmt = {
'url': mobj.group('url'),
'play_path': 'mp4:' + mobj.group('play_path'),
'page_url': video_page_url,
'player_url': video_page_url + 'includes/vodplayer.swf',
}
else:
video_upload_date = None
if video_upload_date:
video_upload_date += mobj.group('upload_date_m') + mobj.group('upload_date_d')
else:
video_description = None
video_upload_date = None
self._downloader.report_warning('Unable to extract description and upload date')
# Thumbnail: not every video has an thumbnail
mobj = re.search(r'<meta property="og:image" content="(?P<thumbnail>[^"]+)">', webpage)
if mobj:
video_thumbnail = mobj.group('thumbnail')
else:
video_thumbnail = None
mobj = re.search(r'<filename [^>]+><!\[CDATA\[(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>[^\]]+)\]\]></filename>', playerdata)
if mobj is None:
raise ExtractorError('Unable to extract media URL')
video_url = mobj.group('url')
video_play_path = 'mp4:' + mobj.group('play_path')
video_player_url = video_page_url + 'includes/vodplayer.swf'
fmt = {
'url': filename.text,
}
fmt.update({
'width': int_or_none(filename.get('width')),
'height': int_or_none(filename.get('height')),
'vbr': int_or_none(filename.get('bitrate')),
'ext': 'flv',
})
formats.append(fmt)
return {
'id': video_id,
'url': video_url,
'play_path': video_play_path,
'page_url': video_page_url,
'player_url': video_player_url,
'ext': 'flv',
'title': video_title,
'description': video_description,
'upload_date': video_upload_date,
'thumbnail': video_thumbnail,
}
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
'duration': duration,
'formats': formats,
}

View File

@ -0,0 +1,37 @@
# coding: utf-8
from __future__ import unicode_literals
import os.path
import re
from .common import InfoExtractor
class SaveFromIE(InfoExtractor):
IE_NAME = 'savefrom.net'
_VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P<url>.*)$'
_TEST = {
'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com',
'info_dict': {
'id': 'UlVRAPW2WJY',
'ext': 'mp4',
'title': 'About Team Radical MMA | MMA Fighting',
'upload_date': '20120816',
'uploader': 'Howcast',
'uploader_id': 'Howcast',
'description': 'md5:4f0aac94361a12e1ce57d74f85265175',
},
'params': {
'skip_download': True
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = os.path.splitext(url.split('/')[-1])[0]
return {
'_type': 'url',
'id': video_id,
'url': mobj.group('url'),
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
import json
@ -12,11 +14,12 @@ class SlideshareIE(InfoExtractor):
_VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
_TEST = {
u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
u'file': u'25665706.mp4',
u'info_dict': {
u'title': u'Managing Scale and Complexity',
u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix',
'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
'info_dict': {
'id': '25665706',
'ext': 'mp4',
'title': 'Managing Scale and Complexity',
'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',
},
}
@ -26,15 +29,17 @@ class SlideshareIE(InfoExtractor):
webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex(
r'var slideshare_object = ({.*?}); var user_info =',
webpage, u'slideshare object')
webpage, 'slideshare object')
info = json.loads(slideshare_obj)
if info['slideshow']['type'] != u'video':
raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
if info['slideshow']['type'] != 'video':
raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
doc = info['doc']
bucket = info['jsplayer']['video_bucket']
ext = info['jsplayer']['video_extension']
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
description = self._html_search_regex(
r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description')
return {
'_type': 'video',
@ -43,5 +48,5 @@ class SlideshareIE(InfoExtractor):
'ext': ext,
'url': video_url,
'thumbnail': info['slideshow']['pin_image_url'],
'description': self._og_search_description(webpage),
'description': description,
}

View File

@ -20,6 +20,7 @@ class SmotriIE(InfoExtractor):
IE_DESC = 'Smotri.com'
IE_NAME = 'smotri'
_VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
_NETRC_MACHINE = 'smotri'
_TESTS = [
# real video id 2610366

View File

@ -17,6 +17,7 @@ class SohuIE(InfoExtractor):
u'info_dict': {
u'title': u'MVFar East Movement《The Illest》',
},
u'skip': u'Only available from China',
}
def _real_extract(self, url):

View File

@ -217,7 +217,7 @@ class SoundcloudIE(InfoExtractor):
return self._extract_info_dict(info, full_title, secret_token=token)
class SoundcloudSetIE(SoundcloudIE):
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
_VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
IE_NAME = 'soundcloud:set'
# it's in tests/test_playlists.py
_TESTS = []

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@ -8,14 +10,14 @@ from ..utils import RegexNotFoundError, ExtractorError
class SpaceIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
_TEST = {
u'add_ie': ['Brightcove'],
u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
u'info_dict': {
u'id': u'2780937028001',
u'ext': u'mp4',
u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61',
u'uploader': u'TechMedia Networks',
'add_ie': ['Brightcove'],
'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
'info_dict': {
'id': '2780937028001',
'ext': 'mp4',
'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61',
'uploader': 'TechMedia Networks',
},
}

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals
import os
import re
from .common import InfoExtractor
@ -8,23 +7,27 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
unified_strdate,
str_to_int,
int_or_none,
)
from ..aes import (
aes_decrypt_text
)
from ..aes import aes_decrypt_text
class SpankwireIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
_VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
_TEST = {
'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
'file': '103545.mp4',
'md5': '1b3f55e345500552dbc252a3e9c1af43',
'md5': '8bbfde12b101204b39e4b9fe7eb67095',
'info_dict': {
"uploader": "oreusz",
"title": "Buckcherry`s X Rated Music Video Crazy Bitch",
"description": "Crazy Bitch X rated music video.",
"age_limit": 18,
'id': '103545',
'ext': 'mp4',
'title': 'Buckcherry`s X Rated Music Video Crazy Bitch',
'description': 'Crazy Bitch X rated music video.',
'uploader': 'oreusz',
'uploader_id': '124697',
'upload_date': '20070508',
'age_limit': 18,
}
}
@ -37,13 +40,26 @@ class SpankwireIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(
r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
description = self._html_search_regex(
r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
uploader = self._html_search_regex(
r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
uploader_id = self._html_search_regex(
r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False)
upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False)
if upload_date:
upload_date = unified_strdate(upload_date)
view_count = self._html_search_regex(
r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = int_or_none(self._html_search_regex(
r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False))
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
if webpage.find('flashvars\.encrypted = "true"') != -1:
@ -53,16 +69,13 @@ class SpankwireIE(InfoExtractor):
formats = []
for video_url in video_urls:
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[4].split('_')[:2]
resolution, bitrate_str = format
format = "-".join(format)
height = int(resolution.rstrip('P'))
tbr = int(bitrate_str.rstrip('K'))
height = int(resolution.rstrip('Pp'))
tbr = int(bitrate_str.rstrip('Kk'))
formats.append({
'url': video_url,
'ext': extension,
'resolution': resolution,
'format': format,
'tbr': tbr,
@ -75,10 +88,14 @@ class SpankwireIE(InfoExtractor):
return {
'id': video_id,
'uploader': video_uploader,
'title': video_title,
'thumbnail': thumbnail,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
'view_count': view_count,
'comment_count': comment_count,
'formats': formats,
'age_limit': age_limit,
}

View File

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import int_or_none
class StreamCZIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<videoid>.+)'
_TEST = {
'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
'md5': '6d3ca61a8d0633c9c542b92fcb936b0c',
'info_dict': {
'id': '765767',
'ext': 'mp4',
'title': 'Peklo na talíři: Éčka pro děti',
'description': 'md5:49ace0df986e95e331d0fe239d421519',
'thumbnail': 'http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
'duration': 256,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
webpage = self._download_webpage(url, video_id)
data = self._html_search_regex(r'Stream\.Data\.Episode\((.+?)\);', webpage, 'stream data')
jsonData = json.loads(data)
formats = []
for video in jsonData['instances']:
for video_format in video['instances']:
format_id = video_format['quality']
if format_id == '240p':
quality = 0
elif format_id == '360p':
quality = 1
elif format_id == '480p':
quality = 2
elif format_id == '720p':
quality = 3
formats.append({
'format_id': '%s-%s' % (video_format['type'].split('/')[1], format_id),
'url': video_format['source'],
'quality': quality,
})
self._sort_formats(formats)
return {
'id': str(jsonData['id']),
'title': self._og_search_title(webpage),
'thumbnail': jsonData['episode_image_original_url'].replace('//', 'http://'),
'formats': formats,
'description': self._og_search_description(webpage),
'duration': int_or_none(jsonData['duration']),
'view_count': int_or_none(jsonData['stats_total']),
}

View File

@ -0,0 +1,27 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class SyfyIE(InfoExtractor):
_VALID_URL = r'https?://www\.syfy\.com/videos/.+?vid:(?P<id>\d+)'
_TEST = {
'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458',
'md5': 'e07de1d52c7278adbb9b9b1c93a66849',
'info_dict': {
'id': 'NmqMrGnXvmO1',
'ext': 'flv',
'title': 'George Lucas has Advice for his Daughter',
'description': 'Listen to what insights George Lucas give his daughter Amanda.',
},
'add_ie': ['ThePlatform'],
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
return self.url_result(self._og_search_video_url(webpage))

View File

@ -6,115 +6,111 @@ import re
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
RegexNotFoundError,
compat_str,
)
class TEDIE(SubtitlesInfoExtractor):
_VALID_URL=r'''http://www\.ted\.com/
(
((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
'''
_VALID_URL = r'''(?x)http://www\.ted\.com/
(
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
'''
_TEST = {
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'file': '102.mp4',
'md5': '4ea1dada91e4174b53dac2bb8ace429d',
'info_dict': {
"description": "md5:c6fa72e6eedbd938c9caf6b2702f5922",
"title": "Dan Dennett: The illusion of consciousness"
'id': '102',
'ext': 'mp4',
'title': 'The illusion of consciousness',
'description': ('Philosopher Dan Dennett makes a compelling '
'argument that not only don\'t we understand our own '
'consciousness, but that half the time our brains are '
'actively fooling us.'),
'uploader': 'Dan Dennett',
}
}
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
_FORMATS_PREFERENCE = {
'low': 1,
'medium': 2,
'high': 3,
}
def _extract_info(self, webpage):
info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
webpage, 'info json')
return json.loads(info_json)
def _real_extract(self, url):
m=re.match(self._VALID_URL, url, re.VERBOSE)
m = re.match(self._VALID_URL, url, re.VERBOSE)
name = m.group('name')
if m.group('type_talk'):
return self._talk_info(url)
else :
playlist_id=m.group('playlist_id')
name=m.group('name')
self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
return [self._playlist_videos_info(url,name,playlist_id)]
return self._talk_info(url, name)
else:
return self._playlist_videos_info(url, name)
def _playlist_videos_info(self, url, name, playlist_id):
def _playlist_videos_info(self, url, name):
'''Returns the videos of the playlist'''
webpage = self._download_webpage(
url, playlist_id, 'Downloading playlist webpage')
matches = re.finditer(
r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
webpage)
playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
webpage, 'playlist title')
webpage = self._download_webpage(url, name,
'Downloading playlist webpage')
info = self._extract_info(webpage)
playlist_info = info['playlist']
playlist_entries = [
self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
for m in matches
self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
for talk in info['talks']
]
return self.playlist_result(
playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
playlist_entries,
playlist_id=compat_str(playlist_info['id']),
playlist_title=playlist_info['title'])
def _talk_info(self, url, video_id=0):
"""Return the video for the talk in the url"""
m = re.match(self._VALID_URL, url,re.VERBOSE)
video_name = m.group('name')
webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name)
self.report_extraction(video_name)
# If the url includes the language we get the title translated
title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
webpage, 'title')
json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
webpage, 'json data')
info = json.loads(json_data)
desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
webpage, 'description', flags = re.DOTALL)
thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
webpage, 'thumbnail')
talk_info = self._extract_info(webpage)['talks'][0]
formats = [{
'ext': 'mp4',
'url': stream['file'],
'format': stream['id']
} for stream in info['htmlStreams']]
video_id = info['id']
'url': format_url,
'format_id': format_id,
'format': format_id,
'preference': self._FORMATS_PREFERENCE.get(format_id, -1),
} for (format_id, format_url) in talk_info['nativeDownloads'].items()]
self._sort_formats(formats)
video_id = compat_str(talk_info['id'])
# subtitles
video_subtitles = self.extract_subtitles(video_id, webpage)
video_subtitles = self.extract_subtitles(video_id, talk_info)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, webpage)
self._list_available_subtitles(video_id, talk_info)
return
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'description': desc,
'title': talk_info['title'],
'uploader': talk_info['speaker'],
'thumbnail': talk_info['thumb'],
'description': self._og_search_description(webpage),
'subtitles': video_subtitles,
'formats': formats,
}
def _get_available_subtitles(self, video_id, webpage):
try:
options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
languages = re.findall(r'(?:<option value=")(\S+)"', options)
if languages:
sub_lang_list = {}
for l in languages:
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
sub_lang_list[l] = url
return sub_lang_list
except RegexNotFoundError:
def _get_available_subtitles(self, video_id, talk_info):
languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
if languages:
sub_lang_list = {}
for l in languages:
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
sub_lang_list[l] = url
return sub_lang_list
else:
self._downloader.report_warning(u'video doesn\'t have subtitles')
return {}
return {}

View File

@ -0,0 +1,68 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class TestURLIE(InfoExtractor):
""" Allows adressing of the test cases as test:yout.*be_1 """
IE_DESC = False # Do not list
_VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$'
def _real_extract(self, url):
from ..extractor import gen_extractors
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
extractor_id = mobj.group('extractor')
all_extractors = gen_extractors()
rex = re.compile(extractor_id, flags=re.IGNORECASE)
matching_extractors = [
e for e in all_extractors if rex.search(e.IE_NAME)]
if len(matching_extractors) == 0:
raise ExtractorError(
'No extractors matching %r found' % extractor_id,
expected=True)
elif len(matching_extractors) > 1:
# Is it obvious which one to pick?
try:
extractor = next(
ie for ie in matching_extractors
if ie.IE_NAME.lower() == extractor_id.lower())
except StopIteration:
raise ExtractorError(
('Found multiple matching extractors: %s' %
' '.join(ie.IE_NAME for ie in matching_extractors)),
expected=True)
else:
extractor = matching_extractors[0]
num_str = mobj.group('num')
num = int(num_str) if num_str else 0
testcases = []
t = getattr(extractor, '_TEST', None)
if t:
testcases.append(t)
testcases.extend(getattr(extractor, '_TESTS', []))
try:
tc = testcases[num]
except IndexError:
raise ExtractorError(
('Test case %d not found, got only %d tests' %
(num, len(testcases))),
expected=True)
self.to_screen('Test URL: %s' % tc['url'])
return {
'_type': 'url',
'url': tc['url'],
'id': video_id,
}

View File

@ -11,7 +11,10 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
class ThePlatformIE(InfoExtractor):
_VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)'
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
|theplatform:)(?P<id>[^/\?&]+)'''
_TEST = {
# from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
@ -29,9 +32,7 @@ class ThePlatformIE(InfoExtractor):
},
}
def _get_info(self, video_id):
smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
'format=smil&mbr=true'.format(video_id))
def _get_info(self, video_id, smil_url):
meta = self._download_xml(smil_url, video_id)
try:
@ -50,26 +51,39 @@ class ThePlatformIE(InfoExtractor):
head = meta.find(_x('smil:head'))
body = meta.find(_x('smil:body'))
base_url = head.find(_x('smil:meta')).attrib['base']
switch = body.find(_x('smil:switch'))
formats = []
for f in switch.findall(_x('smil:video')):
attr = f.attrib
width = int(attr['width'])
height = int(attr['height'])
vbr = int(attr['system-bitrate']) // 1000
format_id = '%dx%d_%dk' % (width, height, vbr)
formats.append({
'format_id': format_id,
'url': base_url,
'play_path': 'mp4:' + attr['src'],
'ext': 'flv',
'width': width,
'height': height,
'vbr': vbr,
})
self._sort_formats(formats)
f4m_node = body.find(_x('smil:seq/smil:video'))
if f4m_node is not None:
f4m_url = f4m_node.attrib['src']
if 'manifest.f4m?' not in f4m_url:
f4m_url += '?'
# the parameters are from syfy.com, other sites may use others,
# they also work for nbc.com
f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3'
formats = [{
'ext': 'flv',
'url': f4m_url,
}]
else:
base_url = head.find(_x('smil:meta')).attrib['base']
switch = body.find(_x('smil:switch'))
formats = []
for f in switch.findall(_x('smil:video')):
attr = f.attrib
width = int(attr['width'])
height = int(attr['height'])
vbr = int(attr['system-bitrate']) // 1000
format_id = '%dx%d_%dk' % (width, height, vbr)
formats.append({
'format_id': format_id,
'url': base_url,
'play_path': 'mp4:' + attr['src'],
'ext': 'flv',
'width': width,
'height': height,
'vbr': vbr,
})
self._sort_formats(formats)
return {
'id': video_id,
@ -83,4 +97,14 @@ class ThePlatformIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
return self._get_info(video_id)
if mobj.group('config'):
config_url = url+ '&form=json'
config_url = config_url.replace('swf/', 'config/')
config_url = config_url.replace('onsite/', 'onsite/config/')
config_json = self._download_webpage(config_url, video_id, u'Downloading config')
config = json.loads(config_json)
smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
else:
smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
'format=smil&mbr=true'.format(video_id))
return self._get_info(video_id, smil_url)

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from youtube_dl.utils import ExtractorError
from ..utils import ExtractorError
class TinyPicIE(InfoExtractor):

View File

@ -0,0 +1,44 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class TruTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P<id>[0-9]+)/.*'
_TEST = {
'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-',
'md5': 'c5b6e301b0a2040b074746cbeaa26ca1',
'info_dict': {
'id': '14880',
'ext': 'flv',
'title': 'Ramses II - Proven To Be A Red Headed Caucasoid',
'thumbnail': 're:^http:.*\.jpg$',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_title = self._og_search_title(webpage).strip()
thumbnail = self._search_regex(
r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False)
all_formats = re.finditer(
r"var (?P<key>[a-z]+)_video_file\s*=\s*'(?P<url>[^']+)';", webpage)
formats = [{
'format_id': m.group('key'),
'quality': -i,
'url': m.group('url'),
} for i, m in enumerate(all_formats)]
self._sort_formats(formats)
return {
'id': video_id,
'title': video_title,
'formats': formats,
'thumbnail': thumbnail,
}

View File

@ -11,7 +11,7 @@ from ..aes import (
)
class Tube8IE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/[^/]+/[^/]+/(?P<videoid>[0-9]+)/?)'
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/.+?/(?P<videoid>\d+)/?)$'
_TEST = {
u'url': u'http://www.tube8.com/teen/kasia-music-video/229795/',
u'file': u'229795.mp4',

View File

@ -0,0 +1,84 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
unified_strdate,
clean_html,
int_or_none,
)
class TvigleIE(InfoExtractor):
IE_NAME = 'tvigle'
IE_DESC = 'Интернет-телевидение Tvigle.ru'
_VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)'
_TESTS = [
{
'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
'md5': '09afba4616666249f087efc6dcf83cb3',
'info_dict': {
'id': '503081',
'ext': 'flv',
'title': 'Брат 2 ',
'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
'upload_date': '20110919',
},
},
{
'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433',
'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
'info_dict': {
'id': '676433',
'ext': 'flv',
'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
'description': 'md5:027f7dc872948f14c96d19b4178428a4',
'upload_date': '20121218',
},
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_data = self._download_xml(
'http://www.tvigle.ru/xml/single.php?obj=%s' % video_id, video_id, 'Downloading video XML')
video = video_data.find('./video')
title = video.get('name')
description = video.get('anons')
if description:
description = clean_html(description)
thumbnail = video_data.get('img')
upload_date = unified_strdate(video.get('date'))
like_count = int_or_none(video.get('vtp'))
formats = []
for num, (format_id, format_note) in enumerate([['low_file', 'SQ'], ['file', 'HQ'], ['hd', 'HD 720']]):
video_url = video.get(format_id)
if not video_url:
continue
formats.append({
'url': video_url,
'format_id': format_id,
'format_note': format_note,
'quality': num,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
'like_count': like_count,
'age_limit': 18,
'formats': formats,
}

View File

@ -4,6 +4,7 @@ import re
import json
from .common import InfoExtractor
from ..utils import compat_urllib_request
class VeohIE(InfoExtractor):
@ -24,6 +25,13 @@ class VeohIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
age_limit = 0
if 'class="adultwarning-container"' in webpage:
self.report_age_confirmation()
age_limit = 18
request = compat_urllib_request.Request(url)
request.add_header('Cookie', 'confirmedAdult=true')
webpage = self._download_webpage(request, video_id)
m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
if m_youtube is not None:
@ -44,4 +52,5 @@ class VeohIE(InfoExtractor):
'thumbnail': info.get('highResImage') or info.get('medResImage'),
'description': info['description'],
'view_count': info['views'],
'age_limit': age_limit,
}

View File

@ -0,0 +1,236 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none
)
class VestiIE(InfoExtractor):
IE_NAME = 'vesti'
IE_DESC = 'Вести.Ru'
_VALID_URL = r'http://(?:.+?\.)?(?:vesti\.ru|russia\.tv)/(?P<id>.+)'
_TESTS = [
{
'url': 'http://www.vesti.ru/videos?vid=575582&cid=1',
'info_dict': {
'id': '765035',
'ext': 'mp4',
'title': 'Вести.net: биткоины в России не являются законными',
'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b',
'duration': 302,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/doc.html?id=1349233',
'info_dict': {
'id': '773865',
'ext': 'mp4',
'title': 'Участники митинга штурмуют Донецкую областную администрацию',
'description': 'md5:1a160e98b3195379b4c849f2f4958009',
'duration': 210,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/only_video.html?vid=576180',
'info_dict': {
'id': '766048',
'ext': 'mp4',
'title': 'США заморозило, Британию затопило',
'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1',
'duration': 87,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://hitech.vesti.ru/news/view/id/4000',
'info_dict': {
'id': '766888',
'ext': 'mp4',
'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
'duration': 279,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://russia.tv/video/show/brand_id/5169/episode_id/970443/video_id/975648',
'info_dict': {
'id': '771852',
'ext': 'mp4',
'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
'duration': 3096,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://russia.tv/brand/show/brand_id/57638',
'info_dict': {
'id': '774016',
'ext': 'mp4',
'title': 'Чужой в семье Сталина',
'description': '',
'duration': 2539,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
'info_dict': {
'id': '766403',
'ext': 'mp4',
'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы',
'description': 'md5:55805dfd35763a890ff50fa9e35e31b3',
'duration': 271,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Blocked outside Russia'
},
{
'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
'info_dict': {
'id': '51499',
'ext': 'flv',
'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
},
'params': {
# rtmp download
'skip_download': True,
},
'skip': 'Translation has finished'
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
mobj = re.search(
r'<meta property="og:video" content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
page)
if mobj:
video_id = mobj.group('id')
page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
'Downloading video page')
mobj = re.search(
r'<meta property="og:video" content="http://player\.rutv\.ru/flash2v/container\.swf\?id=(?P<id>\d+)', page)
if mobj:
video_type = 'video'
video_id = mobj.group('id')
else:
mobj = re.search(
r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>',
page)
if not mobj:
raise ExtractorError('No media found', expected=True)
video_type = mobj.group('type')
video_id = mobj.group('id')
json_data = self._download_json(
'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
video_id, 'Downloading JSON')
if json_data['errors']:
raise ExtractorError('vesti returned error: %s' % json_data['errors'], expected=True)
playlist = json_data['data']['playlist']
medialist = playlist['medialist']
media = medialist[0]
if media['errors']:
raise ExtractorError('vesti returned error: %s' % media['errors'], expected=True)
view_count = playlist.get('count_views')
priority_transport = playlist['priority_transport']
thumbnail = media['picture']
width = int_or_none(media['width'])
height = int_or_none(media['height'])
description = media['anons']
title = media['title']
duration = int_or_none(media.get('duration'))
formats = []
for transport, links in media['sources'].items():
for quality, url in links.items():
if transport == 'rtmp':
mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
if not mobj:
continue
fmt = {
'url': mobj.group('url'),
'play_path': mobj.group('playpath'),
'app': mobj.group('app'),
'page_url': 'http://player.rutv.ru',
'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22',
'rtmp_live': True,
'ext': 'flv',
'vbr': int(quality),
}
elif transport == 'm3u8':
fmt = {
'url': url,
'ext': 'mp4',
}
else:
fmt = {
'url': url
}
fmt.update({
'width': width,
'height': height,
'format_id': '%s-%s' % (transport, quality),
'preference': -1 if priority_transport == transport else -2,
})
formats.append(fmt)
if not formats:
raise ExtractorError('No media links available for %s' % video_id)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'view_count': view_count,
'duration': duration,
'formats': formats,
}

View File

@ -24,9 +24,10 @@ class VevoIE(InfoExtractor):
(?P<id>[^&?#]+)'''
_TESTS = [{
'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
'file': 'GB1101300280.mp4',
"md5": "06bea460acb744eab74a9d7dcb4bfd61",
'info_dict': {
'id': 'GB1101300280',
'ext': 'mp4',
"upload_date": "20130624",
"uploader": "Hurts",
"title": "Somebody to Die For",
@ -34,6 +35,33 @@ class VevoIE(InfoExtractor):
"width": 1920,
"height": 1080,
}
}, {
'note': 'v3 SMIL format',
'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
'md5': '893ec0e0d4426a1d96c01de8f2bdff58',
'info_dict': {
'id': 'USUV71302923',
'ext': 'mp4',
'upload_date': '20140219',
'uploader': 'Cassadee Pope',
'title': 'I Wish I Could Break Your Heart',
'duration': 226.101,
'age_limit': 0,
}
}, {
'note': 'Age-limited video',
'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
'info_dict': {
'id': 'USRV81300282',
'ext': 'mp4',
'age_limit': 18,
'title': 'Tunnel Vision (Explicit)',
'uploader': 'Justin Timberlake',
'upload_date': '20130704',
},
'params': {
'skip_download': 'true',
}
}]
_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
@ -105,9 +133,31 @@ class VevoIE(InfoExtractor):
video_info = self._download_json(json_url, video_id)['video']
formats = self._formats_from_json(video_info)
is_explicit = video_info.get('isExplicit')
if is_explicit is True:
age_limit = 18
elif is_explicit is False:
age_limit = 0
else:
age_limit = None
# Download SMIL
smil_blocks = sorted((
f for f in video_info['videoVersions']
if f['sourceType'] == 13),
key=lambda f: f['version'])
smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
self._SMIL_BASE_URL, video_id, video_id.lower())
if smil_blocks:
smil_url_m = self._search_regex(
r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL',
fatal=False)
if smil_url_m is not None:
smil_url = smil_url_m
try:
smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
self._SMIL_BASE_URL, video_id, video_id.lower())
smil_xml = self._download_webpage(smil_url, video_id,
'Downloading SMIL info')
formats.extend(self._formats_from_smil(smil_xml))
@ -128,4 +178,5 @@ class VevoIE(InfoExtractor):
'upload_date': upload_date.strftime('%Y%m%d'),
'uploader': video_info['mainArtists'][0]['artistName'],
'duration': video_info['duration'],
'age_limit': age_limit,
}

View File

@ -0,0 +1,80 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import int_or_none
class VideoBamIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)'
_TESTS = [
{
'url': 'http://videobam.com/OiJQM',
'md5': 'db471f27763a531f10416a0c58b5a1e0',
'info_dict': {
'id': 'OiJQM',
'ext': 'mp4',
'title': 'Is Alcohol Worse Than Ecstasy?',
'description': 'md5:d25b96151515c91debc42bfbb3eb2683',
'uploader': 'frihetsvinge',
},
},
{
'url': 'http://videobam.com/pqLvq',
'md5': 'd9a565b5379a99126ef94e1d7f9a383e',
'note': 'HD video',
'info_dict': {
'id': 'pqLvq',
'ext': 'mp4',
}
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page')
formats = []
for preference, format_id in enumerate(['low', 'high']):
mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page)
if not mobj:
continue
formats.append({
'url': mobj.group('url'),
'ext': 'mp4',
'format_id': format_id,
'preference': preference,
})
if not formats:
player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config'))
formats = [{
'url': item['url'],
'ext': 'mp4',
} for item in player_config['playlist'] if 'autoPlay' in item]
self._sort_formats(formats)
title = self._og_search_title(page, default='VideoBam', fatal=False)
description = self._og_search_description(page, default=None)
thumbnail = self._og_search_thumbnail(page)
uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None)
view_count = int_or_none(
self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'view_count': view_count,
'formats': formats,
'age_limit': 18,
}

View File

@ -37,13 +37,14 @@ class VimeoIE(SubtitlesInfoExtractor):
_TESTS = [
{
'url': 'http://vimeo.com/56015672#at=0',
'file': '56015672.mp4',
'md5': '8879b6cc097e987f02484baf890129e5',
'info_dict': {
"upload_date": "20121220",
"description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
"uploader_id": "user7108434",
"uploader": "Filippo Valsorda",
'id': '56015672',
'ext': 'mp4',
"upload_date": "20121220",
"description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
"uploader_id": "user7108434",
"uploader": "Filippo Valsorda",
"title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
},
},
@ -220,7 +221,9 @@ class VimeoIE(SubtitlesInfoExtractor):
# Extract video thumbnail
video_thumbnail = config["video"].get("thumbnail")
if video_thumbnail is None:
_, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1]
video_thumbs = config["video"].get("thumbs")
if video_thumbs and isinstance(video_thumbs, dict):
_, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1]
# Extract video description
video_description = None

View File

@ -1,8 +1,10 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import unified_strdate
class VineIE(InfoExtractor):
@ -13,31 +15,46 @@ class VineIE(InfoExtractor):
'info_dict': {
'id': 'b9KOOWX7HUx',
'ext': 'mp4',
'uploader': 'Jack Dorsey',
'title': 'Chicken.',
'description': 'Chicken.',
'upload_date': '20130519',
'uploader': 'Jack Dorsey',
'uploader_id': '76',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'https://vine.co/v/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
video_url = self._html_search_meta('twitter:player:stream', webpage,
'video URL')
data = json.loads(self._html_search_regex(
r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
webpage, 'uploader', fatal=False, flags=re.DOTALL)
formats = [
{
'url': data['videoLowURL'],
'ext': 'mp4',
'format_id': 'low',
},
{
'url': data['videoUrl'],
'ext': 'mp4',
'format_id': 'standard',
}
]
return {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'uploader': uploader,
}
'description': data['description'],
'thumbnail': data['thumbnailUrl'],
'upload_date': unified_strdate(data['created']),
'uploader': data['username'],
'uploader_id': data['userIdStr'],
'like_count': data['likes']['count'],
'comment_count': data['comments']['count'],
'repost_count': data['reposts']['count'],
'formats': formats,
}

View File

@ -6,6 +6,9 @@ import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_request,
compat_urllib_parse,
compat_str,
unescapeHTML,
)
@ -13,32 +16,96 @@ from ..utils import (
class VKIE(InfoExtractor):
IE_NAME = 'vk.com'
_VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
_VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk'
_TESTS = [{
'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
'file': '162222515.flv',
'md5': '0deae91935c54e00003c2a00646315f0',
'info_dict': {
'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 'Noize MC',
_TESTS = [
{
'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
'md5': '0deae91935c54e00003c2a00646315f0',
'info_dict': {
'id': '162222515',
'ext': 'flv',
'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 'Noize MC',
'duration': 195,
},
},
},
{
'url': 'http://vk.com/video4643923_163339118',
'file': '163339118.mp4',
'md5': 'f79bccb5cd182b1f43502ca5685b2b36',
'info_dict': {
'uploader': 'Elvira Dzhonik',
'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
{
'url': 'http://vk.com/video4643923_163339118',
'md5': 'f79bccb5cd182b1f43502ca5685b2b36',
'info_dict': {
'id': '163339118',
'ext': 'mp4',
'uploader': 'Elvira Dzhonik',
'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
'duration': 558,
}
},
{
'note': 'Embedded video',
'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
'info_dict': {
'id': '162925554',
'ext': 'mp4',
'uploader': 'Vladimir Gavrin',
'title': 'Lin Dan',
'duration': 101,
}
},
{
'url': 'http://vk.com/video-8871596_164049491',
'md5': 'a590bcaf3d543576c9bd162812387666',
'note': 'Only available for registered users',
'info_dict': {
'id': '164049491',
'ext': 'mp4',
'uploader': 'Триллеры',
'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]\u00a0',
'duration': 8352,
},
'skip': 'Requires vk account credentials',
},
]
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
login_form = {
'act': 'login',
'role': 'al_frame',
'expire': '1',
'email': username,
'pass': password,
}
}]
request = compat_urllib_request.Request('https://login.vk.com/?act=login',
compat_urllib_parse.urlencode(login_form).encode('utf-8'))
login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
if re.search(r'onLoginFailed', login_page):
raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
def _real_initialize(self):
self._login()
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = mobj.group('videoid')
if not video_id:
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
info_page = self._download_webpage(info_url, video_id)
if re.search(r'<!>Please log in or <', info_page):
raise ExtractorError('This video is only available for registered users, '
'use --username and --password options to provide account credentials.', expected=True)
m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
if m_yt is not None:
self.to_screen(u'Youtube video detected')
@ -60,4 +127,5 @@ class VKIE(InfoExtractor):
'title': unescapeHTML(data['md_title']),
'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'),
'duration': data.get('duration')
}

View File

@ -6,14 +6,15 @@ from .common import InfoExtractor
class WimpIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?wimp\.com/([^/]+)/'
_VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/'
_TEST = {
'url': 'http://www.wimp.com/deerfence/',
'file': 'deerfence.flv',
'md5': '8b215e2e0168c6081a1cf84b2846a2b5',
'url': 'http://www.wimp.com/maruexhausted/',
'md5': 'f1acced123ecb28d9bb79f2479f2b6a1',
'info_dict': {
"title": "Watch Till End: Herd of deer jump over a fence.",
"description": "These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.",
'id': 'maruexhausted',
'ext': 'flv',
'title': 'Maru is exhausted.',
'description': 'md5:57e099e857c0a4ea312542b684a869b8',
}
}
@ -30,4 +31,4 @@ class WimpIE(InfoExtractor):
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
}
}

View File

@ -22,8 +22,8 @@ class WorldStarHipHopIE(InfoExtractor):
webpage_src = self._download_webpage(url, video_id)
m_vevo_id = re.search(r'videoId=(.*?)&amp?',
webpage_src)
webpage_src)
if m_vevo_id is not None:
self.to_screen(u'Vevo video detected:')
return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')

View File

@ -4,51 +4,51 @@ import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
unified_strdate,
str_to_int,
int_or_none,
parse_duration,
)
class XHamsterIE(InfoExtractor):
"""Information Extractor for xHamster"""
_VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
_TESTS = [{
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
'file': '1509445.mp4',
'md5': '8281348b8d3c53d39fffb377d24eac4e',
'info_dict': {
"upload_date": "20121014",
"uploader_id": "Ruseful2011",
"title": "FemaleAgent Shy beauty takes the bait",
"age_limit": 18,
_VALID_URL = r'http://(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
_TESTS = [
{
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
'md5': '8281348b8d3c53d39fffb377d24eac4e',
'info_dict': {
'id': '1509445',
'ext': 'mp4',
'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014',
'uploader_id': 'Ruseful2011',
'duration': 893,
'age_limit': 18,
}
},
{
'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
'md5': '4cbd8d56708ecb4fb4124c23e4acb81a',
'info_dict': {
'id': '2221348',
'ext': 'mp4',
'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914',
'uploader_id': 'jojo747400',
'duration': 200,
'age_limit': 18,
}
}
},
{
'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
'file': '2221348.flv',
'md5': 'e767b9475de189320f691f49c679c4c7',
'info_dict': {
"upload_date": "20130914",
"uploader_id": "jojo747400",
"title": "Britney Spears Sexy Booty",
"age_limit": 18,
}
}]
]
def _real_extract(self,url):
def extract_video_url(webpage):
mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
if mobj is None:
raise ExtractorError('Unable to extract media URL')
if len(mobj.group('server')) == 0:
return compat_urllib_parse.unquote(mobj.group('file'))
else:
return mobj.group('server')+'/key='+mobj.group('file')
def extract_mp4_video_url(webpage):
mp4 = re.search(r'<a href=\"(.+?)\" class=\"mp4Play\"',webpage)
mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
if mp4 is None:
return None
raise ExtractorError('Unable to extract media URL')
else:
return mp4.group(1)
@ -62,50 +62,49 @@ class XHamsterIE(InfoExtractor):
mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
webpage = self._download_webpage(mrss_url, video_id)
video_title = self._html_search_regex(
r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
# Only a few videos have an description
mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
video_description = mobj.group(1) if mobj else None
description = mobj.group(1) if mobj else None
mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
if mobj:
video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
else:
video_upload_date = None
self._downloader.report_warning('Unable to extract upload date')
upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'',
webpage, 'upload date', fatal=False)
if upload_date:
upload_date = unified_strdate(upload_date)
video_uploader_id = self._html_search_regex(
r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
webpage, 'uploader id', default='anonymous')
video_thumbnail = self._search_regex(
r'\'image\':\'(?P<thumbnail>[^\']+)\'',
webpage, 'thumbnail', fatal=False)
thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)
duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
webpage, 'duration', fatal=False))
view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage)
(like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)
mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage)
comment_count = mobj.group('commentcount') if mobj else 0
age_limit = self._rta_search(webpage)
hd = is_hd(webpage)
video_url = extract_video_url(webpage)
formats = [{
'url': video_url,
'format_id': 'hd' if hd else 'sd',
'preference': 0,
'preference': 1,
}]
video_mp4_url = extract_mp4_video_url(webpage)
if video_mp4_url is not None:
formats.append({
'url': video_mp4_url,
'ext': 'mp4',
'format_id': 'mp4-hd' if hd else 'mp4-sd',
'preference': 1,
})
if not hd:
webpage = self._download_webpage(
mrss_url + '?hd', video_id, note='Downloading HD webpage')
mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
if is_hd(webpage):
video_url = extract_video_url(webpage)
formats.append({
@ -118,11 +117,16 @@ class XHamsterIE(InfoExtractor):
return {
'id': video_id,
'title': video_title,
'formats': formats,
'description': video_description,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'thumbnail': video_thumbnail,
'title': title,
'description': description,
'upload_date': upload_date,
'uploader_id': uploader_id,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
'like_count': int_or_none(like_count),
'dislike_count': int_or_none(dislike_count),
'comment_count': int_or_none(comment_count),
'age_limit': age_limit,
'formats': formats,
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import os
import re
@ -5,19 +7,24 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
parse_duration,
str_to_int,
)
class XTubeIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_TEST = {
u'url': u'http://www.xtube.com/watch.php?v=kVTUy_G222_',
u'file': u'kVTUy_G222_.mp4',
u'md5': u'092fbdd3cbe292c920ef6fc6a8a9cdab',
u'info_dict': {
u"title": u"strange erotica",
u"description": u"surreal gay themed erotica...almost an ET kind of thing",
u"uploader": u"greenshowers",
u"age_limit": 18,
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
'info_dict': {
'id': 'kVTUy_G222_',
'ext': 'mp4',
'title': 'strange erotica',
'description': 'surreal gay themed erotica...almost an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
'age_limit': 18,
}
}
@ -30,10 +37,23 @@ class XTubeIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title')
video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False)
video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', fatal=False)
video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/')
video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(
r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
duration = parse_duration(self._html_search_regex(
r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
view_count = self._html_search_regex(
r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = self._html_search_regex(
r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
if comment_count:
comment_count = str_to_int(comment_count)
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
@ -46,6 +66,9 @@ class XTubeIE(InfoExtractor):
'title': video_title,
'uploader': video_uploader,
'description': video_description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'url': video_url,
'ext': extension,
'format': format,

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import itertools
import json
import re
@ -12,25 +14,25 @@ from ..utils import (
class YahooIE(InfoExtractor):
IE_DESC = u'Yahoo screen'
IE_DESC = 'Yahoo screen'
_VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
_TESTS = [
{
u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
u'file': u'214727115.mp4',
u'md5': u'4962b075c08be8690a922ee026d05e69',
u'info_dict': {
u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
u'description': u'Julian and Travis watch Julian Smith',
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
'file': '214727115.mp4',
'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': {
'title': 'Julian Smith & Travis Legg Watch Julian Smith',
'description': 'Julian and Travis watch Julian Smith',
},
},
{
u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
u'file': u'103000935.mp4',
u'md5': u'd6e6fc6e1313c608f316ddad7b82b306',
u'info_dict': {
u'title': u'Codefellas - The Cougar Lies with Spanish Moss',
u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
'file': '103000935.mp4',
'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
'info_dict': {
'title': 'Codefellas - The Cougar Lies with Spanish Moss',
'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
},
]
@ -41,7 +43,7 @@ class YahooIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
items_json = self._search_regex(r'mediaItems: ({.*?})$',
webpage, u'items', flags=re.MULTILINE)
webpage, 'items', flags=re.MULTILINE)
items = json.loads(items_json)
info = items['mediaItems']['query']['results']['mediaObj'][0]
# The 'meta' field is not always in the video webpage, we request it
@ -60,7 +62,7 @@ class YahooIE(InfoExtractor):
})
query_result_json = self._download_webpage(
'http://video.query.yahoo.com/v1/public/yql?' + data,
video_id, u'Downloading video info')
video_id, 'Downloading video info')
query_result = json.loads(query_result_json)
info = query_result['query']['results']['mediaObj'][0]
meta = info['meta']
@ -103,13 +105,13 @@ class YahooNewsIE(YahooIE):
_VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
_TEST = {
u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
u'md5': u'67010fdf3a08d290e060a4dd96baa07b',
u'info_dict': {
u'id': u'104538833',
u'ext': u'mp4',
u'title': u'China Moses Is Crazy About the Blues',
u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0',
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
'md5': '67010fdf3a08d290e060a4dd96baa07b',
'info_dict': {
'id': '104538833',
'ext': 'mp4',
'title': 'China Moses Is Crazy About the Blues',
'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
},
}
@ -120,14 +122,14 @@ class YahooNewsIE(YahooIE):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id')
long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')
return self._get_info(long_id, video_id)
class YahooSearchIE(SearchInfoExtractor):
IE_DESC = u'Yahoo screen search'
IE_DESC = 'Yahoo screen search'
_MAX_RESULTS = 1000
IE_NAME = u'screen.yahoo:search'
IE_NAME = 'screen.yahoo:search'
_SEARCH_KEY = 'yvsearch'
def _get_n_results(self, query, n):
@ -139,12 +141,12 @@ class YahooSearchIE(SearchInfoExtractor):
'entries': []
}
for pagenum in itertools.count(0):
result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
webpage = self._download_webpage(result_url, query,
note='Downloading results page '+str(pagenum+1))
info = json.loads(webpage)
m = info[u'm']
results = info[u'results']
m = info['m']
results = info['results']
for (i, r) in enumerate(results):
if (pagenum * 30) +i >= n:
@ -152,7 +154,7 @@ class YahooSearchIE(SearchInfoExtractor):
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
res['entries'].append(e)
if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):
if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)):
break
return res

Some files were not shown because too many files have changed in this diff Show More