Compare commits

...

123 Commits

Author SHA1 Message Date
0db492c02a release 2017.07.23 2017-07-23 01:09:09 +07:00
425f41319a [ChangeLog] Actualize 2017-07-23 01:06:08 +07:00
71dde5eecf [itv] Fix production id extraction (closes #13671) 2017-07-23 00:59:07 +07:00
935d6c20c0 [vidio] Make duration non fatal and fix typo 2017-07-23 00:44:50 +07:00
e0f1fb0a27 [mtv] Skip missing video parts (closes #13690) 2017-07-23 00:25:23 +07:00
0017d9ad6d [YoutubeDL] Improve default format specification (closes #13704) 2017-07-23 00:12:01 +07:00
327c8364f1 [sportbox:embed] Fix extraction 2017-07-22 21:35:14 +07:00
359aa2fdd1 [npo] Add support for npo3.nl URLs 2017-07-22 19:15:55 +07:00
f76c02c87b [dramafever] Fix tests 2017-07-22 11:41:40 +07:00
7d9a1db111 [dramafever] Remove video id from title (closes #13699) 2017-07-22 11:40:46 +07:00
0396806f67 [YoutubeDL] Do not override id, extractor and extractor_key in url_transparent
All these meta fields must be borrowed from final extractor that actually performs extraction.
This commit fixes extractor id in download archives for url_transparent downloads. Previously, 'transparent' extractor was erroneously
used for extractor archive id, e.g. 'eggheadlesson 4n8ugwwj5t' instead of 'wistia 4n8ugwwj5t'.
2017-07-21 00:13:32 +07:00
dc6520aa3d [egghead:lesson] Add extractor (#6635) 2017-07-20 23:22:36 +07:00
c653326a14 [funnyordie] Extract more metadata (closes #13677) 2017-07-20 22:50:56 +07:00
3fcf346ac1 [youku:show] Refine playlist extraction
Handle playlists that the initial page is not the first page
2017-07-20 23:20:46 +08:00
fa63cf6c23 [youku:show] Fix playlist extraction (closes #13248) 2017-07-20 22:57:51 +08:00
85f5a74b6c [tbs] Mark as broken and skip invalid tests 2017-07-20 21:19:09 +08:00
d20b1c6725 [dispeak] Recognize sevt subdomain (closes #13276) 2017-07-20 18:14:14 +08:00
bb176df3bb [spiegel:article] Move test 2017-07-17 22:19:40 +07:00
83d00044c1 [adn] Improve error reporting (#13663) 2017-07-16 20:50:32 +07:00
7abed4e06c [crunchyroll] Relax series and season regex (closes #13659) 2017-07-16 12:40:45 +07:00
13eb526f11 [nexx:embed] PEP 8 2017-07-16 05:23:19 +07:00
00d06e3cfc [spiegel:article] Add support for nexx iframe embeds (closes #13029) 2017-07-16 04:38:20 +07:00
749ca5eced [extractor/common] Fix playlist_from_matches 2017-07-16 04:33:14 +07:00
3f59b0154a [nexx:embed] Add extractor for iframe embeds 2017-07-16 04:32:37 +07:00
089b97cfee [nexx] Improve JS embed extraction 2017-07-16 04:30:48 +07:00
decf86044d [pearvideo] Improve (closes #13031) 2017-07-16 03:06:04 +07:00
94b817edeb [pearvideo] Add extractor 2017-07-16 03:02:31 +07:00
cea931a9e5 release 2017.07.15 2017-07-15 07:36:05 +07:00
ef78563e9c [ChangeLog] Actualize 2017-07-15 07:33:26 +07:00
961ea474b6 [YoutubeDL] PEP 8 2017-07-15 07:02:57 +07:00
ea3f20494f [youtube] PEP 8 2017-07-15 07:02:57 +07:00
c7604d79e9 [spiegeltv] Delegate extraction to nexx (closes #13159) 2017-07-15 07:02:57 +07:00
4e826cd9ae [nexx] Add extractor (closes #10807, closes #13465) 2017-07-15 07:02:57 +07:00
2583c0b54e Fix bugs caused by typos 2017-07-14 23:08:32 +07:00
7d02dcfaa2 [youtube] Don't capture YouTube Red ad for creator meta field (closes #13621) 2017-07-14 22:37:04 +07:00
00dbdfc1f7 [slideshare] Fix extraction 2017-07-14 22:11:07 +07:00
f354d84807 [5tv] Add another video URL pattern (closes #13354) 2017-07-14 22:10:17 +07:00
15da37c7dc [YoutubeDL] Don't expand env variables in meta fields (closes #13637) 2017-07-14 00:42:12 +07:00
9a0942ad55 [drtv] Make HLS and HDS extraction non fatal 2017-07-11 22:59:56 +07:00
f2bb33a986 [ted] Fix subtitles extraction (closes #13628, closes #13629) 2017-07-11 21:36:45 +07:00
3615bfe1b4 [twitter] Fix remaining tests 2017-07-11 16:46:37 +08:00
e8f20ffa03 [vine] Make sure the title won't be empty
And fix a relevant TwitterCard test case
2017-07-11 16:05:15 +08:00
9be31e771c [twitter] Support HLS streams in vmap URLs 2017-07-11 15:48:48 +08:00
7f176ac477 [periscope] Support pscp.tv URLs in embedded frames
And fix a relevant twitter test
2017-07-11 15:35:19 +08:00
2edfd745df [twitter] Extract mp4 urls via mobile API (closes #12726) 2017-07-11 15:19:36 +08:00
708f6f511e [niconico] Fix authentication error handling (closes #12486) 2017-07-11 15:04:45 +08:00
bb13949197 [niconico] Check login errors (#12486) 2017-07-11 15:03:11 +08:00
c3c94ca4a4 [giantbomb] Extract m3u8 formats (closes #13626) 2017-07-10 21:34:27 +08:00
e3cd1fcdd1 [vlive:playlist] Relax and simplify 2017-07-10 04:32:24 +07:00
b71c18b434 [vlive:playlist] Add extractor 2017-07-10 04:24:04 +07:00
7bf539edcc [eagleplatform] Fix test 2017-07-10 00:14:41 +07:00
65c416dda8 release 2017.07.09 2017-07-09 20:16:38 +07:00
207acd8465 [ChangeLog] Actualize 2017-07-09 20:15:15 +07:00
71a1db8919 [dailymail] Add support for embeds 2017-07-09 20:06:24 +07:00
6e925598d6 [csjw] Add coding cookie 2017-07-09 19:18:12 +07:00
73cf76a93f [joj] Rewrite and add support for generic embeds (closes #13268) 2017-07-09 19:17:54 +07:00
256a746d21 [joj] Add extractor 2017-07-09 19:17:38 +07:00
58179eb7d9 [abc.net.au:iview] Extract more formats (closes #13492, closes #13489) 2017-07-09 17:55:40 +07:00
485cb37576 [egghead:course] Improve (closes #13370) 2017-07-09 17:30:49 +07:00
ed84454d35 [egghead:course] Fix extraction 2017-07-09 17:30:25 +07:00
a02682fd13 Keep in sync with ffmpeg's current malformed AAC bitstream wording (closes #13587) 2017-07-09 17:09:44 +07:00
0d2f0b0357 [csjw] Make description optional 2017-07-09 17:05:11 +07:00
c319d1c483 [csjw] Fix issues and improve extraction (closes #13525) 2017-07-09 17:01:05 +07:00
d2b9f362fa [cjsw] Add extractor 2017-07-09 17:01:00 +07:00
4328ddf82b [extractor/common] Add support for AMP tags in _parse_html5_media_entries 2017-07-09 16:29:52 +07:00
250b042c7e [generic] Add tests for #13557 2017-07-09 16:02:38 +07:00
665e945246 [eagleplatform] Add support for referrer protected videos (closes #13557) 2017-07-09 15:57:58 +07:00
5af2fd7fa0 [eagleplatform] Add support for another embed pattern (#13557) 2017-07-09 15:55:04 +07:00
15237fcd51 [veoh] Extend _VALID_URL 2017-07-09 14:54:52 +07:00
7a57730907 [npo:live] Fix live stream id extraction (closes #13568) 2017-07-09 14:21:40 +07:00
8b347a389e [googledrive] Fix height extraction (closes #13603) 2017-07-09 00:26:13 +07:00
a49804816c [dailymotion] Add support for new layout (close #13580) 2017-07-08 18:12:15 +07:00
eadd313321 [yam] Remove extractor
mymedia.yam.com is dead. An wikipedia user also pointed out that Yam's
blog service is no longer available. [1]

[1] https://zh.wikipedia.org/zh-tw/%E5%A4%A9%E7%A9%BA%E9%83%A8%E8%90%BD
2017-07-08 15:48:05 +08:00
d852c6bc59 [xhamster] Extract all formats and fix duration extraction (#13593) 2017-07-07 22:49:11 +07:00
00e5c36315 [xhamster] Add support for new URL schema (closes #13593) 2017-07-07 22:27:34 +07:00
8a04ade86b Credit @parmjitv for #13322, #13503, #13541, #13549 2017-07-06 23:15:23 +07:00
ab328411d5 Credit @orng for ruv (#13396) 2017-07-06 23:15:16 +07:00
ddeff4be3f Credit @gfabiano for #13382, #13385, #13415 2017-07-06 23:15:09 +07:00
60d4401c5e [espn] Extend _VALID_URL (fixes #13244) 2017-07-06 22:55:59 +07:00
dee2ff1d81 [test_utils] Fix tests under Windows 2017-07-06 00:25:37 +07:00
6554708252 [kaltura] Fix typo in subtitles extraction (closes #13569) 2017-07-05 23:20:50 +07:00
0a2e1b2e30 [vier] Adapt extraction to redesign (#13575) 2017-07-05 22:52:47 +07:00
babbc04d45 [xuite] Move to the new HTML5 API and reduce # of requests 2017-07-05 23:27:12 +08:00
609ff8ca19 [utils] Support attributes with no values in get_elements_by_attribute() 2017-07-05 23:27:12 +08:00
b6c9fe4162 release 2017.07.02 2017-07-02 20:17:10 +07:00
4d9ba27bba [ChangeLog] Actualize 2017-07-02 20:12:40 +07:00
50ae3f646e [thisoldhouse] Add more fallbacks for video id (closes #13541) 2017-07-02 20:06:15 +07:00
99a7e76240 [thisoldhouse] Update test 2017-07-02 20:05:11 +07:00
a3a6d01a96 [thisoldhouse] Fix video id extraction (closes #13540) 2017-07-02 20:04:51 +07:00
02d61a65e2 [xfileshare] Extend format regex (closes #13536) 2017-07-02 08:00:22 +07:00
9b35297be1 [extractors] Add import for tastytrade 2017-07-01 18:39:29 +07:00
4917478803 [ted] Fix extraction (closes #13535)) 2017-07-01 18:39:01 +07:00
54faac2235 [tastytrade] Add extractor (closes #13521) 2017-06-30 22:20:30 +07:00
c69701c6ab [extractor/common] Improve _json_ld 2017-06-30 22:19:06 +07:00
d4f8ce6e91 [dplayit] Relax video id regex (closes #13524) 2017-06-30 21:55:45 +07:00
b311b0ead2 [generic] Extract more generic metadata (closes #13527) 2017-06-30 21:42:04 +07:00
72d256c434 [bbccouk] Extend _VALID_URL 2017-06-29 22:29:28 +07:00
b2ed954fc6 [bbccouk] Capture and output error message (closes #13518) 2017-06-29 22:27:53 +07:00
a919ca0ad6 [cbsnews] Actualize test 2017-06-28 22:30:12 +07:00
88d6b7c2bd [cbsnews] Relax video info regex (fixes #13284) 2017-06-28 22:21:35 +07:00
fd1c5fba6b [facebook] Add test for plugin video embed (#13493) 2017-06-27 22:38:59 +07:00
0646e34c7d [facebook] Add support for plugin video embeds and multiple embeds (closes #13493) 2017-06-27 22:38:54 +07:00
bf2dc9cc6e [soundcloud] Fix tests 2017-06-27 21:26:46 +07:00
f1c051009b [soundcloud] Switch to https for API requests 2017-06-27 21:20:18 +07:00
33ffb645a6 [pandatv] Switch to https for API and download URLs 2017-06-26 22:11:09 +07:00
35544690e4 [pandatv] Add support for https URLs 2017-06-26 22:00:31 +07:00
136503e302 [ChangeLog] Update after #13494 2017-06-26 19:56:07 +08:00
4a87de72df [niconico] fix sp subdomain links 2017-06-25 21:30:05 +02:00
a7ce8f16c4 release 2017.06.25 2017-06-25 05:16:06 +07:00
a5aea53fc8 [ChangeLog] Actualize 2017-06-25 05:13:12 +07:00
0c7a631b61 [adobepass] Add support for ATTOTT MSO (DIRECTV NOW) (closes #13472) 2017-06-25 05:03:17 +07:00
fd9ee4de8c [wsj] Add support for barrons.com (closes #13470) 2017-06-25 02:15:35 +07:00
5744cf6c03 [ign] Add another video id pattern (closes #13328) 2017-06-25 01:59:15 +07:00
9c48b5a193 [raiplay:live] Improve and add test (closes #13414) 2017-06-25 01:49:27 +07:00
449c665776 [raiplay:live] Add extractor 2017-06-25 01:48:54 +07:00
23aec3d623 [redbulltv] Restore hls format prefix 2017-06-25 01:10:31 +07:00
27449ad894 [redbulltv] Add support for lives and segments (closes #13486)) 2017-06-25 01:09:12 +07:00
bd65f18153 [onetpl] Add support for videos embedded via pulsembed (closes #13482) 2017-06-24 18:33:31 +07:00
73af5cc817 [YoutubeDL] Skip malformed formats for better extraction robustness 2017-06-23 21:18:33 +07:00
b5f523ed62 [ooyala] Add test for missing stream['url']['data'] 2017-06-23 20:56:48 +07:00
4f4dd8d797 [ooyala] Make more robust 2017-06-23 20:56:21 +07:00
4cb18ab1b9 [ooyala] Skip empty format URLs (closes #13471, closes #13476) 2017-06-23 20:50:48 +07:00
ac7409eec5 [hgtv.com:show] Fix typo 2017-06-23 02:54:12 +07:00
77 changed files with 2084 additions and 754 deletions

View File

@ -6,8 +6,8 @@
--- ---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.06.23*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. ### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.07.23*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.06.23** - [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.07.23**
### Before submitting an *issue* make sure you have: ### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
[debug] User config: [] [debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2017.06.23 [debug] youtube-dl version 2017.07.23
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {} [debug] Proxy map: {}

View File

@ -220,3 +220,6 @@ gritstub
Adam Voss Adam Voss
Mike Fährmann Mike Fährmann
Jan Kundrát Jan Kundrát
Giuseppe Fabiano
Örn Guðjónsson
Parmjit Virk

116
ChangeLog
View File

@ -1,3 +1,119 @@
version 2017.07.23
Core
* [YoutubeDL] Improve default format specification (#13704)
* [YoutubeDL] Do not override id, extractor and extractor_key for
url_transparent entities
* [extractor/common] Fix playlist_from_matches
Extractors
* [itv] Fix production id extraction (#13671, #13703)
* [vidio] Make duration non fatal and fix typo
* [mtv] Skip missing video parts (#13690)
* [sportbox:embed] Fix extraction
+ [npo] Add support for npo3.nl URLs (#13695)
* [dramafever] Remove video id from title (#13699)
+ [egghead:lesson] Add support for lessons (#6635)
* [funnyordie] Extract more metadata (#13677)
* [youku:show] Fix playlist extraction (#13248)
+ [dispeak] Recognize sevt subdomain (#13276)
* [adn] Improve error reporting (#13663)
* [crunchyroll] Relax series and season regex (#13659)
+ [spiegel:article] Add support for nexx iframe embeds (#13029)
+ [nexx:embed] Add support for iframe embeds
* [nexx] Improve JS embed extraction
+ [pearvideo] Add support for pearvideo.com (#13031)
version 2017.07.15
Core
* [YoutubeDL] Don't expand environment variables in meta fields (#13637)
Extractors
* [spiegeltv] Delegate extraction to nexx extractor (#13159)
+ [nexx] Add support for nexx.cloud (#10807, #13465)
* [generic] Fix rutube embeds extraction (#13641)
* [karrierevideos] Fix title extraction (#13641)
* [youtube] Don't capture YouTube Red ad for creator meta field (#13621)
* [slideshare] Fix extraction (#13617)
+ [5tv] Add another video URL pattern (#13354, #13606)
* [drtv] Make HLS and HDS extraction non fatal
* [ted] Fix subtitles extraction (#13628, #13629)
* [vine] Make sure the title won't be empty
+ [twitter] Support HLS streams in vmap URLs
+ [periscope] Support pscp.tv URLs in embedded frames
* [twitter] Extract mp4 urls via mobile API (#12726)
* [niconico] Fix authentication error handling (#12486)
* [giantbomb] Extract m3u8 formats (#13626)
+ [vlive:playlist] Add support for playlists (#13613)
version 2017.07.09
Core
+ [extractor/common] Add support for AMP tags in _parse_html5_media_entries
+ [utils] Support attributes with no values in get_elements_by_attribute
Extractors
+ [dailymail] Add support for embeds
+ [joj] Add support for joj.sk (#13268)
* [abc.net.au:iview] Extract more formats (#13492, #13489)
* [egghead:course] Fix extraction (#6635, #13370)
+ [cjsw] Add support for cjsw.com (#13525)
+ [eagleplatform] Add support for referrer protected videos (#13557)
+ [eagleplatform] Add support for another embed pattern (#13557)
* [veoh] Extend URL regular expression (#13601)
* [npo:live] Fix live stream id extraction (#13568, #13605)
* [googledrive] Fix height extraction (#13603)
+ [dailymotion] Add support for new layout (#13580)
- [yam] Remove extractor
* [xhamster] Extract all formats and fix duration extraction (#13593)
+ [xhamster] Add support for new URL schema (#13593)
* [espn] Extend URL regular expression (#13244, #13549)
* [kaltura] Fix typo in subtitles extraction (#13569)
* [vier] Adapt extraction to redesign (#13575)
version 2017.07.02
Core
* [extractor/common] Improve _json_ld
Extractors
+ [thisoldhouse] Add more fallbacks for video id
* [thisoldhouse] Fix video id extraction (#13540, #13541)
* [xfileshare] Extend format regular expression (#13536)
* [ted] Fix extraction (#13535)
+ [tastytrade] Add support for tastytrade.com (#13521)
* [dplayit] Relax video id regular expression (#13524)
+ [generic] Extract more generic metadata (#13527)
+ [bbccouk] Capture and output error message (#13501, #13518)
* [cbsnews] Relax video info regular expression (#13284, #13503)
+ [facebook] Add support for plugin video embeds and multiple embeds (#13493)
* [soundcloud] Switch to https for API requests (#13502)
* [pandatv] Switch to https for API and download URLs
+ [pandatv] Add support for https URLs (#13491)
+ [niconico] Support sp subdomain (#13494)
version 2017.06.25
Core
+ [adobepass] Add support for DIRECTV NOW (mso ATTOTT) (#13472)
* [YoutubeDL] Skip malformed formats for better extraction robustness
Extractors
+ [wsj] Add support for barrons.com (#13470)
+ [ign] Add another video id pattern (#13328)
+ [raiplay:live] Add support for live streams (#13414)
+ [redbulltv] Add support for live videos and segments (#13486)
+ [onetpl] Add support for videos embedded via pulsembed (#13482)
* [ooyala] Make more robust
* [ooyala] Skip empty format URLs (#13471, #13476)
* [hgtv.com:show] Fix typo
version 2017.06.23 version 2017.06.23
Core Core

View File

@ -42,7 +42,7 @@
- **Allocine** - **Allocine**
- **AlphaPorno** - **AlphaPorno**
- **AMCNetworks** - **AMCNetworks**
- **anderetijden**: npo.nl and ntr.nl - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **AnimeOnDemand** - **AnimeOnDemand**
- **anitube.se** - **anitube.se**
- **Anvato** - **Anvato**
@ -154,6 +154,7 @@
- **chirbit** - **chirbit**
- **chirbit:profile** - **chirbit:profile**
- **Cinchcast** - **Cinchcast**
- **CJSW**
- **Clipfish** - **Clipfish**
- **cliphunter** - **cliphunter**
- **ClipRs** - **ClipRs**
@ -237,6 +238,7 @@
- **EbaumsWorld** - **EbaumsWorld**
- **EchoMsk** - **EchoMsk**
- **egghead:course**: egghead.io course - **egghead:course**: egghead.io course
- **egghead:lesson**: egghead.io lesson
- **eHow** - **eHow**
- **Einthusan** - **Einthusan**
- **eitb.tv** - **eitb.tv**
@ -369,6 +371,7 @@
- **Jamendo** - **Jamendo**
- **JamendoAlbum** - **JamendoAlbum**
- **JeuxVideo** - **JeuxVideo**
- **Joj**
- **Jove** - **Jove**
- **jpopsuki.tv** - **jpopsuki.tv**
- **JWPlatform** - **JWPlatform**
@ -519,6 +522,8 @@
- **NextMedia**: 蘋果日報 - **NextMedia**: 蘋果日報
- **NextMediaActionNews**: 蘋果日報 - 動新聞 - **NextMediaActionNews**: 蘋果日報 - 動新聞
- **NextTV**: 壹電視 - **NextTV**: 壹電視
- **Nexx**
- **NexxEmbed**
- **nfb**: National Film Board of Canada - **nfb**: National Film Board of Canada
- **nfl.com** - **nfl.com**
- **NhkVod** - **NhkVod**
@ -549,7 +554,7 @@
- **NowTVList** - **NowTVList**
- **nowvideo**: NowVideo - **nowvideo**: NowVideo
- **Noz** - **Noz**
- **npo**: npo.nl and ntr.nl - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **npo.nl:live** - **npo.nl:live**
- **npo.nl:radio** - **npo.nl:radio**
- **npo.nl:radio:fragment** - **npo.nl:radio:fragment**
@ -593,6 +598,7 @@
- **Patreon** - **Patreon**
- **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC)
- **pcmag** - **pcmag**
- **PearVideo**
- **People** - **People**
- **periscope**: Periscope - **periscope**: Periscope
- **periscope:user**: Periscope user videos - **periscope:user**: Periscope user videos
@ -644,6 +650,7 @@
- **RadioJavan** - **RadioJavan**
- **Rai** - **Rai**
- **RaiPlay** - **RaiPlay**
- **RaiPlayLive**
- **RBMARadio** - **RBMARadio**
- **RDS**: RDS.ca - **RDS**: RDS.ca
- **RedBullTV** - **RedBullTV**
@ -767,7 +774,8 @@
- **Tagesschau** - **Tagesschau**
- **tagesschau:player** - **tagesschau:player**
- **Tass** - **Tass**
- **TBS** - **TastyTrade**
- **TBS** (Currently broken)
- **TDSLifeway** - **TDSLifeway**
- **teachertube**: teachertube.com videos - **teachertube**: teachertube.com videos
- **teachertube:user:collection**: teachertube.com user and collection videos - **teachertube:user:collection**: teachertube.com user and collection videos
@ -938,13 +946,14 @@
- **vk:wallpost** - **vk:wallpost**
- **vlive** - **vlive**
- **vlive:channel** - **vlive:channel**
- **vlive:playlist**
- **Vodlocker** - **Vodlocker**
- **VODPl** - **VODPl**
- **VODPlatform** - **VODPlatform**
- **VoiceRepublic** - **VoiceRepublic**
- **VoxMedia** - **VoxMedia**
- **Vporn** - **Vporn**
- **vpro**: npo.nl and ntr.nl - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **Vrak** - **Vrak**
- **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be
- **vrv** - **vrv**
@ -970,7 +979,7 @@
- **wholecloud**: WholeCloud - **wholecloud**: WholeCloud
- **Wimp** - **Wimp**
- **Wistia** - **Wistia**
- **wnl**: npo.nl and ntr.nl - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **WorldStarHipHop** - **WorldStarHipHop**
- **wrzuta.pl** - **wrzuta.pl**
- **wrzuta.pl:playlist** - **wrzuta.pl:playlist**
@ -994,7 +1003,6 @@
- **XVideos** - **XVideos**
- **XXXYMovies** - **XXXYMovies**
- **Yahoo**: Yahoo screen and movies - **Yahoo**: Yahoo screen and movies
- **Yam**: 蕃薯藤yam天空部落
- **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:album**: Яндекс.Музыка - Альбом
- **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист
- **yandexmusic:track**: Яндекс.Музыка - Трек - **yandexmusic:track**: Яндекс.Музыка - Трек

View File

@ -41,6 +41,7 @@ def _make_result(formats, **kwargs):
'id': 'testid', 'id': 'testid',
'title': 'testttitle', 'title': 'testttitle',
'extractor': 'testex', 'extractor': 'testex',
'extractor_key': 'TestEx',
} }
res.update(**kwargs) res.update(**kwargs)
return res return res
@ -448,6 +449,17 @@ class TestFormatSelection(unittest.TestCase):
pass pass
self.assertEqual(ydl.downloaded_info_dicts, []) self.assertEqual(ydl.downloaded_info_dicts, [])
def test_default_format_spec(self):
ydl = YDL({'simulate': True})
self.assertEqual(ydl._default_format_spec({}), 'bestvideo+bestaudio/best')
ydl = YDL({'outtmpl': '-'})
self.assertEqual(ydl._default_format_spec({}), 'best')
ydl = YDL({})
self.assertEqual(ydl._default_format_spec({}, download=False), 'bestvideo+bestaudio/best')
self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best')
class TestYoutubeDL(unittest.TestCase): class TestYoutubeDL(unittest.TestCase):
def test_subtitles(self): def test_subtitles(self):
@ -527,6 +539,8 @@ class TestYoutubeDL(unittest.TestCase):
'ext': 'mp4', 'ext': 'mp4',
'width': None, 'width': None,
'height': 1080, 'height': 1080,
'title1': '$PATH',
'title2': '%PATH%',
} }
def fname(templ): def fname(templ):
@ -545,10 +559,14 @@ class TestYoutubeDL(unittest.TestCase):
self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4') self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4')
self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4') self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4')
self.assertEqual(fname('%(height) 0 6d.%(ext)s'), ' 01080.mp4') self.assertEqual(fname('%(height) 0 6d.%(ext)s'), ' 01080.mp4')
self.assertEqual(fname('%%'), '%')
self.assertEqual(fname('%%%%'), '%%')
self.assertEqual(fname('%%(height)06d.%(ext)s'), '%(height)06d.mp4') self.assertEqual(fname('%%(height)06d.%(ext)s'), '%(height)06d.mp4')
self.assertEqual(fname('%(width)06d.%(ext)s'), 'NA.mp4') self.assertEqual(fname('%(width)06d.%(ext)s'), 'NA.mp4')
self.assertEqual(fname('%(width)06d.%%(ext)s'), 'NA.%(ext)s') self.assertEqual(fname('%(width)06d.%%(ext)s'), 'NA.%(ext)s')
self.assertEqual(fname('%%(width)06d.%(ext)s'), '%(width)06d.mp4') self.assertEqual(fname('%%(width)06d.%(ext)s'), '%(width)06d.mp4')
self.assertEqual(fname('Hello %(title1)s'), 'Hello $PATH')
self.assertEqual(fname('Hello %(title2)s'), 'Hello %PATH%')
def test_format_note(self): def test_format_note(self):
ydl = YoutubeDL() ydl = YoutubeDL()
@ -755,7 +773,8 @@ class TestYoutubeDL(unittest.TestCase):
'_type': 'url_transparent', '_type': 'url_transparent',
'url': 'foo2:', 'url': 'foo2:',
'ie_key': 'Foo2', 'ie_key': 'Foo2',
'title': 'foo1 title' 'title': 'foo1 title',
'id': 'foo1_id',
} }
class Foo2IE(InfoExtractor): class Foo2IE(InfoExtractor):
@ -781,6 +800,9 @@ class TestYoutubeDL(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0] downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['url'], TEST_URL) self.assertEqual(downloaded['url'], TEST_URL)
self.assertEqual(downloaded['title'], 'foo1 title') self.assertEqual(downloaded['title'], 'foo1 title')
self.assertEqual(downloaded['id'], 'testid')
self.assertEqual(downloaded['extractor'], 'testex')
self.assertEqual(downloaded['extractor_key'], 'TestEx')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -98,6 +98,7 @@ from youtube_dl.compat import (
compat_chr, compat_chr,
compat_etree_fromstring, compat_etree_fromstring,
compat_getenv, compat_getenv,
compat_os_name,
compat_setenv, compat_setenv,
compat_urlparse, compat_urlparse,
compat_parse_qs, compat_parse_qs,
@ -448,7 +449,9 @@ class TestUtil(unittest.TestCase):
def test_shell_quote(self): def test_shell_quote(self):
args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')] args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')]
self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""") self.assertEqual(
shell_quote(args),
"""ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''')
def test_str_to_int(self): def test_str_to_int(self):
self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123,456'), 123456)
@ -932,7 +935,7 @@ class TestUtil(unittest.TestCase):
def test_args_to_str(self): def test_args_to_str(self):
self.assertEqual( self.assertEqual(
args_to_str(['foo', 'ba/r', '-baz', '2 be', '']), args_to_str(['foo', 'ba/r', '-baz', '2 be', '']),
'foo ba/r -baz \'2 be\' \'\'' 'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""'
) )
def test_parse_filesize(self): def test_parse_filesize(self):
@ -1228,6 +1231,12 @@ part 3</font></u>
self.assertEqual(get_element_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
html = '''
<div itemprop="author" itemscope>foo</div>
'''
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
def test_get_elements_by_class(self): def test_get_elements_by_class(self):
html = ''' html = '''
<span class="foo bar">nice</span><span class="foo bar">also nice</span> <span class="foo bar">nice</span><span class="foo bar">also nice</span>

View File

@ -26,6 +26,8 @@ import tokenize
import traceback import traceback
import random import random
from string import ascii_letters
from .compat import ( from .compat import (
compat_basestring, compat_basestring,
compat_cookiejar, compat_cookiejar,
@ -674,7 +676,19 @@ class YoutubeDL(object):
FORMAT_RE.format(numeric_field), FORMAT_RE.format(numeric_field),
r'%({0})s'.format(numeric_field), outtmpl) r'%({0})s'.format(numeric_field), outtmpl)
filename = expand_path(outtmpl % template_dict) # expand_path translates '%%' into '%' and '$$' into '$'
# correspondingly that is not what we want since we need to keep
# '%%' intact for template dict substitution step. Working around
# with boundary-alike separator hack.
sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
# outtmpl should be expand_path'ed before template dict substitution
# because meta fields may contain env variables we don't want to
# be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
# title "Hello $PATH", we don't want `$PATH` to be expanded.
filename = expand_path(outtmpl).replace(sep, '') % template_dict
# Temporary fix for #4787 # Temporary fix for #4787
# 'Treat' all problem characters by passing filename through preferredencoding # 'Treat' all problem characters by passing filename through preferredencoding
# to workaround encoding issues with subprocess on python2 @ Windows # to workaround encoding issues with subprocess on python2 @ Windows
@ -846,7 +860,7 @@ class YoutubeDL(object):
force_properties = dict( force_properties = dict(
(k, v) for k, v in ie_result.items() if v is not None) (k, v) for k, v in ie_result.items() if v is not None)
for f in ('_type', 'url', 'ie_key'): for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
if f in force_properties: if f in force_properties:
del force_properties[f] del force_properties[f]
new_result = info.copy() new_result = info.copy()
@ -1050,6 +1064,25 @@ class YoutubeDL(object):
return op(actual_value, comparison_value) return op(actual_value, comparison_value)
return _filter return _filter
def _default_format_spec(self, info_dict, download=True):
req_format_list = []
def can_have_partial_formats():
if self.params.get('simulate', False):
return True
if not download:
return True
if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
return False
if info_dict.get('is_live'):
return False
merger = FFmpegMergerPP(self)
return merger.available and merger.can_merge()
if can_have_partial_formats():
req_format_list.append('bestvideo+bestaudio')
req_format_list.append('best')
return '/'.join(req_format_list)
def build_format_selector(self, format_spec): def build_format_selector(self, format_spec):
def syntax_error(note, start): def syntax_error(note, start):
message = ( message = (
@ -1448,17 +1481,25 @@ class YoutubeDL(object):
if not formats: if not formats:
raise ExtractorError('No video formats found!') raise ExtractorError('No video formats found!')
def is_wellformed(f):
url = f.get('url')
valid_url = url and isinstance(url, compat_str)
if not valid_url:
self.report_warning(
'"url" field is missing or empty - skipping format, '
'there is an error in extractor')
return valid_url
# Filter out malformed formats for better extraction robustness
formats = list(filter(is_wellformed, formats))
formats_dict = {} formats_dict = {}
# We check that all the formats have the format and format_id fields # We check that all the formats have the format and format_id fields
for i, format in enumerate(formats): for i, format in enumerate(formats):
if 'url' not in format:
raise ExtractorError('Missing "url" key in result (index %d)' % i)
sanitize_string_field(format, 'format_id') sanitize_string_field(format, 'format_id')
sanitize_numeric_fields(format) sanitize_numeric_fields(format)
format['url'] = sanitize_url(format['url']) format['url'] = sanitize_url(format['url'])
if format.get('format_id') is None: if format.get('format_id') is None:
format['format_id'] = compat_str(i) format['format_id'] = compat_str(i)
else: else:
@ -1512,14 +1553,10 @@ class YoutubeDL(object):
req_format = self.params.get('format') req_format = self.params.get('format')
if req_format is None: if req_format is None:
req_format_list = [] req_format = self._default_format_spec(info_dict, download=download)
if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and if self.params.get('verbose'):
not info_dict.get('is_live')): self.to_stdout('[debug] Default format spec: %s' % req_format)
merger = FFmpegMergerPP(self)
if merger.available and merger.can_merge():
req_format_list.append('bestvideo+bestaudio')
req_format_list.append('best')
req_format = '/'.join(req_format_list)
format_selector = self.build_format_selector(req_format) format_selector = self.build_format_selector(req_format)
# While in format selection we may need to have an access to the original # While in format selection we may need to have an access to the original
@ -1882,7 +1919,7 @@ class YoutubeDL(object):
info_dict.get('protocol') == 'm3u8' and info_dict.get('protocol') == 'm3u8' and
self.params.get('hls_prefer_native')): self.params.get('hls_prefer_native')):
if fixup_policy == 'warn': if fixup_policy == 'warn':
self.report_warning('%s: malformated aac bitstream.' % ( self.report_warning('%s: malformed AAC bitstream detected.' % (
info_dict['id'])) info_dict['id']))
elif fixup_policy == 'detect_or_warn': elif fixup_policy == 'detect_or_warn':
fixup_pp = FFmpegFixupM3u8PP(self) fixup_pp = FFmpegFixupM3u8PP(self)
@ -1891,7 +1928,7 @@ class YoutubeDL(object):
info_dict['__postprocessors'].append(fixup_pp) info_dict['__postprocessors'].append(fixup_pp)
else: else:
self.report_warning( self.report_warning(
'%s: malformated aac bitstream. %s' '%s: malformed AAC bitstream detected. %s'
% (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
else: else:
assert fixup_policy in ('ignore', 'never') assert fixup_policy in ('ignore', 'never')

View File

@ -98,7 +98,7 @@ def write_piff_header(stream, params):
if is_audio: if is_audio:
smhd_payload = s88.pack(0) # balance smhd_payload = s88.pack(0) # balance
smhd_payload = u16.pack(0) # reserved smhd_payload += u16.pack(0) # reserved
media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header
else: else:
vmhd_payload = u16.pack(0) # graphics mode vmhd_payload = u16.pack(0) # graphics mode
@ -126,7 +126,6 @@ def write_piff_header(stream, params):
if fourcc == 'AACL': if fourcc == 'AACL':
sample_entry_box = box(b'mp4a', sample_entry_payload) sample_entry_box = box(b'mp4a', sample_entry_payload)
else: else:
sample_entry_payload = sample_entry_payload
sample_entry_payload += u16.pack(0) # pre defined sample_entry_payload += u16.pack(0) # pre defined
sample_entry_payload += u16.pack(0) # reserved sample_entry_payload += u16.pack(0) # reserved
sample_entry_payload += u32.pack(0) * 3 # pre defined sample_entry_payload += u32.pack(0) * 3 # pre defined

View File

@ -3,11 +3,13 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
js_to_json, js_to_json,
int_or_none, int_or_none,
parse_iso8601, parse_iso8601,
try_get,
) )
@ -124,7 +126,20 @@ class ABCIViewIE(InfoExtractor):
title = video_params.get('title') or video_params['seriesTitle'] title = video_params.get('title') or video_params['seriesTitle']
stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') stream = next(s for s in video_params['playlist'] if s.get('type') == 'program')
formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id) format_urls = [
try_get(stream, lambda x: x['hds-unmetered'], compat_str)]
# May have higher quality video
sd_url = try_get(
stream, lambda x: x['streams']['hds']['sd'], compat_str)
if sd_url:
format_urls.append(sd_url.replace('metered', 'um'))
formats = []
for format_url in format_urls:
if format_url:
formats.extend(
self._extract_akamai_formats(format_url, video_id))
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {} subtitles = {}

View File

@ -107,11 +107,13 @@ class ADNIE(InfoExtractor):
metas = options.get('metas') or {} metas = options.get('metas') or {}
title = metas.get('title') or video_info['title'] title = metas.get('title') or video_info['title']
links = player_config.get('links') or {} links = player_config.get('links') or {}
error = None
if not links: if not links:
links_url = player_config['linksurl'] links_url = player_config['linksurl']
links_data = self._download_json(urljoin( links_data = self._download_json(urljoin(
self._BASE_URL, links_url), video_id) self._BASE_URL, links_url), video_id)
links = links_data.get('links') or {} links = links_data.get('links') or {}
error = links_data.get('error')
formats = [] formats = []
for format_id, qualities in links.items(): for format_id, qualities in links.items():
@ -130,7 +132,8 @@ class ADNIE(InfoExtractor):
for f in m3u8_formats: for f in m3u8_formats:
f['language'] = 'fr' f['language'] = 'fr'
formats.extend(m3u8_formats) formats.extend(m3u8_formats)
error = options.get('error') if not error:
error = options.get('error')
if not formats and error: if not formats and error:
raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -15,6 +15,7 @@ from ..utils import (
urlencode_postdata, urlencode_postdata,
unified_timestamp, unified_timestamp,
ExtractorError, ExtractorError,
NO_DEFAULT,
) )
@ -24,6 +25,11 @@ MSO_INFO = {
'username_field': 'username', 'username_field': 'username',
'password_field': 'password', 'password_field': 'password',
}, },
'ATTOTT': {
'name': 'DIRECTV NOW',
'username_field': 'email',
'password_field': 'loginpassword',
},
'Rogers': { 'Rogers': {
'name': 'Rogers', 'name': 'Rogers',
'username_field': 'UserName', 'username_field': 'UserName',
@ -1316,6 +1322,8 @@ class AdobePassIE(InfoExtractor):
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_MVPD_CACHE = 'ap-mvpd' _MVPD_CACHE = 'ap-mvpd'
_DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page'
def _download_webpage_handle(self, *args, **kwargs): def _download_webpage_handle(self, *args, **kwargs):
headers = kwargs.get('headers', {}) headers = kwargs.get('headers', {})
headers.update(self.geo_verification_headers()) headers.update(self.geo_verification_headers())
@ -1365,6 +1373,21 @@ class AdobePassIE(InfoExtractor):
'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier '
'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True)
def extract_redirect_url(html, url=None, fatal=False):
# TODO: eliminate code duplication with generic extractor and move
# redirection code into _download_webpage_handle
REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
redirect_url = self._search_regex(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
html, 'meta refresh redirect',
default=NO_DEFAULT if fatal else None, fatal=fatal)
if not redirect_url:
return None
if url:
redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url))
return redirect_url
mvpd_headers = { mvpd_headers = {
'ap_42': 'anonymous', 'ap_42': 'anonymous',
'ap_11': 'Linux i686', 'ap_11': 'Linux i686',
@ -1414,16 +1437,15 @@ class AdobePassIE(InfoExtractor):
if '<form name="signin"' in provider_redirect_page: if '<form name="signin"' in provider_redirect_page:
provider_login_page_res = provider_redirect_page_res provider_login_page_res = provider_redirect_page_res
elif 'http-equiv="refresh"' in provider_redirect_page: elif 'http-equiv="refresh"' in provider_redirect_page:
oauth_redirect_url = self._html_search_regex( oauth_redirect_url = extract_redirect_url(
r'content="0;\s*url=([^\'"]+)', provider_redirect_page, fatal=True)
provider_redirect_page, 'meta refresh redirect')
provider_login_page_res = self._download_webpage_handle( provider_login_page_res = self._download_webpage_handle(
oauth_redirect_url, video_id, oauth_redirect_url, video_id,
'Downloading Provider Login Page') self._DOWNLOADING_LOGIN_PAGE)
else: else:
provider_login_page_res = post_form( provider_login_page_res = post_form(
provider_redirect_page_res, provider_redirect_page_res,
'Downloading Provider Login Page') self._DOWNLOADING_LOGIN_PAGE)
mvpd_confirm_page_res = post_form( mvpd_confirm_page_res = post_form(
provider_login_page_res, 'Logging in', { provider_login_page_res, 'Logging in', {
@ -1470,8 +1492,17 @@ class AdobePassIE(InfoExtractor):
'Content-Type': 'application/x-www-form-urlencoded' 'Content-Type': 'application/x-www-form-urlencoded'
}) })
else: else:
# Some providers (e.g. DIRECTV NOW) have another meta refresh
# based redirect that should be followed.
provider_redirect_page, urlh = provider_redirect_page_res
provider_refresh_redirect_url = extract_redirect_url(
provider_redirect_page, url=urlh.geturl())
if provider_refresh_redirect_url:
provider_redirect_page_res = self._download_webpage_handle(
provider_refresh_redirect_url, video_id,
'Downloading Provider Redirect Page (meta refresh)')
provider_login_page_res = post_form( provider_login_page_res = post_form(
provider_redirect_page_res, 'Downloading Provider Login Page') provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE)
mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {
mso_info.get('username_field', 'username'): username, mso_info.get('username_field', 'username'): username,
mso_info.get('password_field', 'password'): password, mso_info.get('password_field', 'password'): password,

View File

@ -43,7 +43,7 @@ class AudioBoomIE(InfoExtractor):
def from_clip(field): def from_clip(field):
if clip: if clip:
clip.get(field) return clip.get(field)
audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
'audio', webpage, 'audio url') 'audio', webpage, 'audio url')

View File

@ -36,7 +36,7 @@ class BBCCoUkIE(InfoExtractor):
(?: (?:
programmes/(?!articles/)| programmes/(?!articles/)|
iplayer(?:/[^/]+)?/(?:episode/|playlist/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
music/clips[/#]| music/(?:clips|audiovideo/popular)[/#]|
radio/player/ radio/player/
) )
(?P<id>%s)(?!/(?:episodes|broadcasts|clips)) (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
@ -229,8 +229,10 @@ class BBCCoUkIE(InfoExtractor):
}, { }, {
'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
'only_matching': True, 'only_matching': True,
} }, {
] 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
'only_matching': True,
}]
_USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
@ -523,6 +525,12 @@ class BBCCoUkIE(InfoExtractor):
webpage = self._download_webpage(url, group_id, 'Downloading video page') webpage = self._download_webpage(url, group_id, 'Downloading video page')
error = self._search_regex(
r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
webpage, 'error', default=None)
if error:
raise ExtractorError(error, expected=True)
programme_id = None programme_id = None
duration = None duration = None

View File

@ -84,9 +84,10 @@ class BuzzFeedIE(InfoExtractor):
continue continue
entries.append(self.url_result(video['url'])) entries.append(self.url_result(video['url']))
facebook_url = FacebookIE._extract_url(webpage) facebook_urls = FacebookIE._extract_urls(webpage)
if facebook_url: entries.extend([
entries.append(self.url_result(facebook_url)) self.url_result(facebook_url)
for facebook_url in facebook_urls])
return { return {
'_type': 'playlist', '_type': 'playlist',

View File

@ -15,19 +15,23 @@ class CBSNewsIE(CBSIE):
_TESTS = [ _TESTS = [
{ {
'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/', # 60 minutes
'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/',
'info_dict': { 'info_dict': {
'id': 'tesla-and-spacex-elon-musks-industrial-empire', 'id': '_B6Ga3VJrI4iQNKsir_cdFo9Re_YJHE_',
'ext': 'flv', 'ext': 'mp4',
'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire', 'title': 'Artificial Intelligence',
'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg', 'description': 'md5:8818145f9974431e0fb58a1b8d69613c',
'duration': 791, 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1606,
'uploader': 'CBSI-NEW',
'timestamp': 1498431900,
'upload_date': '20170625',
}, },
'params': { 'params': {
# rtmp download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'skip': 'Subscribers only',
}, },
{ {
'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
@ -52,6 +56,22 @@ class CBSNewsIE(CBSIE):
'skip_download': True, 'skip_download': True,
}, },
}, },
{
# 48 hours
'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/',
'info_dict': {
'id': 'QpM5BJjBVEAUFi7ydR9LusS69DPLqPJ1',
'ext': 'mp4',
'title': 'Cold as Ice',
'description': 'Can a childhood memory of a friend\'s murder solve a 1957 cold case? "48 Hours" correspondent Erin Moriarty has the latest.',
'upload_date': '20170604',
'timestamp': 1496538000,
'uploader': 'CBSI-NEW',
},
'params': {
'skip_download': True,
},
},
] ]
def _real_extract(self, url): def _real_extract(self, url):
@ -60,7 +80,7 @@ class CBSNewsIE(CBSIE):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_info = self._parse_json(self._html_search_regex( video_info = self._parse_json(self._html_search_regex(
r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', r'(?:<ul class="media-list items" id="media-related-items"[^>]*><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
webpage, 'video JSON info', default='{}'), video_id, fatal=False) webpage, 'video JSON info', default='{}'), video_id, fatal=False)
if video_info: if video_info:

View File

@ -0,0 +1,72 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
unescapeHTML,
)
class CJSWIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)'
_TESTS = [{
'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620',
'md5': 'cee14d40f1e9433632c56e3d14977120',
'info_dict': {
'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41',
'ext': 'mp3',
'title': 'Freshly Squeezed Episode June 20, 2017',
'description': 'md5:c967d63366c3898a80d0c7b0ff337202',
'series': 'Freshly Squeezed',
'episode_id': '20170620',
},
}, {
# no description
'url': 'http://cjsw.com/program/road-pops/episode/20170707/',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
program, episode_id = mobj.group('program', 'id')
audio_id = '%s/%s' % (program, episode_id)
webpage = self._download_webpage(url, episode_id)
title = unescapeHTML(self._search_regex(
(r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)',
r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'),
webpage, 'title', group='title'))
audio_url = self._search_regex(
r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'audio url', group='url')
audio_id = self._search_regex(
r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3',
audio_url, 'audio id', default=audio_id)
formats = [{
'url': audio_url,
'ext': determine_ext(audio_url, 'mp3'),
'vcodec': 'none',
}]
description = self._html_search_regex(
r'<p>(?P<description>.+?)</p>', webpage, 'description',
default=None)
series = self._search_regex(
r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage,
'series', default=program, group='name')
return {
'id': audio_id,
'title': title,
'description': description,
'formats': formats,
'series': series,
'episode_id': episode_id,
}

View File

@ -730,12 +730,12 @@ class InfoExtractor(object):
video_info['title'] = video_title video_info['title'] = video_title
return video_info return video_info
def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
urlrs = orderedSet( urls = orderedSet(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches) for m in matches)
return self.playlist_result( return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title) urls, playlist_id=playlist_id, playlist_title=playlist_title)
@staticmethod @staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
@ -1002,17 +1002,17 @@ class InfoExtractor(object):
item_type = e.get('@type') item_type = e.get('@type')
if expected_type is not None and expected_type != item_type: if expected_type is not None and expected_type != item_type:
return info return info
if item_type == 'TVEpisode': if item_type in ('TVEpisode', 'Episode'):
info.update({ info.update({
'episode': unescapeHTML(e.get('name')), 'episode': unescapeHTML(e.get('name')),
'episode_number': int_or_none(e.get('episodeNumber')), 'episode_number': int_or_none(e.get('episodeNumber')),
'description': unescapeHTML(e.get('description')), 'description': unescapeHTML(e.get('description')),
}) })
part_of_season = e.get('partOfSeason') part_of_season = e.get('partOfSeason')
if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
info['series'] = unescapeHTML(part_of_series.get('name')) info['series'] = unescapeHTML(part_of_series.get('name'))
elif item_type == 'Article': elif item_type == 'Article':
info.update({ info.update({
@ -1022,10 +1022,10 @@ class InfoExtractor(object):
}) })
elif item_type == 'VideoObject': elif item_type == 'VideoObject':
extract_video_object(e) extract_video_object(e)
elif item_type == 'WebPage': continue
video = e.get('video') video = e.get('video')
if isinstance(video, dict) and video.get('@type') == 'VideoObject': if isinstance(video, dict) and video.get('@type') == 'VideoObject':
extract_video_object(video) extract_video_object(video)
break break
return dict((k, v) for k, v in info.items() if v is not None) return dict((k, v) for k, v in info.items() if v is not None)
@ -2132,15 +2132,18 @@ class InfoExtractor(object):
return is_plain_url, formats return is_plain_url, formats
entries = [] entries = []
# amp-video and amp-audio are very similar to their HTML5 counterparts
# so we wll include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video)
media_tags = [(media_tag, media_type, '') media_tags = [(media_tag, media_type, '')
for media_tag, media_type for media_tag, media_type
in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
media_tags.extend(re.findall( media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'. # We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see # Allowing more characters may end up in significant slow down (see
# https://github.com/rg3/youtube-dl/issues/11979, example URL: # https://github.com/rg3/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml). # http://www.porntrex.com/maps/videositemap.xml).
r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
for media_tag, media_type, media_content in media_tags: for media_tag, media_type, media_content in media_tags:
media_info = { media_info = {
'formats': [], 'formats': [],

View File

@ -510,7 +510,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
# webpage provide more accurate data than series_title from XML # webpage provide more accurate data than series_title from XML
series = self._html_search_regex( series = self._html_search_regex(
r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)', r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d',
webpage, 'series', fatal=False) webpage, 'series', fatal=False)
season = xpath_text(metadata, 'series_title') season = xpath_text(metadata, 'series_title')
@ -518,7 +518,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
episode_number = int_or_none(xpath_text(metadata, 'episode_number')) episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
season_number = int_or_none(self._search_regex( season_number = int_or_none(self._search_regex(
r'(?s)<h4[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h4>\s*<h4>\s*Season (\d+)', r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
webpage, 'season number', default=None)) webpage, 'season number', default=None))
return { return {

View File

@ -1,6 +1,8 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import ( from ..utils import (
@ -12,8 +14,8 @@ from ..utils import (
class DailyMailIE(InfoExtractor): class DailyMailIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)'
_TEST = { _TESTS = [{
'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html',
'md5': 'f6129624562251f628296c3a9ffde124', 'md5': 'f6129624562251f628296c3a9ffde124',
'info_dict': { 'info_dict': {
@ -22,7 +24,16 @@ class DailyMailIE(InfoExtractor):
'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'',
'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84',
} }
} }, {
'url': 'http://www.dailymail.co.uk/embed/video/1295863.html',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -147,7 +147,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
view_count_str = self._search_regex( view_count_str = self._search_regex(
(r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"', (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
r'video_views_count[^>]+>\s+([\s\d\,.]+)'), r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
webpage, 'view count', fatal=False) webpage, 'view count', default=None)
if view_count_str: if view_count_str:
view_count_str = re.sub(r'\s', '', view_count_str) view_count_str = re.sub(r'\s', '', view_count_str)
view_count = str_to_int(view_count_str) view_count = str_to_int(view_count_str)
@ -159,7 +159,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
[r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826 [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826
r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
r'buildPlayer\(({.+?})\);', r'buildPlayer\(({.+?})\);',
r'var\s+config\s*=\s*({.+?});'], r'var\s+config\s*=\s*({.+?});',
# New layout regex (see https://github.com/rg3/youtube-dl/issues/13580)
r'__PLAYER_CONFIG__\s*=\s*({.+?});'],
webpage, 'player v5', default=None) webpage, 'player v5', default=None)
if player_v5: if player_v5:
player = self._parse_json(player_v5, video_id) player = self._parse_json(player_v5, video_id)

View File

@ -13,7 +13,7 @@ from ..utils import (
class DigitallySpeakingIE(InfoExtractor): class DigitallySpeakingIE(InfoExtractor):
_VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml' _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
_TESTS = [{ _TESTS = [{
# From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface
@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor):
# From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC
'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml', 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',
'only_matching': True, 'only_matching': True,
}, {
# From http://www.gdcvault.com/play/1013700/Advanced-Material
'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
'only_matching': True,
}] }]
def _parse_mp4(self, metadata): def _parse_mp4(self, metadata):

View File

@ -184,7 +184,7 @@ class DPlayItIE(InfoExtractor):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
info_url = self._search_regex( info_url = self._search_regex(
r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
webpage, 'video id') webpage, 'video id')
title = remove_end(self._og_search_title(webpage), ' | Dplay') title = remove_end(self._og_search_title(webpage), ' | Dplay')

View File

@ -12,6 +12,7 @@ from ..utils import (
ExtractorError, ExtractorError,
clean_html, clean_html,
int_or_none, int_or_none,
remove_end,
sanitized_Request, sanitized_Request,
urlencode_postdata urlencode_postdata
) )
@ -72,15 +73,15 @@ class DramaFeverIE(DramaFeverBaseIE):
'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
'info_dict': { 'info_dict': {
'id': '4512.1', 'id': '4512.1',
'ext': 'mp4', 'ext': 'flv',
'title': 'Cooking with Shin 4512.1', 'title': 'Cooking with Shin',
'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
'episode': 'Episode 1', 'episode': 'Episode 1',
'episode_number': 1, 'episode_number': 1,
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'timestamp': 1404336058, 'timestamp': 1404336058,
'upload_date': '20140702', 'upload_date': '20140702',
'duration': 343, 'duration': 344,
}, },
'params': { 'params': {
# m3u8 download # m3u8 download
@ -90,15 +91,15 @@ class DramaFeverIE(DramaFeverBaseIE):
'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1', 'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1',
'info_dict': { 'info_dict': {
'id': '4826.4', 'id': '4826.4',
'ext': 'mp4', 'ext': 'flv',
'title': 'Mnet Asian Music Awards 2015 4826.4', 'title': 'Mnet Asian Music Awards 2015',
'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91', 'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91',
'episode': 'Mnet Asian Music Awards 2015 - Part 3', 'episode': 'Mnet Asian Music Awards 2015 - Part 3',
'episode_number': 4, 'episode_number': 4,
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'timestamp': 1450213200, 'timestamp': 1450213200,
'upload_date': '20151215', 'upload_date': '20151215',
'duration': 5602, 'duration': 5359,
}, },
'params': { 'params': {
# m3u8 download # m3u8 download
@ -122,6 +123,10 @@ class DramaFeverIE(DramaFeverBaseIE):
countries=self._GEO_COUNTRIES) countries=self._GEO_COUNTRIES)
raise raise
# title is postfixed with video id for some reason, removing
if info.get('title'):
info['title'] = remove_end(info['title'], video_id).strip()
series_id, episode_number = video_id.split('.') series_id, episode_number = video_id.split('.')
episode_info = self._download_json( episode_info = self._download_json(
# We only need a single episode info, so restricting page size to one episode # We only need a single episode info, so restricting page size to one episode

View File

@ -118,7 +118,7 @@ class DRTVIE(InfoExtractor):
if target == 'HDS': if target == 'HDS':
f4m_formats = self._extract_f4m_formats( f4m_formats = self._extract_f4m_formats(
uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
video_id, preference, f4m_id=format_id) video_id, preference, f4m_id=format_id, fatal=False)
if kind == 'AudioResource': if kind == 'AudioResource':
for f in f4m_formats: for f in f4m_formats:
f['vcodec'] = 'none' f['vcodec'] = 'none'
@ -126,7 +126,8 @@ class DRTVIE(InfoExtractor):
elif target == 'HLS': elif target == 'HLS':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
uri, video_id, 'mp4', entry_protocol='m3u8_native', uri, video_id, 'mp4', entry_protocol='m3u8_native',
preference=preference, m3u8_id=format_id)) preference=preference, m3u8_id=format_id,
fatal=False))
else: else:
bitrate = link.get('Bitrate') bitrate = link.get('Bitrate')
if bitrate: if bitrate:

View File

@ -11,6 +11,7 @@ from ..compat import (
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
unsmuggle_url,
) )
@ -50,6 +51,10 @@ class EaglePlatformIE(InfoExtractor):
'view_count': int, 'view_count': int,
}, },
'skip': 'Georestricted', 'skip': 'Georestricted',
}, {
# referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/)
'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306',
'only_matching': True,
}] }]
@staticmethod @staticmethod
@ -60,16 +65,40 @@ class EaglePlatformIE(InfoExtractor):
webpage) webpage)
if mobj is not None: if mobj is not None:
return mobj.group('url') return mobj.group('url')
# Basic usage embedding (see http://dultonmedia.github.io/eplayer/) PLAYER_JS_RE = r'''
<script[^>]+
src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs)
.+?
'''
# "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/)
mobj = re.search( mobj = re.search(
r'''(?xs) r'''(?xs)
<script[^>]+ %s
src=(?P<q1>["\'])(?:https?:)?//(?P<host>.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1)
.+?
<div[^>]+ <div[^>]+
class=(?P<q2>["\'])eagleplayer(?P=q2)[^>]+ class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+
data-id=["\'](?P<id>\d+) data-id=["\'](?P<id>\d+)
''', webpage) ''' % PLAYER_JS_RE, webpage)
if mobj is not None:
return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
# Generalization of "Javascript code usage", "Combined usage" and
# "Usage without attaching to DOM" embeddings (see
# http://dultonmedia.github.io/eplayer/)
mobj = re.search(
r'''(?xs)
%s
<script>
.+?
new\s+EaglePlayer\(
(?:[^,]+\s*,\s*)?
{
.+?
\bid\s*:\s*["\']?(?P<id>\d+)
.+?
}
\s*\)
.+?
</script>
''' % PLAYER_JS_RE, webpage)
if mobj is not None: if mobj is not None:
return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
@ -79,9 +108,10 @@ class EaglePlatformIE(InfoExtractor):
if status != 200: if status != 200:
raise ExtractorError(' '.join(response['errors']), expected=True) raise ExtractorError(' '.join(response['errors']), expected=True)
def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', *args, **kwargs): def _download_json(self, url_or_request, video_id, *args, **kwargs):
try: try:
response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) response = super(EaglePlatformIE, self)._download_json(
url_or_request, video_id, *args, **kwargs)
except ExtractorError as ee: except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError): if isinstance(ee.cause, compat_HTTPError):
response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) response = self._parse_json(ee.cause.read().decode('utf-8'), video_id)
@ -93,11 +123,24 @@ class EaglePlatformIE(InfoExtractor):
return self._download_json(url_or_request, video_id, note)['data'][0] return self._download_json(url_or_request, video_id, note)['data'][0]
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')
headers = {}
query = {
'id': video_id,
}
referrer = smuggled_data.get('referrer')
if referrer:
headers['Referer'] = referrer
query['referrer'] = referrer
player_data = self._download_json( player_data = self._download_json(
'http://%s/api/player_data?id=%s' % (host, video_id), video_id) 'http://%s/api/player_data' % host, video_id,
headers=headers, query=query)
media = player_data['data']['playlist']['viewports'][0]['medialist'][0] media = player_data['data']['playlist']['viewports'][0]['medialist'][0]

View File

@ -1,15 +1,18 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
int_or_none,
try_get,
unified_timestamp,
)
class EggheadCourseIE(InfoExtractor): class EggheadCourseIE(InfoExtractor):
IE_DESC = 'egghead.io course' IE_DESC = 'egghead.io course'
IE_NAME = 'egghead:course' IE_NAME = 'egghead:course'
_VALID_URL = r'https://egghead\.io/courses/(?P<id>[a-zA-Z_0-9-]+)' _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'
_TEST = { _TEST = {
'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript',
'playlist_count': 29, 'playlist_count': 29,
@ -22,18 +25,60 @@ class EggheadCourseIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
title = self._html_search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'title') course = self._download_json(
ul = self._search_regex(r'(?s)<ul class="series-lessons-list">(.*?)</ul>', webpage, 'session list') 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id)
found = re.findall(r'(?s)<a class="[^"]*"\s*href="([^"]+)">\s*<li class="item', ul) entries = [
entries = [self.url_result(m) for m in found] self.url_result(
'wistia:%s' % lesson['wistia_id'], ie='Wistia',
video_id=lesson['wistia_id'], video_title=lesson.get('title'))
for lesson in course['lessons'] if lesson.get('wistia_id')]
return self.playlist_result(
entries, playlist_id, course.get('title'),
course.get('description'))
class EggheadLessonIE(InfoExtractor):
IE_DESC = 'egghead.io lesson'
IE_NAME = 'egghead:lesson'
_VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
'info_dict': {
'id': 'fv5yotjxcg',
'ext': 'mp4',
'title': 'Create linear data flow with container style types (Box)',
'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e',
'thumbnail': r're:^https?:.*\.jpg$',
'timestamp': 1481296768,
'upload_date': '20161209',
'duration': 304,
'view_count': 0,
'tags': ['javascript', 'free'],
},
'params': {
'skip_download': True,
},
}
def _real_extract(self, url):
lesson_id = self._match_id(url)
lesson = self._download_json(
'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)
return { return {
'_type': 'playlist', '_type': 'url_transparent',
'id': playlist_id, 'ie_key': 'Wistia',
'title': title, 'url': 'wistia:%s' % lesson['wistia_id'],
'description': self._og_search_description(webpage), 'id': lesson['wistia_id'],
'entries': entries, 'title': lesson.get('title'),
'description': lesson.get('summary'),
'thumbnail': lesson.get('thumb_nail'),
'timestamp': unified_timestamp(lesson.get('published_at')),
'duration': int_or_none(lesson.get('duration')),
'view_count': int_or_none(lesson.get('plays_count')),
'tags': try_get(lesson, lambda x: x['tag_list'], list),
} }

View File

@ -10,7 +10,25 @@ from ..utils import (
class ESPNIE(InfoExtractor): class ESPNIE(InfoExtractor):
_VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/video/clip(?:\?.*?\bid=|/_/id/)(?P<id>\d+)' _VALID_URL = r'''(?x)
https?://
(?:
(?:(?:\w+\.)+)?espn\.go|
(?:www\.)?espn
)\.com/
(?:
(?:
video/clip|
watch/player
)
(?:
\?.*?\bid=|
/_/id/
)
)
(?P<id>\d+)
'''
_TESTS = [{ _TESTS = [{
'url': 'http://espn.go.com/video/clip?id=10365079', 'url': 'http://espn.go.com/video/clip?id=10365079',
'info_dict': { 'info_dict': {
@ -25,20 +43,34 @@ class ESPNIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
# intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season 'url': 'https://broadband.espn.go.com/video/clip?id=18910086',
'url': 'http://espn.go.com/video/clip?id=2743663',
'info_dict': { 'info_dict': {
'id': '2743663', 'id': '18910086',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Must-See Moments: Best of the MLS season', 'title': 'Kyrie spins around defender for two',
'description': 'md5:4c2d7232beaea572632bec41004f0aeb', 'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b',
'timestamp': 1449446454, 'timestamp': 1489539155,
'upload_date': '20151207', 'upload_date': '20170315',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'expected_warnings': ['Unable to download f4m manifest'], 'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672',
'only_matching': True,
}, {
'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774',
'only_matching': True,
}, {
'url': 'http://www.espn.com/watch/player?id=19141491',
'only_matching': True,
}, {
'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875',
'only_matching': True,
}, {
'url': 'http://www.espn.com/watch/player/_/id/19141491',
'only_matching': True,
}, { }, {
'url': 'http://www.espn.com/video/clip?id=10365079', 'url': 'http://www.espn.com/video/clip?id=10365079',
'only_matching': True, 'only_matching': True,

View File

@ -185,6 +185,7 @@ from .chirbit import (
ChirbitProfileIE, ChirbitProfileIE,
) )
from .cinchcast import CinchcastIE from .cinchcast import CinchcastIE
from .cjsw import CJSWIE
from .clipfish import ClipfishIE from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE from .cliphunter import CliphunterIE
from .cliprs import ClipRsIE from .cliprs import ClipRsIE
@ -297,7 +298,10 @@ from .dw import (
from .eagleplatform import EaglePlatformIE from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE from .echomsk import EchoMskIE
from .egghead import EggheadCourseIE from .egghead import (
EggheadCourseIE,
EggheadLessonIE,
)
from .ehow import EHowIE from .ehow import EHowIE
from .eighttracks import EightTracksIE from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE from .einthusan import EinthusanIE
@ -469,6 +473,7 @@ from .jamendo import (
) )
from .jeuxvideo import JeuxVideoIE from .jeuxvideo import JeuxVideoIE
from .jove import JoveIE from .jove import JoveIE
from .joj import JojIE
from .jwplatform import JWPlatformIE from .jwplatform import JWPlatformIE
from .jpopsukitv import JpopsukiIE from .jpopsukitv import JpopsukiIE
from .kaltura import KalturaIE from .kaltura import KalturaIE
@ -651,6 +656,10 @@ from .nextmedia import (
AppleDailyIE, AppleDailyIE,
NextTVIE, NextTVIE,
) )
from .nexx import (
NexxIE,
NexxEmbedIE,
)
from .nfb import NFBIE from .nfb import NFBIE
from .nfl import NFLIE from .nfl import NFLIE
from .nhk import NhkVodIE from .nhk import NhkVodIE
@ -759,6 +768,7 @@ from .pandoratv import PandoraTVIE
from .parliamentliveuk import ParliamentLiveUKIE from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE from .patreon import PatreonIE
from .pbs import PBSIE from .pbs import PBSIE
from .pearvideo import PearVideoIE
from .people import PeopleIE from .people import PeopleIE
from .periscope import ( from .periscope import (
PeriscopeIE, PeriscopeIE,
@ -824,6 +834,7 @@ from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE from .radiofrance import RadioFranceIE
from .rai import ( from .rai import (
RaiPlayIE, RaiPlayIE,
RaiPlayLiveIE,
RaiIE, RaiIE,
) )
from .rbmaradio import RBMARadioIE from .rbmaradio import RBMARadioIE
@ -972,6 +983,7 @@ from .tagesschau import (
TagesschauIE, TagesschauIE,
) )
from .tass import TassIE from .tass import TassIE
from .tastytrade import TastyTradeIE
from .tbs import TBSIE from .tbs import TBSIE
from .tdslifeway import TDSLifewayIE from .tdslifeway import TDSLifewayIE
from .teachertube import ( from .teachertube import (
@ -1202,7 +1214,8 @@ from .vk import (
) )
from .vlive import ( from .vlive import (
VLiveIE, VLiveIE,
VLiveChannelIE VLiveChannelIE,
VLivePlaylistIE
) )
from .vodlocker import VodlockerIE from .vodlocker import VodlockerIE
from .vodpl import VODPlIE from .vodpl import VODPlIE
@ -1278,7 +1291,6 @@ from .yahoo import (
YahooIE, YahooIE,
YahooSearchIE, YahooSearchIE,
) )
from .yam import YamIE
from .yandexmusic import ( from .yandexmusic import (
YandexMusicTrackIE, YandexMusicTrackIE,
YandexMusicAlbumIE, YandexMusicAlbumIE,

View File

@ -203,19 +203,19 @@ class FacebookIE(InfoExtractor):
}] }]
@staticmethod @staticmethod
def _extract_url(webpage): def _extract_urls(webpage):
mobj = re.search( urls = []
r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) for mobj in re.finditer(
if mobj is not None: r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
return mobj.group('url') webpage):
urls.append(mobj.group('url'))
# Facebook API embed # Facebook API embed
# see https://developers.facebook.com/docs/plugins/embedded-video-player # see https://developers.facebook.com/docs/plugins/embedded-video-player
mobj = re.search(r'''(?x)<div[^>]+ for mobj in re.finditer(r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage):
if mobj is not None: urls.append(mobj.group('url'))
return mobj.group('url') return urls
def _login(self): def _login(self):
(useremail, password) = self._get_login_info() (useremail, password) = self._get_login_info()

View File

@ -43,7 +43,7 @@ class FiveTVIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'glavnoe', 'id': 'glavnoe',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Итоги недели с 8 по 14 июня 2015 года', 'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
}, },
}, { }, {
@ -70,7 +70,8 @@ class FiveTVIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_url = self._search_regex( video_url = self._search_regex(
r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"', [r'<div[^>]+?class="flowplayer[^>]+?data-href="([^"]+)"',
r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
webpage, 'video url') webpage, 'video url')
title = self._og_search_title(webpage, default=None) or self._search_regex( title = self._og_search_title(webpage, default=None) or self._search_regex(

View File

@ -1,10 +1,14 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
unified_timestamp,
)
class FunnyOrDieIE(InfoExtractor): class FunnyOrDieIE(InfoExtractor):
@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor):
'title': 'Heart-Shaped Box: Literal Video Version', 'title': 'Heart-Shaped Box: Literal Video Version',
'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338', 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
'thumbnail': r're:^http:.*\.jpg$', 'thumbnail': r're:^http:.*\.jpg$',
'uploader': 'DASjr',
'timestamp': 1317904928,
'upload_date': '20111006',
'duration': 318.3,
}, },
}, { }, {
'url': 'http://www.funnyordie.com/embed/e402820827', 'url': 'http://www.funnyordie.com/embed/e402820827',
@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor):
'title': 'Please Use This Song (Jon Lajoie)', 'title': 'Please Use This Song (Jon Lajoie)',
'description': 'Please use this to sell something. www.jonlajoie.com', 'description': 'Please use this to sell something. www.jonlajoie.com',
'thumbnail': r're:^http:.*\.jpg$', 'thumbnail': r're:^http:.*\.jpg$',
'timestamp': 1398988800,
'upload_date': '20140502',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor):
'url': 'http://www.funnyordie.com%s' % src, 'url': 'http://www.funnyordie.com%s' % src,
}] }]
post_json = self._search_regex( timestamp = unified_timestamp(self._html_search_meta(
r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') 'uploadDate', webpage, 'timestamp', default=None))
post = json.loads(post_json)
uploader = self._html_search_regex(
r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
webpage, 'uploader', default=None)
title, description, thumbnail, duration = [None] * 4
medium = self._parse_json(
self._search_regex(
r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
default='{}'),
video_id, fatal=False)
if medium:
title = medium.get('title')
duration = float_or_none(medium.get('duration'))
if not timestamp:
timestamp = unified_timestamp(medium.get('publishDate'))
post = self._parse_json(
self._search_regex(
r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
default='{}'),
video_id, fatal=False)
if post:
if not title:
title = post.get('name')
description = post.get('description')
thumbnail = post.get('picture')
if not title:
title = self._og_search_title(webpage)
if not description:
description = self._og_search_description(webpage)
if not duration:
duration = int_or_none(self._html_search_meta(
('video:duration', 'duration'), webpage, 'duration', default=False))
return { return {
'id': video_id, 'id': video_id,
'title': post['name'], 'title': title,
'description': post.get('description'), 'description': description,
'thumbnail': post.get('picture'), 'thumbnail': thumbnail,
'uploader': uploader,
'timestamp': timestamp,
'duration': duration,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} }

View File

@ -36,6 +36,10 @@ from .brightcove import (
BrightcoveLegacyIE, BrightcoveLegacyIE,
BrightcoveNewIE, BrightcoveNewIE,
) )
from .nexx import (
NexxIE,
NexxEmbedIE,
)
from .nbc import NBCSportsVPlayerIE from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE from .ooyala import OoyalaIE
from .rutv import RUTVIE from .rutv import RUTVIE
@ -57,6 +61,7 @@ from .dailymotion import (
DailymotionIE, DailymotionIE,
DailymotionCloudIE, DailymotionCloudIE,
) )
from .dailymail import DailyMailIE
from .onionstudios import OnionStudiosIE from .onionstudios import OnionStudiosIE
from .viewlift import ViewLiftEmbedIE from .viewlift import ViewLiftEmbedIE
from .mtv import MTVServicesEmbeddedIE from .mtv import MTVServicesEmbeddedIE
@ -91,6 +96,7 @@ from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE from .washingtonpost import WashingtonPostIE
from .wistia import WistiaIE from .wistia import WistiaIE
from .mediaset import MediasetIE from .mediaset import MediasetIE
from .joj import JojIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -759,6 +765,20 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': ['Dailymotion'], 'add_ie': ['Dailymotion'],
}, },
# DailyMail embed
{
'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot',
'info_dict': {
'id': '1495629',
'ext': 'mp4',
'title': 'Care worker punches elderly dementia patient in head 11 times',
'description': 'md5:3a743dee84e57e48ec68bf67113199a5',
},
'add_ie': ['DailyMail'],
'params': {
'skip_download': True,
},
},
# YouTube embed # YouTube embed
{ {
'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
@ -1185,7 +1205,7 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': ['Kaltura'], 'add_ie': ['Kaltura'],
}, },
# Eagle.Platform embed (generic URL) # EaglePlatform embed (generic URL)
{ {
'url': 'http://lenta.ru/news/2015/03/06/navalny/', 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
# Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@ -1199,8 +1219,26 @@ class GenericIE(InfoExtractor):
'view_count': int, 'view_count': int,
'age_limit': 0, 'age_limit': 0,
}, },
'params': {
'skip_download': True,
},
}, },
# ClipYou (Eagle.Platform) embed (custom URL) # referrer protected EaglePlatform embed
{
'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
'info_dict': {
'id': '582306',
'ext': 'mp4',
'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 3382,
'view_count': int,
},
'params': {
'skip_download': True,
},
},
# ClipYou (EaglePlatform) embed (custom URL)
{ {
'url': 'http://muz-tv.ru/play/7129/', 'url': 'http://muz-tv.ru/play/7129/',
# Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@ -1212,6 +1250,9 @@ class GenericIE(InfoExtractor):
'duration': 216, 'duration': 216,
'view_count': int, 'view_count': int,
}, },
'params': {
'skip_download': True,
},
}, },
# Pladform embed # Pladform embed
{ {
@ -1512,6 +1553,22 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': ['BrightcoveLegacy'], 'add_ie': ['BrightcoveLegacy'],
}, },
# Nexx embed
{
'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503',
'info_dict': {
'id': '247746',
'ext': 'mp4',
'title': "Yesterday's Jam (OV)",
'description': 'md5:09bc0984723fed34e2581624a84e05f0',
'timestamp': 1492594816,
'upload_date': '20170419',
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
},
# Facebook <iframe> embed # Facebook <iframe> embed
{ {
'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
@ -1522,6 +1579,21 @@ class GenericIE(InfoExtractor):
'title': 'Facebook video #599637780109885', 'title': 'Facebook video #599637780109885',
}, },
}, },
# Facebook <iframe> embed, plugin video
{
'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/',
'info_dict': {
'id': '1754168231264132',
'ext': 'mp4',
'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...',
'uploader': 'Tariq Ramadan (official)',
'timestamp': 1496758379,
'upload_date': '20170606',
},
'params': {
'skip_download': True,
},
},
# Facebook API embed # Facebook API embed
{ {
'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
@ -1734,6 +1806,26 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': [MediasetIE.ie_key()], 'add_ie': [MediasetIE.ie_key()],
}, },
{
# JOJ.sk embeds
'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
'info_dict': {
'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok',
'title': 'Slovenskom sa prehnala vlna silných búrok',
},
'playlist_mincount': 5,
'add_ie': [JojIE.ie_key()],
},
{
# AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
'url': 'https://tvrain.ru/amp/418921/',
'md5': 'cc00413936695987e8de148b67d14f1d',
'info_dict': {
'id': '418921',
'ext': 'mp4',
'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
},
},
# { # {
# # TODO: find another test # # TODO: find another test
# # http://schema.org/VideoObject # # http://schema.org/VideoObject
@ -2033,6 +2125,13 @@ class GenericIE(InfoExtractor):
video_description = self._og_search_description(webpage, default=None) video_description = self._og_search_description(webpage, default=None)
video_thumbnail = self._og_search_thumbnail(webpage, default=None) video_thumbnail = self._og_search_thumbnail(webpage, default=None)
info_dict.update({
'title': video_title,
'description': video_description,
'thumbnail': video_thumbnail,
'age_limit': age_limit,
})
# Look for Brightcove Legacy Studio embeds # Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls: if bc_urls:
@ -2054,6 +2153,16 @@ class GenericIE(InfoExtractor):
if bc_urls: if bc_urls:
return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
# Look for Nexx embeds
nexx_urls = NexxIE._extract_urls(webpage)
if nexx_urls:
return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key())
# Look for Nexx iFrame embeds
nexx_embed_urls = NexxEmbedIE._extract_urls(webpage)
if nexx_embed_urls:
return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key())
# Look for ThePlatform embeds # Look for ThePlatform embeds
tp_urls = ThePlatformIE._extract_urls(webpage) tp_urls = ThePlatformIE._extract_urls(webpage)
if tp_urls: if tp_urls:
@ -2126,6 +2235,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for DailyMail embeds
dailymail_urls = DailyMailIE._extract_urls(webpage)
if dailymail_urls:
return self.playlist_from_matches(
dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
# Look for embedded Wistia player # Look for embedded Wistia player
wistia_url = WistiaIE._extract_url(webpage) wistia_url = WistiaIE._extract_url(webpage)
if wistia_url: if wistia_url:
@ -2222,9 +2337,9 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url')) return self.url_result(mobj.group('url'))
# Look for embedded Facebook player # Look for embedded Facebook player
facebook_url = FacebookIE._extract_url(webpage) facebook_urls = FacebookIE._extract_urls(webpage)
if facebook_url is not None: if facebook_urls:
return self.url_result(facebook_url, 'Facebook') return self.playlist_from_matches(facebook_urls, video_id, video_title)
# Look for embedded VK player # Look for embedded VK player
mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
@ -2421,12 +2536,12 @@ class GenericIE(InfoExtractor):
if kaltura_url: if kaltura_url:
return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
# Look for Eagle.Platform embeds # Look for EaglePlatform embeds
eagleplatform_url = EaglePlatformIE._extract_url(webpage) eagleplatform_url = EaglePlatformIE._extract_url(webpage)
if eagleplatform_url: if eagleplatform_url:
return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
# Look for ClipYou (uses Eagle.Platform) embeds # Look for ClipYou (uses EaglePlatform) embeds
mobj = re.search( mobj = re.search(
r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
if mobj is not None: if mobj is not None:
@ -2655,7 +2770,7 @@ class GenericIE(InfoExtractor):
rutube_urls = RutubeIE._extract_urls(webpage) rutube_urls = RutubeIE._extract_urls(webpage)
if rutube_urls: if rutube_urls:
return self.playlist_from_matches( return self.playlist_from_matches(
rutube_urls, ie=RutubeIE.ie_key()) rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
# Look for WashingtonPost embeds # Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage) wapo_urls = WashingtonPostIE._extract_urls(webpage)
@ -2669,18 +2784,32 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
# Look for JOJ.sk embeds
joj_urls = JojIE._extract_urls(webpage)
if joj_urls:
return self.playlist_from_matches(
joj_urls, video_id, video_title, ie=JojIE.ie_key())
def merge_dicts(dict1, dict2):
merged = {}
for k, v in dict1.items():
if v is not None:
merged[k] = v
for k, v in dict2.items():
if v is None:
continue
if (k not in merged or
(isinstance(v, compat_str) and v and
isinstance(merged[k], compat_str) and
not merged[k])):
merged[k] = v
return merged
# Looking for http://schema.org/VideoObject # Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld( json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject') webpage, video_id, default={}, expected_type='VideoObject')
if json_ld.get('url'): if json_ld.get('url'):
info_dict.update({ return merge_dicts(json_ld, info_dict)
'title': video_title or info_dict['title'],
'description': video_description,
'thumbnail': video_thumbnail,
'age_limit': age_limit
})
info_dict.update(json_ld)
return info_dict
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@ -2698,9 +2827,7 @@ class GenericIE(InfoExtractor):
if jwplayer_data: if jwplayer_data:
info = self._parse_jwplayer_data( info = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False, base_url=url) jwplayer_data, video_id, require_title=False, base_url=url)
if not info.get('title'): return merge_dicts(info, info_dict)
info['title'] = video_title
return info
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):

View File

@ -5,9 +5,10 @@ import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
unescapeHTML, determine_ext,
qualities,
int_or_none, int_or_none,
qualities,
unescapeHTML,
) )
@ -15,7 +16,7 @@ class GiantBombIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)' _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'
_TEST = { _TEST = {
'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/', 'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/',
'md5': '57badeface303ecf6b98b812de1b9018', 'md5': 'c8ea694254a59246a42831155dec57ac',
'info_dict': { 'info_dict': {
'id': '2300-9782', 'id': '2300-9782',
'display_id': 'quick-look-destiny-the-dark-below', 'display_id': 'quick-look-destiny-the-dark-below',
@ -51,11 +52,16 @@ class GiantBombIE(InfoExtractor):
for format_id, video_url in video['videoStreams'].items(): for format_id, video_url in video['videoStreams'].items():
if format_id == 'f4m_stream': if format_id == 'f4m_stream':
continue continue
if video_url.endswith('.f4m'): ext = determine_ext(video_url)
if ext == 'f4m':
f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id) f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id)
if f4m_formats: if f4m_formats:
f4m_formats[0]['quality'] = quality(format_id) f4m_formats[0]['quality'] = quality(format_id)
formats.extend(f4m_formats) formats.extend(f4m_formats)
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, display_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
else: else:
formats.append({ formats.append({
'url': video_url, 'url': video_url,

View File

@ -92,7 +92,7 @@ class GoogleDriveIE(InfoExtractor):
if resolution: if resolution:
f.update({ f.update({
'width': resolution[0], 'width': resolution[0],
'height': resolution[0], 'height': resolution[1],
}) })
formats.append(f) formats.append(f)
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -28,7 +28,7 @@ class HGTVComShowIE(InfoExtractor):
config = self._parse_json( config = self._parse_json(
self._search_regex( self._search_regex(
r'(?s)data-(?:deferred)?-module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script', r'(?s)data-(?:deferred-)?module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script',
webpage, 'video config'), webpage, 'video config'),
display_id)['channels'][0] display_id)['channels'][0]

View File

@ -89,6 +89,11 @@ class IGNIE(InfoExtractor):
'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
'only_matching': True, 'only_matching': True,
}, },
{
# videoId pattern
'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
'only_matching': True,
},
] ]
def _find_video_id(self, webpage): def _find_video_id(self, webpage):
@ -98,6 +103,8 @@ class IGNIE(InfoExtractor):
r'data-video-id="(.+?)"', r'data-video-id="(.+?)"',
r'<object id="vid_(.+?)"', r'<object id="vid_(.+?)"',
r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
r'videoId&quot;\s*:\s*&quot;(.+?)&quot;',
r'videoId["\']\s*:\s*["\']([^"\']+?)["\']',
] ]
return self._search_regex(res_id, webpage, 'video id', default=None) return self._search_regex(res_id, webpage, 'video id', default=None)

View File

@ -59,12 +59,18 @@ class ITVIE(InfoExtractor):
def _add_sub_element(element, name): def _add_sub_element(element, name):
return etree.SubElement(element, _add_ns(name)) return etree.SubElement(element, _add_ns(name))
production_id = (
params.get('data-video-autoplay-id') or
'%s#001' % (
params.get('data-video-episode-id') or
video_id.replace('a', '/')))
req_env = etree.Element(_add_ns('soapenv:Envelope')) req_env = etree.Element(_add_ns('soapenv:Envelope'))
_add_sub_element(req_env, 'soapenv:Header') _add_sub_element(req_env, 'soapenv:Header')
body = _add_sub_element(req_env, 'soapenv:Body') body = _add_sub_element(req_env, 'soapenv:Body')
get_playlist = _add_sub_element(body, ('tem:GetPlaylist')) get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
request = _add_sub_element(get_playlist, 'tem:request') request = _add_sub_element(get_playlist, 'tem:request')
_add_sub_element(request, 'itv:ProductionId').text = params['data-video-id'] _add_sub_element(request, 'itv:ProductionId').text = production_id
_add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper() _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
vodcrid = _add_sub_element(request, 'itv:Vodcrid') vodcrid = _add_sub_element(request, 'itv:Vodcrid')
_add_sub_element(vodcrid, 'com:Id') _add_sub_element(vodcrid, 'com:Id')

100
youtube_dl/extractor/joj.py Executable file
View File

@ -0,0 +1,100 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
js_to_json,
try_get,
)
class JojIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
joj:|
https?://media\.joj\.sk/embed/
)
(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})
'''
_TESTS = [{
'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
'info_dict': {
'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
'ext': 'mp4',
'title': 'NOVÉ BÝVANIE',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 3118,
}
}, {
'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'https://media.joj.sk/embed/%s' % video_id, video_id)
title = self._search_regex(
(r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
r'<title>(?P<title>[^<]+)'), webpage, 'title',
default=None, group='title') or self._og_search_title(webpage)
bitrates = self._parse_json(
self._search_regex(
r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates',
default='{}'),
video_id, transform_source=js_to_json, fatal=False)
formats = []
for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
if isinstance(format_url, compat_str):
height = self._search_regex(
r'(\d+)[pP]\.', format_url, 'height', default=None)
formats.append({
'url': format_url,
'format_id': '%sp' % height if height else None,
'height': int(height),
})
if not formats:
playlist = self._download_xml(
'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
video_id)
for file_el in playlist.findall('./files/file'):
path = file_el.get('path')
if not path:
continue
format_id = file_el.get('id') or file_el.get('label')
formats.append({
'url': 'http://n16.joj.sk/storage/%s' % path.replace(
'dat/', '', 1),
'format_id': format_id,
'height': int_or_none(self._search_regex(
r'(\d+)[pP]', format_id or path, 'height',
default=None)),
})
self._sort_formats(formats)
thumbnail = self._og_search_thumbnail(webpage)
duration = int_or_none(self._search_regex(
r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}

View File

@ -324,7 +324,7 @@ class KalturaIE(InfoExtractor):
if captions: if captions:
for caption in captions.get('objects', []): for caption in captions.get('objects', []):
# Continue if caption is not ready # Continue if caption is not ready
if f.get('status') != 2: if caption.get('status') != 2:
continue continue
if not caption.get('id'): if not caption.get('id'):
continue continue

View File

@ -48,7 +48,7 @@ class KarriereVideosIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = (self._html_search_meta('title', webpage, default=None) or title = (self._html_search_meta('title', webpage, default=None) or
self._search_regex(r'<h1 class="title">([^<]+)</h1>')) self._search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'video title'))
video_id = self._search_regex( video_id = self._search_regex(
r'/config/video/(.+?)\.xml', webpage, 'video id') r'/config/video/(.+?)\.xml', webpage, 'video id')

View File

@ -83,7 +83,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
hls_url = rendition.find('./src').text hls_url = rendition.find('./src').text
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', hls_url, video_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id='hls')) m3u8_id='hls', fatal=False))
else: else:
# fms # fms
try: try:
@ -106,7 +106,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
}]) }])
except (KeyError, TypeError): except (KeyError, TypeError):
raise ExtractorError('Invalid rendition field.') raise ExtractorError('Invalid rendition field.')
self._sort_formats(formats) if formats:
self._sort_formats(formats)
return formats return formats
def _extract_subtitles(self, mdoc, mtvn_id): def _extract_subtitles(self, mdoc, mtvn_id):
@ -133,8 +134,11 @@ class MTVServicesInfoExtractor(InfoExtractor):
mediagen_url += 'acceptMethods=' mediagen_url += 'acceptMethods='
mediagen_url += 'hls' if use_hls else 'fms' mediagen_url += 'hls' if use_hls else 'fms'
mediagen_doc = self._download_xml(mediagen_url, video_id, mediagen_doc = self._download_xml(
'Downloading video urls') mediagen_url, video_id, 'Downloading video urls', fatal=False)
if mediagen_doc is False:
return None
item = mediagen_doc.find('./video/item') item = mediagen_doc.find('./video/item')
if item is not None and item.get('type') == 'text': if item is not None and item.get('type') == 'text':
@ -174,6 +178,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id) formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id)
# Some parts of complete video may be missing (e.g. missing Act 3 in
# http://www.southpark.de/alle-episoden/s14e01-sexual-healing)
if not formats:
return None
self._sort_formats(formats)
return { return {
'title': title, 'title': title,
'formats': formats, 'formats': formats,
@ -205,9 +216,14 @@ class MTVServicesInfoExtractor(InfoExtractor):
title = xpath_text(idoc, './channel/title') title = xpath_text(idoc, './channel/title')
description = xpath_text(idoc, './channel/description') description = xpath_text(idoc, './channel/description')
entries = []
for item in idoc.findall('.//item'):
info = self._get_video_info(item, use_hls)
if info:
entries.append(info)
return self.playlist_result( return self.playlist_result(
[self._get_video_info(item, use_hls) for item in idoc.findall('.//item')], entries, playlist_title=title, playlist_description=description)
playlist_title=title, playlist_description=description)
def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
triforce_feed = self._parse_json(self._search_regex( triforce_feed = self._parse_json(self._search_regex(

View File

@ -0,0 +1,271 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import random
import re
import time
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
parse_duration,
try_get,
urlencode_postdata,
)
class NexxIE(InfoExtractor):
_VALID_URL = r'https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/(?P<id>\d+)'
_TESTS = [{
# movie
'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
'md5': '16746bfc28c42049492385c989b26c4a',
'info_dict': {
'id': '128907',
'ext': 'mp4',
'title': 'Stiftung Warentest',
'alt_title': 'Wie ein Test abläuft',
'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
'release_year': 2013,
'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2509,
'timestamp': 1384264416,
'upload_date': '20131112',
},
'params': {
'format': 'bestvideo',
},
}, {
# episode
'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858',
'info_dict': {
'id': '247858',
'ext': 'mp4',
'title': 'Return of the Golden Child (OV)',
'description': 'md5:5d969537509a92b733de21bae249dc63',
'release_year': 2017,
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1397,
'timestamp': 1495033267,
'upload_date': '20170517',
'episode_number': 2,
'season_number': 2,
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
}, {
'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
# Reference:
# 1. https://nx-s.akamaized.net/files/201510/44.pdf
entries = []
# JavaScript Integration
mobj = re.search(
r'<script\b[^>]+\bsrc=["\']https?://require\.nexx(?:\.cloud|cdn\.com)/(?P<id>\d+)',
webpage)
if mobj:
domain_id = mobj.group('id')
for video_id in re.findall(
r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)',
webpage):
entries.append(
'https://api.nexx.cloud/v3/%s/videos/byid/%s'
% (domain_id, video_id))
# TODO: support more embed formats
return entries
@staticmethod
def _extract_url(webpage):
return NexxIE._extract_urls(webpage)[0]
def _handle_error(self, response):
status = int_or_none(try_get(
response, lambda x: x['metadata']['status']) or 200)
if 200 <= status < 300:
return
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']),
expected=True)
def _call_api(self, domain_id, path, video_id, data=None, headers={}):
headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
result = self._download_json(
'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id,
'Downloading %s JSON' % path, data=urlencode_postdata(data),
headers=headers)
self._handle_error(result)
return result['result']
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
domain_id, video_id = mobj.group('domain_id', 'id')
# Reverse engineered from JS code (see getDeviceID function)
device_id = '%d:%d:%d%d' % (
random.randint(1, 4), int(time.time()),
random.randint(1e4, 99999), random.randint(1, 9))
result = self._call_api(domain_id, 'session/init', video_id, data={
'nxp_devh': device_id,
'nxp_userh': '',
'precid': '0',
'playlicense': '0',
'screenx': '1920',
'screeny': '1080',
'playerversion': '6.0.00',
'gateway': 'html5',
'adGateway': '',
'explicitlanguage': 'en-US',
'addTextTemplates': '1',
'addDomainData': '1',
'addAdModel': '1',
}, headers={
'X-Request-Enable-Auth-Fallback': '1',
})
cid = result['general']['cid']
# As described in [1] X-Request-Token generation algorithm is
# as follows:
# md5( operation + domain_id + domain_secret )
# where domain_secret is a static value that will be given by nexx.tv
# as per [1]. Here is how this "secret" is generated (reversed
# from _play.api.init function, search for clienttoken). So it's
# actually not static and not that much of a secret.
# 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf
secret = result['device']['clienttoken'][int(device_id[0]):]
secret = secret[0:len(secret) - int(device_id[-1])]
op = 'byid'
# Reversed from JS code for _play.api.call function (search for
# X-Request-Token)
request_token = hashlib.md5(
''.join((op, domain_id, secret)).encode('utf-8')).hexdigest()
video = self._call_api(
domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={
'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description',
'addInteractionOptions': '1',
'addStatusDetails': '1',
'addStreamDetails': '1',
'addCaptions': '1',
'addScenes': '1',
'addHotSpots': '1',
'addBumpers': '1',
'captionFormat': 'data',
}, headers={
'X-Request-CID': cid,
'X-Request-Token': request_token,
})
general = video['general']
title = general['title']
stream_data = video['streamdata']
language = general.get('language_raw') or ''
# TODO: reverse more cdns and formats
cdn = stream_data['cdnType']
assert cdn == 'azure'
azure_locator = stream_data['azureLocator']
AZURE_URL = 'http://nx-p%02d.akamaized.net/'
for secure in ('s', ''):
cdn_shield = stream_data.get('cdnShieldHTTP%s' % secure.upper())
if cdn_shield:
azure_base = 'http%s://%s' % (secure, cdn_shield)
break
else:
azure_base = AZURE_URL % int(stream_data['azureAccount'].replace('nexxplayplus', ''))
is_ml = ',' in language
azure_m3u8_url = '%s%s/%s_src%s.ism/Manifest(format=m3u8-aapl)' % (
azure_base, azure_locator, video_id, ('_manifest' if is_ml else ''))
protection_token = try_get(
video, lambda x: x['protectiondata']['token'], compat_str)
if protection_token:
azure_m3u8_url += '?hdnts=%s' % protection_token
formats = self._extract_m3u8_formats(
azure_m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='%s-hls' % cdn)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'alt_title': general.get('subtitle'),
'description': general.get('description'),
'release_year': int_or_none(general.get('year')),
'creator': general.get('studio') or general.get('studio_adref'),
'thumbnail': try_get(
video, lambda x: x['imagedata']['thumb'], compat_str),
'duration': parse_duration(general.get('runtime')),
'timestamp': int_or_none(general.get('uploaded')),
'episode_number': int_or_none(try_get(
video, lambda x: x['episodedata']['episode'])),
'season_number': int_or_none(try_get(
video, lambda x: x['episodedata']['season'])),
'formats': formats,
}
class NexxEmbedIE(InfoExtractor):
_VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
'md5': '16746bfc28c42049492385c989b26c4a',
'info_dict': {
'id': '161464',
'ext': 'mp4',
'title': 'Nervenkitzel Achterbahn',
'alt_title': 'Karussellbauer in Deutschland',
'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
'release_year': 2005,
'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2761,
'timestamp': 1394021479,
'upload_date': '20140305',
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
}
@staticmethod
def _extract_urls(webpage):
# Reference:
# 1. https://nx-s.akamaized.net/files/201510/44.pdf
# iFrame Embed Integration
return [mobj.group('url') for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1',
webpage)]
def _real_extract(self, url):
embed_id = self._match_id(url)
webpage = self._download_webpage(url, embed_id)
return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key())

View File

@ -1,23 +1,22 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
import json import json
import datetime import datetime
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_parse_qs,
compat_urlparse, compat_urlparse,
) )
from ..utils import ( from ..utils import (
determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
sanitized_Request,
xpath_text,
determine_ext,
urlencode_postdata, urlencode_postdata,
xpath_text,
) )
@ -83,9 +82,12 @@ class NiconicoIE(InfoExtractor):
'uploader_id': '312', 'uploader_id': '312',
}, },
'skip': 'The viewing period of the video you were searching for has expired.', 'skip': 'The viewing period of the video you were searching for has expired.',
}, {
'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
'only_matching': True,
}] }]
_VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico' _NETRC_MACHINE = 'niconico'
def _real_initialize(self): def _real_initialize(self):
@ -98,19 +100,24 @@ class NiconicoIE(InfoExtractor):
return True return True
# Log in # Log in
login_ok = True
login_form_strs = { login_form_strs = {
'mail': username, 'mail_tel': username,
'password': password, 'password': password,
} }
login_data = urlencode_postdata(login_form_strs) urlh = self._request_webpage(
request = sanitized_Request( 'https://account.nicovideo.jp/api/v1/login', None,
'https://secure.nicovideo.jp/secure/login', login_data) note='Logging in', errnote='Unable to log in',
login_results = self._download_webpage( data=urlencode_postdata(login_form_strs))
request, None, note='Logging in', errnote='Unable to log in') if urlh is False:
if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: login_ok = False
else:
parts = compat_urlparse.urlparse(urlh.geturl())
if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
login_ok = False
if not login_ok:
self._downloader.report_warning('unable to log in: bad username or password') self._downloader.report_warning('unable to log in: bad username or password')
return False return login_ok
return True
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -28,7 +28,7 @@ class NPOBaseIE(InfoExtractor):
class NPOIE(NPOBaseIE): class NPOIE(NPOBaseIE):
IE_NAME = 'npo' IE_NAME = 'npo'
IE_DESC = 'npo.nl and ntr.nl' IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?: (?:
npo:| npo:|
@ -38,7 +38,7 @@ class NPOIE(NPOBaseIE):
npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}|
ntr\.nl/(?:[^/]+/){2,}| ntr\.nl/(?:[^/]+/){2,}|
omroepwnl\.nl/video/fragment/[^/]+__| omroepwnl\.nl/video/fragment/[^/]+__|
zapp\.nl/[^/]+/[^/]+/ (?:zapp|npo3)\.nl/(?:[^/]+/){2}
) )
) )
(?P<id>[^/?#]+) (?P<id>[^/?#]+)
@ -146,6 +146,9 @@ class NPOIE(NPOBaseIE):
}, { }, {
'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870',
'only_matching': True,
}, { }, {
# live stream # live stream
'url': 'npo:LI_NL1_4188102', 'url': 'npo:LI_NL1_4188102',
@ -341,7 +344,7 @@ class NPOLiveIE(NPOBaseIE):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
live_id = self._search_regex( live_id = self._search_regex(
r'data-prid="([^"]+)"', webpage, 'live id') [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id')
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',

View File

@ -11,6 +11,7 @@ from ..utils import (
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
js_to_json, js_to_json,
NO_DEFAULT,
parse_iso8601, parse_iso8601,
remove_start, remove_start,
strip_or_none, strip_or_none,
@ -198,6 +199,19 @@ class OnetPlIE(InfoExtractor):
'upload_date': '20170214', 'upload_date': '20170214',
'timestamp': 1487078046, 'timestamp': 1487078046,
}, },
}, {
# embedded via pulsembed
'url': 'http://film.onet.pl/pensjonat-nad-rozlewiskiem-relacja-z-planu-serialu/y428n0',
'info_dict': {
'id': '501235.965429946',
'ext': 'mp4',
'title': '"Pensjonat nad rozlewiskiem": relacja z planu serialu',
'upload_date': '20170622',
'timestamp': 1498159955,
},
'params': {
'skip_download': True,
},
}, { }, {
'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3', 'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3',
'only_matching': True, 'only_matching': True,
@ -212,13 +226,25 @@ class OnetPlIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _search_mvp_id(self, webpage, default=NO_DEFAULT):
return self._search_regex(
r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage, 'mvp id',
default=default)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
mvp_id = self._search_regex( mvp_id = self._search_mvp_id(webpage, default=None)
r'data-params-mvp=["\'](\d+\.\d+)', webpage, 'mvp id')
if not mvp_id:
pulsembed_url = self._search_regex(
r'data-src=(["\'])(?P<url>(?:https?:)?//pulsembed\.eu/.+?)\1',
webpage, 'pulsembed url', group='url')
webpage = self._download_webpage(
pulsembed_url, video_id, 'Downloading pulsembed webpage')
mvp_id = self._search_mvp_id(webpage)
return self.url_result( return self.url_result(
'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id) 'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id)

View File

@ -3,12 +3,14 @@ import re
import base64 import base64
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none,
float_or_none,
ExtractorError,
unsmuggle_url,
determine_ext, determine_ext,
ExtractorError,
float_or_none,
int_or_none,
try_get,
unsmuggle_url,
) )
from ..compat import compat_urllib_parse_urlencode from ..compat import compat_urllib_parse_urlencode
@ -39,13 +41,15 @@ class OoyalaBaseIE(InfoExtractor):
formats = [] formats = []
if cur_auth_data['authorized']: if cur_auth_data['authorized']:
for stream in cur_auth_data['streams']: for stream in cur_auth_data['streams']:
s_url = base64.b64decode( url_data = try_get(stream, lambda x: x['url']['data'], compat_str)
stream['url']['data'].encode('ascii')).decode('utf-8') if not url_data:
if s_url in urls: continue
s_url = base64.b64decode(url_data.encode('ascii')).decode('utf-8')
if not s_url or s_url in urls:
continue continue
urls.append(s_url) urls.append(s_url)
ext = determine_ext(s_url, None) ext = determine_ext(s_url, None)
delivery_type = stream['delivery_type'] delivery_type = stream.get('delivery_type')
if delivery_type == 'hls' or ext == 'm3u8': if delivery_type == 'hls' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native',
@ -65,7 +69,7 @@ class OoyalaBaseIE(InfoExtractor):
else: else:
formats.append({ formats.append({
'url': s_url, 'url': s_url,
'ext': ext or stream.get('delivery_type'), 'ext': ext or delivery_type,
'vcodec': stream.get('video_codec'), 'vcodec': stream.get('video_codec'),
'format_id': delivery_type, 'format_id': delivery_type,
'width': int_or_none(stream.get('width')), 'width': int_or_none(stream.get('width')),
@ -136,6 +140,11 @@ class OoyalaIE(OoyalaBaseIE):
'title': 'Divide Tool Path.mp4', 'title': 'Divide Tool Path.mp4',
'duration': 204.405, 'duration': 204.405,
} }
},
{
# empty stream['url']['data']
'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is',
'only_matching': True,
} }
] ]

View File

@ -10,13 +10,13 @@ from ..utils import (
class PandaTVIE(InfoExtractor): class PandaTVIE(InfoExtractor):
IE_DESC = '熊猫TV' IE_DESC = '熊猫TV'
_VALID_URL = r'http://(?:www\.)?panda\.tv/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?panda\.tv/(?P<id>[0-9]+)'
_TEST = { _TESTS = [{
'url': 'http://www.panda.tv/10091', 'url': 'http://www.panda.tv/66666',
'info_dict': { 'info_dict': {
'id': '10091', 'id': '66666',
'title': 're:.+', 'title': 're:.+',
'uploader': '囚徒', 'uploader': '刘杀鸡',
'ext': 'flv', 'ext': 'flv',
'is_live': True, 'is_live': True,
}, },
@ -24,13 +24,16 @@ class PandaTVIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
'skip': 'Live stream is offline', 'skip': 'Live stream is offline',
} }, {
'url': 'https://www.panda.tv/66666',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
config = self._download_json( config = self._download_json(
'http://www.panda.tv/api_room?roomid=%s' % video_id, video_id) 'https://www.panda.tv/api_room?roomid=%s' % video_id, video_id)
error_code = config.get('errno', 0) error_code = config.get('errno', 0)
if error_code is not 0: if error_code is not 0:
@ -74,7 +77,7 @@ class PandaTVIE(InfoExtractor):
continue continue
for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))): for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))):
formats.append({ formats.append({
'url': 'http://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s' 'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s'
% (pl, plflag1, room_key, live_panda, suffix[quality], ext), % (pl, plflag1, room_key, live_panda, suffix[quality], ext),
'format_id': '%s-%s' % (k, ext), 'format_id': '%s-%s' % (k, ext),
'quality': quality, 'quality': quality,

View File

@ -0,0 +1,63 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
qualities,
unified_timestamp,
)
class PearVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>\d+)'
_TEST = {
'url': 'http://www.pearvideo.com/video_1076290',
'info_dict': {
'id': '1076290',
'ext': 'mp4',
'title': '小浣熊在主人家玻璃上滚石头:没砸',
'description': 'md5:01d576b747de71be0ee85eb7cac25f9d',
'timestamp': 1494275280,
'upload_date': '20170508',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
quality = qualities(
('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src'))
formats = [{
'url': mobj.group('url'),
'format_id': mobj.group('id'),
'quality': quality(mobj.group('id')),
} for mobj in re.finditer(
r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2',
webpage)]
self._sort_formats(formats)
title = self._search_regex(
(r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)',
r'<[^>]+\bdata-title=(["\'])(?P<value>(?:(?!\1).)+)\1'),
webpage, 'title', group='value')
description = self._search_regex(
(r'<div[^>]+\bclass=(["\'])summary\1[^>]*>(?P<value>[^<]+)',
r'<[^>]+\bdata-summary=(["\'])(?P<value>(?:(?!\1).)+)\1'),
webpage, 'description', default=None,
group='value') or self._html_search_meta('Description', webpage)
timestamp = unified_timestamp(self._search_regex(
r'<div[^>]+\bclass=["\']date["\'][^>]*>([^<]+)',
webpage, 'timestamp', fatal=False))
return {
'id': video_id,
'title': title,
'description': description,
'timestamp': timestamp,
'formats': formats,
}

View File

@ -49,7 +49,7 @@ class PeriscopeIE(PeriscopeBaseIE):
@staticmethod @staticmethod
def _extract_url(webpage): def _extract_url(webpage):
mobj = re.search( mobj = re.search(
r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?periscope\.tv/(?:(?!\1).)+)\1', webpage) r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage)
if mobj: if mobj:
return mobj.group('url') return mobj.group('url')

View File

@ -191,11 +191,12 @@ class RaiPlayIE(RaiBaseIE):
info = { info = {
'id': video_id, 'id': video_id,
'title': title, 'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
'alt_title': media.get('subtitle'), 'alt_title': media.get('subtitle'),
'description': media.get('description'), 'description': media.get('description'),
'uploader': media.get('channel'), 'uploader': strip_or_none(media.get('channel')),
'creator': media.get('editor'), 'creator': strip_or_none(media.get('editor')),
'duration': parse_duration(video.get('duration')), 'duration': parse_duration(video.get('duration')),
'timestamp': timestamp, 'timestamp': timestamp,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
@ -208,10 +209,46 @@ class RaiPlayIE(RaiBaseIE):
} }
info.update(relinker_info) info.update(relinker_info)
return info return info
class RaiPlayLiveIE(RaiBaseIE):
_VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://www.raiplay.it/dirette/rainews24',
'info_dict': {
'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c',
'display_id': 'rainews24',
'ext': 'mp4',
'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:6eca31500550f9376819f174e5644754',
'uploader': 'Rai News 24',
'creator': 'Rai News 24',
'is_live': True,
},
'params': {
'skip_download': True,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE,
webpage, 'content id')
return {
'_type': 'url_transparent',
'ie_key': RaiPlayIE.ie_key(),
'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id,
'id': video_id,
'display_id': display_id,
}
class RaiIE(RaiBaseIE): class RaiIE(RaiBaseIE):
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_TESTS = [{ _TESTS = [{

View File

@ -13,7 +13,7 @@ from ..utils import (
class RedBullTVIE(InfoExtractor): class RedBullTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?P<id>AP-\w+)' _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film|live)/(?:AP-\w+/segment/)?(?P<id>AP-\w+)'
_TESTS = [{ _TESTS = [{
# film # film
'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc',
@ -42,6 +42,22 @@ class RedBullTVIE(InfoExtractor):
'season_number': 2, 'season_number': 2,
'episode_number': 4, 'episode_number': 4,
}, },
'params': {
'skip_download': True,
},
}, {
# segment
'url': 'https://www.redbull.tv/live/AP-1R5DX49XS1W11/segment/AP-1QSAQJ6V52111/semi-finals',
'info_dict': {
'id': 'AP-1QSAQJ6V52111',
'ext': 'mp4',
'title': 'Semi Finals - Vans Park Series Pro Tour',
'description': 'md5:306a2783cdafa9e65e39aa62f514fd97',
'duration': 11791.991,
},
'params': {
'skip_download': True,
},
}, { }, {
'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion', 'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion',
'only_matching': True, 'only_matching': True,
@ -82,7 +98,8 @@ class RedBullTVIE(InfoExtractor):
title = info['title'].strip() title = info['title'].strip()
formats = self._extract_m3u8_formats( formats = self._extract_m3u8_formats(
video['url'], video_id, 'mp4', 'm3u8_native') video['url'], video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {} subtitles = {}

View File

@ -31,7 +31,7 @@ class SlideshareIE(InfoExtractor):
page_title = mobj.group('title') page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title) webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex( slideshare_obj = self._search_regex(
r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);',
webpage, 'slideshare object') webpage, 'slideshare object')
info = json.loads(slideshare_obj) info = json.loads(slideshare_obj)
if info['slideshow']['type'] != 'video': if info['slideshow']['type'] != 'video':

View File

@ -136,7 +136,7 @@ class SoundcloudIE(InfoExtractor):
@classmethod @classmethod
def _resolv_url(cls, url): def _resolv_url(cls, url):
return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
track_id = compat_str(info['id']) track_id = compat_str(info['id'])
@ -174,7 +174,7 @@ class SoundcloudIE(InfoExtractor):
# We have to retrieve the url # We have to retrieve the url
format_dict = self._download_json( format_dict = self._download_json(
'http://api.soundcloud.com/i1/tracks/%s/streams' % track_id, 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id,
track_id, 'Downloading track url', query={ track_id, 'Downloading track url', query={
'client_id': self._CLIENT_ID, 'client_id': self._CLIENT_ID,
'secret_token': secret_token, 'secret_token': secret_token,
@ -236,7 +236,7 @@ class SoundcloudIE(InfoExtractor):
track_id = mobj.group('track_id') track_id = mobj.group('track_id')
if track_id is not None: if track_id is not None:
info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
full_title = track_id full_title = track_id
token = mobj.group('secret_token') token = mobj.group('secret_token')
if token: if token:
@ -261,7 +261,7 @@ class SoundcloudIE(InfoExtractor):
self.report_resolve(full_title) self.report_resolve(full_title)
url = 'http://soundcloud.com/%s' % resolve_title url = 'https://soundcloud.com/%s' % resolve_title
info_json_url = self._resolv_url(url) info_json_url = self._resolv_url(url)
info = self._download_json(info_json_url, full_title, 'Downloading info JSON') info = self._download_json(info_json_url, full_title, 'Downloading info JSON')
@ -290,7 +290,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
'id': '2284613', 'id': '2284613',
'title': 'The Royal Concept EP', 'title': 'The Royal Concept EP',
}, },
'playlist_mincount': 6, 'playlist_mincount': 5,
}, { }, {
'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
'only_matching': True, 'only_matching': True,
@ -304,7 +304,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
# extract simple title (uploader + slug of song title) # extract simple title (uploader + slug of song title)
slug_title = mobj.group('slug_title') slug_title = mobj.group('slug_title')
full_title = '%s/sets/%s' % (uploader, slug_title) full_title = '%s/sets/%s' % (uploader, slug_title)
url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
token = mobj.group('token') token = mobj.group('token')
if token: if token:
@ -380,7 +380,7 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
'url': 'https://soundcloud.com/grynpyret/spotlight', 'url': 'https://soundcloud.com/grynpyret/spotlight',
'info_dict': { 'info_dict': {
'id': '7098329', 'id': '7098329',
'title': 'GRYNPYRET (Spotlight)', 'title': 'Grynpyret (Spotlight)',
}, },
'playlist_mincount': 1, 'playlist_mincount': 1,
}] }]
@ -410,7 +410,7 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
uploader = mobj.group('user') uploader = mobj.group('user')
url = 'http://soundcloud.com/%s/' % uploader url = 'https://soundcloud.com/%s/' % uploader
resolv_url = self._resolv_url(url) resolv_url = self._resolv_url(url)
user = self._download_json( user = self._download_json(
resolv_url, uploader, 'Downloading user info') resolv_url, uploader, 'Downloading user info')
@ -473,7 +473,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
_VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
IE_NAME = 'soundcloud:playlist' IE_NAME = 'soundcloud:playlist'
_TESTS = [{ _TESTS = [{
'url': 'http://api.soundcloud.com/playlists/4110309', 'url': 'https://api.soundcloud.com/playlists/4110309',
'info_dict': { 'info_dict': {
'id': '4110309', 'id': '4110309',
'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .nexx import NexxEmbedIE
from .spiegeltv import SpiegeltvIE from .spiegeltv import SpiegeltvIE
from ..compat import compat_urlparse from ..compat import compat_urlparse
from ..utils import ( from ..utils import (
@ -121,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor):
}, },
'playlist_count': 6, 'playlist_count': 6,
}, {
# Nexx iFrame embed
'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
'info_dict': {
'id': '161464',
'ext': 'mp4',
'title': 'Nervenkitzel Achterbahn',
'alt_title': 'Karussellbauer in Deutschland',
'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
'release_year': 2005,
'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2761,
'timestamp': 1394021479,
'upload_date': '20140305',
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -143,6 +164,9 @@ class SpiegelArticleIE(InfoExtractor):
entries = [ entries = [
self.url_result(compat_urlparse.urljoin( self.url_result(compat_urlparse.urljoin(
self.http_scheme() + '//spiegel.de/', embed_path)) self.http_scheme() + '//spiegel.de/', embed_path))
for embed_path in embeds for embed_path in embeds]
] if embeds:
return self.playlist_result(entries) return self.playlist_result(entries)
return self.playlist_from_matches(
NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())

View File

@ -1,114 +1,17 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse from .nexx import NexxIE
from ..utils import (
determine_ext,
float_or_none,
)
class SpiegeltvIE(InfoExtractor): class SpiegeltvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?spiegel\.tv/(?:#/)?filme/(?P<id>[\-a-z0-9]+)' _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)'
_TESTS = [{ _TEST = {
'url': 'http://www.spiegel.tv/filme/flug-mh370/', 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/',
'info_dict': {
'id': 'flug-mh370',
'ext': 'm4v',
'title': 'Flug MH370',
'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines',
'thumbnail': r're:http://.*\.jpg$',
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
'url': 'http://www.spiegel.tv/#/filme/alleskino-die-wahrheit-ueber-maenner/',
'only_matching': True, 'only_matching': True,
}] }
def _real_extract(self, url): def _real_extract(self, url):
if '/#/' in url: return self.url_result(
url = url.replace('/#/', '/') 'https://api.nexx.cloud/v3/748/videos/byid/%s'
video_id = self._match_id(url) % self._match_id(url), ie=NexxIE.ie_key())
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title')
apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com'
version_json = self._download_json(
'%s/version.json' % apihost, video_id,
note='Downloading version information')
version_name = version_json['version_name']
slug_json = self._download_json(
'%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id),
video_id,
note='Downloading object information')
oid = slug_json['object_id']
media_json = self._download_json(
'%s/%s/restapi/media/%s.json' % (apihost, version_name, oid),
video_id, note='Downloading media information')
uuid = media_json['uuid']
is_wide = media_json['is_wide']
server_json = self._download_json(
'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
video_id, note='Downloading server information')
format = '16x9' if is_wide else '4x3'
formats = []
for streamingserver in server_json['streamingserver']:
endpoint = streamingserver.get('endpoint')
if not endpoint:
continue
play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
if endpoint.startswith('rtmp'):
formats.append({
'url': endpoint,
'format_id': 'rtmp',
'app': compat_urllib_parse_urlparse(endpoint).path[1:],
'play_path': play_path,
'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
'ext': 'flv',
'rtmp_live': True,
})
elif determine_ext(endpoint) == 'm3u8':
formats.append({
'url': endpoint.replace('[video]', play_path),
'ext': 'm4v',
'format_id': 'hls', # Prefer hls since it allows to workaround georestriction
'protocol': 'm3u8',
'preference': 1,
'http_headers': {
'Accept-Encoding': 'deflate', # gzip causes trouble on the server side
},
})
else:
formats.append({
'url': endpoint,
})
self._check_formats(formats, video_id)
thumbnails = []
for image in media_json['images']:
thumbnails.append({
'url': image['url'],
'width': image['width'],
'height': image['height'],
})
description = media_json['subtitle']
duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
return {
'id': video_id,
'title': title,
'description': description,
'duration': duration,
'thumbnails': thumbnails,
'formats': formats,
}

View File

@ -4,7 +4,11 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import js_to_json from ..utils import (
determine_ext,
int_or_none,
js_to_json,
)
class SportBoxEmbedIE(InfoExtractor): class SportBoxEmbedIE(InfoExtractor):
@ -14,8 +18,10 @@ class SportBoxEmbedIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '211355', 'id': '211355',
'ext': 'mp4', 'ext': 'mp4',
'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'title': '211355',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 292,
'view_count': int,
}, },
'params': { 'params': {
# m3u8 download # m3u8 download
@ -24,6 +30,9 @@ class SportBoxEmbedIE(InfoExtractor):
}, { }, {
'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://news.sportbox.ru/vdl/player/media/193095',
'only_matching': True,
}] }]
@staticmethod @staticmethod
@ -37,36 +46,34 @@ class SportBoxEmbedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
wjplayer_data = self._parse_json(
self._search_regex(
r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'),
video_id, transform_source=js_to_json)
formats = [] formats = []
for source in wjplayer_data['sources']:
def cleanup_js(code): src = source.get('src')
# desktop_advert_config contains complex Javascripts and we don't need it if not src:
return js_to_json(re.sub(r'desktop_advert_config.*', '', code)) continue
if determine_ext(src) == 'm3u8':
jwplayer_data = self._parse_json(self._search_regex( formats.extend(self._extract_m3u8_formats(
r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id, src, video_id, 'mp4', entry_protocol='m3u8_native',
transform_source=cleanup_js) m3u8_id='hls', fatal=False))
else:
hls_url = jwplayer_data.get('hls_url') formats.append({
if hls_url: 'url': src,
formats.extend(self._extract_m3u8_formats( })
hls_url, video_id, ext='mp4', m3u8_id='hls'))
rtsp_url = jwplayer_data.get('rtsp_url')
if rtsp_url:
formats.append({
'url': rtsp_url,
'format_id': 'rtsp',
})
self._sort_formats(formats) self._sort_formats(formats)
title = jwplayer_data['node_title'] view_count = int_or_none(self._search_regex(
thumbnail = jwplayer_data.get('image_url') r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None))
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': video_id,
'thumbnail': thumbnail, 'thumbnail': wjplayer_data.get('poster'),
'duration': int_or_none(wjplayer_data.get('duration')),
'view_count': view_count,
'formats': formats, 'formats': formats,
} }

View File

@ -0,0 +1,43 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from .ooyala import OoyalaIE
class TastyTradeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017',
'info_dict': {
'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
'ext': 'mp4',
'title': 'A History of Teaming',
'description': 'md5:2a9033db8da81f2edffa4c99888140b3',
'duration': 422.255,
},
'params': {
'skip_download': True,
},
'add_ie': ['Ooyala'],
}, {
'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
ooyala_code = self._search_regex(
r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1',
webpage, 'ooyala code', group='code')
info = self._search_json_ld(webpage, display_id, fatal=False)
info.update({
'_type': 'url_transparent',
'ie_key': OoyalaIE.ie_key(),
'url': 'ooyala:%s' % ooyala_code,
'display_id': display_id,
})
return info

View File

@ -8,6 +8,9 @@ from ..utils import extract_attributes
class TBSIE(TurnerBaseIE): class TBSIE(TurnerBaseIE):
# https://github.com/rg3/youtube-dl/issues/13658
_WORKING = False
_VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html' _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html'
_TESTS = [{ _TESTS = [{
'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html',
@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Theatrical Trailer', 'title': 'Theatrical Trailer',
'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.',
} },
'skip': 'TBS videos are deleted after a while',
}, { }, {
'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html',
'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56',
@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'You Better Run', 'title': 'You Better Run',
'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.',
} },
'skip': 'TBS videos are deleted after a while',
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -6,7 +6,10 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import int_or_none from ..utils import (
int_or_none,
try_get,
)
class TEDIE(InfoExtractor): class TEDIE(InfoExtractor):
@ -113,8 +116,9 @@ class TEDIE(InfoExtractor):
} }
def _extract_info(self, webpage): def _extract_info(self, webpage):
info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', info_json = self._search_regex(
webpage, 'info json') r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
webpage, 'info json')
return json.loads(info_json) return json.loads(info_json)
def _real_extract(self, url): def _real_extract(self, url):
@ -136,11 +140,16 @@ class TEDIE(InfoExtractor):
webpage = self._download_webpage(url, name, webpage = self._download_webpage(url, name,
'Downloading playlist webpage') 'Downloading playlist webpage')
info = self._extract_info(webpage) info = self._extract_info(webpage)
playlist_info = info['playlist']
playlist_info = try_get(
info, lambda x: x['__INITIAL_DATA__']['playlist'],
dict) or info['playlist']
playlist_entries = [ playlist_entries = [
self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
for talk in info['talks'] for talk in try_get(
info, lambda x: x['__INITIAL_DATA__']['talks'],
dict) or info['talks']
] ]
return self.playlist_result( return self.playlist_result(
playlist_entries, playlist_entries,
@ -149,9 +158,14 @@ class TEDIE(InfoExtractor):
def _talk_info(self, url, video_name): def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name) webpage = self._download_webpage(url, video_name)
self.report_extraction(video_name)
talk_info = self._extract_info(webpage)['talks'][0] info = self._extract_info(webpage)
talk_info = try_get(
info, lambda x: x['__INITIAL_DATA__']['talks'][0],
dict) or info['talks'][0]
title = talk_info['title'].strip()
external = talk_info.get('external') external = talk_info.get('external')
if external: if external:
@ -165,19 +179,27 @@ class TEDIE(InfoExtractor):
'url': ext_url or external['uri'], 'url': ext_url or external['uri'],
} }
native_downloads = try_get(
talk_info, lambda x: x['downloads']['nativeDownloads'],
dict) or talk_info['nativeDownloads']
formats = [{ formats = [{
'url': format_url, 'url': format_url,
'format_id': format_id, 'format_id': format_id,
'format': format_id, 'format': format_id,
} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] } for (format_id, format_url) in native_downloads.items() if format_url is not None]
if formats: if formats:
for f in formats: for f in formats:
finfo = self._NATIVE_FORMATS.get(f['format_id']) finfo = self._NATIVE_FORMATS.get(f['format_id'])
if finfo: if finfo:
f.update(finfo) f.update(finfo)
player_talk = talk_info['player_talks'][0]
resources_ = player_talk.get('resources') or talk_info.get('resources')
http_url = None http_url = None
for format_id, resources in talk_info['resources'].items(): for format_id, resources in resources_.items():
if format_id == 'h264': if format_id == 'h264':
for resource in resources: for resource in resources:
h264_url = resource.get('file') h264_url = resource.get('file')
@ -237,14 +259,11 @@ class TEDIE(InfoExtractor):
video_id = compat_str(talk_info['id']) video_id = compat_str(talk_info['id'])
thumbnail = talk_info['thumb']
if not thumbnail.startswith('http'):
thumbnail = 'http://' + thumbnail
return { return {
'id': video_id, 'id': video_id,
'title': talk_info['title'].strip(), 'title': title,
'uploader': talk_info['speaker'], 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
'thumbnail': thumbnail, 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info), 'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats, 'formats': formats,
@ -252,20 +271,22 @@ class TEDIE(InfoExtractor):
} }
def _get_subtitles(self, video_id, talk_info): def _get_subtitles(self, video_id, talk_info):
languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] sub_lang_list = {}
if languages: for language in try_get(
sub_lang_list = {} talk_info,
for l in languages: (lambda x: x['downloads']['languages'],
sub_lang_list[l] = [ lambda x: x['languages']), list):
{ lang_code = language.get('languageCode') or language.get('ianaCode')
'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), if not lang_code:
'ext': ext, continue
} sub_lang_list[lang_code] = [
for ext in ['ted', 'srt'] {
] 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
return sub_lang_list 'ext': ext,
else: }
return {} for ext in ['ted', 'srt']
]
return sub_lang_list
def _watch_info(self, url, name): def _watch_info(self, url, name):
webpage = self._download_webpage(url, name) webpage = self._download_webpage(url, name)

View File

@ -2,13 +2,15 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import try_get
class ThisOldHouseIE(InfoExtractor): class ThisOldHouseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench',
'md5': '946f05bbaa12a33f9ae35580d2dfcfe3', 'md5': '568acf9ca25a639f0c4ff905826b662f',
'info_dict': { 'info_dict': {
'id': '2REGtUDQ', 'id': '2REGtUDQ',
'ext': 'mp4', 'ext': 'mp4',
@ -28,8 +30,15 @@ class ThisOldHouseIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
drupal_settings = self._parse_json(self._search_regex( video_id = self._search_regex(
r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'drupal settings'), display_id) r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'),
video_id = drupal_settings['jwplatform']['video_id'] webpage, 'video id', default=None, group='id')
if not video_id:
drupal_settings = self._parse_json(self._search_regex(
r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
webpage, 'drupal settings'), display_id)
video_id = try_get(
drupal_settings, lambda x: x['jwplatform']['video_id'],
compat_str) or list(drupal_settings['comScore'])[0]
return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id) return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id)

View File

@ -7,20 +7,38 @@ from .common import InfoExtractor
from ..compat import compat_urlparse from ..compat import compat_urlparse
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
float_or_none, dict_get,
xpath_text,
remove_end,
int_or_none,
ExtractorError, ExtractorError,
float_or_none,
int_or_none,
remove_end,
try_get,
xpath_text,
) )
from .periscope import PeriscopeIE from .periscope import PeriscopeIE
class TwitterBaseIE(InfoExtractor): class TwitterBaseIE(InfoExtractor):
def _get_vmap_video_url(self, vmap_url, video_id): def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_data = self._download_xml(vmap_url, video_id) vmap_data = self._download_xml(vmap_url, video_id)
return xpath_text(vmap_data, './/MediaFile').strip() video_url = xpath_text(vmap_data, './/MediaFile').strip()
if determine_ext(video_url) == 'm3u8':
return self._extract_m3u8_formats(
video_url, video_id, ext='mp4', m3u8_id='hls',
entry_protocol='m3u8_native')
return [{
'url': video_url,
}]
@staticmethod
def _search_dimensions_in_video_url(a_format, video_url):
m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
if m:
a_format.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
})
class TwitterCardIE(TwitterBaseIE): class TwitterCardIE(TwitterBaseIE):
@ -36,7 +54,8 @@ class TwitterCardIE(TwitterBaseIE):
'title': 'Twitter Card', 'title': 'Twitter Card',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 30.033, 'duration': 30.033,
} },
'skip': 'Video gone',
}, },
{ {
'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
@ -48,6 +67,7 @@ class TwitterCardIE(TwitterBaseIE):
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'duration': 80.155, 'duration': 80.155,
}, },
'skip': 'Video gone',
}, },
{ {
'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
@ -65,7 +85,7 @@ class TwitterCardIE(TwitterBaseIE):
}, },
{ {
'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568',
'md5': 'ab2745d0b0ce53319a534fccaa986439', 'md5': '6dabeaca9e68cbb71c99c322a4b42a11',
'info_dict': { 'info_dict': {
'id': 'iBb2x00UVlv', 'id': 'iBb2x00UVlv',
'ext': 'mp4', 'ext': 'mp4',
@ -73,16 +93,17 @@ class TwitterCardIE(TwitterBaseIE):
'uploader_id': '1189339351084113920', 'uploader_id': '1189339351084113920',
'uploader': 'ArsenalTerje', 'uploader': 'ArsenalTerje',
'title': 'Vine by ArsenalTerje', 'title': 'Vine by ArsenalTerje',
'timestamp': 1447451307,
}, },
'add_ie': ['Vine'], 'add_ie': ['Vine'],
}, { }, {
'url': 'https://twitter.com/i/videos/tweet/705235433198714880', 'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
'md5': '3846d0a07109b5ab622425449b59049d', 'md5': '884812a2adc8aaf6fe52b15ccbfa3b88',
'info_dict': { 'info_dict': {
'id': '705235433198714880', 'id': '705235433198714880',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Twitter web player', 'title': 'Twitter web player',
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*',
}, },
}, { }, {
'url': 'https://twitter.com/i/videos/752274308186120192', 'url': 'https://twitter.com/i/videos/752274308186120192',
@ -90,6 +111,59 @@ class TwitterCardIE(TwitterBaseIE):
}, },
] ]
def _parse_media_info(self, media_info, video_id):
formats = []
for media_variant in media_info.get('variants', []):
media_url = media_variant['url']
if media_url.endswith('.m3u8'):
formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
elif media_url.endswith('.mpd'):
formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
else:
vbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000)
a_format = {
'url': media_url,
'format_id': 'http-%d' % vbr if vbr else 'http',
'vbr': vbr,
}
# Reported bitRate may be zero
if not a_format['vbr']:
del a_format['vbr']
self._search_dimensions_in_video_url(a_format, media_url)
formats.append(a_format)
return formats
def _extract_mobile_formats(self, username, video_id):
webpage = self._download_webpage(
'https://mobile.twitter.com/%s/status/%s' % (username, video_id),
video_id, 'Downloading mobile webpage',
headers={
# A recent mobile UA is necessary for `gt` cookie
'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0',
})
main_script_url = self._html_search_regex(
r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL')
main_script = self._download_webpage(
main_script_url, video_id, 'Downloading main script')
bearer_token = self._search_regex(
r'BEARER_TOKEN\s*:\s*"([^"]+)"',
main_script, 'bearer token')
guest_token = self._search_regex(
r'document\.cookie\s*=\s*decodeURIComponent\("gt=(\d+)',
webpage, 'guest token')
api_data = self._download_json(
'https://api.twitter.com/2/timeline/conversation/%s.json' % video_id,
video_id, 'Downloading mobile API data',
headers={
'Authorization': 'Bearer ' + bearer_token,
'x-guest-token': guest_token,
})
media_info = try_get(api_data, lambda o: o['globalObjects']['tweets'][video_id]
['extended_entities']['media'][0]['video_info']) or {}
return self._parse_media_info(media_info, video_id)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -117,14 +191,6 @@ class TwitterCardIE(TwitterBaseIE):
if periscope_url: if periscope_url:
return self.url_result(periscope_url, PeriscopeIE.ie_key()) return self.url_result(periscope_url, PeriscopeIE.ie_key())
def _search_dimensions_in_video_url(a_format, video_url):
m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
if m:
a_format.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
})
video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')
if video_url: if video_url:
@ -135,15 +201,14 @@ class TwitterCardIE(TwitterBaseIE):
'url': video_url, 'url': video_url,
} }
_search_dimensions_in_video_url(f, video_url) self._search_dimensions_in_video_url(f, video_url)
formats.append(f) formats.append(f)
vmap_url = config.get('vmapUrl') or config.get('vmap_url') vmap_url = config.get('vmapUrl') or config.get('vmap_url')
if vmap_url: if vmap_url:
formats.append({ formats.extend(
'url': self._get_vmap_video_url(vmap_url, video_id), self._extract_formats_from_vmap_url(vmap_url, video_id))
})
media_info = None media_info = None
@ -152,29 +217,14 @@ class TwitterCardIE(TwitterBaseIE):
media_info = entity['mediaInfo'] media_info = entity['mediaInfo']
if media_info: if media_info:
for media_variant in media_info['variants']: formats.extend(self._parse_media_info(media_info, video_id))
media_url = media_variant['url']
if media_url.endswith('.m3u8'):
formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
elif media_url.endswith('.mpd'):
formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
else:
vbr = int_or_none(media_variant.get('bitRate'), scale=1000)
a_format = {
'url': media_url,
'format_id': 'http-%d' % vbr if vbr else 'http',
'vbr': vbr,
}
# Reported bitRate may be zero
if not a_format['vbr']:
del a_format['vbr']
_search_dimensions_in_video_url(a_format, media_url)
formats.append(a_format)
duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9)
username = config.get('user', {}).get('screen_name')
if username:
formats.extend(self._extract_mobile_formats(username, video_id))
self._remove_duplicate_formats(formats)
self._sort_formats(formats) self._sort_formats(formats)
title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title') title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
@ -255,10 +305,10 @@ class TwitterIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '700207533655363584', 'id': '700207533655363584',
'ext': 'mp4', 'ext': 'mp4',
'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', 'title': 'Donte - BEAT PROD: @suhmeduh #Damndaniel',
'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', 'description': 'Donte on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'uploader': 'JG', 'uploader': 'Donte',
'uploader_id': 'jaydingeer', 'uploader_id': 'jaydingeer',
}, },
'params': { 'params': {
@ -270,9 +320,11 @@ class TwitterIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'MIOxnrUteUd', 'id': 'MIOxnrUteUd',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', 'title': 'FilmDrunk - Vine of the day',
'uploader': 'TAKUMA', 'description': 'FilmDrunk on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"',
'uploader_id': '1004126642786242560', 'uploader': 'FilmDrunk',
'uploader_id': 'Filmdrunk',
'timestamp': 1402826626,
'upload_date': '20140615', 'upload_date': '20140615',
}, },
'add_ie': ['Vine'], 'add_ie': ['Vine'],
@ -294,13 +346,28 @@ class TwitterIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '1zqKVVlkqLaKB', 'id': '1zqKVVlkqLaKB',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence',
'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"',
'upload_date': '20160923', 'upload_date': '20160923',
'uploader_id': 'OPP_HSD', 'uploader_id': 'OPP_HSD',
'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', 'uploader': 'Sgt Kerry Schmidt',
'timestamp': 1474613214, 'timestamp': 1474613214,
}, },
'add_ie': ['Periscope'], 'add_ie': ['Periscope'],
}, {
# has mp4 formats via mobile API
'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
'info_dict': {
'id': '852138619213144067',
'ext': 'mp4',
'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"',
'uploader': 'عالم الأخبار',
'uploader_id': 'news_al3alm',
},
'params': {
'format': 'best[format_id^=http-]',
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -393,7 +460,7 @@ class TwitterAmplifyIE(TwitterBaseIE):
vmap_url = self._html_search_meta( vmap_url = self._html_search_meta(
'twitter:amplify:vmap', webpage, 'vmap url') 'twitter:amplify:vmap', webpage, 'vmap url')
video_url = self._get_vmap_video_url(vmap_url, video_id) formats = self._extract_formats_from_vmap_url(vmap_url, video_id)
thumbnails = [] thumbnails = []
thumbnail = self._html_search_meta( thumbnail = self._html_search_meta(
@ -415,11 +482,10 @@ class TwitterAmplifyIE(TwitterBaseIE):
}) })
video_w, video_h = _find_dimension('player') video_w, video_h = _find_dimension('player')
formats = [{ formats[0].update({
'url': video_url,
'width': video_w, 'width': video_w,
'height': video_h, 'height': video_h,
}] })
return { return {
'id': video_id, 'id': video_id,

View File

@ -12,47 +12,46 @@ from ..utils import (
class VeohIE(InfoExtractor): class VeohIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
_TESTS = [ _TESTS = [{
{ 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', 'md5': '620e68e6a3cff80086df3348426c9ca3',
'md5': '620e68e6a3cff80086df3348426c9ca3', 'info_dict': {
'info_dict': { 'id': '56314296',
'id': '56314296', 'ext': 'mp4',
'ext': 'mp4', 'title': 'Straight Backs Are Stronger',
'title': 'Straight Backs Are Stronger', 'uploader': 'LUMOback',
'uploader': 'LUMOback', 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
},
}, },
{ }, {
'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
'info_dict': { 'info_dict': {
'id': '27701988', 'id': '27701988',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Chile workers cover up to avoid skin damage', 'title': 'Chile workers cover up to avoid skin damage',
'description': 'md5:2bd151625a60a32822873efc246ba20d', 'description': 'md5:2bd151625a60a32822873efc246ba20d',
'uploader': 'afp-news', 'uploader': 'afp-news',
'duration': 123, 'duration': 123,
},
'skip': 'This video has been deleted.',
}, },
{ 'skip': 'This video has been deleted.',
'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', }, {
'md5': '4fde7b9e33577bab2f2f8f260e30e979', 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
'note': 'Embedded ooyala video', 'md5': '4fde7b9e33577bab2f2f8f260e30e979',
'info_dict': { 'note': 'Embedded ooyala video',
'id': '69525809', 'info_dict': {
'ext': 'mp4', 'id': '69525809',
'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', 'ext': 'mp4',
'description': 'md5:f5a11c51f8fb51d2315bca0937526891', 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
'uploader': 'newsy-videos', 'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
}, 'uploader': 'newsy-videos',
'skip': 'This video has been deleted.',
}, },
] 'skip': 'This video has been deleted.',
}, {
'url': 'http://www.veoh.com/watch/e152215AJxZktGS',
'only_matching': True,
}]
def _extract_formats(self, source): def _extract_formats(self, source):
formats = [] formats = []

View File

@ -121,7 +121,11 @@ class VH1IE(MTVIE):
idoc = self._download_xml( idoc = self._download_xml(
doc_url, video_id, doc_url, video_id,
'Downloading info', transform_source=fix_xml_ampersands) 'Downloading info', transform_source=fix_xml_ampersands)
return self.playlist_result(
[self._get_video_info(item) for item in idoc.findall('.//item')], entries = []
playlist_id=video_id, for item in idoc.findall('.//item'):
) info = self._get_video_info(item)
if info:
entries.append(info)
return self.playlist_result(entries, playlist_id=video_id)

View File

@ -56,7 +56,8 @@ class VidioIE(InfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
duration = int_or_none(duration or self._search_regex( duration = int_or_none(duration or self._search_regex(
r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration')) r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage,
'duration', fatal=False, group='duration'))
thumbnail = thumbnail or self._og_search_thumbnail(webpage) thumbnail = thumbnail or self._og_search_thumbnail(webpage)
like_count = int_or_none(self._search_regex( like_count = int_or_none(self._search_regex(

View File

@ -15,7 +15,21 @@ from ..utils import (
class VierIE(InfoExtractor): class VierIE(InfoExtractor):
IE_NAME = 'vier' IE_NAME = 'vier'
IE_DESC = 'vier.be and vijf.be' IE_DESC = 'vier.be and vijf.be'
_VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' _VALID_URL = r'''(?x)
https?://
(?:www\.)?(?P<site>vier|vijf)\.be/
(?:
(?:
[^/]+/videos|
video(?:/[^/]+)*
)/
(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|
(?:
video/v3/embed|
embed/video/public
)/(?P<embed_id>\d+)
)
'''
_NETRC_MACHINE = 'vier' _NETRC_MACHINE = 'vier'
_TESTS = [{ _TESTS = [{
'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
@ -83,6 +97,15 @@ class VierIE(InfoExtractor):
}, { }, {
'url': 'http://www.vier.be/video/v3/embed/16129', 'url': 'http://www.vier.be/video/v3/embed/16129',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.vijf.be/embed/video/public/4093',
'only_matching': True,
}, {
'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics',
'only_matching': True,
}, {
'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6',
'only_matching': True,
}] }]
def _real_initialize(self): def _real_initialize(self):
@ -133,14 +156,20 @@ class VierIE(InfoExtractor):
video_id = self._search_regex( video_id = self._search_regex(
[r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
webpage, 'video id', default=video_id or display_id) webpage, 'video id', default=video_id or display_id)
application = self._search_regex(
[r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
webpage, 'application', default=site + '_vod')
filename = self._search_regex(
[r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
webpage, 'filename')
playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) playlist_url = self._search_regex(
r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1',
webpage, 'm3u8 url', default=None, group='url')
if not playlist_url:
application = self._search_regex(
[r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
webpage, 'application', default=site + '_vod')
filename = self._search_regex(
[r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
webpage, 'filename')
playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
formats = self._extract_wowza_formats( formats = self._extract_wowza_formats(
playlist_url, display_id, skip_protocols=['dash']) playlist_url, display_id, skip_protocols=['dash'])
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -92,10 +92,12 @@ class VineIE(InfoExtractor):
username = data.get('username') username = data.get('username')
alt_title = 'Vine by %s' % username if username else None
return { return {
'id': video_id, 'id': video_id,
'title': data.get('description'), 'title': data.get('description') or alt_title or 'Vine video',
'alt_title': 'Vine by %s' % username if username else None, 'alt_title': alt_title,
'thumbnail': data.get('thumbnailUrl'), 'thumbnail': data.get('thumbnailUrl'),
'timestamp': unified_timestamp(data.get('created')), 'timestamp': unified_timestamp(data.get('created')),
'uploader': username, 'uploader': username,

View File

@ -49,6 +49,10 @@ class VLiveIE(InfoExtractor):
}, },
}] }]
@classmethod
def suitable(cls, url):
return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -261,3 +265,54 @@ class VLiveChannelIE(InfoExtractor):
return self.playlist_result( return self.playlist_result(
entries, channel_code, channel_name) entries, channel_code, channel_name)
class VLivePlaylistIE(InfoExtractor):
IE_NAME = 'vlive:playlist'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.vlive.tv/video/22867/playlist/22912',
'info_dict': {
'id': '22912',
'title': 'Valentine Day Message from TWICE'
},
'playlist_mincount': 9
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id, playlist_id = mobj.group('video_id', 'id')
VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
if self._downloader.params.get('noplaylist'):
self.to_screen(
'Downloading just video %s because of --no-playlist' % video_id)
return self.url_result(
VIDEO_URL_TEMPLATE % video_id,
ie=VLiveIE.ie_key(), video_id=video_id)
self.to_screen(
'Downloading playlist %s - add --no-playlist to just download video'
% playlist_id)
webpage = self._download_webpage(
'http://www.vlive.tv/video/%s/playlist/%s'
% (video_id, playlist_id), playlist_id)
item_ids = self._parse_json(
self._search_regex(
r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage,
'playlist video seqs'),
playlist_id)
entries = [
self.url_result(
VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(),
video_id=compat_str(item_id))
for item_id in item_ids]
playlist_name = self._html_search_regex(
r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)',
webpage, 'playlist title', fatal=False)
return self.playlist_result(entries, playlist_id, playlist_name)

View File

@ -13,7 +13,7 @@ class WSJIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?: (?:
https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
https?://(?:www\.)?wsj\.com/video/[^/]+/| https?://(?:www\.)?(?:wsj|barrons)\.com/video/[^/]+/|
wsj: wsj:
) )
(?P<id>[a-fA-F0-9-]{36}) (?P<id>[a-fA-F0-9-]{36})
@ -35,6 +35,9 @@ class WSJIE(InfoExtractor):
}, { }, {
'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html', 'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -157,7 +157,7 @@ class XFileShareIE(InfoExtractor):
def extract_formats(default=NO_DEFAULT): def extract_formats(default=NO_DEFAULT):
urls = [] urls = []
for regex in ( for regex in (
r'file\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1', r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)', r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
clean_html, clean_html,
dict_get, dict_get,
@ -14,12 +15,21 @@ from ..utils import (
class XHamsterIE(InfoExtractor): class XHamsterIE(InfoExtractor):
_VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?' _VALID_URL = r'''(?x)
https?://
(?:.+?\.)?xhamster\.com/
(?:
movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html|
videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+)
)
'''
_TESTS = [{ _TESTS = [{
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
'md5': '8281348b8d3c53d39fffb377d24eac4e', 'md5': '8281348b8d3c53d39fffb377d24eac4e',
'info_dict': { 'info_dict': {
'id': '1509445', 'id': '1509445',
'display_id': 'femaleagent_shy_beauty_takes_the_bait',
'ext': 'mp4', 'ext': 'mp4',
'title': 'FemaleAgent Shy beauty takes the bait', 'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014', 'upload_date': '20121014',
@ -32,6 +42,7 @@ class XHamsterIE(InfoExtractor):
'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
'info_dict': { 'info_dict': {
'id': '2221348', 'id': '2221348',
'display_id': 'britney_spears_sexy_booty',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Britney Spears Sexy Booty', 'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914', 'upload_date': '20130914',
@ -66,26 +77,18 @@ class XHamsterIE(InfoExtractor):
# This video is visible for marcoalfa123456's friends only # This video is visible for marcoalfa123456's friends only
'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html', 'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html',
'only_matching': True, 'only_matching': True,
}, {
# new URL schema
'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
def extract_video_url(webpage, name):
return self._search_regex(
[r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
webpage, name, group='mp4')
def is_hd(webpage):
return '<div class=\'icon iconHD\'' in webpage
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') or mobj.group('id_2')
display_id = mobj.group('display_id') or mobj.group('display_id_2')
video_id = mobj.group('id') webpage = self._download_webpage(url, video_id)
seo = mobj.group('seo')
proto = mobj.group('proto')
mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)
webpage = self._download_webpage(mrss_url, video_id)
error = self._html_search_regex( error = self._html_search_regex(
r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>',
@ -99,6 +102,39 @@ class XHamsterIE(InfoExtractor):
r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'], r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'],
webpage, 'title') webpage, 'title')
formats = []
format_urls = set()
sources = self._parse_json(
self._search_regex(
r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources',
default='{}'),
video_id, fatal=False)
for format_id, format_url in sources.items():
if not isinstance(format_url, compat_str):
continue
if format_url in format_urls:
continue
format_urls.add(format_url)
formats.append({
'format_id': format_id,
'url': format_url,
'height': int_or_none(self._search_regex(
r'^(\d+)[pP]', format_id, 'height', default=None))
})
video_url = self._search_regex(
[r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
webpage, 'video url', group='mp4', default=None)
if video_url and video_url not in format_urls:
formats.append({
'url': video_url,
})
self._sort_formats(formats)
# Only a few videos have an description # Only a few videos have an description
mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
description = mobj.group(1) if mobj else None description = mobj.group(1) if mobj else None
@ -117,7 +153,8 @@ class XHamsterIE(InfoExtractor):
webpage, 'thumbnail', fatal=False, group='thumbnail') webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = parse_duration(self._search_regex( duration = parse_duration(self._search_regex(
r'Runtime:\s*</span>\s*([\d:]+)', webpage, [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']',
r'Runtime:\s*</span>\s*([\d:]+)'], webpage,
'duration', fatal=False)) 'duration', fatal=False))
view_count = int_or_none(self._search_regex( view_count = int_or_none(self._search_regex(
@ -132,30 +169,6 @@ class XHamsterIE(InfoExtractor):
age_limit = self._rta_search(webpage) age_limit = self._rta_search(webpage)
hd = is_hd(webpage)
format_id = 'hd' if hd else 'sd'
video_url = extract_video_url(webpage, format_id)
formats = [{
'url': video_url,
'format_id': 'hd' if hd else 'sd',
'preference': 1,
}]
if not hd:
mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
if is_hd(webpage):
video_url = extract_video_url(webpage, 'hd')
formats.append({
'url': video_url,
'format_id': 'hd',
'preference': 2,
})
self._sort_formats(formats)
categories_html = self._search_regex( categories_html = self._search_regex(
r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage, r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage,
'categories', default=None) 'categories', default=None)
@ -164,6 +177,7 @@ class XHamsterIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'display_id': display_id,
'title': title, 'title': title,
'description': description, 'description': description,
'upload_date': upload_date, 'upload_date': upload_date,

View File

@ -1,14 +1,13 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import base64
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
float_or_none,
get_element_by_attribute,
parse_iso8601, parse_iso8601,
parse_duration, remove_end,
) )
@ -24,6 +23,7 @@ class XuiteIE(InfoExtractor):
'id': '3860914', 'id': '3860914',
'ext': 'mp3', 'ext': 'mp3',
'title': '孤單南半球-歐德陽', 'title': '孤單南半球-歐德陽',
'description': '孤單南半球-歐德陽',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 247.246, 'duration': 247.246,
'timestamp': 1314932940, 'timestamp': 1314932940,
@ -44,7 +44,7 @@ class XuiteIE(InfoExtractor):
'duration': 596.458, 'duration': 596.458,
'timestamp': 1454242500, 'timestamp': 1454242500,
'upload_date': '20160131', 'upload_date': '20160131',
'uploader': 'yan12125', 'uploader': '屁姥',
'uploader_id': '12158353', 'uploader_id': '12158353',
'categories': ['個人短片'], 'categories': ['個人短片'],
'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4', 'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4',
@ -72,10 +72,10 @@ class XuiteIE(InfoExtractor):
# from http://forgetfulbc.blogspot.com/2016/06/date.html # from http://forgetfulbc.blogspot.com/2016/06/date.html
'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0', 'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0',
'info_dict': { 'info_dict': {
'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==', 'id': '27447336',
'ext': 'mp4', 'ext': 'mp4',
'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', 'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)',
'description': 'md5:f0abdcb69df300f522a5442ef3146f2a', 'description': 'md5:1223810fa123b179083a3aed53574706',
'timestamp': 1466160960, 'timestamp': 1466160960,
'upload_date': '20160617', 'upload_date': '20160617',
'uploader': 'B.C. & Lowy', 'uploader': 'B.C. & Lowy',
@ -86,29 +86,9 @@ class XuiteIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def base64_decode_utf8(data):
return base64.b64decode(data.encode('utf-8')).decode('utf-8')
@staticmethod
def base64_encode_utf8(data):
return base64.b64encode(data.encode('utf-8')).decode('utf-8')
def _extract_flv_config(self, encoded_media_id):
flv_config = self._download_xml(
'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id,
'flv config')
prop_dict = {}
for prop in flv_config.findall('./property'):
prop_id = self.base64_decode_utf8(prop.attrib['id'])
# CDATA may be empty in flv config
if not prop.text:
continue
encoded_content = self.base64_decode_utf8(prop.text)
prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)
return prop_dict
def _real_extract(self, url): def _real_extract(self, url):
# /play/ URLs provide embedded video URL and more metadata
url = url.replace('/embed/', '/play/')
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
@ -121,51 +101,53 @@ class XuiteIE(InfoExtractor):
'%s returned error: %s' % (self.IE_NAME, error_msg), '%s returned error: %s' % (self.IE_NAME, error_msg),
expected=True) expected=True)
encoded_media_id = self._search_regex( media_info = self._parse_json(self._search_regex(
r'attributes\.name\s*=\s*"([^"]+)"', webpage, r'var\s+mediaInfo\s*=\s*({.*});', webpage, 'media info'), video_id)
'encoded media id', default=None)
if encoded_media_id is None:
video_id = self._html_search_regex(
r'data-mediaid="(\d+)"', webpage, 'media id')
encoded_media_id = self.base64_encode_utf8(video_id)
flv_config = self._extract_flv_config(encoded_media_id)
FORMATS = { video_id = media_info['MEDIA_ID']
'audio': 'mp3',
'video': 'mp4',
}
formats = [] formats = []
for format_tag in ('src', 'hq_src'): for key in ('html5Url', 'html5HQUrl'):
video_url = flv_config.get(format_tag) video_url = media_info.get(key)
if not video_url: if not video_url:
continue continue
format_id = self._search_regex( format_id = self._search_regex(
r'\bq=(.+?)\b', video_url, 'format id', default=format_tag) r'\bq=(.+?)\b', video_url, 'format id', default=None)
formats.append({ formats.append({
'url': video_url, 'url': video_url,
'ext': FORMATS.get(flv_config['type'], 'mp4'), 'ext': 'mp4' if format_id.isnumeric() else format_id,
'format_id': format_id, 'format_id': format_id,
'height': int(format_id) if format_id.isnumeric() else None, 'height': int(format_id) if format_id.isnumeric() else None,
}) })
self._sort_formats(formats) self._sort_formats(formats)
timestamp = flv_config.get('publish_datetime') timestamp = media_info.get('PUBLISH_DATETIME')
if timestamp: if timestamp:
timestamp = parse_iso8601(timestamp + ' +0800', ' ') timestamp = parse_iso8601(timestamp + ' +0800', ' ')
category = flv_config.get('category') category = media_info.get('catName')
categories = [category] if category else [] categories = [category] if category else []
uploader = media_info.get('NICKNAME')
uploader_url = None
author_div = get_element_by_attribute('itemprop', 'author', webpage)
if author_div:
uploader = uploader or self._html_search_meta('name', author_div)
uploader_url = self._html_search_regex(
r'<link[^>]+itemprop="url"[^>]+href="([^"]+)"', author_div,
'uploader URL', fatal=False)
return { return {
'id': video_id, 'id': video_id,
'title': flv_config['title'], 'title': media_info['TITLE'],
'description': flv_config.get('description'), 'description': remove_end(media_info.get('metaDesc'), ' (Xuite 影音)'),
'thumbnail': flv_config.get('thumb'), 'thumbnail': media_info.get('ogImageUrl'),
'timestamp': timestamp, 'timestamp': timestamp,
'uploader': flv_config.get('author_name'), 'uploader': uploader,
'uploader_id': flv_config.get('author_id'), 'uploader_id': media_info.get('MEMBER_ID'),
'duration': parse_duration(flv_config.get('duration')), 'uploader_url': uploader_url,
'duration': float_or_none(media_info.get('MEDIA_DURATION'), 1000000),
'categories': categories, 'categories': categories,
'formats': formats, 'formats': formats,
} }

View File

@ -1,123 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
float_or_none,
month_by_abbreviation,
ExtractorError,
get_element_by_attribute,
)
class YamIE(InfoExtractor):
IE_DESC = '蕃薯藤yam天空部落'
_VALID_URL = r'https?://mymedia\.yam\.com/m/(?P<id>\d+)'
_TESTS = [{
# An audio hosted on Yam
'url': 'http://mymedia.yam.com/m/2283921',
'md5': 'c011b8e262a52d5473d9c2e3c9963b9c',
'info_dict': {
'id': '2283921',
'ext': 'mp3',
'title': '發現 - 趙薇 京華煙雲主題曲',
'description': '發現 - 趙薇 京華煙雲主題曲',
'uploader_id': 'princekt',
'upload_date': '20080807',
'duration': 313.0,
}
}, {
# An external video hosted on YouTube
'url': 'http://mymedia.yam.com/m/3599430',
'md5': '03127cf10d8f35d120a9e8e52e3b17c6',
'info_dict': {
'id': 'CNpEoQlrIgA',
'ext': 'mp4',
'upload_date': '20150306',
'uploader': '新莊社大瑜伽社',
'description': 'md5:11e2e405311633ace874f2e6226c8b17',
'uploader_id': '2323agoy',
'title': '20090412陽明山二子坪-1',
},
'skip': 'Video does not exist',
}, {
'url': 'http://mymedia.yam.com/m/3598173',
'info_dict': {
'id': '3598173',
'ext': 'mp4',
},
'skip': 'cause Yam system error',
}, {
'url': 'http://mymedia.yam.com/m/3599437',
'info_dict': {
'id': '3599437',
'ext': 'mp4',
},
'skip': 'invalid YouTube URL',
}, {
'url': 'http://mymedia.yam.com/m/2373534',
'md5': '7ff74b91b7a817269d83796f8c5890b1',
'info_dict': {
'id': '2373534',
'ext': 'mp3',
'title': '林俊傑&蔡卓妍-小酒窩',
'description': 'md5:904003395a0fcce6cfb25028ff468420',
'upload_date': '20080928',
'uploader_id': 'onliner2',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
page = self._download_webpage(url, video_id)
# Check for errors
system_msg = self._html_search_regex(
r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message',
default=None)
if system_msg:
raise ExtractorError(system_msg, expected=True)
# Is it hosted externally on YouTube?
youtube_url = self._html_search_regex(
r'<embed src="(http://www.youtube.com/[^"]+)"',
page, 'YouTube url', default=None)
if youtube_url:
return self.url_result(youtube_url, 'Youtube')
title = self._html_search_regex(
r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title')
api_page = self._download_webpage(
'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
note='Downloading API page')
api_result_obj = compat_urlparse.parse_qs(api_page)
info_table = get_element_by_attribute('class', 'info', page)
uploader_id = self._html_search_regex(
r'<!-- 發表作者 -->[\n ]+<a href="/([a-z0-9]+)"',
info_table, 'uploader id', fatal=False)
mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' +
r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
if mobj:
upload_date = '%s%02d%02d' % (
mobj.group('year'),
month_by_abbreviation(mobj.group('mon')),
int(mobj.group('day')))
else:
upload_date = None
duration = float_or_none(api_result_obj['totaltime'][0], scale=1000)
return {
'id': video_id,
'url': api_result_obj['mp3file'][0],
'title': title,
'description': self._html_search_meta('description', page),
'duration': duration,
'uploader_id': uploader_id,
'upload_date': upload_date,
}

View File

@ -1,7 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools
import random import random
import re import re
import string import string
@ -14,7 +13,6 @@ from ..utils import (
js_to_json, js_to_json,
str_or_none, str_or_none,
strip_jsonp, strip_jsonp,
urljoin,
) )
@ -222,17 +220,42 @@ class YoukuShowIE(InfoExtractor):
_VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html' _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html'
IE_NAME = 'youku:show' IE_NAME = 'youku:show'
_TEST = { _TESTS = [{
'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html',
'info_dict': { 'info_dict': {
'id': 'zc7c670be07ff11e48b3f', 'id': 'zc7c670be07ff11e48b3f',
'title': '花千骨 未删减', 'title': '花千骨 DVD',
'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558',
}, },
'playlist_count': 50, 'playlist_count': 50,
} }, {
# Episode number not starting from 1
'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html',
'info_dict': {
'id': 'zefbfbd70efbfbd780bef',
'title': '超级飞侠3',
'description': 'md5:275715156abebe5ccc2a1992e9d56b98',
},
'playlist_count': 24,
}, {
# Ongoing playlist. The initial page is the last one
'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html',
'only_matchine': True,
}]
_PAGE_SIZE = 40 def _extract_entries(self, playlist_data_url, show_id, note, query):
query['callback'] = 'cb'
playlist_data = self._download_json(
playlist_data_url, show_id, query=query, note=note,
transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
drama_list = (get_element_by_class('p-drama-grid', playlist_data) or
get_element_by_class('p-drama-half-row', playlist_data))
if drama_list is None:
raise ExtractorError('No episodes found')
video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list)
return playlist_data, [
self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key())
for video_url in video_urls]
def _real_extract(self, url): def _real_extract(self, url):
show_id = self._match_id(url) show_id = self._match_id(url)
@ -242,30 +265,29 @@ class YoukuShowIE(InfoExtractor):
page_config = self._parse_json(self._search_regex( page_config = self._parse_json(self._search_regex(
r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'),
show_id, transform_source=js_to_json) show_id, transform_source=js_to_json)
for idx in itertools.count(0): first_page, initial_entries = self._extract_entries(
if idx == 0: 'http://list.youku.com/show/module', show_id,
playlist_data_url = 'http://list.youku.com/show/module' note='Downloading initial playlist data page',
query = {'id': page_config['showid'], 'tab': 'point'} query={
else: 'id': page_config['showid'],
playlist_data_url = 'http://list.youku.com/show/point' 'tab': 'showInfo',
query = { })
'id': page_config['showid'], first_page_reload_id = self._html_search_regex(
'stage': 'reload_%d' % (self._PAGE_SIZE * idx + 1), r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id')
} # The first reload_id has the same items as first_page
query['callback'] = 'cb' reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page)
playlist_data = self._download_json( for idx, reload_id in enumerate(reload_ids):
playlist_data_url, show_id, query=query, if reload_id == first_page_reload_id:
entries.extend(initial_entries)
continue
_, new_entries = self._extract_entries(
'http://list.youku.com/show/episode', show_id,
note='Downloading playlist data page %d' % (idx + 1), note='Downloading playlist data page %d' % (idx + 1),
transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] query={
video_urls = re.findall( 'id': page_config['showid'],
r'<div[^>]+class="p-thumb"[^<]+<a[^>]+href="([^"]+)"', 'stage': reload_id,
playlist_data) })
new_entries = [
self.url_result(urljoin(url, video_url), YoukuIE.ie_key())
for video_url in video_urls]
entries.extend(new_entries) entries.extend(new_entries)
if len(new_entries) < self._PAGE_SIZE:
break
desc = self._html_search_meta('description', webpage, fatal=False) desc = self._html_search_meta('description', webpage, fatal=False)
playlist_title = desc.split(',')[0] if desc else None playlist_title = desc.split(',')[0] if desc else None

View File

@ -673,6 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}, },
}, },
# video_info is None (https://github.com/rg3/youtube-dl/issues/4421) # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
# YouTube Red ad is not captured for creator
{ {
'url': '__2ABJjxzNo', 'url': '__2ABJjxzNo',
'info_dict': { 'info_dict': {
@ -1649,7 +1650,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_webpage, 'license', default=None) video_webpage, 'license', default=None)
m_music = re.search( m_music = re.search(
r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', r'''(?x)
<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
<ul[^>]*>\s*
<li>(?P<title>.+?)
by (?P<creator>.+?)
(?:
\(.+?\)|
<a[^>]*
(?:
\bhref=["\']/red[^>]*>| # drop possible
>\s*Listen ad-free with YouTube Red # YouTube Red ad
)
.*?
)?</li
''',
video_webpage) video_webpage)
if m_music: if m_music:
video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

View File

@ -542,7 +542,7 @@ class FFmpegFixupM3u8PP(FFmpegPostProcessor):
temp_filename = prepend_extension(filename, 'temp') temp_filename = prepend_extension(filename, 'temp')
options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename) self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename)
self.run_ffmpeg(filename, temp_filename, options) self.run_ffmpeg(filename, temp_filename, options)
os.remove(encodeFilename(filename)) os.remove(encodeFilename(filename))

View File

@ -365,9 +365,9 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
retlist = [] retlist = []
for m in re.finditer(r'''(?xs) for m in re.finditer(r'''(?xs)
<([a-zA-Z0-9:._-]+) <([a-zA-Z0-9:._-]+)
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s+%s=['"]?%s['"]? \s+%s=['"]?%s['"]?
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s*> \s*>
(?P<content>.*?) (?P<content>.*?)
</\1> </\1>

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2017.06.23' __version__ = '2017.07.23'