Compare commits

...

55 Commits

Author SHA1 Message Date
9b92a5917b release 2017.02.11 2017-02-11 03:24:00 +07:00
3e2274c8b7 [ChangeLog] Actualize 2017-02-11 17:08:22 +07:00
3d7e3aaa0e [pluralsight:course] Fix extraction (closes #12075) 2017-02-11 17:00:52 +07:00
624c4b92ff [facebook] Add coding cookie 2017-02-11 16:18:45 +07:00
2af12ad9d2 Introduce get_elements_by_class and get_elements_by_attribute utility functions 2017-02-11 17:16:54 +08:00
97eb9bd2ac [bbc] extract m3u8 formats with 320k audio 2017-02-10 19:46:15 +01:00
71cdd75628 [facebook] Relax video id matching (closes #11017, closes #12055, closes #12056) 2017-02-11 01:05:22 +07:00
c7d6f614f3 [corus] Add new extractor(closes #12060)(#9164) 2017-02-10 17:00:09 +01:00
08a00eef79 [extractor/common] skip m3u8 manifests protected with Adobe Flash Access 2017-02-10 17:00:09 +01:00
9dd5408c99 [pluralsight] Detect blocked account error message (#12070) 2017-02-10 22:48:11 +07:00
9510709575 [bloomberg] Add another video id regex (closes #12062) 2017-02-10 22:16:20 +07:00
5abcca9060 [sixplay] use raw string for regex 2017-02-10 09:34:59 +01:00
e01bfc19c3 [extractor/commonmistakes] Restrict _VALID_URL (closes #12050) 2017-02-10 09:39:24 +07:00
4d32b63851 [tvplayer] Add new extractor 2017-02-09 23:09:21 +01:00
55d4de2283 release 2017.02.10 2017-02-10 01:27:33 +07:00
61ee556aea [ChangeLog] Actualize 2017-02-10 01:26:00 +07:00
ff24261ba0 [kaltura] Add explicit port to regexes
They should not match e.g. cdnapi.kaltura.computernetworks.com/...
2017-02-10 01:24:14 +07:00
fbc6dc525e [xtube] Fix shortcuts 2017-02-10 01:06:23 +07:00
9150d1eb69 [xtube] Fix extraction (closes #12023) 2017-02-10 01:03:35 +07:00
b7f9843bec [pornhub] Simplify (closes #12018) 2017-02-10 00:57:44 +07:00
e64b0fca14 [pornhub] Fix extraction (closes #12007) 2017-02-10 00:56:12 +07:00
78ef214d2d [facebook] Improve JS data regex (closes #12042) 2017-02-09 23:42:40 +07:00
be670b8e8f [external:ffmpeg] do not assume that ffmpeg unknown version format is new 2017-02-09 17:36:59 +01:00
37084f6641 [kaltura] improve embed partner id extraction(fixes #12041) 2017-02-09 16:24:54 +01:00
b04975733c [sprout] Add new extractor 2017-02-09 09:13:29 +01:00
c8b8fb0a99 [sixplay] improve extraction
- skip drm protected formats
- extract more and better formats
- skip duplicate asset urls
2017-02-08 22:56:10 +01:00
8298018273 [scrippsnetworks:watch] Add new extractor(closes #10765) 2017-02-08 20:44:23 +01:00
ae8d5a5c59 [go] add support for adobe pass auth(closes #11468)(closes #10831) 2017-02-08 18:57:07 +01:00
b9c9cb5f79 [6play] Fix extraction (closes #12011) 2017-02-08 23:15:39 +07:00
fdf9b959bc [nbc] add support adobe pass auth(closes #12006) 2017-02-08 16:23:42 +01:00
013877298d release 2017.02.07 2017-02-07 02:04:50 +07:00
c87f95f991 [ChangeLog] Actualize 2017-02-07 01:58:57 +07:00
f28aeff264 [pornhub] Fix extraction (closes #11997) 2017-02-07 01:52:59 +07:00
242a14a1f6 [extractor/common] Fix audio only with audio group in m3u8 (closes #11995) 2017-02-07 00:22:16 +07:00
d5d904ff7d [canalplus] Add support for cstar.fr (#11990) 2017-02-06 23:53:42 +07:00
5620f840f6 [extractor/generic] Add test for #11993 and more metadata for rtmp 2017-02-06 23:31:58 +07:00
b7a8c1bcfa [extractor/generic] Improve rtmp support (closes #11993) 2017-02-06 23:23:40 +07:00
7097bffba6 [downloader/fragment] Respect --no-part 2017-02-06 23:07:59 +07:00
2aec7256ae [extractor/common] Speed-up media tags regex (closes #11979) 2017-02-06 00:20:30 +07:00
815482d4eb Credit @motophil for gaskrank.py (#11685) 2017-02-06 00:38:22 +08:00
9c14fe9681 [gaskrank] Minor change and update ChangeLog after #11685 2017-02-06 00:25:28 +08:00
e705755739 [gaskrank] Add new extractor (#11685)
* [gaskrank] Add new extractor

* [gaskrank] Add new extractor - fixes as requested

* [gaskrank] Add new extractor - style fix

* [Gaskrank] Add new extractor - requested fixes

* [Gaskrank] Add new extractor - fix md5 checksum

* [gaskrank] Add new extractor - more requested fixes

* [Gaskrank] Add new extractor - fixed all but one quantified code issues

* [Gaskrank] add new extractor - more fields extracted, added second test

* [Gaskrank] Add new extractor - requested fixes.

* [Gaskrank] Add new extractor - requested changes.

* [Gaskrank] Add new extractor - final(?) fixes.
2017-02-06 00:19:37 +08:00
019f4c0371 [bandcamp] Fix extraction for incomplete albums
Closes #11727
2017-02-05 22:47:04 +08:00
2ab2c0d1f5 [iwara] Add width (closes #11724)
The heuristic is from #11724
2017-02-05 22:30:13 +08:00
caf0f5f8b7 [iwara] Fix extraction (closes #11781) 2017-02-05 21:48:13 +08:00
e4e50f60b1 [googledrive] Fix extraction on Python 3.6
Since Python 3.6, invalid escape sequences are deprecated. It's likely
that there are invalid escape sequences somewhere on the webpage, so
instead of unescaping the whole webpage, just unescape the URL.

See https://bugs.python.org/issue27364. That change was designed for
string literals, while it affects the 'unicode_escape' encoding as well.
The code path is:

str.decode('unicode_escape')
    codecs.unicode_escape_decode()
        PyUnicode_DecodeUnicodeEscape()
2017-02-05 21:41:08 +08:00
6ef3e65a7b [videopress] Add extractor 2017-02-05 13:37:27 +07:00
6fd138bed8 [sportbox] PEP 8 2017-02-05 13:36:52 +07:00
49bd8d5e2e [travis] Add python 3.6 2017-02-05 02:41:22 +07:00
3d2c2752c5 [afreecatv] extract rtmp formats 2017-02-04 18:18:28 +01:00
a713a86755 release 2017.02.04.1 2017-02-04 23:26:39 +07:00
7bccd5fc8a [ChangeLog] Actualize 2017-02-04 23:23:38 +07:00
3144eccf55 [ChangeLog] Actualize 2017-02-04 23:22:28 +07:00
9db8f6c540 [twitch:stream] Improve _VALID_URL (closes #11971) 2017-02-04 23:21:07 +07:00
8e4041cf3f [radiocanada] fix extraction for toutv rtmp formats 2017-02-04 17:05:35 +01:00
40 changed files with 1102 additions and 186 deletions

View File

@ -6,8 +6,8 @@
--- ---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.04*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. ### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.11*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.04** - [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.11**
### Before submitting an *issue* make sure you have: ### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
[debug] User config: [] [debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2017.02.04 [debug] youtube-dl version 2017.02.11
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {} [debug] Proxy map: {}

View File

@ -6,6 +6,7 @@ python:
- "3.3" - "3.3"
- "3.4" - "3.4"
- "3.5" - "3.5"
- "3.6"
sudo: false sudo: false
script: nosetests test --verbose script: nosetests test --verbose
notifications: notifications:

View File

@ -201,3 +201,4 @@ Stephen Chen
Fabian Stahl Fabian Stahl
Bagira Bagira
Odd Stråbø Odd Stråbø
Philip Herzog

View File

@ -1,12 +1,91 @@
version 2017.02.11
Core
+ [utils] Introduce get_elements_by_class and get_elements_by_attribute
utility functions
+ [extractor/common] Skip m3u8 manifests protected with Adobe Flash Access
Extractor
* [pluralsight:course] Fix extraction (#12075)
+ [bbc] Extract m3u8 formats with 320k audio
* [facebook] Relax video id matching (#11017, #12055, #12056)
+ [corus] Add support for Corus Entertainment sites (#12060, #9164)
+ [pluralsight] Detect blocked account error message (#12070)
+ [bloomberg] Add another video id pattern (#12062)
* [extractor/commonmistakes] Restrict URL regular expression (#12050)
+ [tvplayer] Add support for tvplayer.com
version 2017.02.10
Extractors
* [xtube] Fix extraction (#12023)
* [pornhub] Fix extraction (#12007, #12018)
* [facebook] Improve JS data regular expression (#12042)
* [kaltura] Improve embed partner id extraction (#12041)
+ [sprout] Add support for sproutonline.com
* [6play] Improve extraction
+ [scrippsnetworks:watch] Add support for Scripps Networks sites (#10765)
+ [go] Add support for Adobe Pass authentication (#11468, #10831)
* [6play] Fix extraction (#12011)
+ [nbc] Add support for Adobe Pass authentication (#12006)
version 2017.02.07
Core
* [extractor/common] Fix audio only with audio group in m3u8 (#11995)
+ [downloader/fragment] Respect --no-part
* [extractor/common] Speed-up HTML5 media entries extraction (#11979)
Extractors
* [pornhub] Fix extraction (#11997)
+ [canalplus] Add support for cstar.fr (#11990)
+ [extractor/generic] Improve RTMP support (#11993)
+ [gaskrank] Add support for gaskrank.tv (#11685)
* [bandcamp] Fix extraction for incomplete albums (#11727)
* [iwara] Fix extraction (#11781)
* [googledrive] Fix extraction on Python 3.6
+ [videopress] Add support for videopress.com
+ [afreecatv] Extract RTMP formats
version 2017.02.04.1
Extractors
+ [twitch:stream] Add support for player.twitch.tv (#11971)
* [radiocanada] Fix extraction for toutv rtmp formats
version 2017.02.04 version 2017.02.04
Core Core
+ Add --playlist-random to shuffle playlists (#11889, #11901) + Add --playlist-random to shuffle playlists (#11889, #11901)
* [utils] Improve comments processing in js_to_json (#11947)
* [utils] Handle single-line comments in js_to_json
* [downloader/external:ffmpeg] Minimize the use of aac_adtstoasc filter
Extractors Extractors
+ [piksel] Add another app token pattern (#11969)
+ [vk] Capture and output author blocked error message (#11965)
+ [turner] Fix secure HLS formats downloading with ffmpeg (#11358, #11373,
#11800)
+ [drtv] Add support for live and radio sections (#1827, #3427)
* [myspace] Fix extraction and extract HLS and HTTP formats
+ [youtube] Add format info for itag 325 and 328
* [vine] Fix extraction (#11955)
- [sportbox] Remove extractor (#11954)
+ [filmon] Add support for filmon.com (#11187)
+ [infoq] Add audio only formats (#11565) + [infoq] Add audio only formats (#11565)
* [douyutv] Improve room id regular expression (#11931)
* [iprima] Fix extraction (#11920, #11896)
* [youtube] Fix ytsearch when cookies are provided (#11924) * [youtube] Fix ytsearch when cookies are provided (#11924)
* [go] Relax video id regular expression (#11937)
* [facebook] Fix title extraction (#11941)
+ [youtube:playlist] Recognize TL playlists (#11945)
+ [bilibili] Support new Bangumi URLs (#11845) + [bilibili] Support new Bangumi URLs (#11845)
+ [cbc:watch] Extract audio codec for audio only formats (#11893)
+ [elpais] Fix extraction for some URLs (#11765)
version 2017.02.01 version 2017.02.01
@ -18,7 +97,6 @@ Extractors
+ [vimeo] Extract upload timestamp + [vimeo] Extract upload timestamp
+ [vimeo] Extract license (#8726, #11880) + [vimeo] Extract license (#8726, #11880)
+ [nrk:series] Add support for series (#11571, #11711) + [nrk:series] Add support for series (#11571, #11711)
+ [elpais] Fix extraction for some URLs (#11765)
version 2017.01.31 version 2017.01.31

View File

@ -11,6 +11,7 @@
- **4tube** - **4tube**
- **56.com** - **56.com**
- **5min** - **5min**
- **6play**
- **8tracks** - **8tracks**
- **91porn** - **91porn**
- **9c9media** - **9c9media**
@ -168,6 +169,7 @@
- **ComedyCentralShortname** - **ComedyCentralShortname**
- **ComedyCentralTV** - **ComedyCentralTV**
- **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED
- **Corus**
- **Coub** - **Coub**
- **Cracked** - **Cracked**
- **Crackle** - **Crackle**
@ -282,6 +284,7 @@
- **Gamersyde** - **Gamersyde**
- **GameSpot** - **GameSpot**
- **GameStar** - **GameStar**
- **Gaskrank**
- **Gazeta** - **Gazeta**
- **GDCVault** - **GDCVault**
- **generic**: Generic downloader that works on some sites - **generic**: Generic downloader that works on some sites
@ -307,7 +310,6 @@
- **HellPorno** - **HellPorno**
- **Helsinki**: helsinki.fi - **Helsinki**: helsinki.fi
- **HentaiStigma** - **HentaiStigma**
- **HGTV**
- **hgtv.com:show** - **hgtv.com:show**
- **HistoricFilms** - **HistoricFilms**
- **history:topic**: History.com Topic - **history:topic**: History.com Topic
@ -666,6 +668,7 @@
- **screen.yahoo:search**: Yahoo screen search - **screen.yahoo:search**: Yahoo screen search
- **Screencast** - **Screencast**
- **ScreencastOMatic** - **ScreencastOMatic**
- **scrippsnetworks:watch**
- **Seeker** - **Seeker**
- **SenateISVP** - **SenateISVP**
- **SendtoNews** - **SendtoNews**
@ -675,7 +678,6 @@
- **Shared**: shared.sx - **Shared**: shared.sx
- **ShowRoomLive** - **ShowRoomLive**
- **Sina** - **Sina**
- **SixPlay**
- **skynewsarabia:article** - **skynewsarabia:article**
- **skynewsarabia:video** - **skynewsarabia:video**
- **SkySports** - **SkySports**
@ -710,6 +712,7 @@
- **SportBoxEmbed** - **SportBoxEmbed**
- **SportDeutschland** - **SportDeutschland**
- **Sportschau** - **Sportschau**
- **Sprout**
- **sr:mediathek**: Saarländischer Rundfunk - **sr:mediathek**: Saarländischer Rundfunk
- **SRGSSR** - **SRGSSR**
- **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites
@ -803,6 +806,7 @@
- **tvp**: Telewizja Polska - **tvp**: Telewizja Polska
- **tvp:embed**: Telewizja Polska - **tvp:embed**: Telewizja Polska
- **tvp:series** - **tvp:series**
- **TVPlayer**
- **Tweakers** - **Tweakers**
- **twitch:chapter** - **twitch:chapter**
- **twitch:clips** - **twitch:clips**
@ -859,6 +863,7 @@
- **videomore:season** - **videomore:season**
- **videomore:video** - **videomore:video**
- **VideoPremium** - **VideoPremium**
- **VideoPress**
- **videoweed**: VideoWeed - **videoweed**: VideoWeed
- **Vidio** - **Vidio**
- **vidme** - **vidme**

View File

@ -34,6 +34,9 @@ from youtube_dl.utils import (
find_xpath_attr, find_xpath_attr,
fix_xml_ampersands, fix_xml_ampersands,
get_element_by_class, get_element_by_class,
get_element_by_attribute,
get_elements_by_class,
get_elements_by_attribute,
InAdvancePagedList, InAdvancePagedList,
intlist_to_bytes, intlist_to_bytes,
is_html, is_html,
@ -1124,6 +1127,32 @@ The first line
self.assertEqual(get_element_by_class('foo', html), 'nice') self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None) self.assertEqual(get_element_by_class('no-such-class', html), None)
def test_get_element_by_attribute(self):
html = '''
<span class="foo bar">nice</span>
'''
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
def test_get_elements_by_class(self):
html = '''
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
'''
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('no-such-class', html), [])
def test_get_elements_by_attribute(self):
html = '''
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
'''
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -275,7 +275,7 @@ class FFmpegFD(ExternalFD):
args += ['-f', 'mpegts'] args += ['-f', 'mpegts']
else: else:
args += ['-f', 'mp4'] args += ['-f', 'mp4']
if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')):
args += ['-bsf:a', 'aac_adtstoasc'] args += ['-bsf:a', 'aac_adtstoasc']
elif protocol == 'rtmp': elif protocol == 'rtmp':
args += ['-f', 'flv'] args += ['-f', 'flv']

View File

@ -61,6 +61,7 @@ class FragmentFD(FileDownloader):
'noprogress': True, 'noprogress': True,
'ratelimit': self.params.get('ratelimit'), 'ratelimit': self.params.get('ratelimit'),
'retries': self.params.get('retries', 0), 'retries': self.params.get('retries', 0),
'nopart': self.params.get('nopart', False),
'test': self.params.get('test', False), 'test': self.params.get('test', False),
} }
) )

View File

@ -221,10 +221,23 @@ class AfreecaTVGlobalIE(AfreecaTVIE):
s_url = s.get('purl') s_url = s.get('purl')
if not s_url: if not s_url:
continue continue
# TODO: extract rtmp formats stype = s.get('stype')
if s.get('stype') == 'HLS': if stype == 'HLS':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
s_url, channel_id, 'mp4', fatal=False)) s_url, channel_id, 'mp4', m3u8_id=stype, fatal=False))
elif stype == 'RTMP':
format_id = [stype]
label = s.get('label')
if label:
format_id.append(label)
formats.append({
'format_id': '-'.join(format_id),
'url': s_url,
'tbr': int_or_none(s.get('bps')),
'height': int_or_none(s.get('brt')),
'ext': 'flv',
'rtmp_live': True,
})
self._sort_formats(formats) self._sort_formats(formats)
info.update({ info.update({

View File

@ -209,6 +209,15 @@ class BandcampAlbumIE(InfoExtractor):
'id': 'entropy-ep', 'id': 'entropy-ep',
}, },
'playlist_mincount': 3, 'playlist_mincount': 3,
}, {
# not all tracks have songs
'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
'info_dict': {
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
},
'playlist_count': 2,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -217,12 +226,16 @@ class BandcampAlbumIE(InfoExtractor):
album_id = mobj.group('album_id') album_id = mobj.group('album_id')
playlist_id = album_id or uploader_id playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id) webpage = self._download_webpage(url, playlist_id)
tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) track_elements = re.findall(
if not tracks_paths: r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
if not track_elements:
raise ExtractorError('The page doesn\'t contain any tracks') raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
entries = [ entries = [
self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
for t_path in tracks_paths] for elem_content, t_path in track_elements
if self._html_search_meta('duration', elem_content, default=None)]
title = self._html_search_regex( title = self._html_search_regex(
r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
webpage, 'title', fatal=False) webpage, 'title', fatal=False)

View File

@ -225,6 +225,8 @@ class BBCCoUkIE(InfoExtractor):
} }
] ]
_USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
class MediaSelectionError(Exception): class MediaSelectionError(Exception):
def __init__(self, id): def __init__(self, id):
self.id = id self.id = id
@ -336,6 +338,15 @@ class BBCCoUkIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
href, programme_id, ext='mp4', entry_protocol='m3u8_native', href, programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)) m3u8_id=format_id, fatal=False))
if re.search(self._USP_RE, href):
usp_formats = self._extract_m3u8_formats(
re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for f in usp_formats:
if f.get('height') and f['height'] > 720:
continue
formats.append(f)
elif transfer_format == 'hds': elif transfer_format == 'hds':
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
href, programme_id, f4m_id=format_id, fatal=False)) href, programme_id, f4m_id=format_id, fatal=False))

View File

@ -33,6 +33,10 @@ class BloombergIE(InfoExtractor):
'params': { 'params': {
'format': 'best[format_id^=hds]', 'format': 'best[format_id^=hds]',
}, },
}, {
# data-bmmrid=
'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money',
'only_matching': True,
}, { }, {
'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
'only_matching': True, 'only_matching': True,
@ -45,9 +49,10 @@ class BloombergIE(InfoExtractor):
name = self._match_id(url) name = self._match_id(url)
webpage = self._download_webpage(url, name) webpage = self._download_webpage(url, name)
video_id = self._search_regex( video_id = self._search_regex(
(r'["\']bmmrId["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
r'videoId\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'id', group='url', default=None) r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'),
webpage, 'id', group='id', default=None)
if not video_id: if not video_id:
bplayer_data = self._parse_json(self._search_regex( bplayer_data = self._parse_json(self._search_regex(
r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)

View File

@ -27,6 +27,7 @@ class CanalplusIE(InfoExtractor):
(?:www\.)?d8\.tv| (?:www\.)?d8\.tv|
(?:www\.)?c8\.fr| (?:www\.)?c8\.fr|
(?:www\.)?d17\.tv| (?:www\.)?d17\.tv|
(?:(?:football|www)\.)?cstar\.fr|
(?:www\.)?itele\.fr (?:www\.)?itele\.fr
)/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?| )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?|
player\.canalplus\.fr/#/(?P<id>\d+) player\.canalplus\.fr/#/(?P<id>\d+)
@ -40,6 +41,7 @@ class CanalplusIE(InfoExtractor):
'd8': 'd8', 'd8': 'd8',
'c8': 'd8', 'c8': 'd8',
'd17': 'd17', 'd17': 'd17',
'cstar': 'd17',
'itele': 'itele', 'itele': 'itele',
} }
@ -86,6 +88,19 @@ class CanalplusIE(InfoExtractor):
'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.', 'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.',
'upload_date': '20161014', 'upload_date': '20161014',
}, },
}, {
'url': 'http://football.cstar.fr/cstar-minisite-foot/pid7566-feminines-videos.html?vid=1416769',
'info_dict': {
'id': '1416769',
'display_id': 'pid7566-feminines-videos',
'ext': 'mp4',
'title': 'France - Albanie : les temps forts de la soirée - 20/09/2016',
'description': 'md5:c3f30f2aaac294c1c969b3294de6904e',
'upload_date': '20160921',
},
'params': {
'skip_download': True,
},
}, { }, {
'url': 'http://m.canalplus.fr/?vid=1398231', 'url': 'http://m.canalplus.fr/?vid=1398231',
'only_matching': True, 'only_matching': True,

View File

@ -1208,6 +1208,9 @@ class InfoExtractor(object):
m3u8_doc, urlh = res m3u8_doc, urlh = res
m3u8_url = urlh.geturl() m3u8_url = urlh.geturl()
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
format_url = lambda u: ( format_url = lambda u: (
@ -1315,8 +1318,8 @@ class InfoExtractor(object):
'abr': abr, 'abr': abr,
}) })
f.update(parse_codecs(last_info.get('CODECS'))) f.update(parse_codecs(last_info.get('CODECS')))
if audio_in_video_stream.get(last_info.get('AUDIO')) is False: if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
# TODO: update acodec for for audio only formats with the same GROUP-ID # TODO: update acodec for audio only formats with the same GROUP-ID
f['acodec'] = 'none' f['acodec'] = 'none'
formats.append(f) formats.append(f)
last_info = {} last_info = {}
@ -1959,7 +1962,12 @@ class InfoExtractor(object):
media_tags = [(media_tag, media_type, '') media_tags = [(media_tag, media_type, '')
for media_tag, media_type for media_tag, media_type
in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage)) media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see
# https://github.com/rg3/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml).
r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
for media_tag, media_type, media_content in media_tags: for media_tag, media_type, media_content in media_tags:
media_info = { media_info = {
'formats': [], 'formats': [],

View File

@ -7,7 +7,7 @@ from ..utils import ExtractorError
class CommonMistakesIE(InfoExtractor): class CommonMistakesIE(InfoExtractor):
IE_DESC = False # Do not list IE_DESC = False # Do not list
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?:url|URL) (?:url|URL)$
''' '''
_TESTS = [{ _TESTS = [{

View File

@ -0,0 +1,72 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .theplatform import ThePlatformFeedIE
from ..utils import int_or_none
class CorusIE(ThePlatformFeedIE):
_VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:globaltv|etcanada)\.com|(?:hgtv|foodnetwork|slice)\.ca)/(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=))(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/',
'md5': '05dcbca777bf1e58c2acbb57168ad3a6',
'info_dict': {
'id': '870923331648',
'ext': 'mp4',
'title': 'Movie Night Popcorn with Bryan',
'description': 'Bryan whips up homemade popcorn, the old fashion way for Jojo and Lincoln.',
'uploader': 'SHWM-NEW',
'upload_date': '20170206',
'timestamp': 1486392197,
},
}, {
'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753',
'only_matching': True,
}, {
'url': 'http://etcanada.com/video/873675331955/meet-the-survivor-game-changers-castaways-part-2/',
'only_matching': True,
}]
_TP_FEEDS = {
'globaltv': {
'feed_id': 'ChQqrem0lNUp',
'account_id': 2269680845,
},
'etcanada': {
'feed_id': 'ChQqrem0lNUp',
'account_id': 2269680845,
},
'hgtv': {
'feed_id': 'L0BMHXi2no43',
'account_id': 2414428465,
},
'foodnetwork': {
'feed_id': 'ukK8o58zbRmJ',
'account_id': 2414429569,
},
'slice': {
'feed_id': '5tUJLgV2YNJ5',
'account_id': 2414427935,
},
}
def _real_extract(self, url):
domain, video_id = re.match(self._VALID_URL, url).groups()
feed_info = self._TP_FEEDS[domain.split('.')[0]]
return self._extract_feed_info('dtjsEC', feed_info['feed_id'], 'byId=' + video_id, video_id, lambda e: {
'episode_number': int_or_none(e.get('pl1$episode')),
'season_number': int_or_none(e.get('pl1$season')),
'series': e.get('pl1$show'),
}, {
'HLS': {
'manifest': 'm3u',
},
'DesktopHLS Default': {
'manifest': 'm3u',
},
'MP4 MBR': {
'manifest': 'm3u',
},
}, feed_info['account_id'])

View File

@ -202,6 +202,7 @@ from .commonprotocols import (
RtmpIE, RtmpIE,
) )
from .condenast import CondeNastIE from .condenast import CondeNastIE
from .corus import CorusIE
from .cracked import CrackedIE from .cracked import CrackedIE
from .crackle import CrackleIE from .crackle import CrackleIE
from .criterion import CriterionIE from .criterion import CriterionIE
@ -349,6 +350,7 @@ from .gameone import (
from .gamersyde import GamersydeIE from .gamersyde import GamersydeIE
from .gamespot import GameSpotIE from .gamespot import GameSpotIE
from .gamestar import GameStarIE from .gamestar import GameStarIE
from .gaskrank import GaskrankIE
from .gazeta import GazetaIE from .gazeta import GazetaIE
from .gdcvault import GDCVaultIE from .gdcvault import GDCVaultIE
from .generic import GenericIE from .generic import GenericIE
@ -380,10 +382,7 @@ from .heise import HeiseIE
from .hellporno import HellPornoIE from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE from .hentaistigma import HentaiStigmaIE
from .hgtv import ( from .hgtv import HGTVComShowIE
HGTVIE,
HGTVComShowIE,
)
from .historicfilms import HistoricFilmsIE from .historicfilms import HistoricFilmsIE
from .hitbox import HitboxIE, HitboxLiveIE from .hitbox import HitboxIE, HitboxLiveIE
from .hitrecord import HitRecordIE from .hitrecord import HitRecordIE
@ -837,6 +836,7 @@ from .sbs import SBSIE
from .scivee import SciVeeIE from .scivee import SciVeeIE
from .screencast import ScreencastIE from .screencast import ScreencastIE
from .screencastomatic import ScreencastOMaticIE from .screencastomatic import ScreencastOMaticIE
from .scrippsnetworks import ScrippsNetworksWatchIE
from .seeker import SeekerIE from .seeker import SeekerIE
from .senateisvp import SenateISVPIE from .senateisvp import SenateISVPIE
from .sendtonews import SendtoNewsIE from .sendtonews import SendtoNewsIE
@ -894,6 +894,7 @@ from .sport5 import Sport5IE
from .sportbox import SportBoxEmbedIE from .sportbox import SportBoxEmbedIE
from .sportdeutschland import SportDeutschlandIE from .sportdeutschland import SportDeutschlandIE
from .sportschau import SportschauIE from .sportschau import SportschauIE
from .sprout import SproutIE
from .srgssr import ( from .srgssr import (
SRGSSRIE, SRGSSRIE,
SRGSSRPlayIE, SRGSSRPlayIE,
@ -1016,6 +1017,7 @@ from .tvplay import (
TVPlayIE, TVPlayIE,
ViafreeIE, ViafreeIE,
) )
from .tvplayer import TVPlayerIE
from .tweakers import TweakersIE from .tweakers import TweakersIE
from .twentyfourvideo import TwentyFourVideoIE from .twentyfourvideo import TwentyFourVideoIE
from .twentymin import TwentyMinutenIE from .twentymin import TwentyMinutenIE
@ -1095,6 +1097,7 @@ from .videomore import (
VideomoreSeasonIE, VideomoreSeasonIE,
) )
from .videopremium import VideoPremiumIE from .videopremium import VideoPremiumIE
from .videopress import VideoPressIE
from .vidio import VidioIE from .vidio import VidioIE
from .vidme import ( from .vidme import (
VidmeIE, VidmeIE,

View File

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
@ -134,6 +135,46 @@ class FacebookIE(InfoExtractor):
'upload_date': '20161030', 'upload_date': '20161030',
'uploader': 'CNN', 'uploader': 'CNN',
}, },
}, {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
'info_dict': {
'id': '1417995061575415',
'ext': 'mp4',
'title': 'md5:a7b86ca673f51800cd54687b7f4012fe',
'timestamp': 1486648217,
'upload_date': '20170209',
'uploader': 'Yaroslav Korpan',
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471',
'info_dict': {
'id': '1072691702860471',
'ext': 'mp4',
'title': 'md5:ae2d22a93fbb12dad20dc393a869739d',
'timestamp': 1477305000,
'upload_date': '20161024',
'uploader': 'La Guía Del Varón',
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
'info_dict': {
'id': '1396382447100162',
'ext': 'mp4',
'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3',
'timestamp': 1486035494,
'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn',
},
'params': {
'skip_download': True,
},
}, { }, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True, 'only_matching': True,
@ -249,7 +290,7 @@ class FacebookIE(InfoExtractor):
for item in instances: for item in instances:
if item[1][0] == 'VideoConfig': if item[1][0] == 'VideoConfig':
video_item = item[2][0] video_item = item[2][0]
if video_item.get('video_id') == video_id: if video_item.get('video_id'):
return video_item['videoData'] return video_item['videoData']
server_js_data = self._parse_json(self._search_regex( server_js_data = self._parse_json(self._search_regex(
@ -262,7 +303,7 @@ class FacebookIE(InfoExtractor):
if not video_data: if not video_data:
server_js_data = self._parse_json( server_js_data = self._parse_json(
self._search_regex( self._search_regex(
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+stream_pagelet', r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall)',
webpage, 'js data', default='{}'), webpage, 'js data', default='{}'),
video_id, transform_source=js_to_json, fatal=False) video_id, transform_source=js_to_json, fatal=False)
if server_js_data: if server_js_data:

View File

@ -0,0 +1,123 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
js_to_json,
unified_strdate,
)
class GaskrankIE(InfoExtractor):
"""InfoExtractor for gaskrank.tv"""
_VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.html?'
_TESTS = [
{
'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm',
'md5': '1ae88dbac97887d85ebd1157a95fc4f9',
'info_dict': {
'id': '201601/26955',
'ext': 'mp4',
'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*',
'thumbnail': r're:^https?://.*\.jpg$',
'categories': ['motorrad-fun'],
'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden',
'uploader_id': 'Bikefun',
'upload_date': '20170110',
'uploader_url': None,
}
},
{
'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm',
'md5': 'c33ee32c711bc6c8224bfcbe62b23095',
'info_dict': {
'id': '201106/15920',
'ext': 'mp4',
'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken',
'thumbnail': r're:^https?://.*\.jpg$',
'categories': ['racing'],
'display_id': 'isle-of-man-tt-2011-michael-du-15920',
'uploader_id': 'IOM',
'upload_date': '20160506',
'uploader_url': 'www.iomtt.com',
}
}
]
def _real_extract(self, url):
"""extract information from gaskrank.tv"""
def fix_json(code):
"""Removes trailing comma in json: {{},} --> {{}}"""
return re.sub(r',\s*}', r'}', js_to_json(code))
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
categories = [re.match(self._VALID_URL, url).group('categories')]
title = self._search_regex(
r'movieName\s*:\s*\'([^\']*)\'',
webpage, 'title')
thumbnail = self._search_regex(
r'poster\s*:\s*\'([^\']*)\'',
webpage, 'thumbnail', default=None)
mobj = re.search(
r'Video von:\s*(?P<uploader_id>[^|]*?)\s*\|\s*vom:\s*(?P<upload_date>[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])',
webpage)
if mobj is not None:
uploader_id = mobj.groupdict().get('uploader_id')
upload_date = unified_strdate(mobj.groupdict().get('upload_date'))
uploader_url = self._search_regex(
r'Homepage:\s*<[^>]*>(?P<uploader_url>[^<]*)',
webpage, 'uploader_url', default=None)
tags = re.findall(
r'/tv/tags/[^/]+/"\s*>(?P<tag>[^<]*?)<',
webpage)
view_count = self._search_regex(
r'class\s*=\s*"gkRight"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P<view_count>[0-9\.]*)',
webpage, 'view_count', default=None)
if view_count:
view_count = int_or_none(view_count.replace('.', ''))
average_rating = self._search_regex(
r'itemprop\s*=\s*"ratingValue"[^>]*>\s*(?P<average_rating>[0-9,]+)',
webpage, 'average_rating')
if average_rating:
average_rating = float_or_none(average_rating.replace(',', '.'))
playlist = self._parse_json(
self._search_regex(
r'playlist\s*:\s*\[([^\]]*)\]',
webpage, 'playlist', default='{}'),
display_id, transform_source=fix_json, fatal=False)
video_id = self._search_regex(
r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4',
playlist.get('0').get('src'), 'video id')
formats = []
for key in playlist:
formats.append({
'url': playlist[key]['src'],
'format_id': key,
'quality': playlist[key].get('quality')})
self._sort_formats(formats, field_preference=['format_id'])
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'categories': categories,
'display_id': display_id,
'uploader_id': uploader_id,
'upload_date': upload_date,
'uploader_url': uploader_url,
'tags': tags,
'view_count': view_count,
'average_rating': average_rating,
}

View File

@ -29,6 +29,7 @@ from ..utils import (
UnsupportedError, UnsupportedError,
xpath_text, xpath_text,
) )
from .commonprotocols import RtmpIE
from .brightcove import ( from .brightcove import (
BrightcoveLegacyIE, BrightcoveLegacyIE,
BrightcoveNewIE, BrightcoveNewIE,
@ -81,6 +82,7 @@ from .videa import VideaIE
from .twentymin import TwentyMinutenIE from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE from .ustream import UstreamIE
from .openload import OpenloadIE from .openload import OpenloadIE
from .videopress import VideoPressIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -946,6 +948,19 @@ class GenericIE(InfoExtractor):
'title': 'Webinar: Using Discovery, The National Archives online catalogue', 'title': 'Webinar: Using Discovery, The National Archives online catalogue',
}, },
}, },
# jwplayer rtmp
{
'url': 'http://www.suffolk.edu/sjc/',
'info_dict': {
'id': 'sjclive',
'ext': 'flv',
'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
'uploader': 'www.suffolk.edu',
},
'params': {
'skip_download': True,
}
},
# rtl.nl embed # rtl.nl embed
{ {
'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
@ -1473,6 +1488,21 @@ class GenericIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
'add_ie': [TwentyMinutenIE.ie_key()], 'add_ie': [TwentyMinutenIE.ie_key()],
},
{
# VideoPress embed
'url': 'https://en.support.wordpress.com/videopress/',
'info_dict': {
'id': 'OcobLTqC',
'ext': 'm4v',
'title': 'IMG_5786',
'timestamp': 1435711927,
'upload_date': '20150701',
},
'params': {
'skip_download': True,
},
'add_ie': [VideoPressIE.ie_key()],
} }
# { # {
# # TODO: find another test # # TODO: find another test
@ -2438,6 +2468,12 @@ class GenericIE(InfoExtractor):
return _playlist_from_matches( return _playlist_from_matches(
openload_urls, ie=OpenloadIE.ie_key()) openload_urls, ie=OpenloadIE.ie_key())
# Look for VideoPress embeds
videopress_urls = VideoPressIE._extract_urls(webpage)
if videopress_urls:
return _playlist_from_matches(
videopress_urls, ie=VideoPressIE.ie_key())
# Looking for http://schema.org/VideoObject # Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld( json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject') webpage, video_id, default={}, expected_type='VideoObject')
@ -2465,6 +2501,8 @@ class GenericIE(InfoExtractor):
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
return True return True
if RtmpIE.suitable(vurl):
return True
vpath = compat_urlparse.urlparse(vurl).path vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath) vext = determine_ext(vpath)
return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js') return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js')
@ -2572,6 +2610,15 @@ class GenericIE(InfoExtractor):
'age_limit': age_limit, 'age_limit': age_limit,
} }
if RtmpIE.suitable(video_url):
entry_info_dict.update({
'_type': 'url_transparent',
'ie_key': RtmpIE.ie_key(),
'url': video_url,
})
entries.append(entry_info_dict)
continue
ext = determine_ext(video_url) ext = determine_ext(video_url)
if ext == 'smil': if ext == 'smil':
entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id) entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .adobepass import AdobePassIE
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
determine_ext, determine_ext,
@ -13,15 +13,30 @@ from ..utils import (
) )
class GoIE(InfoExtractor): class GoIE(AdobePassIE):
_BRANDS = { _SITE_INFO = {
'abc': '001', 'abc': {
'freeform': '002', 'brand': '001',
'watchdisneychannel': '004', 'requestor_id': 'ABC',
'watchdisneyjunior': '008', },
'watchdisneyxd': '009', 'freeform': {
'brand': '002',
'requestor_id': 'ABCFamily',
},
'watchdisneychannel': {
'brand': '004',
'requestor_id': 'Disney',
},
'watchdisneyjunior': {
'brand': '008',
'requestor_id': 'DisneyJunior',
},
'watchdisneyxd': {
'brand': '009',
'requestor_id': 'DisneyXD',
}
} }
_VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_BRANDS.keys()) _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_TESTS = [{ _TESTS = [{
'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',
'info_dict': { 'info_dict': {
@ -47,7 +62,8 @@ class GoIE(InfoExtractor):
# There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
# from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id')
brand = self._BRANDS[sub_domain] site_info = self._SITE_INFO[sub_domain]
brand = site_info['brand']
video_data = self._download_json( video_data = self._download_json(
'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id),
video_id)['video'][0] video_id)['video'][0]
@ -63,14 +79,26 @@ class GoIE(InfoExtractor):
if ext == 'm3u8': if ext == 'm3u8':
video_type = video_data.get('type') video_type = video_data.get('type')
if video_type == 'lf': if video_type == 'lf':
data = {
'video_id': video_data['id'],
'video_type': video_type,
'brand': brand,
'device': '001',
}
if video_data.get('accesslevel') == '1':
requestor_id = site_info['requestor_id']
resource = self._get_mvpd_resource(
requestor_id, title, video_id, None)
auth = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)
data.update({
'token': auth,
'token_type': 'ap',
'adobe_requestor_id': requestor_id,
})
entitlement = self._download_json( entitlement = self._download_json(
'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
video_id, data=urlencode_postdata({ video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers())
'video_id': video_data['id'],
'video_type': video_type,
'brand': brand,
'device': '001',
}))
errors = entitlement.get('errors', {}).get('errors', []) errors = entitlement.get('errors', {}).get('errors', [])
if errors: if errors:
error_message = ', '.join([error['message'] for error in errors]) error_message = ', '.join([error['message'] for error in errors])

View File

@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
lowercase_escape,
) )
@ -13,12 +14,12 @@ class GoogleDriveIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
_TESTS = [{ _TESTS = [{
'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
'md5': '881f7700aec4f538571fa1e0eed4a7b6', 'md5': 'd109872761f7e7ecf353fa108c0dbe1e',
'info_dict': { 'info_dict': {
'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Big Buck Bunny.mp4', 'title': 'Big Buck Bunny.mp4',
'duration': 46, 'duration': 45,
} }
}, { }, {
# video id is longer than 28 characters # video id is longer than 28 characters
@ -55,7 +56,7 @@ class GoogleDriveIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage( webpage = self._download_webpage(
'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') 'http://docs.google.com/file/d/%s' % video_id, video_id)
reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
if reason: if reason:
@ -74,7 +75,7 @@ class GoogleDriveIE(InfoExtractor):
resolution = fmt.split('/')[1] resolution = fmt.split('/')[1]
width, height = resolution.split('x') width, height = resolution.split('x')
formats.append({ formats.append({
'url': fmt_url, 'url': lowercase_escape(fmt_url),
'format_id': fmt_id, 'format_id': fmt_id,
'resolution': resolution, 'resolution': resolution,
'width': int_or_none(width), 'width': int_or_none(width),

View File

@ -2,50 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
int_or_none,
js_to_json,
smuggle_url,
)
class HGTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?hgtv\.ca/[^/]+/video/(?P<id>[^/]+)/video.html'
_TEST = {
'url': 'http://www.hgtv.ca/homefree/video/overnight-success/video.html?v=738081859718&p=1&s=da#video',
'md5': '',
'info_dict': {
'id': 'aFH__I_5FBOX',
'ext': 'mp4',
'title': 'Overnight Success',
'description': 'After weeks of hard work, high stakes, breakdowns and pep talks, the final 2 contestants compete to win the ultimate dream.',
'uploader': 'SHWM-NEW',
'timestamp': 1470320034,
'upload_date': '20160804',
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
embed_vars = self._parse_json(self._search_regex(
r'(?s)embed_vars\s*=\s*({.*?});',
webpage, 'embed vars'), display_id, js_to_json)
return {
'_type': 'url_transparent',
'url': smuggle_url(
'http://link.theplatform.com/s/dtjsEC/%s?mbr=true&manifest=m3u' % embed_vars['pid'], {
'force_smil_url': True
}),
'series': embed_vars.get('show'),
'season_number': int_or_none(embed_vars.get('season')),
'episode_number': int_or_none(embed_vars.get('episode')),
'ie_key': 'ThePlatform',
}
class HGTVComShowIE(InfoExtractor): class HGTVComShowIE(InfoExtractor):

View File

@ -3,14 +3,18 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse from ..compat import compat_urllib_parse_urlparse
from ..utils import remove_end from ..utils import (
int_or_none,
mimetype2ext,
remove_end,
)
class IwaraIE(InfoExtractor): class IwaraIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)' _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD',
'md5': '1d53866b2c514b23ed69e4352fdc9839', # md5 is unstable
'info_dict': { 'info_dict': {
'id': 'amVwUl1EHpAD9RD', 'id': 'amVwUl1EHpAD9RD',
'ext': 'mp4', 'ext': 'mp4',
@ -23,17 +27,17 @@ class IwaraIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc',
'ext': 'mp4', 'ext': 'mp4',
'title': '[3D Hentai] Kyonyu Ã\x97 Genkai Ã\x97 Emaki Shinobi Girls.mp4', 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4',
'age_limit': 18, 'age_limit': 18,
}, },
'add_ie': ['GoogleDrive'], 'add_ie': ['GoogleDrive'],
}, { }, {
'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq',
'md5': '1d85f1e5217d2791626cff5ec83bb189', # md5 is unstable
'info_dict': { 'info_dict': {
'id': '6liAP9s2Ojc', 'id': '6liAP9s2Ojc',
'ext': 'mp4', 'ext': 'mp4',
'age_limit': 0, 'age_limit': 18,
'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)',
'description': 'md5:590c12c0df1443d833fbebe05da8c47a', 'description': 'md5:590c12c0df1443d833fbebe05da8c47a',
'upload_date': '20160910', 'upload_date': '20160910',
@ -52,9 +56,9 @@ class IwaraIE(InfoExtractor):
# ecchi is 'sexy' in Japanese # ecchi is 'sexy' in Japanese
age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0
entries = self._parse_html5_media_entries(url, webpage, video_id) video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id)
if not entries: if not video_data:
iframe_url = self._html_search_regex( iframe_url = self._html_search_regex(
r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1',
webpage, 'iframe URL', group='url') webpage, 'iframe URL', group='url')
@ -67,11 +71,25 @@ class IwaraIE(InfoExtractor):
title = remove_end(self._html_search_regex( title = remove_end(self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara') r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara')
info_dict = entries[0] formats = []
info_dict.update({ for a_format in video_data:
format_id = a_format.get('resolution')
height = int_or_none(self._search_regex(
r'(\d+)p', format_id, 'height', default=None))
formats.append({
'url': a_format['uri'],
'format_id': format_id,
'ext': mimetype2ext(a_format.get('mime')) or 'mp4',
'height': height,
'width': int_or_none(height / 9.0 * 16.0 if height else None),
'quality': 1 if format_id == 'Source' else 0,
})
self._sort_formats(formats)
return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'age_limit': age_limit, 'age_limit': age_limit,
}) 'formats': formats,
}
return info_dict

View File

@ -23,11 +23,11 @@ class KalturaIE(InfoExtractor):
(?: (?:
kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)| kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
https?:// https?://
(:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/ (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/
(?: (?:
(?: (?:
# flash player # flash player
index\.php/kwidget| index\.php/(?:kwidget|extwidget/preview)|
# html5 player # html5 player
html5/html5lib/[^/]+/mwEmbedFrame\.php html5/html5lib/[^/]+/mwEmbedFrame\.php
) )
@ -94,6 +94,14 @@ class KalturaIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
},
{
'url': 'https://www.kaltura.com/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto',
'only_matching': True,
},
{
'url': 'https://www.kaltura.com:443/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto',
'only_matching': True,
} }
] ]
@ -112,7 +120,7 @@ class KalturaIE(InfoExtractor):
re.search( re.search(
r'''(?xs) r'''(?xs)
(?P<q1>["\']) (?P<q1>["\'])
(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/(?:(?!(?P=q1)).)*(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)* (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)*
(?P=q1).*? (?P=q1).*?
(?: (?:
entry_?[Ii]d| entry_?[Ii]d|
@ -209,6 +217,8 @@ class KalturaIE(InfoExtractor):
partner_id = params['wid'][0][1:] partner_id = params['wid'][0][1:]
elif 'p' in params: elif 'p' in params:
partner_id = params['p'][0] partner_id = params['p'][0]
elif 'partner_id' in params:
partner_id = params['partner_id'][0]
else: else:
raise ExtractorError('Invalid URL', expected=True) raise ExtractorError('Invalid URL', expected=True)
if 'entry_id' in params: if 'entry_id' in params:

View File

@ -4,23 +4,26 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .theplatform import ThePlatformIE from .theplatform import ThePlatformIE
from .adobepass import AdobePassIE
from ..compat import compat_urllib_parse_urlparse
from ..utils import ( from ..utils import (
find_xpath_attr, find_xpath_attr,
lowercase_escape, lowercase_escape,
smuggle_url, smuggle_url,
unescapeHTML, unescapeHTML,
update_url_query, update_url_query,
int_or_none,
) )
class NBCIE(InfoExtractor): class NBCIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
_TESTS = [ _TESTS = [
{ {
'url': 'http://www.nbc.com/the-tonight-show/segments/112966', 'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237',
'info_dict': { 'info_dict': {
'id': '112966', 'id': '2848237',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
@ -69,7 +72,7 @@ class NBCIE(InfoExtractor):
# HLS streams requires the 'hdnea3' cookie # HLS streams requires the 'hdnea3' cookie
'url': 'http://www.nbc.com/Kings/video/goliath/n1806', 'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
'info_dict': { 'info_dict': {
'id': 'n1806', 'id': '101528f5a9e8127b107e98c5e6ce4638',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Goliath', 'title': 'Goliath',
'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.', 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.',
@ -87,21 +90,57 @@ class NBCIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( info = {
[
r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',
r'"embedURL"\s*:\s*"([^"]+)"'
],
webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
if theplatform_url.startswith('//'):
theplatform_url = 'http:' + theplatform_url
return {
'_type': 'url_transparent', '_type': 'url_transparent',
'ie_key': 'ThePlatform', 'ie_key': 'ThePlatform',
'url': smuggle_url(theplatform_url, {'source_url': url}),
'id': video_id, 'id': video_id,
} }
video_data = None
preload = self._search_regex(
r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None)
if preload:
preload_data = self._parse_json(preload, video_id)
path = compat_urllib_parse_urlparse(url).path.rstrip('/')
entity_id = preload_data.get('xref', {}).get(path)
video_data = preload_data.get('entities', {}).get(entity_id)
if video_data:
query = {
'mbr': 'true',
'manifest': 'm3u',
}
video_id = video_data['guid']
title = video_data['title']
if video_data.get('entitlement') == 'auth':
resource = self._get_mvpd_resource(
'nbcentertainment', title, video_id,
video_data.get('vChipRating'))
query['auth'] = self._extract_mvpd_auth(
url, video_id, 'nbcentertainment', resource)
theplatform_url = smuggle_url(update_url_query(
'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
query), {'force_smil_url': True})
info.update({
'id': video_id,
'title': title,
'url': theplatform_url,
'description': video_data.get('description'),
'keywords': video_data.get('keywords'),
'season_number': int_or_none(video_data.get('seasonNumber')),
'episode_number': int_or_none(video_data.get('episodeNumber')),
'series': video_data.get('showName'),
})
else:
theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
[
r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',
r'"embedURL"\s*:\s*"([^"]+)"'
],
webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
if theplatform_url.startswith('//'):
theplatform_url = 'http:' + theplatform_url
info['url'] = smuggle_url(theplatform_url, {'source_url': url})
return info
class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsVPlayerIE(InfoExtractor):

View File

@ -18,6 +18,7 @@ from ..utils import (
parse_duration, parse_duration,
qualities, qualities,
srt_subtitles_timecode, srt_subtitles_timecode,
update_url_query,
urlencode_postdata, urlencode_postdata,
) )
@ -92,6 +93,10 @@ class PluralsightIE(PluralsightBaseIE):
raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to login: %s' % error, expected=True)
if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')):
BLOCKED = 'Your account has been blocked due to suspicious activity'
if BLOCKED in response:
raise ExtractorError(
'Unable to login: %s' % BLOCKED, expected=True)
raise ExtractorError('Unable to log in') raise ExtractorError('Unable to log in')
def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): def _get_subtitles(self, author, clip_id, lang, name, duration, video_id):
@ -327,25 +332,44 @@ class PluralsightCourseIE(PluralsightBaseIE):
# TODO: PSM cookie # TODO: PSM cookie
course = self._download_json( course = self._download_json(
'%s/data/course/%s' % (self._API_BASE, course_id), '%s/player/functions/rpc' % self._API_BASE, course_id,
course_id, 'Downloading course JSON') 'Downloading course JSON',
data=json.dumps({
'fn': 'bootstrapPlayer',
'payload': {
'courseId': course_id,
}
}).encode('utf-8'),
headers={
'Content-Type': 'application/json;charset=utf-8'
})['payload']['course']
title = course['title'] title = course['title']
course_name = course['name']
course_data = course['modules']
description = course.get('description') or course.get('shortDescription') description = course.get('description') or course.get('shortDescription')
course_data = self._download_json(
'%s/data/course/content/%s' % (self._API_BASE, course_id),
course_id, 'Downloading course data JSON')
entries = [] entries = []
for num, module in enumerate(course_data, 1): for num, module in enumerate(course_data, 1):
author = module.get('author')
module_name = module.get('name')
if not author or not module_name:
continue
for clip in module.get('clips', []): for clip in module.get('clips', []):
player_parameters = clip.get('playerParameters') clip_index = int_or_none(clip.get('index'))
if not player_parameters: if clip_index is None:
continue continue
clip_url = update_url_query(
'%s/player' % self._API_BASE, query={
'mode': 'live',
'course': course_name,
'author': author,
'name': module_name,
'clip': clip_index,
})
entries.append({ entries.append({
'_type': 'url_transparent', '_type': 'url_transparent',
'url': '%s/training/player?%s' % (self._API_BASE, player_parameters), 'url': clip_url,
'ie_key': PluralsightIE.ie_key(), 'ie_key': PluralsightIE.ie_key(),
'chapter': module.get('title'), 'chapter': module.get('title'),
'chapter_number': num, 'chapter_number': num,

View File

@ -156,7 +156,18 @@ class PornHubIE(InfoExtractor):
comment_count = self._extract_count( comment_count = self._extract_count(
r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage))) video_variables = {}
for video_variablename, quote, video_variable in re.findall(
r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage):
video_variables[video_variablename] = video_variable
video_urls = []
for encoded_video_url in re.findall(
r'player_quality_[0-9]{3,4}p\s*=(.+?);', webpage):
for varname, varval in video_variables.items():
encoded_video_url = encoded_video_url.replace(varname, varval)
video_urls.append(re.sub(r'[\s+]', '', encoded_video_url))
if webpage.find('"encrypted":true') != -1: if webpage.find('"encrypted":true') != -1:
password = compat_urllib_parse_unquote_plus( password = compat_urllib_parse_unquote_plus(
self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))

View File

@ -54,9 +54,8 @@ class RadioCanadaIE(InfoExtractor):
raise ExtractorError('This video is DRM protected.', expected=True) raise ExtractorError('This video is DRM protected.', expected=True)
device_types = ['ipad'] device_types = ['ipad']
if app_code != 'toutv':
device_types.append('flash')
if not smuggled_data: if not smuggled_data:
device_types.append('flash')
device_types.append('android') device_types.append('android')
formats = [] formats = []
@ -103,7 +102,7 @@ class RadioCanadaIE(InfoExtractor):
continue continue
f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url)
protocol = determine_protocol({'url': f_url}) protocol = determine_protocol({'url': f_url})
formats.append({ f = {
'format_id': '%s-%d' % (protocol, tbr), 'format_id': '%s-%d' % (protocol, tbr),
'url': f_url, 'url': f_url,
'ext': 'flv' if protocol == 'rtmp' else ext, 'ext': 'flv' if protocol == 'rtmp' else ext,
@ -111,7 +110,14 @@ class RadioCanadaIE(InfoExtractor):
'width': int_or_none(url_e.get('width')), 'width': int_or_none(url_e.get('width')),
'height': int_or_none(url_e.get('height')), 'height': int_or_none(url_e.get('height')),
'tbr': tbr, 'tbr': tbr,
}) }
mobj = re.match(r'(?P<url>rtmp://[^/]+/[^/]+)/(?P<playpath>[^?]+)(?P<auth>\?.+)', f_url)
if mobj:
f.update({
'url': mobj.group('url') + mobj.group('auth'),
'play_path': mobj.group('playpath'),
})
formats.append(f)
if protocol == 'rtsp': if protocol == 'rtsp':
base_url = self._search_regex( base_url = self._search_regex(
r'rtsp://([^?]+)', f_url, 'base url', default=None) r'rtsp://([^?]+)', f_url, 'base url', default=None)

View File

@ -0,0 +1,60 @@
# coding: utf-8
from __future__ import unicode_literals
from .adobepass import AdobePassIE
from ..utils import (
int_or_none,
smuggle_url,
update_url_query,
)
class ScrippsNetworksWatchIE(AdobePassIE):
IE_NAME = 'scrippsnetworks:watch'
_VALID_URL = r'https?://watch\.(?:hgtv|foodnetwork|travelchannel|diynetwork|cookingchanneltv)\.com/player\.[A-Z0-9]+\.html#(?P<id>\d+)'
_TEST = {
'url': 'http://watch.hgtv.com/player.HNT.html#0256538',
'md5': '26545fd676d939954c6808274bdb905a',
'info_dict': {
'id': '0256538',
'ext': 'mp4',
'title': 'Seeking a Wow House',
'description': 'Buyers retiring in Palm Springs, California, want a modern house with major wow factor. They\'re also looking for a pool and a large, open floorplan with tall windows looking out at the views.',
'uploader': 'SCNI',
'upload_date': '20170207',
'timestamp': 1486450493,
},
'skip': 'requires TV provider authentication',
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
channel = self._parse_json(self._search_regex(
r'"channels"\s*:\s*(\[.+\])',
webpage, 'channels'), video_id)[0]
video_data = next(v for v in channel['videos'] if v.get('nlvid') == video_id)
title = video_data['title']
release_url = video_data['releaseUrl']
if video_data.get('restricted'):
requestor_id = self._search_regex(
r'requestorId\s*=\s*"([^"]+)";', webpage, 'requestor id')
resource = self._get_mvpd_resource(
requestor_id, title, video_id,
video_data.get('ratings', [{}])[0].get('rating'))
auth = self._extract_mvpd_auth(
url, video_id, requestor_id, resource)
release_url = update_url_query(release_url, {'auth': auth})
return {
'_type': 'url_transparent',
'id': video_id,
'title': title,
'url': smuggle_url(release_url, {'force_smil_url': True}),
'description': video_data.get('description'),
'thumbnail': video_data.get('thumbnailUrl'),
'series': video_data.get('showTitle'),
'season_number': int_or_none(video_data.get('season')),
'episode_number': int_or_none(video_data.get('episodeNumber')),
'ie_key': 'ThePlatform',
}

View File

@ -1,64 +1,101 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
qualities,
int_or_none,
mimetype2ext,
determine_ext, determine_ext,
int_or_none,
try_get,
qualities,
) )
class SixPlayIE(InfoExtractor): class SixPlayIE(InfoExtractor):
IE_NAME = '6play'
_VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)'
_TEST = { _TEST = {
'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320', 'url': 'http://www.6play.fr/le-meilleur-patissier-p_1807/le-meilleur-patissier-special-fetes-mercredi-a-21-00-sur-m6-c_11638450',
'md5': '42310bffe4ba3982db112b9cd3467328', 'md5': '42310bffe4ba3982db112b9cd3467328',
'info_dict': { 'info_dict': {
'id': '11495320', 'id': '11638450',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Jamel et ses amis au Marrakech du rire 2015', 'title': 'Le Meilleur Pâtissier, spécial fêtes mercredi à 21:00 sur M6',
'description': 'md5:ba2149d5c321d5201b78070ee839d872', 'description': 'md5:308853f6a5f9e2d55a30fc0654de415f',
'duration': 39,
'series': 'Le meilleur pâtissier',
},
'params': {
'skip_download': True,
}, },
} }
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
clip_data = self._download_json(
'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id,
video_id)
video_data = clip_data['videoInfo']
data = self._download_json(
'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/6play/videos/clip_%s' % video_id,
video_id, query={
'csa': 5,
'with': 'clips',
})
clip_data = data['clips'][0]
title = clip_data['title']
urls = []
quality_key = qualities(['lq', 'sd', 'hq', 'hd']) quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
formats = [] formats = []
for source in clip_data['sources']: for asset in clip_data['assets']:
source_type, source_url = source.get('type'), source.get('src') asset_url = asset.get('full_physical_path')
if not source_url or source_type == 'hls/primetime': protocol = asset.get('protocol')
if not asset_url or protocol == 'primetime' or asset_url in urls:
continue continue
ext = mimetype2ext(source_type) or determine_ext(source_url) urls.append(asset_url)
if ext == 'm3u8': container = asset.get('video_container')
formats.extend(self._extract_m3u8_formats( ext = determine_ext(asset_url)
source_url, video_id, 'mp4', 'm3u8_native', if container == 'm3u8' or ext == 'm3u8':
m3u8_id='hls', fatal=False)) if protocol == 'usp':
formats.extend(self._extract_f4m_formats( asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url)
source_url.replace('.m3u8', '.f4m'), formats.extend(self._extract_m3u8_formats(
video_id, f4m_id='hds', fatal=False)) asset_url, video_id, 'mp4', 'm3u8_native',
elif ext == 'mp4': m3u8_id='hls', fatal=False))
quality = source.get('quality') formats.extend(self._extract_f4m_formats(
asset_url.replace('.m3u8', '.f4m'),
video_id, f4m_id='hds', fatal=False))
formats.extend(self._extract_mpd_formats(
asset_url.replace('.m3u8', '.mpd'),
video_id, mpd_id='dash', fatal=False))
formats.extend(self._extract_ism_formats(
re.sub(r'/[^/]+\.m3u8', '/Manifest', asset_url),
video_id, ism_id='mss', fatal=False))
else:
formats.extend(self._extract_m3u8_formats(
asset_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
elif container == 'mp4' or ext == 'mp4':
quality = asset.get('video_quality')
formats.append({ formats.append({
'url': source_url, 'url': asset_url,
'format_id': quality, 'format_id': quality,
'quality': quality_key(quality), 'quality': quality_key(quality),
'ext': ext, 'ext': ext,
}) })
self._sort_formats(formats) self._sort_formats(formats)
def get(getter):
for src in (data, clip_data):
v = try_get(src, getter, compat_str)
if v:
return v
return { return {
'id': video_id, 'id': video_id,
'title': video_data['title'].strip(), 'title': title,
'description': video_data.get('description'), 'description': get(lambda x: x['description']),
'duration': int_or_none(video_data.get('duration')), 'duration': int_or_none(clip_data.get('duration')),
'series': video_data.get('titlePgm'), 'series': get(lambda x: x['program']['title']),
'formats': formats, 'formats': formats,
} }

View File

@ -4,11 +4,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urlparse from ..utils import js_to_json
from ..utils import (
js_to_json,
unified_strdate,
)
class SportBoxEmbedIE(InfoExtractor): class SportBoxEmbedIE(InfoExtractor):

View File

@ -0,0 +1,52 @@
# coding: utf-8
from __future__ import unicode_literals
from .adobepass import AdobePassIE
from ..utils import (
extract_attributes,
update_url_query,
smuggle_url,
)
class SproutIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P<id>[^/?#]+)'
_TEST = {
'url': 'http://www.sproutonline.com/watch/cowboy-adventure',
'md5': '74bf14128578d1e040c3ebc82088f45f',
'info_dict': {
'id': '9dexnwtmh8_X',
'ext': 'mp4',
'title': 'A Cowboy Adventure',
'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.',
'timestamp': 1437758640,
'upload_date': '20150724',
'uploader': 'NBCU-SPROUT-NEW',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_component = self._search_regex(
r'(?s)(<div[^>]+data-component="video"[^>]*?>)',
webpage, 'video component', default=None)
if video_component:
options = self._parse_json(extract_attributes(
video_component)['data-options'], video_id)
theplatform_url = options['video']
query = {
'mbr': 'true',
'manifest': 'm3u',
}
if options.get('protected'):
query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout')
theplatform_url = smuggle_url(update_url_query(
theplatform_url, query), {'force_smil_url': True})
else:
iframe = self._search_regex(
r'(<iframe[^>]+id="sproutVideoIframe"[^>]*?>)',
webpage, 'iframe')
theplatform_url = extract_attributes(iframe)['src']
return self.url_result(theplatform_url, 'ThePlatform')

View File

@ -306,9 +306,10 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
}, },
}] }]
def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}): def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):
real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query) real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
entry = self._download_json(real_url, video_id)['entries'][0] entry = self._download_json(real_url, video_id)['entries'][0]
main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else None
formats = [] formats = []
subtitles = {} subtitles = {}
@ -333,7 +334,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
if asset_type in asset_types_query: if asset_type in asset_types_query:
query.update(asset_types_query[asset_type]) query.update(asset_types_query[asset_type])
cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query( cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type) main_smil_url or smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
formats.extend(cur_formats) formats.extend(cur_formats)
subtitles = self._merge_subtitles(subtitles, cur_subtitles) subtitles = self._merge_subtitles(subtitles, cur_subtitles)

View File

@ -0,0 +1,75 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
extract_attributes,
urlencode_postdata,
ExtractorError,
)
class TVPlayerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tvplayer\.com/watch/(?P<id>[^/?#]+)'
_TEST = {
'url': 'http://tvplayer.com/watch/bbcone',
'info_dict': {
'id': '89',
'ext': 'mp4',
'title': r're:^BBC One [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
},
'params': {
# m3u8 download
'skip_download': True,
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
current_channel = extract_attributes(self._search_regex(
r'(<div[^>]+class="[^"]*current-channel[^"]*"[^>]*>)',
webpage, 'channel element'))
title = current_channel['data-name']
resource_id = self._search_regex(
r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id')
platform = self._search_regex(
r'platform\s*=\s*"([^"]+)"', webpage, 'platform')
token = self._search_regex(
r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null')
validate = self._search_regex(
r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null')
try:
response = self._download_json(
'http://api.tvplayer.com/api/v2/stream/live',
resource_id, headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}, data=urlencode_postdata({
'service': 1,
'platform': platform,
'id': resource_id,
'token': token,
'validate': validate,
}))['tvplayer']['response']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError):
response = self._parse_json(
e.cause.read().decode(), resource_id)['tvplayer']['response']
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
raise
formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4')
self._sort_formats(formats)
return {
'id': resource_id,
'display_id': display_id,
'title': self._live_title(title),
'formats': formats,
'is_live': True,
}

View File

@ -447,7 +447,14 @@ class TwitchHighlightsIE(TwitchVideosBaseIE):
class TwitchStreamIE(TwitchBaseIE): class TwitchStreamIE(TwitchBaseIE):
IE_NAME = 'twitch:stream' IE_NAME = 'twitch:stream'
_VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE _VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?twitch\.tv/|
player\.twitch\.tv/\?.*?\bchannel=
)
(?P<id>[^/#?]+)
'''
_TESTS = [{ _TESTS = [{
'url': 'http://www.twitch.tv/shroomztv', 'url': 'http://www.twitch.tv/shroomztv',
@ -471,8 +478,25 @@ class TwitchStreamIE(TwitchBaseIE):
}, { }, {
'url': 'http://www.twitch.tv/miracle_doto#profile-0', 'url': 'http://www.twitch.tv/miracle_doto#profile-0',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://player.twitch.tv/?channel=lotsofs',
'only_matching': True,
}] }]
@classmethod
def suitable(cls, url):
return (False
if any(ie.suitable(url) for ie in (
TwitchVideoIE,
TwitchChapterIE,
TwitchVodIE,
TwitchProfileIE,
TwitchAllVideosIE,
TwitchUploadsIE,
TwitchPastBroadcastsIE,
TwitchHighlightsIE))
else super(TwitchStreamIE, cls).suitable(url))
def _real_extract(self, url): def _real_extract(self, url):
channel_id = self._match_id(url) channel_id = self._match_id(url)

View File

@ -0,0 +1,99 @@
# coding: utf-8
from __future__ import unicode_literals
import random
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
float_or_none,
parse_age_limit,
qualities,
try_get,
unified_timestamp,
urljoin,
)
class VideoPressIE(InfoExtractor):
_VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)'
_TESTS = [{
'url': 'https://videopress.com/embed/kUJmAcSf',
'md5': '706956a6c875873d51010921310e4bc6',
'info_dict': {
'id': 'kUJmAcSf',
'ext': 'mp4',
'title': 'VideoPress Demo',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 634.6,
'timestamp': 1434983935,
'upload_date': '20150622',
'age_limit': 0,
},
}, {
# 17+, requires birth_* params
'url': 'https://videopress.com/embed/iH3gstfZ',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)',
webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
video_id, query={
'birth_month': random.randint(1, 12),
'birth_day': random.randint(1, 31),
'birth_year': random.randint(1950, 1995),
})
title = video['title']
def base_url(scheme):
return try_get(
video, lambda x: x['file_url_base'][scheme], compat_str)
base_url = base_url('https') or base_url('http')
QUALITIES = ('std', 'dvd', 'hd')
quality = qualities(QUALITIES)
formats = []
for format_id, f in video['files'].items():
if not isinstance(f, dict):
continue
for ext, path in f.items():
if ext in ('mp4', 'ogg'):
formats.append({
'url': urljoin(base_url, path),
'format_id': '%s-%s' % (format_id, ext),
'ext': determine_ext(path, ext),
'quality': quality(format_id),
})
original_url = try_get(video, lambda x: x['original'], compat_str)
if original_url:
formats.append({
'url': original_url,
'format_id': 'original',
'quality': len(QUALITIES),
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': video.get('description'),
'thumbnail': video.get('poster'),
'duration': float_or_none(video.get('duration'), 1000),
'timestamp': unified_timestamp(video.get('upload_date')),
'age_limit': parse_age_limit(video.get('rating')),
'formats': formats,
}

View File

@ -53,14 +53,15 @@ class XTubeIE(InfoExtractor):
if not display_id: if not display_id:
display_id = video_id display_id = video_id
url = 'http://www.xtube.com/watch.php?v=%s' % video_id url = 'http://www.xtube.com/video-watch/-%s' % video_id
req = sanitized_Request(url) req = sanitized_Request(url)
req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1') req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1')
webpage = self._download_webpage(req, display_id) webpage = self._download_webpage(req, display_id)
sources = self._parse_json(self._search_regex( sources = self._parse_json(self._search_regex(
r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id) r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),',
webpage, 'sources', group='sources'), video_id)
formats = [] formats = []
for format_id, format_url in sources.items(): for format_id, format_url in sources.items():
@ -81,10 +82,10 @@ class XTubeIE(InfoExtractor):
r'<span[^>]+class="nickname"[^>]*>([^<]+)'), r'<span[^>]+class="nickname"[^>]*>([^<]+)'),
webpage, 'uploader', fatal=False) webpage, 'uploader', fatal=False)
duration = parse_duration(self._search_regex( duration = parse_duration(self._search_regex(
r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>', r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>',
webpage, 'duration', fatal=False)) webpage, 'duration', fatal=False))
view_count = str_to_int(self._search_regex( view_count = str_to_int(self._search_regex(
r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>', r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>',
webpage, 'view count', fatal=False)) webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex( comment_count = str_to_int(self._html_search_regex(
r'>Comments? \(([\d,\.]+)\)<', r'>Comments? \(([\d,\.]+)\)<',

View File

@ -337,17 +337,30 @@ def get_element_by_id(id, html):
def get_element_by_class(class_name, html): def get_element_by_class(class_name, html):
return get_element_by_attribute( """Return the content of the first tag with the specified class in the passed HTML document"""
retval = get_elements_by_class(class_name, html)
return retval[0] if retval else None
def get_element_by_attribute(attribute, value, html, escape_value=True):
retval = get_elements_by_attribute(attribute, value, html, escape_value)
return retval[0] if retval else None
def get_elements_by_class(class_name, html):
"""Return the content of all tags with the specified class in the passed HTML document as a list"""
return get_elements_by_attribute(
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
html, escape_value=False) html, escape_value=False)
def get_element_by_attribute(attribute, value, html, escape_value=True): def get_elements_by_attribute(attribute, value, html, escape_value=True):
"""Return the content of the tag with the specified attribute in the passed HTML document""" """Return the content of the tag with the specified attribute in the passed HTML document"""
value = re.escape(value) if escape_value else value value = re.escape(value) if escape_value else value
m = re.search(r'''(?xs) retlist = []
for m in re.finditer(r'''(?xs)
<([a-zA-Z0-9:._-]+) <([a-zA-Z0-9:._-]+)
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
\s+%s=['"]?%s['"]? \s+%s=['"]?%s['"]?
@ -355,16 +368,15 @@ def get_element_by_attribute(attribute, value, html, escape_value=True):
\s*> \s*>
(?P<content>.*?) (?P<content>.*?)
</\1> </\1>
''' % (re.escape(attribute), value), html) ''' % (re.escape(attribute), value), html):
res = m.group('content')
if not m: if res.startswith('"') or res.startswith("'"):
return None res = res[1:-1]
res = m.group('content')
if res.startswith('"') or res.startswith("'"): retlist.append(unescapeHTML(res))
res = res[1:-1]
return unescapeHTML(res) return retlist
class HTMLAttributeParser(compat_HTMLParser): class HTMLAttributeParser(compat_HTMLParser):

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2017.02.04' __version__ = '2017.02.11'