Compare commits

...

33 Commits

Author SHA1 Message Date
1e9a9e167d release 2015.02.20 2015-02-20 23:23:12 +01:00
3da0db62e6 [escapist] Fix extraction (fixes #5017) 2015-02-20 23:22:47 +01:00
e14ced7918 Merge branch 'master' of github.com:rg3/youtube-dl 2015-02-20 23:20:14 +01:00
ab9d02f53b Merge branch 'minusf-TED_code' 2015-02-21 00:14:51 +06:00
a461a11989 [ted] Improve external video handling and add test 2015-02-21 00:14:38 +06:00
1bd838608f prefer 'code' to 'uri' if present 2015-02-20 18:24:20 +01:00
50efb383f0 [tv4] Add new extractor (Closes #4839) (Closes #2103) 2015-02-20 14:13:41 +02:00
5e9a033e6e [imgur] Allow alternative values
Every now and then, imgur.com goes crazy and gives us a generic title and description (otherwise it looks all fine though).
Simply update the test case to allow for that craziness.
2015-02-20 02:52:03 +01:00
dd0a58f5f0 [blinkx] Fix extraction 2015-02-20 01:19:38 +06:00
a21420389e release 2015.02.19.3 2015-02-19 19:28:17 +01:00
6140baf4e1 [nationalgeographic] Add extractor (closes #4960) 2015-02-19 18:17:31 +01:00
8fc642eb5b [pornhub] Fix uploader regex 2015-02-19 22:15:49 +06:00
e66e1a0046 [pornhub] Add support for playlists (Closes #4995) 2015-02-19 22:15:19 +06:00
d5c69f1da4 [5min] Cover joystiq.com URLs (Closes #4962) 2015-02-19 21:47:11 +06:00
5c8a3f862a [nbc] Use a test video that works outside the US 2015-02-19 15:00:39 +01:00
a3b9157f49 [cbssports] Add extractor (closes #4996) 2015-02-19 13:06:53 +01:00
b88ba05356 [imgur] Simplify 2015-02-19 05:53:09 +01:00
b74d505577 Merge remote-tracking branch 'jbboehr/imgur-gifv-improvements' 2015-02-19 05:16:11 +01:00
9e2d7dca87 [imgur] improve error check for non-video URLs 2015-02-18 19:47:54 -08:00
d236b37ac9 [imgur] improve regex #4998 2015-02-18 19:28:19 -08:00
e880c66bd8 [theonion] Modernize 2015-02-19 04:12:40 +01:00
383456aa29 [Makefile] Also delete *.avi files in clean 2015-02-19 04:09:52 +01:00
1a13940c8d [imgur] support regular URL 2015-02-18 18:12:48 -08:00
3d54788495 [webofstories] Fix extraction 2015-02-19 02:12:08 +01:00
71d53ace2f [sockshare] Do not require thumbnail anymore
Thumbnail is not present on the website anymore.
2015-02-19 02:04:30 +01:00
f37e3f99f0 [generic] Correct test case
Video has been reuploaded / edited
2015-02-19 02:00:52 +01:00
bd03ffc16e [netzkino] Skip download in test case
Works fine from Germany, but fails from everywhere else
2015-02-19 01:58:54 +01:00
1ac1af9b47 release 2015.02.19.2 2015-02-19 01:43:28 +01:00
3bf5705316 [imgur] Add new extractor 2015-02-19 01:43:20 +01:00
1c2528c8a3 [cbs] Modernize 2015-02-19 01:22:50 +01:00
7bd15b1a03 release 2015.02.19.1 2015-02-19 01:04:24 +01:00
6b961a85fd [patreon] Add support for embedlies (fixes #4969) 2015-02-19 01:04:19 +01:00
7707004043 [patreon] Modernize 2015-02-19 00:38:05 +01:00
23 changed files with 437 additions and 85 deletions

View File

@ -1,7 +1,7 @@
all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
clean:
rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe
rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe
PREFIX ?= /usr/local
BINDIR ?= $(PREFIX)/bin

View File

@ -68,6 +68,7 @@
- **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv
- **CBS**
- **CBSNews**: CBS News
- **CBSSports**
- **CeskaTelevize**
- **channel9**: Channel 9
- **Chilloutzone**
@ -121,6 +122,7 @@
- **EllenTV**
- **EllenTV:clips**
- **ElPais**: El País
- **Embedly**
- **EMPFlix**
- **Engadget**
- **Eporner**
@ -190,6 +192,7 @@
- **ign.com**
- **imdb**: Internet Movie Database trailers
- **imdb:list**: Internet Movie Database lists
- **Imgur**
- **Ina**
- **InfoQ**
- **Instagram**
@ -262,6 +265,7 @@
- **myvideo**
- **MyVidster**
- **n-tv.de**
- **NationalGeographic**
- **Naver**
- **NBA**
- **NBC**
@ -319,6 +323,7 @@
- **podomatic**
- **PornHd**
- **PornHub**
- **PornHubPlaylist**
- **Pornotube**
- **PornoXO**
- **PromptFile**
@ -446,6 +451,7 @@
- **Turbo**
- **Tutv**
- **tv.dfb.de**
- **TV4**: tv4.se and tv4play.se
- **tvigle**: Интернет-телевидение Tvigle.ru
- **tvp.pl**
- **tvp.pl:Series**

View File

@ -58,6 +58,7 @@ from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
from .cbsnews import CBSNewsIE
from .cbssports import CBSSportsIE
from .ccc import CCCIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
@ -121,6 +122,7 @@ from .ellentv import (
EllenTVClipsIE,
)
from .elpais import ElPaisIE
from .embedly import EmbedlyIE
from .empflix import EMPFlixIE
from .engadget import EngadgetIE
from .eporner import EpornerIE
@ -204,6 +206,7 @@ from .imdb import (
ImdbIE,
ImdbListIE
)
from .imgur import ImgurIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE, InstagramUserIE
@ -282,6 +285,7 @@ from .myspace import MySpaceIE, MySpaceAlbumIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .myvidster import MyVidsterIE
from .nationalgeographic import NationalGeographicIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import (
@ -350,7 +354,10 @@ from .playfm import PlayFMIE
from .playvid import PlayvidIE
from .podomatic import PodomaticIE
from .pornhd import PornHdIE
from .pornhub import PornHubIE
from .pornhub import (
PornHubIE,
PornHubPlaylistIE,
)
from .pornotube import PornotubeIE
from .pornoxo import PornoXOIE
from .promptfile import PromptFileIE
@ -483,6 +490,7 @@ from .tumblr import TumblrIE
from .tunein import TuneInIE
from .turbo import TurboIE
from .tutv import TutvIE
from .tv4 import TV4IE
from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE

View File

@ -1,40 +1,35 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import remove_start
from ..utils import (
remove_start,
int_or_none,
)
class BlinkxIE(InfoExtractor):
_VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
_VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
IE_NAME = 'blinkx'
_TEST = {
'url': 'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB',
'md5': '2e9a07364af40163a908edbf10bb2492',
'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
'md5': '337cf7a344663ec79bf93a526a2e06c7',
'info_dict': {
'id': '8aQUy7GV',
'id': 'Da0Gw3xc',
'ext': 'mp4',
'title': 'Police Car Rolls Away',
'uploader': 'stupidvideos.com',
'upload_date': '20131215',
'timestamp': 1387068000,
'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!',
'duration': 14.886,
'thumbnails': [{
'width': 100,
'height': 76,
'resolution': '100x76',
'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg',
}],
'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
'uploader': 'IGN News',
'upload_date': '20150217',
'timestamp': 1424215740,
'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
'duration': 47.743333,
},
}
def _real_extract(self, rl):
m = re.match(self._VALID_URL, rl)
video_id = m.group('id')
def _real_extract(self, url):
video_id = self._match_id(url)
display_id = video_id[:8]
api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' +
@ -60,18 +55,20 @@ class BlinkxIE(InfoExtractor):
elif m['type'] in ('flv', 'mp4'):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
tbr = (int(m['vbr']) + int(m['abr'])) // 1000
vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
tbr = vbr + abr if vbr and abr else None
format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
formats.append({
'format_id': format_id,
'url': m['link'],
'vcodec': vcodec,
'acodec': acodec,
'abr': int(m['abr']) // 1000,
'vbr': int(m['vbr']) // 1000,
'abr': abr,
'vbr': vbr,
'tbr': tbr,
'width': int(m['w']),
'height': int(m['h']),
'width': int_or_none(m.get('w')),
'height': int_or_none(m.get('h')),
})
self._sort_formats(formats)

View File

@ -1,7 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@ -39,8 +37,7 @@ class CBSIE(InfoExtractor):
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
real_id = self._search_regex(
r"video\.settings\.pid\s*=\s*'([^']+)';",

View File

@ -0,0 +1,30 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class CBSSportsIE(InfoExtractor):
_VALID_URL = r'http://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)'
_TEST = {
'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s',
'info_dict': {
'id': '_d5_GbO8p1sT',
'ext': 'flv',
'title': 'US Open flashbacks: 1990s',
'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
section = mobj.group('section')
video_id = mobj.group('id')
all_videos = self._download_json(
'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section,
video_id)
# The json file contains the info of all the videos in the section
video_info = next(v for v in all_videos if v['pcid'] == video_id)
return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform')

View File

@ -0,0 +1,16 @@
# encoding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
class EmbedlyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P<id>[^#&]+)'
_TESTS = [{
'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1',
'only_matching': True,
}]
def _real_extract(self, url):
return self.url_result(compat_urllib_parse_unquote(self._match_id(url)))

View File

@ -22,6 +22,7 @@ class EscapistIE(InfoExtractor):
'uploader_id': 'the-escapist-presents',
'uploader': 'The Escapist Presents',
'title': "Breaking Down Baldur's Gate",
'thumbnail': 're:^https?://.*\.jpg$',
}
}
@ -40,9 +41,8 @@ class EscapistIE(InfoExtractor):
raw_title = self._html_search_meta('title', webpage, fatal=True)
title = raw_title.partition(' : ')[2]
player_url = self._og_search_video_url(webpage, name='player URL')
config_url = compat_urllib_parse.unquote(self._search_regex(
r'config=(.*)$', player_url, 'config URL'))
config_url = compat_urllib_parse.unquote(self._html_search_regex(
r'<param name="flashvars" value="config=([^"&]+)', webpage, 'config URL'))
formats = []
@ -81,5 +81,4 @@ class EscapistIE(InfoExtractor):
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'description': description,
'player_url': player_url,
}

View File

@ -14,6 +14,7 @@ class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
_VALID_URL = r'''(?x)
(?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
https?://(?:(?:massively|www)\.)?joystiq\.com/video/|
5min:)
(?P<id>\d+)
'''

View File

@ -532,7 +532,7 @@ class GenericIE(InfoExtractor):
'info_dict': {
'id': 'Mrj4DVp2zeA',
'ext': 'mp4',
'upload_date': '20150204',
'upload_date': '20150212',
'uploader': 'The National Archives UK',
'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
'uploader_id': 'NationalArchives08',

View File

@ -0,0 +1,97 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
js_to_json,
mimetype2ext,
ExtractorError,
)
class ImgurIE(InfoExtractor):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?'
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$',
},
}, {
'url': 'https://imgur.com/A61SaA1',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
width = int_or_none(self._search_regex(
r'<param name="width" value="([0-9]+)"',
webpage, 'width', fatal=False))
height = int_or_none(self._search_regex(
r'<param name="height" value="([0-9]+)"',
webpage, 'height', fatal=False))
video_elements = self._search_regex(
r'(?s)<div class="video-elements">(.*?)</div>',
webpage, 'video elements', default=None)
if not video_elements:
raise ExtractorError(
'No sources found for video %s. Maybe an image?' % video_id,
expected=True)
formats = []
for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements):
formats.append({
'format_id': m.group('type').partition('/')[2],
'url': self._proto_relative_url(m.group('src')),
'ext': mimetype2ext(m.group('type')),
'acodec': 'none',
'width': width,
'height': height,
'http_headers': {
'User-Agent': 'youtube-dl (like wget)',
},
})
gif_json = self._search_regex(
r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
webpage, 'GIF code', fatal=False)
if gif_json:
gifd = self._parse_json(
gif_json, video_id, transform_source=js_to_json)
formats.append({
'format_id': 'gif',
'preference': -10,
'width': width,
'height': height,
'ext': 'gif',
'acodec': 'none',
'vcodec': 'gif',
'container': 'gif',
'url': self._proto_relative_url(gifd['gifUrl']),
'filesize': gifd.get('size'),
'http_headers': {
'User-Agent': 'youtube-dl (like wget)',
},
})
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'description': self._og_search_description(webpage),
'title': self._og_search_title(webpage),
}

View File

@ -0,0 +1,38 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
smuggle_url,
url_basename,
)
class NationalGeographicIE(InfoExtractor):
_VALID_URL = r'http://video\.nationalgeographic\.com/video/.*?'
_TEST = {
'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
'info_dict': {
'id': '4DmDACA6Qtk_',
'ext': 'flv',
'title': 'Mating Crabs Busted by Sharks',
'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
},
'add_ie': ['ThePlatform'],
}
def _real_extract(self, url):
name = url_basename(url)
webpage = self._download_webpage(url, name)
feed_url = self._search_regex(r'data-feed-url="([^"]+)"', webpage, 'feed url')
guid = self._search_regex(r'data-video-guid="([^"]+)"', webpage, 'guid')
feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name)
content = feed.find('.//{http://search.yahoo.com/mrss/}content')
theplatform_id = url_basename(content.attrib.get('url'))
return self.url_result(smuggle_url(
'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id,
# For some reason, the normal links don't work and we must force the use of f4m
{'force_smil_url': True}))

View File

@ -18,13 +18,13 @@ class NBCIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
'url': 'http://www.nbc.com/the-tonight-show/segments/112966',
# md5 checksum is not stable
'info_dict': {
'id': 'bTmnLCvIbaaH',
'id': 'c9xnCo0YPOPH',
'ext': 'flv',
'title': 'I Am a Firefighter',
'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
},
},
{

View File

@ -29,6 +29,9 @@ class NetzkinoIE(InfoExtractor):
'timestamp': 1344858571,
'age_limit': 12,
},
'params': {
'skip_download': 'Download only works from Germany',
}
}
def _real_extract(self, url):

View File

@ -1,9 +1,6 @@
# encoding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
js_to_json,
@ -11,7 +8,7 @@ from ..utils import (
class PatreonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)'
_VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(?P<id>[^&#]+)'
_TESTS = [
{
'url': 'http://www.patreon.com/creation?hid=743933',
@ -35,6 +32,23 @@ class PatreonIE(InfoExtractor):
'thumbnail': 're:^https?://.*$',
},
},
{
'url': 'https://www.patreon.com/creation?hid=1682498',
'info_dict': {
'id': 'SU4fj_aEMVw',
'ext': 'mp4',
'title': 'I\'m on Patreon!',
'uploader': 'TraciJHines',
'thumbnail': 're:^https?://.*$',
'upload_date': '20150211',
'description': 'md5:c5a706b1f687817a3de09db1eb93acd4',
'uploader_id': 'TraciJHines',
},
'params': {
'noplaylist': True,
'skip_download': True,
}
}
]
# Currently Patreon exposes download URL via hidden CSS, so login is not
@ -65,26 +79,29 @@ class PatreonIE(InfoExtractor):
'''
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage).strip()
attach_fn = self._html_search_regex(
r'<div class="attach"><a target="_blank" href="([^"]+)">',
webpage, 'attachment URL', default=None)
embed = self._html_search_regex(
r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"',
webpage, 'embedded URL', default=None)
if attach_fn is not None:
video_url = 'http://www.patreon.com' + attach_fn
thumbnail = self._og_search_thumbnail(webpage)
uploader = self._html_search_regex(
r'<strong>(.*?)</strong> is creating', webpage, 'uploader')
elif embed is not None:
return self.url_result(embed)
else:
playlist_js = self._search_regex(
playlist = self._parse_json(self._search_regex(
r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])',
webpage, 'playlist JSON')
playlist_json = js_to_json(playlist_js)
playlist = json.loads(playlist_json)
webpage, 'playlist JSON'),
video_id, transform_source=js_to_json)
data = playlist[0]
video_url = self._proto_relative_url(data['mp3'])
thumbnail = self._proto_relative_url(data.get('cover'))

View File

@ -56,7 +56,7 @@ class PornHubIE(InfoExtractor):
video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|<span class="username)[^>]+>(.+?)<',
r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
if thumbnail:
@ -110,3 +110,33 @@ class PornHubIE(InfoExtractor):
'formats': formats,
'age_limit': 18,
}
class PornHubPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.pornhub.com/playlist/6201671',
'info_dict': {
'id': '6201671',
'title': 'P0p4',
},
'playlist_mincount': 35,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
entries = [
self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
]
playlist = self._parse_json(
self._search_regex(
r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
playlist_id)
return self.playlist_result(
entries, playlist_id, playlist.get('title'), playlist.get('description'))

View File

@ -25,7 +25,6 @@ class SockshareIE(InfoExtractor):
'id': '437BE28B89D799D7',
'title': 'big_buck_bunny_720p_surround.avi',
'ext': 'avi',
'thumbnail': 're:^http://.*\.jpg$',
}
}
@ -45,7 +44,7 @@ class SockshareIE(InfoExtractor):
''', webpage, 'hash')
fields = {
"hash": confirm_hash,
"hash": confirm_hash.encode('utf-8'),
"confirm": "Continue as Free User"
}
@ -68,7 +67,7 @@ class SockshareIE(InfoExtractor):
webpage, 'title', default=None)
thumbnail = self._html_search_regex(
r'<img\s+src="([^"]*)".+?name="bg"',
webpage, 'thumbnail')
webpage, 'thumbnail', default=None)
formats = [{
'format_id': 'sd',

View File

@ -83,6 +83,22 @@ class TEDIE(SubtitlesInfoExtractor):
'params': {
'skip_download': True,
},
}, {
# YouTube video
'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
'add_ie': ['Youtube'],
'info_dict': {
'id': 'aFBIPO-P7LM',
'ext': 'mp4',
'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
'uploader': 'TEDx Talks',
'uploader_id': 'TEDxTalks',
'upload_date': '20111216',
},
'params': {
'skip_download': True,
},
}]
_NATIVE_FORMATS = {
@ -132,11 +148,16 @@ class TEDIE(SubtitlesInfoExtractor):
talk_info = self._extract_info(webpage)['talks'][0]
if talk_info.get('external') is not None:
self.to_screen('Found video from %s' % talk_info['external']['service'])
external = talk_info.get('external')
if external:
service = external['service']
self.to_screen('Found video from %s' % service)
ext_url = None
if service.lower() == 'youtube':
ext_url = external.get('code')
return {
'_type': 'url',
'url': talk_info['external']['uri'],
'url': ext_url or external['uri'],
}
formats = [{

View File

@ -4,11 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class TheOnionIE(InfoExtractor):
_VALID_URL = r'(?x)https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<article_id>[0-9]+)/?'
_VALID_URL = r'https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<id>[0-9]+)/?'
_TEST = {
'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/',
'md5': '19eaa9a39cf9b9804d982e654dc791ee',
@ -22,10 +21,8 @@ class TheOnionIE(InfoExtractor):
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
article_id = mobj.group('article_id')
webpage = self._download_webpage(url, article_id)
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'"videoId":\s(\d+),', webpage, 'video ID')
@ -34,10 +31,6 @@ class TheOnionIE(InfoExtractor):
thumbnail = self._og_search_thumbnail(webpage)
sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage)
if not sources:
raise ExtractorError(
'No sources found for video %s' % video_id, expected=True)
formats = []
for src, type_ in sources:
if type_ == 'video/mp4':
@ -54,15 +47,15 @@ class TheOnionIE(InfoExtractor):
})
elif type_ == 'application/x-mpegURL':
formats.extend(
self._extract_m3u8_formats(src, video_id, preference=-1))
self._extract_m3u8_formats(src, display_id, preference=-1))
else:
self.report_warning(
'Encountered unexpected format: %s' % type_)
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,

View File

@ -71,7 +71,9 @@ class ThePlatformIE(SubtitlesInfoExtractor):
if not provider_id:
provider_id = 'dJ5BDC'
if mobj.group('config'):
if smuggled_data.get('force_smil_url', False):
smil_url = url
elif mobj.group('config'):
config_url = url + '&form=json'
config_url = config_url.replace('swf/', 'config/')
config_url = config_url.replace('onsite/', 'onsite/config/')

100
youtube_dl/extractor/tv4.py Normal file
View File

@ -0,0 +1,100 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_iso8601,
)
class TV4IE(InfoExtractor):
IE_DESC = 'tv4.se and tv4play.se'
_VALID_URL = r'''(?x)https?://(?:www\.)?
(?:
tv4\.se/(?:[^/]+)/klipp/(?:.*)-|
tv4play\.se/
(?:
(?:program|barn)/(?:[^\?]+)\?video_id=|
iframe/video/|
film/|
sport/|
)
)(?P<id>[0-9]+)'''
_TESTS = [
{
'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650',
'md5': '909d6454b87b10a25aa04c4bdd416a9b',
'info_dict': {
'id': '2491650',
'ext': 'mp4',
'title': 'Kalla Fakta 5 (english subtitles)',
'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': int,
'upload_date': '20131125',
},
},
{
'url': 'http://www.tv4play.se/iframe/video/3054113',
'md5': '77f851c55139ffe0ebd41b6a5552489b',
'info_dict': {
'id': '3054113',
'ext': 'mp4',
'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder',
'thumbnail': 're:^https?://.*\.jpg$',
'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.',
'timestamp': int,
'upload_date': '20150130',
},
},
{
'url': 'http://www.tv4play.se/sport/3060959',
'only_matching': True,
},
{
'url': 'http://www.tv4play.se/film/2378136',
'only_matching': True,
},
{
'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412',
'only_matching': True,
},
]
def _real_extract(self, url):
video_id = self._match_id(url)
info = self._download_json(
'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON')
# If is_geo_restricted is true, it doesn't neceserally mean we can't download it
if info['is_geo_restricted']:
self.report_warning('This content might not be available in your country due to licensing restrictions.')
if info['requires_subscription']:
raise ExtractorError('This content requires subscription.', expected=True)
sources_data = self._download_json(
'https://prima.tv4play.se/api/web/asset/%s/play.json?protocol=http&videoFormat=MP4' % video_id, video_id, 'Downloading sources JSON')
sources = sources_data['playback']
formats = []
for item in sources.get('items', {}).get('item', []):
ext, bitrate = item['mediaFormat'], item['bitrate']
formats.append({
'format_id': '%s_%s' % (ext, bitrate),
'tbr': bitrate,
'ext': ext,
'url': item['url'],
})
self._sort_formats(formats)
return {
'id': video_id,
'title': info['title'],
'formats': formats,
'description': info.get('description'),
'timestamp': parse_iso8601(info.get('broadcast_date_time')),
'duration': info.get('duration'),
'thumbnail': info.get('image'),
'is_live': sources.get('live'),
}

View File

@ -45,19 +45,17 @@ class WebOfStoriesIE(InfoExtractor):
description = self._html_search_meta('description', webpage)
thumbnail = self._og_search_thumbnail(webpage)
story_filename = self._search_regex(
r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename')
speaker_id = self._search_regex(
r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID')
story_id = self._search_regex(
r'\.storyId\((\d+)\)', webpage, 'story ID')
speaker_type = self._search_regex(
r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type')
great_life = self._search_regex(
r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story')
embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
webpage, 'embed params').split(',')]
(
_, speaker_id, story_id, story_duration,
speaker_type, great_life, _thumbnail, _has_subtitles,
story_filename, _story_order) = embed_params
is_great_life_series = great_life == 'true'
duration = int_or_none(self._search_regex(
r'\.duration\((\d+)\)', webpage, 'duration', fatal=False))
duration = int_or_none(story_duration)
# URL building, see: http://www.webofstories.com/scripts/player.js
ms_prefix = ''

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = '2015.02.19'
__version__ = '2015.02.20'