[wat] improve extraction(#10281)

add alternative method to extract http formats
works even if the video is geo-restricted or removed
from public access(most of the cases)
This commit is contained in:
Remita Amine 2016-08-10 14:17:22 +01:00
parent 69d8eeeec5
commit 57ce8a6d08

View File

@ -9,6 +9,7 @@ from ..utils import (
ExtractorError, ExtractorError,
unified_strdate, unified_strdate,
HEADRequest, HEADRequest,
int_or_none,
) )
@ -30,34 +31,43 @@ class WatIE(InfoExtractor):
}, },
{ {
'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',
'md5': 'fbc84e4378165278e743956d9c1bf16b', 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c',
'info_dict': { 'info_dict': {
'id': '11713075', 'id': '11713075',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3',
'upload_date': '20140816', 'upload_date': '20140816',
'duration': 2910,
}, },
'skip': "Ce contenu n'est pas disponible pour l'instant.", 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],
}, },
] ]
_FORMATS = (
(200, 416, 234),
(400, 480, 270),
(600, 640, 360),
(1200, 640, 360),
(1800, 960, 540),
(2500, 1280, 720),
)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
# 'contentv4' is used in the website, but it also returns the related # 'contentv4' is used in the website, but it also returns the related
# videos, we don't need them # videos, we don't need them
video_info = self._download_json( video_data = self._download_json(
'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media'] 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
video_info = video_data['media']
error_desc = video_info.get('error_desc') error_desc = video_info.get('error_desc')
if error_desc: if error_desc:
raise ExtractorError( self.report_warning(
'%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) '%s returned error: %s' % (self.IE_NAME, error_desc))
chapters = video_info['chapters'] chapters = video_info['chapters']
if chapters:
first_chapter = chapters[0] first_chapter = chapters[0]
def video_id_for_chapter(chapter): def video_id_for_chapter(chapter):
@ -69,9 +79,10 @@ class WatIE(InfoExtractor):
return self.playlist_result(entries, video_id, video_info['title']) return self.playlist_result(entries, video_id, video_info['title'])
# Otherwise we can continue and extract just one part, we have to use # Otherwise we can continue and extract just one part, we have to use
# the video id for getting the video url # the video id for getting the video url
else:
first_chapter = video_info
date_diffusion = first_chapter.get('date_diffusion') title = first_chapter['title']
upload_date = unified_strdate(date_diffusion) if date_diffusion else None
def extract_url(path_template, url_type): def extract_url(path_template, url_type):
req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
@ -83,10 +94,10 @@ class WatIE(InfoExtractor):
expected=True) expected=True)
return red_url return red_url
m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
http_url = extract_url('android5/%s.mp4', 'http')
formats = [] formats = []
try:
http_url = extract_url('android5/%s.mp4', 'http')
m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
m3u8_formats = self._extract_m3u8_formats( m3u8_formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
formats.extend(m3u8_formats) formats.extend(m3u8_formats)
@ -97,22 +108,47 @@ class WatIE(InfoExtractor):
vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr')
if not vbr or not abr: if not vbr or not abr:
continue continue
format_id = m3u8_format['format_id'].replace('hls', 'http')
fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url)
if self._is_valid_url(fmt_url, video_id, format_id):
f = m3u8_format.copy() f = m3u8_format.copy()
f.update({ f.update({
'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url), 'url': fmt_url,
'format_id': f['format_id'].replace('hls', 'http'), 'format_id': format_id,
'protocol': 'http', 'protocol': 'http',
}) })
formats.append(f) formats.append(f)
self._sort_formats(formats) self._sort_formats(formats)
except ExtractorError:
abr = 64
for vbr, width, height in self._FORMATS:
tbr = vbr + abr
format_id = 'http-%s' % tbr
fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr)
if self._is_valid_url(fmt_url, video_id, format_id):
formats.append({
'format_id': format_id,
'url': fmt_url,
'vbr': vbr,
'abr': abr,
'width': width,
'height': height,
})
date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4')
upload_date = unified_strdate(date_diffusion) if date_diffusion else None
duration = None
files = video_info['files']
if files:
duration = int_or_none(files[0].get('duration'))
return { return {
'id': video_id, 'id': video_id,
'title': first_chapter['title'], 'title': title,
'thumbnail': first_chapter['preview'], 'thumbnail': first_chapter.get('preview'),
'description': first_chapter['description'], 'description': first_chapter.get('description'),
'view_count': video_info['views'], 'view_count': int_or_none(video_info.get('views')),
'upload_date': upload_date, 'upload_date': upload_date,
'duration': video_info['files'][0]['duration'], 'duration': duration,
'formats': formats, 'formats': formats,
} }