2014-02-11 13:25:24 +05:30
from __future__ import unicode_literals
2013-06-23 20:41:54 +02:00
import itertools
2014-04-04 22:22:11 +02:00
import json
2013-06-23 20:13:52 +02:00
import re
2013-06-23 20:41:54 +02:00
from . common import InfoExtractor , SearchInfoExtractor
2013-06-23 20:13:52 +02:00
from . . utils import (
2013-06-23 20:41:54 +02:00
compat_urllib_parse ,
2013-09-28 21:19:52 +02:00
compat_urlparse ,
clean_html ,
2013-12-25 15:18:40 +01:00
int_or_none ,
2013-06-23 20:13:52 +02:00
)
2013-09-28 21:19:52 +02:00
2013-06-23 20:13:52 +02:00
class YahooIE ( InfoExtractor ) :
2014-04-21 06:18:04 +02:00
IE_DESC = ' Yahoo screen and movies '
2014-08-16 13:56:22 +07:00
_VALID_URL = r ' (?P<url>https?://(?:screen|movies) \ .yahoo \ .com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)? \ .html) '
2013-09-28 21:19:52 +02:00
_TESTS = [
{
2014-02-11 13:25:24 +05:30
' url ' : ' http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html ' ,
' md5 ' : ' 4962b075c08be8690a922ee026d05e69 ' ,
' info_dict ' : {
2014-06-04 20:13:36 +07:00
' id ' : ' 2d25e626-2378-391f-ada0-ddaf1417e588 ' ,
2014-04-04 18:31:30 +02:00
' ext ' : ' mp4 ' ,
2014-02-11 13:25:24 +05:30
' title ' : ' Julian Smith & Travis Legg Watch Julian Smith ' ,
' description ' : ' Julian and Travis watch Julian Smith ' ,
2013-09-28 21:19:52 +02:00
} ,
} ,
{
2014-02-11 13:25:24 +05:30
' url ' : ' http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html ' ,
' md5 ' : ' d6e6fc6e1313c608f316ddad7b82b306 ' ,
' info_dict ' : {
2014-06-04 20:13:36 +07:00
' id ' : ' d1dedf8c-d58c-38c3-8963-e899929ae0a9 ' ,
2014-04-04 18:31:30 +02:00
' ext ' : ' mp4 ' ,
2014-02-11 13:25:24 +05:30
' title ' : ' Codefellas - The Cougar Lies with Spanish Moss ' ,
' description ' : ' Agent Topple \' s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about? ' ,
2013-09-28 21:19:52 +02:00
} ,
2013-06-27 20:46:46 +02:00
} ,
2014-04-21 06:18:04 +02:00
{
' url ' : ' https://movies.yahoo.com/video/world-loves-spider-man-190819223.html ' ,
' md5 ' : ' 410b7104aa9893b765bc22787a22f3d9 ' ,
' info_dict ' : {
' id ' : ' 516ed8e2-2c4f-339f-a211-7a8b49d30845 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The World Loves Spider-Man ' ,
' description ' : ''' People all over the world are celebrating the release of \" The Amazing Spider-Man 2. \" We ' re taking a look at the enthusiastic response Spider-Man has received from viewers all over the world. ''' ,
}
2014-08-16 13:56:22 +07:00
} ,
{
' url ' : ' https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed ' ,
' md5 ' : ' 60e8ac193d8fb71997caa8fce54c6460 ' ,
' info_dict ' : {
' id ' : ' 4fe78544-8d48-39d8-97cd-13f205d9fcdb ' ,
' ext ' : ' mp4 ' ,
' title ' : " Yahoo Saves ' Community ' " ,
' description ' : ' md5:4d4145af2fd3de00cbb6c1d664105053 ' ,
}
} ,
2013-09-28 21:19:52 +02:00
]
2013-06-23 20:13:52 +02:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' id ' )
2014-08-16 13:56:22 +07:00
url = mobj . group ( ' url ' )
2013-06-23 20:13:52 +02:00
webpage = self . _download_webpage ( url , video_id )
2014-04-21 06:18:04 +02:00
items_json = self . _search_regex (
r ' mediaItems: ( { .*?})$ ' , webpage , ' items ' , flags = re . MULTILINE ,
default = None )
if items_json is None :
2014-06-04 20:13:36 +07:00
CONTENT_ID_REGEXES = [
2014-04-21 06:18:04 +02:00
r ' YUI \ .namespace \ ( " Media " \ ) \ .CONTENT_ID \ s*= \ s* " ([^ " ]+) " ' ,
2014-06-04 20:13:36 +07:00
r ' root \ .App \ .Cache \ .context \ .videoCache \ .curVideo = \ { " ([^ " ]+) " '
]
long_id = self . _search_regex ( CONTENT_ID_REGEXES , webpage , ' content ID ' )
2014-04-21 06:18:04 +02:00
video_id = long_id
else :
items = json . loads ( items_json )
info = items [ ' mediaItems ' ] [ ' query ' ] [ ' results ' ] [ ' mediaObj ' ] [ 0 ]
# The 'meta' field is not always in the video webpage, we request it
# from another page
long_id = info [ ' id ' ]
2014-06-04 17:38:41 +02:00
return self . _get_info ( long_id , video_id , webpage )
2013-11-29 15:25:43 +01:00
2014-06-04 17:38:41 +02:00
def _get_info ( self , long_id , video_id , webpage ) :
2013-10-10 21:01:45 +02:00
query = ( ' SELECT * FROM yahoo.media.video.streams WHERE id= " %s " '
2013-11-29 22:06:17 +01:00
' AND plrs= " 86Gj0vCaSzV_Iuf6hNylf2 " AND region= " US " '
' AND protocol= " http " ' % long_id )
2013-10-10 21:01:45 +02:00
data = compat_urllib_parse . urlencode ( {
' q ' : query ,
' env ' : ' prod ' ,
' format ' : ' json ' ,
} )
2014-04-04 18:31:30 +02:00
query_result = self . _download_json (
2013-10-10 21:01:45 +02:00
' http://video.query.yahoo.com/v1/public/yql? ' + data ,
2014-02-11 13:25:24 +05:30
video_id , ' Downloading video info ' )
2013-10-10 21:01:45 +02:00
info = query_result [ ' query ' ] [ ' results ' ] [ ' mediaObj ' ] [ 0 ]
2013-09-28 21:19:52 +02:00
meta = info [ ' meta ' ]
formats = [ ]
for s in info [ ' streams ' ] :
format_info = {
2013-12-25 15:18:40 +01:00
' width ' : int_or_none ( s . get ( ' width ' ) ) ,
' height ' : int_or_none ( s . get ( ' height ' ) ) ,
' tbr ' : int_or_none ( s . get ( ' bitrate ' ) ) ,
2013-09-28 21:19:52 +02:00
}
host = s [ ' host ' ]
path = s [ ' path ' ]
if host . startswith ( ' rtmp ' ) :
format_info . update ( {
' url ' : host ,
' play_path ' : path ,
' ext ' : ' flv ' ,
} )
else :
format_url = compat_urlparse . urljoin ( host , path )
format_info [ ' url ' ] = format_url
formats . append ( format_info )
2013-12-25 15:18:40 +01:00
self . _sort_formats ( formats )
2013-09-28 21:19:52 +02:00
2013-11-27 21:24:55 +01:00
return {
2013-09-28 21:19:52 +02:00
' id ' : video_id ,
' title ' : meta [ ' title ' ] ,
' formats ' : formats ,
' description ' : clean_html ( meta [ ' description ' ] ) ,
2014-06-04 17:38:41 +02:00
' thumbnail ' : meta [ ' thumbnail ' ] if meta . get ( ' thumbnail ' ) else self . _og_search_thumbnail ( webpage ) ,
2013-09-28 21:19:52 +02:00
}
2013-06-23 20:13:52 +02:00
2013-06-23 20:41:54 +02:00
2013-11-29 15:25:43 +01:00
class YahooNewsIE ( YahooIE ) :
IE_NAME = ' yahoo:news '
_VALID_URL = r ' http://news \ .yahoo \ .com/video/.*?-(?P<id> \ d*?) \ .html '
2014-04-19 19:41:06 +02:00
_TESTS = [ {
2014-02-11 13:25:24 +05:30
' url ' : ' http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html ' ,
' md5 ' : ' 67010fdf3a08d290e060a4dd96baa07b ' ,
' info_dict ' : {
' id ' : ' 104538833 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' China Moses Is Crazy About the Blues ' ,
' description ' : ' md5:9900ab8cd5808175c7b3fe55b979bed0 ' ,
2013-11-29 15:25:43 +01:00
} ,
2014-04-19 19:41:06 +02:00
} ]
2013-11-29 15:25:43 +01:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' id ' )
webpage = self . _download_webpage ( url , video_id )
2014-02-11 13:25:24 +05:30
long_id = self . _search_regex ( r ' contentId: \' (.+?) \' , ' , webpage , ' long id ' )
2014-06-04 17:38:41 +02:00
return self . _get_info ( long_id , video_id , webpage )
2013-11-29 15:25:43 +01:00
2013-06-23 20:41:54 +02:00
class YahooSearchIE ( SearchInfoExtractor ) :
2014-02-11 13:25:24 +05:30
IE_DESC = ' Yahoo screen search '
2013-06-23 20:41:54 +02:00
_MAX_RESULTS = 1000
2014-02-11 13:25:24 +05:30
IE_NAME = ' screen.yahoo:search '
2013-06-23 20:41:54 +02:00
_SEARCH_KEY = ' yvsearch '
def _get_n_results ( self , query , n ) :
""" Get a specified number of results for a query """
2014-04-04 18:31:30 +02:00
entries = [ ]
for pagenum in itertools . count ( 0 ) :
2014-02-11 13:25:24 +05:30
result_url = ' http://video.search.yahoo.com/search/?p= %s &fr=screen&o=js&gs=0&b= %d ' % ( compat_urllib_parse . quote_plus ( query ) , pagenum * 30 )
2014-04-04 18:31:30 +02:00
info = self . _download_json ( result_url , query ,
note = ' Downloading results page ' + str ( pagenum + 1 ) )
2014-02-11 13:25:24 +05:30
m = info [ ' m ' ]
results = info [ ' results ' ]
2013-06-23 20:41:54 +02:00
for ( i , r ) in enumerate ( results ) :
2014-04-04 18:31:30 +02:00
if ( pagenum * 30 ) + i > = n :
2013-06-23 20:41:54 +02:00
break
mobj = re . search ( r ' (?P<url>screen \ .yahoo \ .com/.*?- \ d*? \ .html) " ' , r )
e = self . url_result ( ' http:// ' + mobj . group ( ' url ' ) , ' Yahoo ' )
2014-04-04 18:31:30 +02:00
entries . append ( e )
if ( pagenum * 30 + i > = n ) or ( m [ ' last ' ] > = ( m [ ' total ' ] - 1 ) ) :
2013-06-23 20:41:54 +02:00
break
2014-04-04 18:31:30 +02:00
return {
' _type ' : ' playlist ' ,
' id ' : query ,
' entries ' : entries ,
}