2013-06-23 10:58:33 -07:00
# coding: utf-8
2013-09-21 05:19:30 -07:00
import collections
2013-09-22 01:30:02 -07:00
import errno
2013-09-21 05:19:30 -07:00
import io
2013-09-22 01:37:23 -07:00
import itertools
2013-06-23 10:58:33 -07:00
import json
2013-09-21 15:35:03 -07:00
import os . path
2013-06-23 10:58:33 -07:00
import re
import socket
2013-09-21 05:19:30 -07:00
import string
import struct
import traceback
2013-09-22 01:37:23 -07:00
import xml . etree . ElementTree
2013-09-21 05:19:30 -07:00
import zlib
2013-06-23 10:58:33 -07:00
2013-06-23 11:28:15 -07:00
from . common import InfoExtractor , SearchInfoExtractor
2013-09-11 06:51:04 -07:00
from . subtitles import SubtitlesInfoExtractor
2013-06-23 10:58:33 -07:00
from . . utils import (
2013-09-22 01:30:02 -07:00
compat_chr ,
2013-06-23 10:58:33 -07:00
compat_http_client ,
compat_parse_qs ,
compat_urllib_error ,
compat_urllib_parse ,
compat_urllib_request ,
compat_str ,
clean_html ,
get_element_by_id ,
ExtractorError ,
unescapeHTML ,
unified_strdate ,
2013-07-07 04:58:23 -07:00
orderedSet ,
2013-09-22 01:30:02 -07:00
write_json_file ,
2013-06-23 10:58:33 -07:00
)
2013-09-11 06:48:23 -07:00
class YoutubeBaseInfoExtractor ( InfoExtractor ) :
2013-07-24 11:40:12 -07:00
""" Provide base functions for Youtube extractors """
_LOGIN_URL = ' https://accounts.google.com/ServiceLogin '
_LANG_URL = r ' https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1 '
_AGE_URL = ' http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en '
_NETRC_MACHINE = ' youtube '
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
def report_lang ( self ) :
""" Report attempt to set language. """
self . to_screen ( u ' Setting language ' )
def _set_language ( self ) :
request = compat_urllib_request . Request ( self . _LANG_URL )
try :
self . report_lang ( )
compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
self . _downloader . report_warning ( u ' unable to set language: %s ' % compat_str ( err ) )
return False
return True
def _login ( self ) :
( username , password ) = self . _get_login_info ( )
# No authentication to be performed
if username is None :
if self . _LOGIN_REQUIRED :
raise ExtractorError ( u ' No login info available, needed for using %s . ' % self . IE_NAME , expected = True )
return False
request = compat_urllib_request . Request ( self . _LOGIN_URL )
try :
login_page = compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
self . _downloader . report_warning ( u ' unable to fetch login page: %s ' % compat_str ( err ) )
return False
galx = None
dsh = None
match = re . search ( re . compile ( r ' <input.+?name= " GALX " .+?value= " (.+?) " ' , re . DOTALL ) , login_page )
if match :
galx = match . group ( 1 )
match = re . search ( re . compile ( r ' <input.+?name= " dsh " .+?value= " (.+?) " ' , re . DOTALL ) , login_page )
if match :
dsh = match . group ( 1 )
2013-06-23 10:58:33 -07:00
2013-07-24 11:40:12 -07:00
# Log in
login_form_strs = {
u ' continue ' : u ' https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1 ' ,
u ' Email ' : username ,
u ' GALX ' : galx ,
u ' Passwd ' : password ,
u ' PersistentCookie ' : u ' yes ' ,
u ' _utf8 ' : u ' 霱 ' ,
u ' bgresponse ' : u ' js_disabled ' ,
u ' checkConnection ' : u ' ' ,
u ' checkedDomains ' : u ' youtube ' ,
u ' dnConn ' : u ' ' ,
u ' dsh ' : dsh ,
u ' pstMsg ' : u ' 0 ' ,
u ' rmShown ' : u ' 1 ' ,
u ' secTok ' : u ' ' ,
u ' signIn ' : u ' Sign in ' ,
u ' timeStmp ' : u ' ' ,
u ' service ' : u ' youtube ' ,
u ' uilel ' : u ' 3 ' ,
u ' hl ' : u ' en_US ' ,
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
login_form = dict ( ( k . encode ( ' utf-8 ' ) , v . encode ( ' utf-8 ' ) ) for k , v in login_form_strs . items ( ) )
login_data = compat_urllib_parse . urlencode ( login_form ) . encode ( ' ascii ' )
request = compat_urllib_request . Request ( self . _LOGIN_URL , login_data )
try :
self . report_login ( )
login_results = compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
if re . search ( r ' (?i)<form[^>]* id= " gaia_loginform " ' , login_results ) is not None :
self . _downloader . report_warning ( u ' unable to log in: bad username or password ' )
return False
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
self . _downloader . report_warning ( u ' unable to log in: %s ' % compat_str ( err ) )
return False
return True
def _confirm_age ( self ) :
age_form = {
' next_url ' : ' / ' ,
' action_confirm ' : ' Confirm ' ,
}
request = compat_urllib_request . Request ( self . _AGE_URL , compat_urllib_parse . urlencode ( age_form ) )
try :
self . report_age_confirmation ( )
compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
raise ExtractorError ( u ' Unable to confirm age: %s ' % compat_str ( err ) )
return True
def _real_initialize ( self ) :
if self . _downloader is None :
return
if not self . _set_language ( ) :
return
if not self . _login ( ) :
return
self . _confirm_age ( )
2013-06-23 10:58:33 -07:00
2013-08-07 23:54:10 -07:00
2013-09-11 06:48:23 -07:00
class YoutubeIE ( YoutubeBaseInfoExtractor , SubtitlesInfoExtractor ) :
2013-07-01 09:52:19 -07:00
IE_DESC = u ' YouTube.com '
2013-06-23 10:58:33 -07:00
_VALID_URL = r """ ^
(
( ? : https ? : / / ) ? # http(s):// (optional)
2013-09-05 13:38:23 -07:00
( ? : ( ? : ( ? : ( ? : \w + \. ) ? youtube ( ? : - nocookie ) ? \. com / |
2013-09-15 03:14:59 -07:00
tube \. majestyc \. net / |
youtube \. googleapis \. com / ) # the various hostnames, with wildcard subdomains
2013-06-23 10:58:33 -07:00
( ? : . * ? \#/)? # handle anchor (#/) redirect urls
( ? : # the various things that can precede the ID:
( ? : ( ? : v | embed | e ) / ) # v/ or embed/ or e/
| ( ? : # or the v= param in all its forms
2013-08-19 01:27:42 -07:00
( ? : ( ? : watch | movie ) ( ? : _popup ) ? ( ? : \. php ) ? ) ? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
2013-06-23 10:58:33 -07:00
( ? : \? | \#!?) # the params delimiter ? or # or #!
( ? : . * ? & ) ? # any other preceding param (like /?s=tuff&v=xxxx)
v =
)
2013-09-05 13:38:23 -07:00
) )
| youtu \. be / # just youtu.be/xxxx
)
2013-06-23 10:58:33 -07:00
) ? # all until now is optional -> you can pass the naked ID
2013-09-09 01:33:12 -07:00
( [ 0 - 9 A - Za - z_ - ] { 11 } ) # here is it! the YouTube video ID
2013-06-23 10:58:33 -07:00
( ? ( 1 ) . + ) ? # if we found the ID, everything can follow
$ """
_NEXT_URL_RE = r ' [ \ ?&]next_url=([^&]+) '
# Listed in order of quality
2013-08-30 17:51:50 -07:00
_available_formats = [ ' 38 ' , ' 37 ' , ' 46 ' , ' 22 ' , ' 45 ' , ' 35 ' , ' 44 ' , ' 34 ' , ' 18 ' , ' 43 ' , ' 6 ' , ' 5 ' , ' 36 ' , ' 17 ' , ' 13 ' ,
2013-09-03 18:49:35 -07:00
# Apple HTTP Live Streaming
2013-08-30 17:51:50 -07:00
' 96 ' , ' 95 ' , ' 94 ' , ' 93 ' , ' 92 ' , ' 132 ' , ' 151 ' ,
2013-08-22 10:45:24 -07:00
# 3D
' 85 ' , ' 84 ' , ' 102 ' , ' 83 ' , ' 101 ' , ' 82 ' , ' 100 ' ,
# Dash video
' 138 ' , ' 137 ' , ' 248 ' , ' 136 ' , ' 247 ' , ' 135 ' , ' 246 ' ,
' 245 ' , ' 244 ' , ' 134 ' , ' 243 ' , ' 133 ' , ' 242 ' , ' 160 ' ,
# Dash audio
' 141 ' , ' 172 ' , ' 140 ' , ' 171 ' , ' 139 ' ,
2013-07-20 03:46:02 -07:00
]
2013-08-30 17:51:50 -07:00
_available_formats_prefer_free = [ ' 38 ' , ' 46 ' , ' 37 ' , ' 45 ' , ' 22 ' , ' 44 ' , ' 35 ' , ' 43 ' , ' 34 ' , ' 18 ' , ' 6 ' , ' 5 ' , ' 36 ' , ' 17 ' , ' 13 ' ,
2013-09-03 18:49:35 -07:00
# Apple HTTP Live Streaming
2013-08-30 17:51:50 -07:00
' 96 ' , ' 95 ' , ' 94 ' , ' 93 ' , ' 92 ' , ' 132 ' , ' 151 ' ,
# 3D
2013-08-01 10:47:48 -07:00
' 85 ' , ' 102 ' , ' 84 ' , ' 101 ' , ' 83 ' , ' 100 ' , ' 82 ' ,
2013-08-22 10:45:24 -07:00
# Dash video
' 138 ' , ' 248 ' , ' 137 ' , ' 247 ' , ' 136 ' , ' 246 ' , ' 245 ' ,
' 244 ' , ' 135 ' , ' 243 ' , ' 134 ' , ' 242 ' , ' 133 ' , ' 160 ' ,
# Dash audio
' 172 ' , ' 141 ' , ' 171 ' , ' 140 ' , ' 139 ' ,
2013-07-20 03:46:02 -07:00
]
2013-08-30 17:51:50 -07:00
_video_formats_map = {
' flv ' : [ ' 35 ' , ' 34 ' , ' 6 ' , ' 5 ' ] ,
' 3gp ' : [ ' 36 ' , ' 17 ' , ' 13 ' ] ,
' mp4 ' : [ ' 38 ' , ' 37 ' , ' 22 ' , ' 18 ' ] ,
' webm ' : [ ' 46 ' , ' 45 ' , ' 44 ' , ' 43 ' ] ,
}
2013-06-23 10:58:33 -07:00
_video_extensions = {
' 13 ' : ' 3gp ' ,
2013-08-30 17:51:50 -07:00
' 17 ' : ' 3gp ' ,
2013-06-23 10:58:33 -07:00
' 18 ' : ' mp4 ' ,
' 22 ' : ' mp4 ' ,
2013-08-30 17:51:50 -07:00
' 36 ' : ' 3gp ' ,
2013-06-23 10:58:33 -07:00
' 37 ' : ' mp4 ' ,
2013-06-23 16:20:29 -07:00
' 38 ' : ' mp4 ' ,
2013-06-23 10:58:33 -07:00
' 43 ' : ' webm ' ,
' 44 ' : ' webm ' ,
' 45 ' : ' webm ' ,
' 46 ' : ' webm ' ,
2013-07-20 03:46:02 -07:00
2013-08-01 10:47:48 -07:00
# 3d videos
' 82 ' : ' mp4 ' ,
' 83 ' : ' mp4 ' ,
' 84 ' : ' mp4 ' ,
' 85 ' : ' mp4 ' ,
' 100 ' : ' webm ' ,
' 101 ' : ' webm ' ,
' 102 ' : ' webm ' ,
2013-08-19 18:22:25 -07:00
2013-09-03 18:49:35 -07:00
# Apple HTTP Live Streaming
2013-07-20 03:46:02 -07:00
' 92 ' : ' mp4 ' ,
' 93 ' : ' mp4 ' ,
' 94 ' : ' mp4 ' ,
' 95 ' : ' mp4 ' ,
' 96 ' : ' mp4 ' ,
' 132 ' : ' mp4 ' ,
' 151 ' : ' mp4 ' ,
2013-08-19 18:22:25 -07:00
# Dash mp4
' 133 ' : ' mp4 ' ,
' 134 ' : ' mp4 ' ,
' 135 ' : ' mp4 ' ,
' 136 ' : ' mp4 ' ,
' 137 ' : ' mp4 ' ,
' 138 ' : ' mp4 ' ,
' 139 ' : ' mp4 ' ,
' 140 ' : ' mp4 ' ,
' 141 ' : ' mp4 ' ,
' 160 ' : ' mp4 ' ,
# Dash webm
' 171 ' : ' webm ' ,
' 172 ' : ' webm ' ,
' 242 ' : ' webm ' ,
' 243 ' : ' webm ' ,
' 244 ' : ' webm ' ,
' 245 ' : ' webm ' ,
' 246 ' : ' webm ' ,
' 247 ' : ' webm ' ,
' 248 ' : ' webm ' ,
2013-06-23 10:58:33 -07:00
}
_video_dimensions = {
' 5 ' : ' 240x400 ' ,
' 6 ' : ' ??? ' ,
' 13 ' : ' ??? ' ,
' 17 ' : ' 144x176 ' ,
' 18 ' : ' 360x640 ' ,
' 22 ' : ' 720x1280 ' ,
' 34 ' : ' 360x640 ' ,
' 35 ' : ' 480x854 ' ,
2013-08-30 17:51:50 -07:00
' 36 ' : ' 240x320 ' ,
2013-06-23 10:58:33 -07:00
' 37 ' : ' 1080x1920 ' ,
' 38 ' : ' 3072x4096 ' ,
' 43 ' : ' 360x640 ' ,
' 44 ' : ' 480x854 ' ,
' 45 ' : ' 720x1280 ' ,
' 46 ' : ' 1080x1920 ' ,
2013-08-01 10:47:48 -07:00
' 82 ' : ' 360p ' ,
' 83 ' : ' 480p ' ,
' 84 ' : ' 720p ' ,
' 85 ' : ' 1080p ' ,
2013-07-20 03:46:02 -07:00
' 92 ' : ' 240p ' ,
' 93 ' : ' 360p ' ,
' 94 ' : ' 480p ' ,
' 95 ' : ' 720p ' ,
' 96 ' : ' 1080p ' ,
2013-08-01 10:47:48 -07:00
' 100 ' : ' 360p ' ,
' 101 ' : ' 480p ' ,
2013-08-19 18:22:25 -07:00
' 102 ' : ' 720p ' ,
2013-07-20 03:46:02 -07:00
' 132 ' : ' 240p ' ,
' 151 ' : ' 72p ' ,
2013-08-19 18:22:25 -07:00
' 133 ' : ' 240p ' ,
' 134 ' : ' 360p ' ,
' 135 ' : ' 480p ' ,
' 136 ' : ' 720p ' ,
' 137 ' : ' 1080p ' ,
' 138 ' : ' >1080p ' ,
' 139 ' : ' 48k ' ,
' 140 ' : ' 128k ' ,
' 141 ' : ' 256k ' ,
' 160 ' : ' 192p ' ,
' 171 ' : ' 128k ' ,
' 172 ' : ' 256k ' ,
' 242 ' : ' 240p ' ,
' 243 ' : ' 360p ' ,
' 244 ' : ' 480p ' ,
' 245 ' : ' 480p ' ,
' 246 ' : ' 480p ' ,
' 247 ' : ' 720p ' ,
' 248 ' : ' 1080p ' ,
2013-06-23 10:58:33 -07:00
}
2013-08-19 18:22:25 -07:00
_special_itags = {
' 82 ' : ' 3D ' ,
' 83 ' : ' 3D ' ,
' 84 ' : ' 3D ' ,
' 85 ' : ' 3D ' ,
' 100 ' : ' 3D ' ,
' 101 ' : ' 3D ' ,
' 102 ' : ' 3D ' ,
' 133 ' : ' DASH Video ' ,
' 134 ' : ' DASH Video ' ,
' 135 ' : ' DASH Video ' ,
' 136 ' : ' DASH Video ' ,
' 137 ' : ' DASH Video ' ,
' 138 ' : ' DASH Video ' ,
' 139 ' : ' DASH Audio ' ,
' 140 ' : ' DASH Audio ' ,
' 141 ' : ' DASH Audio ' ,
' 160 ' : ' DASH Video ' ,
' 171 ' : ' DASH Audio ' ,
' 172 ' : ' DASH Audio ' ,
' 242 ' : ' DASH Video ' ,
' 243 ' : ' DASH Video ' ,
' 244 ' : ' DASH Video ' ,
' 245 ' : ' DASH Video ' ,
' 246 ' : ' DASH Video ' ,
' 247 ' : ' DASH Video ' ,
' 248 ' : ' DASH Video ' ,
2013-06-23 10:58:33 -07:00
}
2013-08-19 18:22:25 -07:00
2013-06-23 10:58:33 -07:00
IE_NAME = u ' youtube '
2013-06-27 10:13:11 -07:00
_TESTS = [
{
2013-06-27 10:55:39 -07:00
u " url " : u " http://www.youtube.com/watch?v=BaW_jenozKc " ,
u " file " : u " BaW_jenozKc.mp4 " ,
u " info_dict " : {
u " title " : u " youtube-dl test video \" ' / \\ ä↭𝕐 " ,
u " uploader " : u " Philipp Hagemeister " ,
u " uploader_id " : u " phihag " ,
u " upload_date " : u " 20121002 " ,
u " description " : u " test chars: \" ' / \\ ä↭𝕐 \n \n This is a test video for youtube-dl. \n \n For more information, contact phihag@phihag.de . "
2013-06-27 10:13:11 -07:00
}
2013-06-27 10:55:39 -07:00
} ,
{
u " url " : u " http://www.youtube.com/watch?v=1ltcDfZMA3U " ,
u " file " : u " 1ltcDfZMA3U.flv " ,
u " note " : u " Test VEVO video (#897) " ,
u " info_dict " : {
u " upload_date " : u " 20070518 " ,
u " title " : u " Maps - It Will Find You " ,
u " description " : u " Music video by Maps performing It Will Find You. " ,
u " uploader " : u " MuteUSA " ,
u " uploader_id " : u " MuteUSA "
2013-06-27 10:13:11 -07:00
}
2013-06-27 10:55:39 -07:00
} ,
{
u " url " : u " http://www.youtube.com/watch?v=UxxajLWwzqY " ,
u " file " : u " UxxajLWwzqY.mp4 " ,
u " note " : u " Test generic use_cipher_signature video (#897) " ,
u " info_dict " : {
u " upload_date " : u " 20120506 " ,
u " title " : u " Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO] " ,
2013-08-29 04:41:59 -07:00
u " description " : u " md5:3e2666e0a55044490499ea45fe9037b7 " ,
2013-08-21 10:25:54 -07:00
u " uploader " : u " Icona Pop " ,
2013-06-27 10:55:39 -07:00
u " uploader_id " : u " IconaPop "
2013-06-27 10:13:11 -07:00
}
2013-07-09 05:38:24 -07:00
} ,
{
u " url " : u " https://www.youtube.com/watch?v=07FYdnEawAQ " ,
u " file " : u " 07FYdnEawAQ.mp4 " ,
u " note " : u " Test VEVO video with age protection (#956) " ,
u " info_dict " : {
u " upload_date " : u " 20130703 " ,
u " title " : u " Justin Timberlake - Tunnel Vision (Explicit) " ,
u " description " : u " md5:64249768eec3bc4276236606ea996373 " ,
u " uploader " : u " justintimberlakeVEVO " ,
u " uploader_id " : u " justintimberlakeVEVO "
}
} ,
2013-07-20 03:46:02 -07:00
{
u ' url ' : u ' https://www.youtube.com/watch?v=TGi3HqYrWHE ' ,
u ' file ' : u ' TGi3HqYrWHE.mp4 ' ,
u ' note ' : u ' m3u8 video ' ,
u ' info_dict ' : {
u ' title ' : u ' Triathlon - Men - London 2012 Olympic Games ' ,
u ' description ' : u ' - Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games ' ,
u ' uploader ' : u ' olympic ' ,
u ' upload_date ' : u ' 20120807 ' ,
u ' uploader_id ' : u ' olympic ' ,
} ,
u ' params ' : {
u ' skip_download ' : True ,
} ,
} ,
2013-06-27 10:13:11 -07:00
]
2013-06-23 10:58:33 -07:00
@classmethod
def suitable ( cls , url ) :
""" Receives a URL and returns True if suitable for this IE. """
2013-09-06 07:24:24 -07:00
if YoutubePlaylistIE . suitable ( url ) : return False
2013-06-23 10:58:33 -07:00
return re . match ( cls . _VALID_URL , url , re . VERBOSE ) is not None
2013-09-21 05:19:30 -07:00
def __init__ ( self , * args , * * kwargs ) :
super ( YoutubeIE , self ) . __init__ ( * args , * * kwargs )
2013-09-21 06:19:48 -07:00
self . _player_cache = { }
2013-09-21 05:19:30 -07:00
2013-06-23 10:58:33 -07:00
def report_video_webpage_download ( self , video_id ) :
""" Report attempt to download video webpage. """
self . to_screen ( u ' %s : Downloading video webpage ' % video_id )
def report_video_info_webpage_download ( self , video_id ) :
""" Report attempt to download video info webpage. """
self . to_screen ( u ' %s : Downloading video info webpage ' % video_id )
def report_information_extraction ( self , video_id ) :
""" Report attempt to extract video information. """
self . to_screen ( u ' %s : Extracting video information ' % video_id )
def report_unavailable_format ( self , video_id , format ) :
""" Report extracted video URL. """
self . to_screen ( u ' %s : Format %s not available ' % ( video_id , format ) )
def report_rtmp_download ( self ) :
""" Indicate the download will use the RTMP protocol. """
self . to_screen ( u ' RTMP download detected ' )
2013-09-21 15:35:03 -07:00
def _extract_signature_function ( self , video_id , player_url , slen ) :
id_m = re . match ( r ' .*-(?P<id>[a-zA-Z0-9_-]+) \ .(?P<ext>[a-z]+)$ ' ,
2013-09-21 06:19:48 -07:00
player_url )
2013-09-21 05:19:30 -07:00
player_type = id_m . group ( ' ext ' )
player_id = id_m . group ( ' id ' )
2013-09-21 15:35:03 -07:00
# Read from filesystem cache
func_id = ' %s _ %s _ %d ' % ( player_type , player_id , slen )
assert os . path . basename ( func_id ) == func_id
2013-09-22 01:30:02 -07:00
cache_dir = self . _downloader . params . get ( ' cachedir ' ,
u ' ~/.youtube-dl/cache ' )
2013-09-21 15:35:03 -07:00
2013-09-22 01:50:12 -07:00
cache_enabled = cache_dir != u ' NONE '
if cache_enabled :
2013-09-21 15:35:03 -07:00
cache_fn = os . path . join ( os . path . expanduser ( cache_dir ) ,
u ' youtube-sigfuncs ' ,
func_id + ' .json ' )
try :
2013-09-22 01:30:02 -07:00
with io . open ( cache_fn , ' r ' , encoding = ' utf-8 ' ) as cachef :
2013-09-21 15:35:03 -07:00
cache_spec = json . load ( cachef )
return lambda s : u ' ' . join ( s [ i ] for i in cache_spec )
2013-09-22 01:30:02 -07:00
except IOError :
2013-09-21 15:35:03 -07:00
pass # No cache available
2013-09-21 06:19:48 -07:00
2013-09-21 05:19:30 -07:00
if player_type == ' js ' :
code = self . _download_webpage (
player_url , video_id ,
2013-09-21 06:19:48 -07:00
note = u ' Downloading %s player %s ' % ( player_type , player_id ) ,
2013-09-21 05:19:30 -07:00
errnote = u ' Download of %s failed ' % player_url )
2013-09-21 06:19:48 -07:00
res = self . _parse_sig_js ( code )
2013-09-21 15:35:03 -07:00
elif player_type == ' swf ' :
2013-09-21 05:19:30 -07:00
urlh = self . _request_webpage (
player_url , video_id ,
2013-09-21 06:19:48 -07:00
note = u ' Downloading %s player %s ' % ( player_type , player_id ) ,
2013-09-21 05:19:30 -07:00
errnote = u ' Download of %s failed ' % player_url )
code = urlh . read ( )
2013-09-21 06:19:48 -07:00
res = self . _parse_sig_swf ( code )
2013-09-21 05:19:30 -07:00
else :
assert False , ' Invalid player type %r ' % player_type
2013-09-22 01:50:12 -07:00
if cache_enabled :
2013-09-22 01:30:02 -07:00
try :
cache_res = res ( map ( compat_chr , range ( slen ) ) )
cache_spec = [ ord ( c ) for c in cache_res ]
try :
os . makedirs ( os . path . dirname ( cache_fn ) )
except OSError as ose :
if ose . errno != errno . EEXIST :
raise
write_json_file ( cache_spec , cache_fn )
2013-09-22 01:37:23 -07:00
except Exception :
2013-09-22 01:30:02 -07:00
tb = traceback . format_exc ( )
self . _downloader . report_warning (
u ' Writing cache to %r failed: %s ' % ( cache_fn , tb ) )
2013-09-21 06:19:48 -07:00
return res
2013-09-22 01:30:02 -07:00
def _print_sig_code ( self , func , slen ) :
def gen_sig_code ( idxs ) :
def _genslice ( start , end , step ) :
starts = u ' ' if start == 0 else str ( start )
ends = u ' : %d ' % ( end + step )
steps = u ' ' if step == 1 else ( ' : %d ' % step )
return u ' s[ %s %s %s ] ' % ( starts , ends , steps )
step = None
2013-09-22 01:37:23 -07:00
start = ' (Never used) ' # Quelch pyflakes warnings - start will be
# set as soon as step is set
2013-09-22 01:30:02 -07:00
for i , prev in zip ( idxs [ 1 : ] , idxs [ : - 1 ] ) :
if step is not None :
if i - prev == step :
continue
yield _genslice ( start , prev , step )
step = None
continue
if i - prev in [ - 1 , 1 ] :
step = i - prev
start = prev
continue
else :
yield u ' s[ %d ] ' % prev
if step is None :
yield u ' s[ %d ] ' % i
else :
yield _genslice ( start , i , step )
cache_res = func ( map ( compat_chr , range ( slen ) ) )
cache_spec = [ ord ( c ) for c in cache_res ]
expr_code = u ' + ' . join ( gen_sig_code ( cache_spec ) )
code = u ' if len(s) == %d : \n return %s \n ' % ( slen , expr_code )
2013-09-22 01:50:12 -07:00
self . to_screen ( u ' Extracted signature function: \n ' + code )
2013-09-22 01:30:02 -07:00
2013-09-21 05:19:30 -07:00
def _parse_sig_js ( self , jscode ) :
funcname = self . _search_regex (
r ' signature=([a-zA-Z]+) ' , jscode ,
u ' Initial JS player signature function name ' )
functions = { }
def argidx ( varname ) :
return string . lowercase . index ( varname )
def interpret_statement ( stmt , local_vars , allow_recursion = 20 ) :
if allow_recursion < 0 :
2013-09-22 01:37:23 -07:00
raise ExtractorError ( u ' Recursion limit reached ' )
2013-09-21 05:19:30 -07:00
if stmt . startswith ( u ' var ' ) :
stmt = stmt [ len ( u ' var ' ) : ]
ass_m = re . match ( r ' ^(?P<out>[a-z]+)(?: \ [(?P<index>[^ \ ]]+) \ ])? ' +
r ' =(?P<expr>.*)$ ' , stmt )
if ass_m :
if ass_m . groupdict ( ) . get ( ' index ' ) :
def assign ( val ) :
lvar = local_vars [ ass_m . group ( ' out ' ) ]
idx = interpret_expression ( ass_m . group ( ' index ' ) ,
local_vars , allow_recursion )
assert isinstance ( idx , int )
lvar [ idx ] = val
return val
expr = ass_m . group ( ' expr ' )
else :
def assign ( val ) :
local_vars [ ass_m . group ( ' out ' ) ] = val
return val
expr = ass_m . group ( ' expr ' )
elif stmt . startswith ( u ' return ' ) :
assign = lambda v : v
expr = stmt [ len ( u ' return ' ) : ]
else :
raise ExtractorError (
u ' Cannot determine left side of statement in %r ' % stmt )
v = interpret_expression ( expr , local_vars , allow_recursion )
return assign ( v )
def interpret_expression ( expr , local_vars , allow_recursion ) :
if expr . isdigit ( ) :
return int ( expr )
if expr . isalpha ( ) :
return local_vars [ expr ]
m = re . match ( r ' ^(?P<in>[a-z]+) \ .(?P<member>.*)$ ' , expr )
if m :
member = m . group ( ' member ' )
val = local_vars [ m . group ( ' in ' ) ]
if member == ' split( " " ) ' :
return list ( val )
if member == ' join( " " ) ' :
return u ' ' . join ( val )
if member == ' length ' :
return len ( val )
if member == ' reverse() ' :
return val [ : : - 1 ]
slice_m = re . match ( r ' slice \ ((?P<idx>.*) \ ) ' , member )
if slice_m :
idx = interpret_expression (
slice_m . group ( ' idx ' ) , local_vars , allow_recursion - 1 )
return val [ idx : ]
m = re . match (
r ' ^(?P<in>[a-z]+) \ [(?P<idx>.+) \ ]$ ' , expr )
if m :
val = local_vars [ m . group ( ' in ' ) ]
idx = interpret_expression ( m . group ( ' idx ' ) , local_vars ,
allow_recursion - 1 )
return val [ idx ]
m = re . match ( r ' ^(?P<a>.+?)(?P<op>[ % ])(?P<b>.+?)$ ' , expr )
if m :
a = interpret_expression ( m . group ( ' a ' ) ,
local_vars , allow_recursion )
b = interpret_expression ( m . group ( ' b ' ) ,
local_vars , allow_recursion )
return a % b
m = re . match (
r ' ^(?P<func>[a-zA-Z]+) \ ((?P<args>[a-z0-9,]+) \ )$ ' , expr )
if m :
fname = m . group ( ' func ' )
if fname not in functions :
functions [ fname ] = extract_function ( fname )
argvals = [ int ( v ) if v . isdigit ( ) else local_vars [ v ]
for v in m . group ( ' args ' ) . split ( ' , ' ) ]
return functions [ fname ] ( argvals )
raise ExtractorError ( u ' Unsupported JS expression %r ' % expr )
def extract_function ( funcname ) :
func_m = re . search (
r ' function ' + re . escape ( funcname ) +
r ' \ ((?P<args>[a-z,]+) \ ) { (?P<code>[^}]+)} ' ,
jscode )
argnames = func_m . group ( ' args ' ) . split ( ' , ' )
def resf ( args ) :
local_vars = dict ( zip ( argnames , args ) )
for stmt in func_m . group ( ' code ' ) . split ( ' ; ' ) :
res = interpret_statement ( stmt , local_vars )
return res
return resf
initial_function = extract_function ( funcname )
return lambda s : initial_function ( [ s ] )
def _parse_sig_swf ( self , file_contents ) :
if file_contents [ 1 : 3 ] != b ' WS ' :
raise ExtractorError (
u ' Not an SWF file; header is %r ' % file_contents [ : 3 ] )
if file_contents [ : 1 ] == b ' C ' :
content = zlib . decompress ( file_contents [ 8 : ] )
else :
raise NotImplementedError ( u ' Unsupported compression format %r ' %
file_contents [ : 1 ] )
def extract_tags ( content ) :
pos = 0
while pos < len ( content ) :
header16 = struct . unpack ( ' <H ' , content [ pos : pos + 2 ] ) [ 0 ]
pos + = 2
tag_code = header16 >> 6
tag_len = header16 & 0x3f
if tag_len == 0x3f :
tag_len = struct . unpack ( ' <I ' , content [ pos : pos + 4 ] ) [ 0 ]
pos + = 4
assert pos + tag_len < = len ( content )
yield ( tag_code , content [ pos : pos + tag_len ] )
pos + = tag_len
code_tag = next ( tag
for tag_code , tag in extract_tags ( content )
if tag_code == 82 )
p = code_tag . index ( b ' \0 ' , 4 ) + 1
2013-09-21 06:32:37 -07:00
code_reader = io . BytesIO ( code_tag [ p : ] )
2013-09-21 05:19:30 -07:00
# Parse ABC (AVM2 ByteCode)
2013-09-21 06:32:37 -07:00
def read_int ( reader = None ) :
if reader is None :
reader = code_reader
2013-09-21 05:19:30 -07:00
res = 0
shift = 0
for _ in range ( 5 ) :
2013-09-21 06:32:37 -07:00
buf = reader . read ( 1 )
assert len ( buf ) == 1
b = struct . unpack ( ' <B ' , buf ) [ 0 ]
2013-09-21 05:19:30 -07:00
res = res | ( ( b & 0x7f ) << shift )
if b & 0x80 == 0 :
break
shift + = 7
2013-09-21 06:32:37 -07:00
return res
def u30 ( reader = None ) :
res = read_int ( reader )
assert res & 0xf0000000 == 0
2013-09-21 05:19:30 -07:00
return res
u32 = read_int
2013-09-21 06:32:37 -07:00
def s32 ( reader = None ) :
v = read_int ( reader )
2013-09-21 05:19:30 -07:00
if v & 0x80000000 != 0 :
v = - ( ( v ^ 0xffffffff ) + 1 )
2013-09-21 06:32:37 -07:00
return v
2013-09-22 01:37:23 -07:00
def read_string ( reader = None ) :
2013-09-21 06:32:37 -07:00
if reader is None :
reader = code_reader
slen = u30 ( reader )
resb = reader . read ( slen )
assert len ( resb ) == slen
return resb . decode ( ' utf-8 ' )
def read_bytes ( count , reader = None ) :
if reader is None :
reader = code_reader
resb = reader . read ( count )
assert len ( resb ) == count
return resb
def read_byte ( reader = None ) :
resb = read_bytes ( 1 , reader = reader )
res = struct . unpack ( ' <B ' , resb ) [ 0 ]
return res
2013-09-21 05:19:30 -07:00
# minor_version + major_version
2013-09-22 01:37:23 -07:00
read_bytes ( 2 + 2 )
2013-09-21 05:19:30 -07:00
# Constant pool
2013-09-21 06:32:37 -07:00
int_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c in range ( 1 , int_count ) :
2013-09-22 01:37:23 -07:00
s32 ( )
2013-09-21 06:32:37 -07:00
uint_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c in range ( 1 , uint_count ) :
2013-09-22 01:37:23 -07:00
u32 ( )
2013-09-21 06:32:37 -07:00
double_count = u30 ( )
2013-09-22 01:37:23 -07:00
read_bytes ( ( double_count - 1 ) * 8 )
2013-09-21 06:32:37 -07:00
string_count = u30 ( )
2013-09-21 05:19:30 -07:00
constant_strings = [ u ' ' ]
for _c in range ( 1 , string_count ) :
2013-09-22 01:37:23 -07:00
s = read_string ( )
2013-09-21 05:19:30 -07:00
constant_strings . append ( s )
2013-09-21 06:32:37 -07:00
namespace_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c in range ( 1 , namespace_count ) :
2013-09-22 01:37:23 -07:00
read_bytes ( 1 ) # kind
u30 ( ) # name
2013-09-21 06:32:37 -07:00
ns_set_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c in range ( 1 , ns_set_count ) :
2013-09-21 06:32:37 -07:00
count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c2 in range ( count ) :
2013-09-22 01:37:23 -07:00
u30 ( )
2013-09-21 06:32:37 -07:00
multiname_count = u30 ( )
2013-09-21 05:19:30 -07:00
MULTINAME_SIZES = {
0x07 : 2 , # QName
0x0d : 2 , # QNameA
0x0f : 1 , # RTQName
0x10 : 1 , # RTQNameA
0x11 : 0 , # RTQNameL
0x12 : 0 , # RTQNameLA
0x09 : 2 , # Multiname
0x0e : 2 , # MultinameA
0x1b : 1 , # MultinameL
0x1c : 1 , # MultinameLA
}
multinames = [ u ' ' ]
for _c in range ( 1 , multiname_count ) :
2013-09-21 06:32:37 -07:00
kind = u30 ( )
2013-09-21 05:19:30 -07:00
assert kind in MULTINAME_SIZES , u ' Invalid multiname kind %r ' % kind
if kind == 0x07 :
2013-09-22 01:37:23 -07:00
u30 ( ) # namespace_idx
2013-09-21 06:32:37 -07:00
name_idx = u30 ( )
2013-09-21 05:19:30 -07:00
multinames . append ( constant_strings [ name_idx ] )
else :
multinames . append ( ' [MULTINAME kind: %d ] ' % kind )
for _c2 in range ( MULTINAME_SIZES [ kind ] ) :
2013-09-22 01:37:23 -07:00
u30 ( )
2013-09-21 05:19:30 -07:00
# Methods
2013-09-21 06:32:37 -07:00
method_count = u30 ( )
2013-09-21 05:19:30 -07:00
MethodInfo = collections . namedtuple (
' MethodInfo ' ,
[ ' NEED_ARGUMENTS ' , ' NEED_REST ' ] )
method_infos = [ ]
for method_id in range ( method_count ) :
2013-09-21 06:32:37 -07:00
param_count = u30 ( )
2013-09-22 01:37:23 -07:00
u30 ( ) # return type
2013-09-21 05:19:30 -07:00
for _ in range ( param_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # param type
u30 ( ) # name index (always 0 for youtube)
2013-09-21 06:32:37 -07:00
flags = read_byte ( )
2013-09-21 05:19:30 -07:00
if flags & 0x08 != 0 :
# Options present
2013-09-21 06:32:37 -07:00
option_count = u30 ( )
2013-09-21 05:19:30 -07:00
for c in range ( option_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # val
read_bytes ( 1 ) # kind
2013-09-21 05:19:30 -07:00
if flags & 0x80 != 0 :
# Param names present
for _ in range ( param_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # param name
2013-09-21 05:19:30 -07:00
mi = MethodInfo ( flags & 0x01 != 0 , flags & 0x04 != 0 )
method_infos . append ( mi )
# Metadata
2013-09-21 06:32:37 -07:00
metadata_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c in range ( metadata_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # name
2013-09-21 06:32:37 -07:00
item_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c2 in range ( item_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # key
u30 ( ) # value
2013-09-21 06:32:37 -07:00
def parse_traits_info ( ) :
trait_name_idx = u30 ( )
kind_full = read_byte ( )
2013-09-21 05:19:30 -07:00
kind = kind_full & 0x0f
attrs = kind_full >> 4
methods = { }
if kind in [ 0x00 , 0x06 ] : # Slot or Const
2013-09-22 01:37:23 -07:00
u30 ( ) # Slot id
u30 ( ) # type_name_idx
2013-09-21 06:32:37 -07:00
vindex = u30 ( )
2013-09-21 05:19:30 -07:00
if vindex != 0 :
2013-09-22 01:37:23 -07:00
read_byte ( ) # vkind
2013-09-21 05:19:30 -07:00
elif kind in [ 0x01 , 0x02 , 0x03 ] : # Method / Getter / Setter
2013-09-22 01:37:23 -07:00
u30 ( ) # disp_id
2013-09-21 06:32:37 -07:00
method_idx = u30 ( )
2013-09-21 05:19:30 -07:00
methods [ multinames [ trait_name_idx ] ] = method_idx
elif kind == 0x04 : # Class
2013-09-22 01:37:23 -07:00
u30 ( ) # slot_id
u30 ( ) # classi
2013-09-21 05:19:30 -07:00
elif kind == 0x05 : # Function
2013-09-22 01:37:23 -07:00
u30 ( ) # slot_id
2013-09-21 06:32:37 -07:00
function_idx = u30 ( )
2013-09-21 05:19:30 -07:00
methods [ function_idx ] = multinames [ trait_name_idx ]
else :
raise ExtractorError ( u ' Unsupported trait kind %d ' % kind )
if attrs & 0x4 != 0 : # Metadata present
2013-09-21 06:32:37 -07:00
metadata_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c3 in range ( metadata_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # metadata index
2013-09-21 05:19:30 -07:00
2013-09-21 06:32:37 -07:00
return methods
2013-09-21 05:19:30 -07:00
# Classes
TARGET_CLASSNAME = u ' SignatureDecipher '
searched_idx = multinames . index ( TARGET_CLASSNAME )
searched_class_id = None
2013-09-21 06:32:37 -07:00
class_count = u30 ( )
2013-09-21 05:19:30 -07:00
for class_id in range ( class_count ) :
2013-09-21 06:32:37 -07:00
name_idx = u30 ( )
2013-09-21 05:19:30 -07:00
if name_idx == searched_idx :
# We found the class we're looking for!
searched_class_id = class_id
2013-09-22 01:37:23 -07:00
u30 ( ) # super_name idx
2013-09-21 06:32:37 -07:00
flags = read_byte ( )
2013-09-21 05:19:30 -07:00
if flags & 0x08 != 0 : # Protected namespace is present
2013-09-22 01:37:23 -07:00
u30 ( ) # protected_ns_idx
2013-09-21 06:32:37 -07:00
intrf_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c2 in range ( intrf_count ) :
2013-09-22 01:37:23 -07:00
u30 ( )
u30 ( ) # iinit
2013-09-21 06:32:37 -07:00
trait_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c2 in range ( trait_count ) :
2013-09-22 01:37:23 -07:00
parse_traits_info ( )
2013-09-21 05:19:30 -07:00
if searched_class_id is None :
raise ExtractorError ( u ' Target class %r not found ' %
TARGET_CLASSNAME )
method_names = { }
method_idxs = { }
for class_id in range ( class_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # cinit
2013-09-21 06:32:37 -07:00
trait_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c2 in range ( trait_count ) :
2013-09-21 06:32:37 -07:00
trait_methods = parse_traits_info ( )
2013-09-21 05:19:30 -07:00
if class_id == searched_class_id :
method_names . update ( trait_methods . items ( ) )
method_idxs . update ( dict (
( idx , name )
for name , idx in trait_methods . items ( ) ) )
# Scripts
2013-09-21 06:32:37 -07:00
script_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c in range ( script_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # init
2013-09-21 06:32:37 -07:00
trait_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c2 in range ( trait_count ) :
2013-09-22 01:37:23 -07:00
parse_traits_info ( )
2013-09-21 05:19:30 -07:00
# Method bodies
2013-09-21 06:32:37 -07:00
method_body_count = u30 ( )
2013-09-21 05:19:30 -07:00
Method = collections . namedtuple ( ' Method ' , [ ' code ' , ' local_count ' ] )
methods = { }
for _c in range ( method_body_count ) :
2013-09-21 06:32:37 -07:00
method_idx = u30 ( )
2013-09-22 01:37:23 -07:00
u30 ( ) # max_stack
2013-09-21 06:32:37 -07:00
local_count = u30 ( )
2013-09-22 01:37:23 -07:00
u30 ( ) # init_scope_depth
u30 ( ) # max_scope_depth
2013-09-21 06:32:37 -07:00
code_length = u30 ( )
code = read_bytes ( code_length )
2013-09-21 05:19:30 -07:00
if method_idx in method_idxs :
2013-09-21 06:32:37 -07:00
m = Method ( code , local_count )
2013-09-21 05:19:30 -07:00
methods [ method_idxs [ method_idx ] ] = m
2013-09-21 06:32:37 -07:00
exception_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c2 in range ( exception_count ) :
2013-09-22 01:37:23 -07:00
u30 ( ) # from
u30 ( ) # to
u30 ( ) # target
u30 ( ) # exc_type
u30 ( ) # var_name
2013-09-21 06:32:37 -07:00
trait_count = u30 ( )
2013-09-21 05:19:30 -07:00
for _c2 in range ( trait_count ) :
2013-09-22 01:37:23 -07:00
parse_traits_info ( )
2013-09-21 05:19:30 -07:00
2013-09-21 06:32:37 -07:00
assert p + code_reader . tell ( ) == len ( code_tag )
2013-09-21 05:19:30 -07:00
assert len ( methods ) == len ( method_idxs )
method_pyfunctions = { }
def extract_function ( func_name ) :
if func_name in method_pyfunctions :
return method_pyfunctions [ func_name ]
if func_name not in methods :
raise ExtractorError ( u ' Cannot find function %r ' % func_name )
m = methods [ func_name ]
def resfunc ( args ) :
registers = [ ' (this) ' ] + list ( args ) + [ None ] * m . local_count
stack = [ ]
coder = io . BytesIO ( m . code )
while True :
opcode = struct . unpack ( ' !B ' , coder . read ( 1 ) ) [ 0 ]
2013-09-21 05:48:12 -07:00
if opcode == 36 : # pushbyte
2013-09-21 05:19:30 -07:00
v = struct . unpack ( ' !B ' , coder . read ( 1 ) ) [ 0 ]
stack . append ( v )
elif opcode == 44 : # pushstring
idx = u30 ( coder )
stack . append ( constant_strings [ idx ] )
elif opcode == 48 : # pushscope
# We don't implement the scope register, so we'll just
# ignore the popped value
stack . pop ( )
elif opcode == 70 : # callproperty
index = u30 ( coder )
mname = multinames [ index ]
arg_count = u30 ( coder )
args = list ( reversed (
[ stack . pop ( ) for _ in range ( arg_count ) ] ) )
obj = stack . pop ( )
if mname == u ' split ' :
assert len ( args ) == 1
assert isinstance ( args [ 0 ] , compat_str )
assert isinstance ( obj , compat_str )
if args [ 0 ] == u ' ' :
res = list ( obj )
else :
res = obj . split ( args [ 0 ] )
stack . append ( res )
2013-09-21 05:48:12 -07:00
elif mname == u ' slice ' :
assert len ( args ) == 1
assert isinstance ( args [ 0 ] , int )
assert isinstance ( obj , list )
res = obj [ args [ 0 ] : ]
stack . append ( res )
elif mname == u ' join ' :
assert len ( args ) == 1
assert isinstance ( args [ 0 ] , compat_str )
assert isinstance ( obj , list )
res = args [ 0 ] . join ( obj )
stack . append ( res )
2013-09-21 05:19:30 -07:00
elif mname in method_pyfunctions :
stack . append ( method_pyfunctions [ mname ] ( args ) )
else :
raise NotImplementedError (
u ' Unsupported property %r on %r '
% ( mname , obj ) )
2013-09-21 05:48:12 -07:00
elif opcode == 72 : # returnvalue
res = stack . pop ( )
return res
elif opcode == 79 : # callpropvoid
index = u30 ( coder )
mname = multinames [ index ]
arg_count = u30 ( coder )
args = list ( reversed (
[ stack . pop ( ) for _ in range ( arg_count ) ] ) )
obj = stack . pop ( )
if mname == u ' reverse ' :
assert isinstance ( obj , list )
obj . reverse ( )
else :
raise NotImplementedError (
u ' Unsupported (void) property %r on %r '
% ( mname , obj ) )
2013-09-21 05:19:30 -07:00
elif opcode == 93 : # findpropstrict
index = u30 ( coder )
mname = multinames [ index ]
res = extract_function ( mname )
stack . append ( res )
elif opcode == 97 : # setproperty
index = u30 ( coder )
value = stack . pop ( )
idx = stack . pop ( )
obj = stack . pop ( )
assert isinstance ( obj , list )
assert isinstance ( idx , int )
obj [ idx ] = value
elif opcode == 98 : # getlocal
index = u30 ( coder )
stack . append ( registers [ index ] )
elif opcode == 99 : # setlocal
index = u30 ( coder )
value = stack . pop ( )
registers [ index ] = value
elif opcode == 102 : # getproperty
index = u30 ( coder )
pname = multinames [ index ]
if pname == u ' length ' :
obj = stack . pop ( )
assert isinstance ( obj , list )
stack . append ( len ( obj ) )
else : # Assume attribute access
idx = stack . pop ( )
assert isinstance ( idx , int )
obj = stack . pop ( )
assert isinstance ( obj , list )
stack . append ( obj [ idx ] )
elif opcode == 128 : # coerce
2013-09-22 01:37:23 -07:00
u30 ( coder )
2013-09-21 05:19:30 -07:00
elif opcode == 133 : # coerce_s
assert isinstance ( stack [ - 1 ] , ( type ( None ) , compat_str ) )
elif opcode == 164 : # modulo
value2 = stack . pop ( )
value1 = stack . pop ( )
res = value1 % value2
stack . append ( res )
2013-09-21 05:48:12 -07:00
elif opcode == 208 : # getlocal_0
stack . append ( registers [ 0 ] )
elif opcode == 209 : # getlocal_1
stack . append ( registers [ 1 ] )
elif opcode == 210 : # getlocal_2
stack . append ( registers [ 2 ] )
elif opcode == 211 : # getlocal_3
stack . append ( registers [ 3 ] )
2013-09-21 05:19:30 -07:00
elif opcode == 214 : # setlocal_2
registers [ 2 ] = stack . pop ( )
elif opcode == 215 : # setlocal_3
registers [ 3 ] = stack . pop ( )
else :
raise NotImplementedError (
u ' Unsupported opcode %d ' % opcode )
method_pyfunctions [ func_name ] = resfunc
return resfunc
initial_function = extract_function ( u ' decipher ' )
return lambda s : initial_function ( [ s ] )
2013-09-21 06:19:48 -07:00
def _decrypt_signature ( self , s , video_id , player_url , age_gate = False ) :
2013-06-27 13:20:50 -07:00
""" Turn the encrypted s field into a working signature """
2013-06-26 16:51:10 -07:00
2013-09-21 06:19:48 -07:00
if player_url is not None :
2013-09-21 05:19:30 -07:00
try :
2013-09-21 06:19:48 -07:00
if player_url not in self . _player_cache :
func = self . _extract_signature_function (
2013-09-21 15:35:03 -07:00
video_id , player_url , len ( s )
2013-09-21 05:19:30 -07:00
)
2013-09-21 06:19:48 -07:00
self . _player_cache [ player_url ] = func
2013-09-22 01:30:02 -07:00
func = self . _player_cache [ player_url ]
if self . _downloader . params . get ( ' youtube_print_sig_code ' ) :
self . _print_sig_code ( func , len ( s ) )
return func ( s )
2013-09-22 01:37:23 -07:00
except Exception :
2013-09-21 05:19:30 -07:00
tb = traceback . format_exc ( )
2013-09-21 06:19:48 -07:00
self . _downloader . report_warning (
u ' Automatic signature extraction failed: ' + tb )
2013-09-21 05:19:30 -07:00
2013-09-22 03:18:10 -07:00
self . _downloader . report_warning (
u ' Warning: Falling back to static signature algorithm ' )
2013-09-21 06:34:29 -07:00
return self . _static_decrypt_signature (
s , video_id , player_url , age_gate )
2013-09-21 05:19:30 -07:00
2013-09-21 06:34:29 -07:00
def _static_decrypt_signature ( self , s , video_id , player_url , age_gate ) :
2013-09-21 05:19:30 -07:00
if age_gate :
# The videos with age protection use another player, so the
# algorithms can be different.
if len ( s ) == 86 :
return s [ 2 : 63 ] + s [ 82 ] + s [ 64 : 82 ] + s [ 63 ]
2013-09-22 01:31:25 -07:00
if len ( s ) == 93 :
return s [ 86 : 29 : - 1 ] + s [ 88 ] + s [ 28 : 5 : - 1 ]
elif len ( s ) == 92 :
2013-07-18 03:24:16 -07:00
return s [ 25 ] + s [ 3 : 25 ] + s [ 0 ] + s [ 26 : 42 ] + s [ 79 ] + s [ 43 : 79 ] + s [ 91 ] + s [ 80 : 83 ]
2013-09-22 01:31:25 -07:00
elif len ( s ) == 91 :
return s [ 84 : 27 : - 1 ] + s [ 86 ] + s [ 26 : 5 : - 1 ]
2013-07-18 03:24:16 -07:00
elif len ( s ) == 90 :
return s [ 25 ] + s [ 3 : 25 ] + s [ 2 ] + s [ 26 : 40 ] + s [ 77 ] + s [ 41 : 77 ] + s [ 89 ] + s [ 78 : 81 ]
2013-08-08 21:48:12 -07:00
elif len ( s ) == 89 :
return s [ 84 : 78 : - 1 ] + s [ 87 ] + s [ 77 : 60 : - 1 ] + s [ 0 ] + s [ 59 : 3 : - 1 ]
2013-07-18 03:24:16 -07:00
elif len ( s ) == 88 :
2013-08-28 01:26:44 -07:00
return s [ 7 : 28 ] + s [ 87 ] + s [ 29 : 45 ] + s [ 55 ] + s [ 46 : 55 ] + s [ 2 ] + s [ 56 : 87 ] + s [ 28 ]
2013-07-17 01:58:07 -07:00
elif len ( s ) == 87 :
2013-08-15 13:00:20 -07:00
return s [ 6 : 27 ] + s [ 4 ] + s [ 28 : 39 ] + s [ 27 ] + s [ 40 : 59 ] + s [ 2 ] + s [ 60 : ]
2013-07-17 01:58:07 -07:00
elif len ( s ) == 86 :
2013-09-05 17:38:03 -07:00
return s [ 5 : 34 ] + s [ 0 ] + s [ 35 : 38 ] + s [ 3 ] + s [ 39 : 45 ] + s [ 38 ] + s [ 46 : 53 ] + s [ 73 ] + s [ 54 : 73 ] + s [ 85 ] + s [ 74 : 85 ] + s [ 53 ]
2013-07-17 01:58:07 -07:00
elif len ( s ) == 85 :
2013-09-17 07:59:12 -07:00
return s [ 3 : 11 ] + s [ 0 ] + s [ 12 : 55 ] + s [ 84 ] + s [ 56 : 84 ]
2013-07-17 01:58:07 -07:00
elif len ( s ) == 84 :
2013-08-29 13:44:29 -07:00
return s [ 81 : 36 : - 1 ] + s [ 0 ] + s [ 35 : 2 : - 1 ]
2013-07-17 01:58:07 -07:00
elif len ( s ) == 83 :
2013-08-13 17:57:35 -07:00
return s [ 81 : 64 : - 1 ] + s [ 82 ] + s [ 63 : 52 : - 1 ] + s [ 45 ] + s [ 51 : 45 : - 1 ] + s [ 1 ] + s [ 44 : 1 : - 1 ] + s [ 0 ]
2013-07-17 01:58:07 -07:00
elif len ( s ) == 82 :
2013-09-12 13:04:09 -07:00
return s [ 80 : 73 : - 1 ] + s [ 81 ] + s [ 72 : 54 : - 1 ] + s [ 2 ] + s [ 53 : 43 : - 1 ] + s [ 0 ] + s [ 42 : 2 : - 1 ] + s [ 43 ] + s [ 1 ] + s [ 54 ]
2013-07-17 01:58:07 -07:00
elif len ( s ) == 81 :
2013-07-25 13:06:53 -07:00
return s [ 56 ] + s [ 79 : 56 : - 1 ] + s [ 41 ] + s [ 55 : 41 : - 1 ] + s [ 80 ] + s [ 40 : 34 : - 1 ] + s [ 0 ] + s [ 33 : 29 : - 1 ] + s [ 34 ] + s [ 28 : 9 : - 1 ] + s [ 29 ] + s [ 8 : 0 : - 1 ] + s [ 9 ]
2013-08-23 02:33:56 -07:00
elif len ( s ) == 80 :
return s [ 1 : 19 ] + s [ 0 ] + s [ 20 : 68 ] + s [ 19 ] + s [ 69 : 80 ]
2013-07-25 13:50:24 -07:00
elif len ( s ) == 79 :
return s [ 54 ] + s [ 77 : 54 : - 1 ] + s [ 39 ] + s [ 53 : 39 : - 1 ] + s [ 78 ] + s [ 38 : 34 : - 1 ] + s [ 0 ] + s [ 33 : 29 : - 1 ] + s [ 34 ] + s [ 28 : 9 : - 1 ] + s [ 29 ] + s [ 8 : 0 : - 1 ] + s [ 9 ]
2013-07-17 01:58:07 -07:00
else :
raise ExtractorError ( u ' Unable to decrypt signature, key length %d not supported; retrying might work ' % ( len ( s ) ) )
2013-06-23 10:58:33 -07:00
2013-09-11 06:48:23 -07:00
def _get_available_subtitles ( self , video_id ) :
try :
2013-09-11 07:24:47 -07:00
sub_list = self . _download_webpage (
' http://video.google.com/timedtext?hl=en&type=list&v= %s ' % video_id ,
video_id , note = False )
except ExtractorError as err :
2013-09-11 06:48:23 -07:00
self . _downloader . report_warning ( u ' unable to download video subtitles: %s ' % compat_str ( err ) )
return { }
lang_list = re . findall ( r ' name= " ([^ " ]*) " [^>]+lang_code= " ([ \ w \ -]+) " ' , sub_list )
sub_lang_list = { }
for l in lang_list :
lang = l [ 1 ]
params = compat_urllib_parse . urlencode ( {
' lang ' : lang ,
' v ' : video_id ,
' fmt ' : self . _downloader . params . get ( ' subtitlesformat ' ) ,
} )
url = u ' http://www.youtube.com/api/timedtext? ' + params
sub_lang_list [ lang ] = url
if not sub_lang_list :
self . _downloader . report_warning ( u ' video doesn \' t have subtitles ' )
return { }
return sub_lang_list
2013-09-11 10:02:01 -07:00
def _get_available_automatic_caption ( self , video_id , webpage ) :
2013-09-11 06:48:23 -07:00
""" We need the webpage for getting the captions url, pass it as an
argument to speed up the process . """
sub_format = self . _downloader . params . get ( ' subtitlesformat ' )
self . to_screen ( u ' %s : Looking for automatic captions ' % video_id )
mobj = re . search ( r ' ;ytplayer.config = ( { .*?}); ' , webpage )
2013-09-11 10:02:01 -07:00
err_msg = u ' Couldn \' t find automatic captions for %s ' % video_id
2013-09-11 06:48:23 -07:00
if mobj is None :
self . _downloader . report_warning ( err_msg )
return { }
player_config = json . loads ( mobj . group ( 1 ) )
try :
args = player_config [ u ' args ' ]
caption_url = args [ u ' ttsurl ' ]
timestamp = args [ u ' timestamp ' ]
2013-09-11 10:02:01 -07:00
# We get the available subtitles
list_params = compat_urllib_parse . urlencode ( {
' type ' : ' list ' ,
' tlangs ' : 1 ,
' asrs ' : 1 ,
2013-09-11 06:48:23 -07:00
} )
2013-09-11 10:02:01 -07:00
list_url = caption_url + ' & ' + list_params
list_page = self . _download_webpage ( list_url , video_id )
caption_list = xml . etree . ElementTree . fromstring ( list_page . encode ( ' utf-8 ' ) )
2013-09-11 10:24:56 -07:00
original_lang_node = caption_list . find ( ' track ' )
if original_lang_node . attrib . get ( ' kind ' ) != ' asr ' :
self . _downloader . report_warning ( u ' Video doesn \' t have automatic captions ' )
return { }
original_lang = original_lang_node . attrib [ ' lang_code ' ]
2013-09-11 10:02:01 -07:00
sub_lang_list = { }
for lang_node in caption_list . findall ( ' target ' ) :
sub_lang = lang_node . attrib [ ' lang_code ' ]
params = compat_urllib_parse . urlencode ( {
' lang ' : original_lang ,
' tlang ' : sub_lang ,
' fmt ' : sub_format ,
' ts ' : timestamp ,
' kind ' : ' asr ' ,
} )
sub_lang_list [ sub_lang ] = caption_url + ' & ' + params
return sub_lang_list
2013-09-11 06:48:23 -07:00
# An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles
except ( KeyError , ExtractorError ) :
self . _downloader . report_warning ( err_msg )
return { }
2013-06-23 10:58:33 -07:00
def _print_formats ( self , formats ) :
print ( ' Available formats: ' )
for x in formats :
2013-08-02 03:21:28 -07:00
print ( ' %s \t : \t %s \t [ %s ] %s ' % ( x , self . _video_extensions . get ( x , ' flv ' ) ,
self . _video_dimensions . get ( x , ' ??? ' ) ,
2013-08-19 18:22:25 -07:00
' ( ' + self . _special_itags [ x ] + ' ) ' if x in self . _special_itags else ' ' ) )
2013-06-23 10:58:33 -07:00
def _extract_id ( self , url ) :
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
if mobj is None :
raise ExtractorError ( u ' Invalid URL: %s ' % url )
video_id = mobj . group ( 2 )
return video_id
2013-07-20 03:46:02 -07:00
def _get_video_url_list ( self , url_map ) :
"""
Transform a dictionary in the format { itag : url } to a list of ( itag , url )
with the requested formats .
"""
req_format = self . _downloader . params . get ( ' format ' , None )
format_limit = self . _downloader . params . get ( ' format_limit ' , None )
available_formats = self . _available_formats_prefer_free if self . _downloader . params . get ( ' prefer_free_formats ' , False ) else self . _available_formats
if format_limit is not None and format_limit in available_formats :
format_list = available_formats [ available_formats . index ( format_limit ) : ]
else :
format_list = available_formats
existing_formats = [ x for x in format_list if x in url_map ]
if len ( existing_formats ) == 0 :
raise ExtractorError ( u ' no known formats available for video ' )
if self . _downloader . params . get ( ' listformats ' , None ) :
self . _print_formats ( existing_formats )
return
if req_format is None or req_format == ' best ' :
video_url_list = [ ( existing_formats [ 0 ] , url_map [ existing_formats [ 0 ] ] ) ] # Best quality
elif req_format == ' worst ' :
video_url_list = [ ( existing_formats [ - 1 ] , url_map [ existing_formats [ - 1 ] ] ) ] # worst quality
elif req_format in ( ' -1 ' , ' all ' ) :
video_url_list = [ ( f , url_map [ f ] ) for f in existing_formats ] # All formats
else :
# Specific formats. We pick the first in a slash-delimeted sequence.
2013-08-30 17:51:50 -07:00
# Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
# available in the specified format. For example,
# if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
# if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
# if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
2013-07-20 03:46:02 -07:00
req_formats = req_format . split ( ' / ' )
video_url_list = None
for rf in req_formats :
if rf in url_map :
video_url_list = [ ( rf , url_map [ rf ] ) ]
break
2013-08-30 17:51:50 -07:00
if rf in self . _video_formats_map :
for srf in self . _video_formats_map [ rf ] :
if srf in url_map :
video_url_list = [ ( srf , url_map [ srf ] ) ]
break
else :
continue
break
2013-07-20 03:46:02 -07:00
if video_url_list is None :
raise ExtractorError ( u ' requested format not available ' )
return video_url_list
def _extract_from_m3u8 ( self , manifest_url , video_id ) :
url_map = { }
def _get_urls ( _manifest ) :
lines = _manifest . split ( ' \n ' )
urls = filter ( lambda l : l and not l . startswith ( ' # ' ) ,
lines )
return urls
manifest = self . _download_webpage ( manifest_url , video_id , u ' Downloading formats manifest ' )
formats_urls = _get_urls ( manifest )
for format_url in formats_urls :
2013-09-08 09:49:10 -07:00
itag = self . _search_regex ( r ' itag/( \ d+?)/ ' , format_url , ' itag ' )
2013-07-20 03:46:02 -07:00
url_map [ itag ] = format_url
return url_map
2013-06-23 10:58:33 -07:00
def _real_extract ( self , url ) :
2013-06-30 17:29:29 -07:00
if re . match ( r ' (?:https?://)?[^/]+/watch \ ?feature=[a-z_]+$ ' , url ) :
self . _downloader . report_warning ( u ' Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \' http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc \' (or simply youtube-dl BaW_jenozKc ). ' )
2013-06-23 10:58:33 -07:00
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re . search ( self . _NEXT_URL_RE , url )
if mobj :
url = ' https://www.youtube.com/ ' + compat_urllib_parse . unquote ( mobj . group ( 1 ) ) . lstrip ( ' / ' )
video_id = self . _extract_id ( url )
# Get video webpage
self . report_video_webpage_download ( video_id )
url = ' https://www.youtube.com/watch?v= %s &gl=US&hl=en&has_verified=1 ' % video_id
request = compat_urllib_request . Request ( url )
try :
video_webpage_bytes = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
raise ExtractorError ( u ' Unable to download video webpage: %s ' % compat_str ( err ) )
video_webpage = video_webpage_bytes . decode ( ' utf-8 ' , ' ignore ' )
# Attempt to extract SWF player URL
2013-09-21 05:19:30 -07:00
mobj = re . search ( r ' swfConfig.*? " (https?: \\ / \\ /.*?watch.*?-.*? \ .swf) " ' , video_webpage )
2013-06-23 10:58:33 -07:00
if mobj is not None :
player_url = re . sub ( r ' \\ (.) ' , r ' \ 1 ' , mobj . group ( 1 ) )
else :
player_url = None
# Get video info
self . report_video_info_webpage_download ( video_id )
2013-07-09 05:38:24 -07:00
if re . search ( r ' player-age-gate-content " > ' , video_webpage ) is not None :
self . report_age_confirmation ( )
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
data = compat_urllib_parse . urlencode ( { ' video_id ' : video_id ,
' el ' : ' embedded ' ,
' gl ' : ' US ' ,
' hl ' : ' en ' ,
' eurl ' : ' https://youtube.googleapis.com/v/ ' + video_id ,
' asv ' : 3 ,
' sts ' : ' 1588 ' ,
} )
video_info_url = ' https://www.youtube.com/get_video_info? ' + data
2013-06-23 10:58:33 -07:00
video_info_webpage = self . _download_webpage ( video_info_url , video_id ,
note = False ,
errnote = ' unable to download video info webpage ' )
video_info = compat_parse_qs ( video_info_webpage )
2013-07-09 05:38:24 -07:00
else :
age_gate = False
for el_type in [ ' &el=embedded ' , ' &el=detailpage ' , ' &el=vevo ' , ' ' ] :
video_info_url = ( ' https://www.youtube.com/get_video_info?&video_id= %s %s &ps=default&eurl=&gl=US&hl=en '
% ( video_id , el_type ) )
video_info_webpage = self . _download_webpage ( video_info_url , video_id ,
note = False ,
errnote = ' unable to download video info webpage ' )
video_info = compat_parse_qs ( video_info_webpage )
if ' token ' in video_info :
break
2013-06-23 10:58:33 -07:00
if ' token ' not in video_info :
if ' reason ' in video_info :
2013-07-01 23:40:21 -07:00
raise ExtractorError ( u ' YouTube said: %s ' % video_info [ ' reason ' ] [ 0 ] , expected = True )
2013-06-23 10:58:33 -07:00
else :
raise ExtractorError ( u ' " token " parameter not in video info for unknown reason ' )
# Check for "rental" videos
if ' ypc_video_rental_bar_text ' in video_info and ' author ' not in video_info :
raise ExtractorError ( u ' " rental " videos not supported ' )
# Start extracting information
self . report_information_extraction ( video_id )
# uploader
if ' author ' not in video_info :
raise ExtractorError ( u ' Unable to extract uploader name ' )
video_uploader = compat_urllib_parse . unquote_plus ( video_info [ ' author ' ] [ 0 ] )
# uploader_id
video_uploader_id = None
mobj = re . search ( r ' <link itemprop= " url " href= " http://www.youtube.com/(?:user|channel)/([^ " ]+) " > ' , video_webpage )
if mobj is not None :
video_uploader_id = mobj . group ( 1 )
else :
self . _downloader . report_warning ( u ' unable to extract uploader nickname ' )
# title
if ' title ' not in video_info :
raise ExtractorError ( u ' Unable to extract video title ' )
video_title = compat_urllib_parse . unquote_plus ( video_info [ ' title ' ] [ 0 ] )
# thumbnail image
2013-07-07 12:21:15 -07:00
# We try first to get a high quality image:
m_thumb = re . search ( r ' <span itemprop= " thumbnail " .*?href= " (.*?) " > ' ,
video_webpage , re . DOTALL )
if m_thumb is not None :
video_thumbnail = m_thumb . group ( 1 )
elif ' thumbnail_url ' not in video_info :
2013-06-23 10:58:33 -07:00
self . _downloader . report_warning ( u ' unable to extract video thumbnail ' )
video_thumbnail = ' '
else : # don't panic if we can't find it
video_thumbnail = compat_urllib_parse . unquote_plus ( video_info [ ' thumbnail_url ' ] [ 0 ] )
# upload date
upload_date = None
mobj = re . search ( r ' id= " eow-date.*?>(.*?)</span> ' , video_webpage , re . DOTALL )
if mobj is not None :
upload_date = ' ' . join ( re . sub ( r ' [/,-] ' , r ' ' , mobj . group ( 1 ) ) . split ( ) )
upload_date = unified_strdate ( upload_date )
# description
video_description = get_element_by_id ( " eow-description " , video_webpage )
if video_description :
video_description = clean_html ( video_description )
else :
fd_mobj = re . search ( r ' <meta name= " description " content= " ([^ " ]+) " ' , video_webpage )
if fd_mobj :
video_description = unescapeHTML ( fd_mobj . group ( 1 ) )
else :
video_description = u ' '
# subtitles
2013-09-11 07:05:49 -07:00
video_subtitles = self . extract_subtitles ( video_id , video_webpage )
2013-06-23 10:58:33 -07:00
if self . _downloader . params . get ( ' listsubtitles ' , False ) :
2013-09-11 10:17:30 -07:00
self . _list_available_subtitles ( video_id , video_webpage )
2013-06-23 10:58:33 -07:00
return
if ' length_seconds ' not in video_info :
self . _downloader . report_warning ( u ' unable to extract video duration ' )
video_duration = ' '
else :
video_duration = compat_urllib_parse . unquote_plus ( video_info [ ' length_seconds ' ] [ 0 ] )
# Decide which formats to download
try :
mobj = re . search ( r ' ;ytplayer.config = ( { .*?}); ' , video_webpage )
2013-06-25 02:17:28 -07:00
if not mobj :
raise ValueError ( ' Could not find vevo ID ' )
2013-06-23 10:58:33 -07:00
info = json . loads ( mobj . group ( 1 ) )
args = info [ ' args ' ]
2013-06-24 12:19:04 -07:00
# Easy way to know if the 's' value is in url_encoded_fmt_stream_map
# this signatures are encrypted
m_s = re . search ( r ' [&,]s= ' , args [ ' url_encoded_fmt_stream_map ' ] )
if m_s is not None :
self . to_screen ( u ' %s : Encrypted signatures detected. ' % video_id )
2013-06-23 10:58:33 -07:00
video_info [ ' url_encoded_fmt_stream_map ' ] = [ args [ ' url_encoded_fmt_stream_map ' ] ]
2013-08-20 21:42:49 -07:00
m_s = re . search ( r ' [&,]s= ' , args . get ( ' adaptive_fmts ' , u ' ' ) )
2013-08-20 21:57:32 -07:00
if m_s is not None :
2013-08-20 23:51:05 -07:00
if ' url_encoded_fmt_stream_map ' in video_info :
video_info [ ' url_encoded_fmt_stream_map ' ] [ 0 ] + = ' , ' + args [ ' adaptive_fmts ' ]
else :
video_info [ ' url_encoded_fmt_stream_map ' ] = [ args [ ' adaptive_fmts ' ] ]
2013-08-19 18:57:55 -07:00
elif ' adaptive_fmts ' in video_info :
2013-08-20 23:51:05 -07:00
if ' url_encoded_fmt_stream_map ' in video_info :
video_info [ ' url_encoded_fmt_stream_map ' ] [ 0 ] + = ' , ' + video_info [ ' adaptive_fmts ' ] [ 0 ]
else :
video_info [ ' url_encoded_fmt_stream_map ' ] = video_info [ ' adaptive_fmts ' ]
2013-06-23 10:58:33 -07:00
except ValueError :
pass
if ' conn ' in video_info and video_info [ ' conn ' ] [ 0 ] . startswith ( ' rtmp ' ) :
self . report_rtmp_download ( )
video_url_list = [ ( None , video_info [ ' conn ' ] [ 0 ] ) ]
elif ' url_encoded_fmt_stream_map ' in video_info and len ( video_info [ ' url_encoded_fmt_stream_map ' ] ) > = 1 :
2013-07-10 05:35:11 -07:00
if ' rtmpe % 3Dyes ' in video_info [ ' url_encoded_fmt_stream_map ' ] [ 0 ] :
raise ExtractorError ( ' rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information. ' , expected = True )
2013-06-23 10:58:33 -07:00
url_map = { }
for url_data_str in video_info [ ' url_encoded_fmt_stream_map ' ] [ 0 ] . split ( ' , ' ) :
url_data = compat_parse_qs ( url_data_str )
if ' itag ' in url_data and ' url ' in url_data :
url = url_data [ ' url ' ] [ 0 ]
if ' sig ' in url_data :
url + = ' &signature= ' + url_data [ ' sig ' ] [ 0 ]
elif ' s ' in url_data :
2013-09-21 05:19:30 -07:00
encrypted_sig = url_data [ ' s ' ] [ 0 ]
2013-06-27 03:54:07 -07:00
if self . _downloader . params . get ( ' verbose ' ) :
2013-07-09 05:38:24 -07:00
if age_gate :
2013-09-22 03:17:42 -07:00
if player_url is None :
player_version = ' unknown '
else :
player_version = self . _search_regex (
r ' -(.+) \ .swf$ ' , player_url ,
u ' flash player ' , fatal = False )
2013-09-21 05:19:30 -07:00
player_desc = ' flash player %s ' % player_version
2013-07-09 05:38:24 -07:00
else :
2013-09-21 06:19:48 -07:00
player_version = self . _search_regex (
r ' html5player-(.+?) \ .js ' , video_webpage ,
2013-07-09 05:38:24 -07:00
' html5 player ' , fatal = False )
2013-09-21 05:19:30 -07:00
player_desc = u ' html5 player %s ' % player_version
parts_sizes = u ' . ' . join ( compat_str ( len ( part ) ) for part in encrypted_sig . split ( ' . ' ) )
2013-07-17 03:08:10 -07:00
self . to_screen ( u ' encrypted signature length %d ( %s ), itag %s , %s ' %
2013-09-21 05:19:30 -07:00
( len ( encrypted_sig ) , parts_sizes , url_data [ ' itag ' ] [ 0 ] , player_desc ) )
2013-09-21 06:19:48 -07:00
if not age_gate :
2013-09-21 05:19:30 -07:00
jsplayer_url_json = self . _search_regex (
r ' " assets " :.+? " js " : \ s*( " [^ " ]+ " ) ' ,
video_webpage , u ' JS player URL ' )
2013-09-21 06:19:48 -07:00
player_url = json . loads ( jsplayer_url_json )
2013-09-21 05:19:30 -07:00
2013-09-21 06:19:48 -07:00
signature = self . _decrypt_signature (
encrypted_sig , video_id , player_url , age_gate )
2013-06-23 10:58:33 -07:00
url + = ' &signature= ' + signature
if ' ratebypass ' not in url :
url + = ' &ratebypass=yes '
url_map [ url_data [ ' itag ' ] [ 0 ] ] = url
2013-07-20 03:46:02 -07:00
video_url_list = self . _get_video_url_list ( url_map )
if not video_url_list :
2013-06-23 10:58:33 -07:00
return
2013-07-20 03:46:02 -07:00
elif video_info . get ( ' hlsvp ' ) :
manifest_url = video_info [ ' hlsvp ' ] [ 0 ]
url_map = self . _extract_from_m3u8 ( manifest_url , video_id )
video_url_list = self . _get_video_url_list ( url_map )
if not video_url_list :
return
2013-06-23 10:58:33 -07:00
else :
raise ExtractorError ( u ' no conn or url_encoded_fmt_stream_map information found in video info ' )
results = [ ]
for format_param , video_real_url in video_url_list :
# Extension
video_extension = self . _video_extensions . get ( format_param , ' flv ' )
2013-08-02 03:21:28 -07:00
video_format = ' {0} - {1} {2} ' . format ( format_param if format_param else video_extension ,
self . _video_dimensions . get ( format_param , ' ??? ' ) ,
2013-08-19 18:22:25 -07:00
' ( ' + self . _special_itags [ format_param ] + ' ) ' if format_param in self . _special_itags else ' ' )
2013-06-23 10:58:33 -07:00
results . append ( {
' id ' : video_id ,
' url ' : video_real_url ,
' uploader ' : video_uploader ,
' uploader_id ' : video_uploader_id ,
' upload_date ' : upload_date ,
' title ' : video_title ,
' ext ' : video_extension ,
' format ' : video_format ,
' thumbnail ' : video_thumbnail ,
' description ' : video_description ,
' player_url ' : player_url ,
' subtitles ' : video_subtitles ,
' duration ' : video_duration
} )
return results
class YoutubePlaylistIE ( InfoExtractor ) :
2013-07-01 09:52:19 -07:00
IE_DESC = u ' YouTube.com playlists '
2013-06-23 10:58:33 -07:00
_VALID_URL = r """ (?:
( ? : https ? : / / ) ?
( ? : \w + \. ) ?
youtube \. com /
( ? :
( ? : course | view_play_list | my_playlists | artist | playlist | watch )
\? ( ? : . * ? & ) * ? ( ? : p | a | list ) =
| p /
)
2013-07-24 11:45:19 -07:00
( ( ? : PL | EC | UU | FL ) ? [ 0 - 9 A - Za - z - _ ] { 10 , } )
2013-06-23 10:58:33 -07:00
. *
|
2013-07-24 11:45:19 -07:00
( ( ? : PL | EC | UU | FL ) [ 0 - 9 A - Za - z - _ ] { 10 , } )
2013-06-23 10:58:33 -07:00
) """
_TEMPLATE_URL = ' https://gdata.youtube.com/feeds/api/playlists/ %s ?max-results= %i &start-index= %i &v=2&alt=json&safeSearch=none '
_MAX_RESULTS = 50
IE_NAME = u ' youtube:playlist '
@classmethod
def suitable ( cls , url ) :
""" Receives a URL and returns True if suitable for this IE. """
return re . match ( cls . _VALID_URL , url , re . VERBOSE ) is not None
def _real_extract ( self , url ) :
# Extract playlist id
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
if mobj is None :
raise ExtractorError ( u ' Invalid URL: %s ' % url )
# Download playlist videos from API
playlist_id = mobj . group ( 1 ) or mobj . group ( 2 )
videos = [ ]
2013-07-24 13:27:33 -07:00
for page_num in itertools . count ( 1 ) :
2013-07-24 11:14:55 -07:00
start_index = self . _MAX_RESULTS * ( page_num - 1 ) + 1
if start_index > = 1000 :
self . _downloader . report_warning ( u ' Max number of results reached ' )
break
url = self . _TEMPLATE_URL % ( playlist_id , self . _MAX_RESULTS , start_index )
2013-06-23 10:58:33 -07:00
page = self . _download_webpage ( url , playlist_id , u ' Downloading page # %s ' % page_num )
try :
response = json . loads ( page )
except ValueError as err :
raise ExtractorError ( u ' Invalid JSON in API response: ' + compat_str ( err ) )
if ' feed ' not in response :
raise ExtractorError ( u ' Got a malformed response from YouTube API ' )
playlist_title = response [ ' feed ' ] [ ' title ' ] [ ' $t ' ]
if ' entry ' not in response [ ' feed ' ] :
# Number of videos is a multiple of self._MAX_RESULTS
break
for entry in response [ ' feed ' ] [ ' entry ' ] :
index = entry [ ' yt$position ' ] [ ' $t ' ]
2013-09-05 12:40:04 -07:00
if ' media$group ' in entry and ' yt$videoid ' in entry [ ' media$group ' ] :
videos . append ( (
index ,
' https://www.youtube.com/watch?v= ' + entry [ ' media$group ' ] [ ' yt$videoid ' ] [ ' $t ' ]
) )
2013-06-23 10:58:33 -07:00
videos = [ v [ 1 ] for v in sorted ( videos ) ]
2013-07-07 17:12:20 -07:00
url_results = [ self . url_result ( vurl , ' Youtube ' ) for vurl in videos ]
2013-06-23 10:58:33 -07:00
return [ self . playlist_result ( url_results , playlist_id , playlist_title ) ]
class YoutubeChannelIE ( InfoExtractor ) :
2013-07-01 09:52:19 -07:00
IE_DESC = u ' YouTube.com channels '
2013-06-23 10:58:33 -07:00
_VALID_URL = r " ^(?:https?://)?(?:youtu \ .be|(?: \ w+ \ .)?youtube(?:-nocookie)? \ .com)/channel/([0-9A-Za-z_-]+) "
_TEMPLATE_URL = ' http://www.youtube.com/channel/ %s /videos?sort=da&flow=list&view=0&page= %s &gl=US&hl=en '
_MORE_PAGES_INDICATOR = ' yt-uix-load-more '
2013-07-23 05:58:01 -07:00
_MORE_PAGES_URL = ' http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging= %s &view=0&sort=da&channel_id= %s '
2013-06-23 10:58:33 -07:00
IE_NAME = u ' youtube:channel '
def extract_videos_from_page ( self , page ) :
ids_in_page = [ ]
for mobj in re . finditer ( r ' href= " /watch \ ?v=([0-9A-Za-z_-]+)&? ' , page ) :
if mobj . group ( 1 ) not in ids_in_page :
ids_in_page . append ( mobj . group ( 1 ) )
return ids_in_page
def _real_extract ( self , url ) :
# Extract channel id
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
raise ExtractorError ( u ' Invalid URL: %s ' % url )
# Download channel page
channel_id = mobj . group ( 1 )
video_ids = [ ]
pagenum = 1
url = self . _TEMPLATE_URL % ( channel_id , pagenum )
page = self . _download_webpage ( url , channel_id ,
u ' Downloading page # %s ' % pagenum )
# Extract video identifiers
ids_in_page = self . extract_videos_from_page ( page )
video_ids . extend ( ids_in_page )
# Download any subsequent channel pages using the json-based channel_ajax query
if self . _MORE_PAGES_INDICATOR in page :
2013-07-24 13:27:33 -07:00
for pagenum in itertools . count ( 1 ) :
2013-06-23 10:58:33 -07:00
url = self . _MORE_PAGES_URL % ( pagenum , channel_id )
page = self . _download_webpage ( url , channel_id ,
u ' Downloading page # %s ' % pagenum )
page = json . loads ( page )
ids_in_page = self . extract_videos_from_page ( page [ ' content_html ' ] )
video_ids . extend ( ids_in_page )
if self . _MORE_PAGES_INDICATOR not in page [ ' load_more_widget_html ' ] :
break
self . _downloader . to_screen ( u ' [youtube] Channel %s : Found %i videos ' % ( channel_id , len ( video_ids ) ) )
urls = [ ' http://www.youtube.com/watch?v= %s ' % id for id in video_ids ]
2013-07-07 17:12:20 -07:00
url_entries = [ self . url_result ( eurl , ' Youtube ' ) for eurl in urls ]
2013-06-23 10:58:33 -07:00
return [ self . playlist_result ( url_entries , channel_id ) ]
class YoutubeUserIE ( InfoExtractor ) :
2013-07-01 09:52:19 -07:00
IE_DESC = u ' YouTube.com user videos (URL or " ytuser " keyword) '
2013-09-06 05:38:41 -07:00
_VALID_URL = r ' (?:(?:(?:https?://)?(?: \ w+ \ .)?youtube \ .com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+) '
2013-06-23 10:58:33 -07:00
_TEMPLATE_URL = ' http://gdata.youtube.com/feeds/api/users/ %s '
_GDATA_PAGE_SIZE = 50
2013-09-06 01:41:46 -07:00
_GDATA_URL = ' http://gdata.youtube.com/feeds/api/users/ %s /uploads?max-results= %d &start-index= %d &alt=json '
2013-06-23 10:58:33 -07:00
IE_NAME = u ' youtube:user '
2013-09-06 07:24:24 -07:00
@classmethod
2013-09-05 13:38:23 -07:00
def suitable ( cls , url ) :
2013-09-06 07:24:24 -07:00
# Don't return True if the url can be extracted with other youtube
# extractor, the regex would is too permissive and it would match.
other_ies = iter ( klass for ( name , klass ) in globals ( ) . items ( ) if name . endswith ( ' IE ' ) and klass is not cls )
if any ( ie . suitable ( url ) for ie in other_ies ) : return False
2013-09-05 13:38:23 -07:00
else : return super ( YoutubeUserIE , cls ) . suitable ( url )
2013-06-23 10:58:33 -07:00
def _real_extract ( self , url ) :
# Extract username
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
raise ExtractorError ( u ' Invalid URL: %s ' % url )
username = mobj . group ( 1 )
# Download video ids using YouTube Data API. Result size per
# query is limited (currently to 50 videos) so we need to query
# page by page until there are no video ids - it means we got
# all of them.
video_ids = [ ]
2013-07-24 13:27:33 -07:00
for pagenum in itertools . count ( 0 ) :
2013-06-23 10:58:33 -07:00
start_index = pagenum * self . _GDATA_PAGE_SIZE + 1
gdata_url = self . _GDATA_URL % ( username , self . _GDATA_PAGE_SIZE , start_index )
page = self . _download_webpage ( gdata_url , username ,
u ' Downloading video ids from %d to %d ' % ( start_index , start_index + self . _GDATA_PAGE_SIZE ) )
2013-09-06 01:41:46 -07:00
try :
response = json . loads ( page )
except ValueError as err :
raise ExtractorError ( u ' Invalid JSON in API response: ' + compat_str ( err ) )
2013-09-18 14:00:32 -07:00
if ' entry ' not in response [ ' feed ' ] :
# Number of videos is a multiple of self._MAX_RESULTS
break
2013-09-06 01:41:46 -07:00
2013-06-23 10:58:33 -07:00
# Extract video identifiers
ids_in_page = [ ]
2013-09-06 01:41:46 -07:00
for entry in response [ ' feed ' ] [ ' entry ' ] :
ids_in_page . append ( entry [ ' id ' ] [ ' $t ' ] . split ( ' / ' ) [ - 1 ] )
2013-06-23 10:58:33 -07:00
video_ids . extend ( ids_in_page )
# A little optimization - if current page is not
# "full", ie. does not contain PAGE_SIZE video ids then
# we can assume that this page is the last one - there
# are no more ids on further pages - no need to query
# again.
if len ( ids_in_page ) < self . _GDATA_PAGE_SIZE :
break
urls = [ ' http://www.youtube.com/watch?v= %s ' % video_id for video_id in video_ids ]
2013-07-07 17:12:20 -07:00
url_results = [ self . url_result ( rurl , ' Youtube ' ) for rurl in urls ]
2013-06-23 10:58:33 -07:00
return [ self . playlist_result ( url_results , playlist_title = username ) ]
2013-06-23 11:28:15 -07:00
class YoutubeSearchIE ( SearchInfoExtractor ) :
2013-07-01 09:52:19 -07:00
IE_DESC = u ' YouTube.com searches '
2013-06-23 11:28:15 -07:00
_API_URL = ' https://gdata.youtube.com/feeds/api/videos?q= %s &start-index= %i &max-results=50&v=2&alt=jsonc '
_MAX_RESULTS = 1000
IE_NAME = u ' youtube:search '
_SEARCH_KEY = ' ytsearch '
def report_download_page ( self , query , pagenum ) :
""" Report attempt to download search page with given number. """
self . _downloader . to_screen ( u ' [youtube] query " %s " : Downloading page %s ' % ( query , pagenum ) )
def _get_n_results ( self , query , n ) :
""" Get a specified number of results for a query """
video_ids = [ ]
pagenum = 0
limit = n
while ( 50 * pagenum ) < limit :
self . report_download_page ( query , pagenum + 1 )
result_url = self . _API_URL % ( compat_urllib_parse . quote_plus ( query ) , ( 50 * pagenum ) + 1 )
request = compat_urllib_request . Request ( result_url )
try :
data = compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
raise ExtractorError ( u ' Unable to download API page: %s ' % compat_str ( err ) )
api_response = json . loads ( data ) [ ' data ' ]
if not ' items ' in api_response :
raise ExtractorError ( u ' [youtube] No video results ' )
new_ids = list ( video [ ' id ' ] for video in api_response [ ' items ' ] )
video_ids + = new_ids
limit = min ( n , api_response [ ' totalItems ' ] )
pagenum + = 1
if len ( video_ids ) > n :
video_ids = video_ids [ : n ]
videos = [ self . url_result ( ' http://www.youtube.com/watch?v= %s ' % id , ' Youtube ' ) for id in video_ids ]
return self . playlist_result ( videos , query )
2013-07-01 08:59:28 -07:00
class YoutubeShowIE ( InfoExtractor ) :
2013-07-01 09:52:19 -07:00
IE_DESC = u ' YouTube.com (multi-season) shows '
2013-07-01 08:59:28 -07:00
_VALID_URL = r ' https?://www \ .youtube \ .com/show/(.*) '
IE_NAME = u ' youtube:show '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
show_name = mobj . group ( 1 )
webpage = self . _download_webpage ( url , show_name , u ' Downloading show webpage ' )
# There's one playlist for each season of the show
m_seasons = list ( re . finditer ( r ' href= " (/playlist \ ?list=.*?) " ' , webpage ) )
self . to_screen ( u ' %s : Found %s seasons ' % ( show_name , len ( m_seasons ) ) )
return [ self . url_result ( ' https://www.youtube.com ' + season . group ( 1 ) , ' YoutubePlaylist ' ) for season in m_seasons ]
2013-07-07 04:58:23 -07:00
2013-07-24 11:40:12 -07:00
class YoutubeFeedsInfoExtractor ( YoutubeBaseInfoExtractor ) :
2013-07-20 10:33:40 -07:00
"""
Base class for extractors that fetch info from
http : / / www . youtube . com / feed_ajax
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties .
"""
2013-07-24 11:40:12 -07:00
_LOGIN_REQUIRED = True
2013-07-07 04:58:23 -07:00
_PAGING_STEP = 30
2013-07-24 13:13:39 -07:00
# use action_load_personal_feed instead of action_load_system_feed
_PERSONAL_FEED = False
2013-07-07 04:58:23 -07:00
2013-07-20 10:33:40 -07:00
@property
def _FEED_TEMPLATE ( self ) :
2013-07-24 13:13:39 -07:00
action = ' action_load_system_feed '
if self . _PERSONAL_FEED :
action = ' action_load_personal_feed '
return ' http://www.youtube.com/feed_ajax? %s =1&feed_name= %s &paging= %% s ' % ( action , self . _FEED_NAME )
2013-07-20 10:33:40 -07:00
@property
def IE_NAME ( self ) :
return u ' youtube: %s ' % self . _FEED_NAME
2013-07-07 04:58:23 -07:00
2013-07-08 02:23:05 -07:00
def _real_initialize ( self ) :
2013-07-24 11:40:12 -07:00
self . _login ( )
2013-07-08 02:23:05 -07:00
2013-07-07 04:58:23 -07:00
def _real_extract ( self , url ) :
feed_entries = [ ]
# The step argument is available only in 2.7 or higher
for i in itertools . count ( 0 ) :
paging = i * self . _PAGING_STEP
2013-07-20 10:33:40 -07:00
info = self . _download_webpage ( self . _FEED_TEMPLATE % paging ,
u ' %s feed ' % self . _FEED_NAME ,
2013-07-07 04:58:23 -07:00
u ' Downloading page %s ' % i )
info = json . loads ( info )
feed_html = info [ ' feed_html ' ]
2013-07-24 13:13:39 -07:00
m_ids = re . finditer ( r ' " /watch \ ?v=(.*?)[ " &] ' , feed_html )
2013-07-07 04:58:23 -07:00
ids = orderedSet ( m . group ( 1 ) for m in m_ids )
feed_entries . extend ( self . url_result ( id , ' Youtube ' ) for id in ids )
if info [ ' paging ' ] is None :
break
2013-07-20 10:33:40 -07:00
return self . playlist_result ( feed_entries , playlist_title = self . _PLAYLIST_TITLE )
class YoutubeSubscriptionsIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = u ' YouTube.com subscriptions feed, " ytsubs " keyword(requires authentication) '
_VALID_URL = r ' https?://www \ .youtube \ .com/feed/subscriptions|:ytsubs(?:criptions)? '
_FEED_NAME = ' subscriptions '
_PLAYLIST_TITLE = u ' Youtube Subscriptions '
class YoutubeRecommendedIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = u ' YouTube.com recommended videos, " ytrec " keyword (requires authentication) '
_VALID_URL = r ' https?://www \ .youtube \ .com/feed/recommended|:ytrec(?:ommended)? '
_FEED_NAME = ' recommended '
_PLAYLIST_TITLE = u ' Youtube Recommended videos '
2013-07-24 11:45:19 -07:00
2013-07-24 13:13:39 -07:00
class YoutubeWatchLaterIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = u ' Youtube watch later list, " ytwatchlater " keyword (requires authentication) '
_VALID_URL = r ' https?://www \ .youtube \ .com/feed/watch_later|:ytwatchlater '
_FEED_NAME = ' watch_later '
_PLAYLIST_TITLE = u ' Youtube Watch Later '
_PAGING_STEP = 100
_PERSONAL_FEED = True
2013-07-24 11:45:19 -07:00
class YoutubeFavouritesIE ( YoutubeBaseInfoExtractor ) :
IE_NAME = u ' youtube:favorites '
IE_DESC = u ' YouTube.com favourite videos, " ytfav " keyword (requires authentication) '
2013-08-30 11:13:05 -07:00
_VALID_URL = r ' https?://www \ .youtube \ .com/my_favorites|:ytfav(?:ou?rites)? '
2013-07-24 11:45:19 -07:00
_LOGIN_REQUIRED = True
def _real_extract ( self , url ) :
webpage = self . _download_webpage ( ' https://www.youtube.com/my_favorites ' , ' Youtube Favourites videos ' )
playlist_id = self . _search_regex ( r ' list=(.+?)[ " &] ' , webpage , u ' favourites playlist id ' )
return self . url_result ( playlist_id , ' YoutubePlaylist ' )