2015-10-18 02:07:48 -07:00
# coding: utf-8
2015-06-27 12:22:25 -07:00
from __future__ import unicode_literals
import re
from . common import InfoExtractor
from . . utils import (
float_or_none ,
2015-10-18 02:11:55 -07:00
xpath_text ,
2015-10-18 03:04:13 -07:00
remove_end ,
2015-11-11 10:13:42 -08:00
int_or_none ,
ExtractorError ,
2015-11-21 08:18:17 -08:00
sanitized_Request ,
2015-06-27 12:22:25 -07:00
)
2016-02-21 00:41:24 -08:00
class TwitterBaseIE ( InfoExtractor ) :
def _get_vmap_video_url ( self , vmap_url , video_id ) :
vmap_data = self . _download_xml ( vmap_url , video_id )
return xpath_text ( vmap_data , ' .//MediaFile ' ) . strip ( )
class TwitterCardIE ( TwitterBaseIE ) :
2015-10-18 02:13:58 -07:00
IE_NAME = ' twitter:card '
2016-03-02 21:39:04 -08:00
_VALID_URL = r ' https?://(?:www \ .)?twitter \ .com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id> \ d+) '
2015-07-21 14:45:36 -07:00
_TESTS = [
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/560070183650213889 ' ,
2016-02-21 00:57:56 -08:00
# MD5 checksums are different in different places
2015-07-21 14:45:36 -07:00
' info_dict ' : {
' id ' : ' 560070183650213889 ' ,
' ext ' : ' mp4 ' ,
2016-03-02 21:39:04 -08:00
' title ' : ' Twitter Card ' ,
2015-07-21 14:45:36 -07:00
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
' duration ' : 30.033 ,
}
2015-06-27 12:22:25 -07:00
} ,
2015-07-21 14:45:36 -07:00
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/623160978427936768 ' ,
' md5 ' : ' 7ee2a553b63d1bccba97fbed97d9e1c8 ' ,
' info_dict ' : {
' id ' : ' 623160978427936768 ' ,
' ext ' : ' mp4 ' ,
2016-03-02 21:39:04 -08:00
' title ' : ' Twitter Card ' ,
2015-07-21 14:45:36 -07:00
' thumbnail ' : ' re:^https?://.* \ .jpg ' ,
' duration ' : 80.155 ,
} ,
2015-10-18 04:07:37 -07:00
} ,
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/654001591733886977 ' ,
2016-02-21 00:57:56 -08:00
' md5 ' : ' d4724ffe6d2437886d004fa5de1043b3 ' ,
2015-10-18 04:07:37 -07:00
' info_dict ' : {
' id ' : ' dq4Oj5quskI ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Ubuntu 11.10 Overview ' ,
' description ' : ' Take a quick peek at what \' s new and improved in Ubuntu 11.10. \n \n Once installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/ ' ,
' upload_date ' : ' 20111013 ' ,
' uploader ' : ' OMG! Ubuntu! ' ,
' uploader_id ' : ' omgubuntu ' ,
} ,
2015-11-14 01:03:26 -08:00
' add_ie ' : [ ' Youtube ' ] ,
2015-11-14 01:02:07 -08:00
} ,
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/665289828897005568 ' ,
' md5 ' : ' ab2745d0b0ce53319a534fccaa986439 ' ,
' info_dict ' : {
' id ' : ' iBb2x00UVlv ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20151113 ' ,
' uploader_id ' : ' 1189339351084113920 ' ,
2016-02-21 00:57:56 -08:00
' uploader ' : ' ArsenalTerje ' ,
' title ' : ' Vine by ArsenalTerje ' ,
2015-11-14 01:02:07 -08:00
} ,
' add_ie ' : [ ' Vine ' ] ,
2016-03-02 21:39:04 -08:00
} , {
' url ' : ' https://twitter.com/i/videos/tweet/705235433198714880 ' ,
' md5 ' : ' 3846d0a07109b5ab622425449b59049d ' ,
' info_dict ' : {
' id ' : ' 705235433198714880 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Twitter web player ' ,
' thumbnail ' : ' re:^https?://.* \ .jpg ' ,
} ,
} ,
2015-07-21 14:45:36 -07:00
]
2015-06-27 12:22:25 -07:00
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
# Different formats served for different User-Agents
USER_AGENTS = [
' Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome) ' , # mp4
' Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0 ' , # webm
]
config = None
formats = [ ]
for user_agent in USER_AGENTS :
2015-11-21 08:18:17 -08:00
request = sanitized_Request ( url )
2015-06-27 12:22:25 -07:00
request . add_header ( ' User-Agent ' , user_agent )
webpage = self . _download_webpage ( request , video_id )
2015-11-14 01:02:07 -08:00
iframe_url = self . _html_search_regex (
r ' <iframe[^>]+src= " ((?:https?:)?//(?:www.youtube.com/embed/[^ " ]+|(?:www \ .)?vine \ .co/v/ \ w+/card)) " ' ,
webpage , ' video iframe ' , default = None )
if iframe_url :
return self . url_result ( iframe_url )
2015-10-18 04:07:37 -07:00
2015-10-18 03:08:24 -07:00
config = self . _parse_json ( self . _html_search_regex (
2016-03-02 21:39:04 -08:00
r ' data-(?:player-)?config= " ([^ " ]+) " ' , webpage , ' data player config ' ) ,
2015-06-27 12:22:25 -07:00
video_id )
2015-07-21 14:45:36 -07:00
if ' playlist ' not in config :
2016-03-02 21:39:04 -08:00
vmap_url = config . get ( ' vmapUrl ' ) or config . get ( ' vmap_url ' )
if vmap_url :
2015-10-18 02:15:47 -07:00
formats . append ( {
2016-03-02 21:39:04 -08:00
' url ' : self . _get_vmap_video_url ( vmap_url , video_id ) ,
2015-10-18 02:15:47 -07:00
} )
2015-07-21 14:45:36 -07:00
break # same video regardless of UA
continue
2015-06-27 12:22:25 -07:00
video_url = config [ ' playlist ' ] [ 0 ] [ ' source ' ]
f = {
' url ' : video_url ,
}
m = re . search ( r ' /(?P<width> \ d+)x(?P<height> \ d+)/ ' , video_url )
if m :
f . update ( {
' width ' : int ( m . group ( ' width ' ) ) ,
' height ' : int ( m . group ( ' height ' ) ) ,
} )
formats . append ( f )
self . _sort_formats ( formats )
2016-03-02 21:39:04 -08:00
title = self . _search_regex ( r ' <title>([^<]+)</title> ' , webpage , ' title ' )
thumbnail = config . get ( ' posterImageUrl ' ) or config . get ( ' image_src ' )
2015-06-27 12:22:25 -07:00
duration = float_or_none ( config . get ( ' duration ' ) )
return {
' id ' : video_id ,
2016-03-02 21:39:04 -08:00
' title ' : title ,
2015-06-27 12:22:25 -07:00
' thumbnail ' : thumbnail ,
' duration ' : duration ,
' formats ' : formats ,
}
2015-07-21 14:38:40 -07:00
2015-10-18 02:16:57 -07:00
class TwitterIE ( InfoExtractor ) :
2015-10-18 02:13:58 -07:00
IE_NAME = ' twitter '
2015-10-18 03:04:13 -07:00
_VALID_URL = r ' https?://(?:www \ .|m \ .|mobile \ .)?twitter \ .com/(?P<user_id>[^/]+)/status/(?P<id> \ d+) '
_TEMPLATE_URL = ' https://twitter.com/ %s /status/ %s '
2015-07-21 14:38:40 -07:00
2015-11-11 10:13:42 -08:00
_TESTS = [ {
2015-10-18 02:07:48 -07:00
' url ' : ' https://twitter.com/freethenipple/status/643211948184596480 ' ,
2016-02-21 00:57:56 -08:00
# MD5 checksums are different in different places
2015-07-21 14:38:40 -07:00
' info_dict ' : {
2015-10-18 02:07:48 -07:00
' id ' : ' 643211948184596480 ' ,
2015-07-21 14:38:40 -07:00
' ext ' : ' mp4 ' ,
2015-10-18 03:04:13 -07:00
' title ' : ' FREE THE NIPPLE - FTN supporters on Hollywood Blvd today! ' ,
2015-07-21 14:38:40 -07:00
' thumbnail ' : ' re:^https?://.* \ .jpg ' ,
2015-10-18 02:07:48 -07:00
' duration ' : 12.922 ,
' description ' : ' FREE THE NIPPLE on Twitter: " FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ " ' ,
' uploader ' : ' FREE THE NIPPLE ' ,
' uploader_id ' : ' freethenipple ' ,
2015-07-21 14:38:40 -07:00
} ,
2015-11-11 10:13:42 -08:00
} , {
' url ' : ' https://twitter.com/giphz/status/657991469417025536/photo/1 ' ,
' md5 ' : ' f36dcd5fb92bf7057f155e7d927eeb42 ' ,
' info_dict ' : {
' id ' : ' 657991469417025536 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai ' ,
' description ' : ' Gifs on Twitter: " tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5 " ' ,
' thumbnail ' : ' re:^https?://.* \ .png ' ,
' uploader ' : ' Gifs ' ,
' uploader_id ' : ' giphz ' ,
} ,
2016-02-21 01:29:28 -08:00
' expected_warnings ' : [ ' height ' , ' width ' ] ,
2015-11-13 10:09:42 -08:00
} , {
' url ' : ' https://twitter.com/starwars/status/665052190608723968 ' ,
' md5 ' : ' 39b7199856dee6cd4432e72c74bc69d4 ' ,
' info_dict ' : {
' id ' : ' 665052190608723968 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. ' ,
' description ' : ' Star Wars on Twitter: " A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. " ' ,
' uploader_id ' : ' starwars ' ,
' uploader ' : ' Star Wars ' ,
} ,
2016-03-02 21:39:04 -08:00
} , {
' url ' : ' https://twitter.com/BTNBrentYarina/status/705235433198714880 ' ,
' info_dict ' : {
' id ' : ' 705235433198714880 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Brent Yarina - Khalil Iverson \' s missed highlight dunk. And made highlight dunk. In one highlight. ' ,
' description ' : ' Brent Yarina on Twitter: " Khalil Iverson \' s missed highlight dunk. And made highlight dunk. In one highlight. " ' ,
' uploader_id ' : ' BTNBrentYarina ' ,
' uploader ' : ' Brent Yarina ' ,
} ,
' params ' : {
# The same video as https://twitter.com/i/videos/tweet/705235433198714880
# Test case of TwitterCardIE
' skip_download ' : True ,
} ,
2015-11-11 10:13:42 -08:00
} ]
2015-07-21 14:38:40 -07:00
def _real_extract ( self , url ) :
2015-10-18 03:04:13 -07:00
mobj = re . match ( self . _VALID_URL , url )
user_id = mobj . group ( ' user_id ' )
twid = mobj . group ( ' id ' )
webpage = self . _download_webpage ( self . _TEMPLATE_URL % ( user_id , twid ) , twid )
username = remove_end ( self . _og_search_title ( webpage ) , ' on Twitter ' )
2015-11-13 10:09:42 -08:00
title = description = self . _og_search_description ( webpage ) . strip ( ' ' ) . replace ( ' \n ' , ' ' ) . strip ( ' “” ' )
2015-10-18 03:04:13 -07:00
# strip 'https -_t.co_BJYgOjSeGA' junk from filenames
2015-11-13 10:09:42 -08:00
title = re . sub ( r ' \ s+(https?://[^ ]+) ' , ' ' , title )
2015-10-18 03:04:13 -07:00
2015-11-11 10:13:42 -08:00
info = {
2015-10-18 03:04:13 -07:00
' uploader_id ' : user_id ,
' uploader ' : username ,
2015-07-21 14:38:40 -07:00
' webpage_url ' : url ,
2015-11-13 10:09:42 -08:00
' description ' : ' %s on Twitter: " %s " ' % ( username , description ) ,
2015-07-21 14:38:40 -07:00
' title ' : username + ' - ' + title ,
}
2015-11-11 10:13:42 -08:00
card_id = self . _search_regex (
r ' [ " \' ]/i/cards/tfw/v1/( \ d+) ' , webpage , ' twitter card url ' , default = None )
if card_id :
card_url = ' https://twitter.com/i/cards/tfw/v1/ ' + card_id
info . update ( {
' _type ' : ' url_transparent ' ,
' ie_key ' : ' TwitterCard ' ,
' url ' : card_url ,
} )
return info
mobj = re . search ( r ''' (?x)
2016-02-21 01:29:28 -08:00
< video [ ^ > ] + class = " animated-gif " ( ? P < more_info > [ ^ > ] + ) > \s *
2015-11-11 10:13:42 -08:00
< source [ ^ > ] + video - src = " (?P<url>[^ " ] + ) "
''' , webpage)
if mobj :
2016-02-21 01:29:28 -08:00
more_info = mobj . group ( ' more_info ' )
height = int_or_none ( self . _search_regex (
r ' data-height= " ( \ d+) " ' , more_info , ' height ' , fatal = False ) )
width = int_or_none ( self . _search_regex (
r ' data-width= " ( \ d+) " ' , more_info , ' width ' , fatal = False ) )
thumbnail = self . _search_regex (
r ' poster= " ([^ " ]+) " ' , more_info , ' poster ' , fatal = False )
2015-11-11 10:13:42 -08:00
info . update ( {
' id ' : twid ,
' url ' : mobj . group ( ' url ' ) ,
2016-02-21 01:29:28 -08:00
' height ' : height ,
' width ' : width ,
' thumbnail ' : thumbnail ,
2015-11-11 10:13:42 -08:00
} )
return info
2016-03-02 21:39:04 -08:00
if ' class= " PlayableMedia ' in webpage :
info . update ( {
' _type ' : ' url_transparent ' ,
' ie_key ' : ' TwitterCard ' ,
' url ' : ' %s //twitter.com/i/videos/tweet/ %s ' % ( self . http_scheme ( ) , twid ) ,
} )
return info
2016-02-21 01:21:37 -08:00
raise ExtractorError ( ' There \' s no video in this tweet. ' )
2016-02-21 00:41:24 -08:00
class TwitterAmplifyIE ( TwitterBaseIE ) :
IE_NAME = ' twitter:amplify '
_VALID_URL = ' https?://amp \ .twimg \ .com/v/(?P<id>[0-9a-f \ -] {36} ) '
_TEST = {
' url ' : ' https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951 ' ,
' md5 ' : ' 7df102d0b9fd7066b86f3159f8e81bf6 ' ,
' info_dict ' : {
' id ' : ' 0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Twitter Video ' ,
2016-02-21 01:16:35 -08:00
' thumbnail ' : ' re:^https?://.* ' ,
2016-02-21 00:41:24 -08:00
} ,
}
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
vmap_url = self . _html_search_meta (
' twitter:amplify:vmap ' , webpage , ' vmap url ' )
video_url = self . _get_vmap_video_url ( vmap_url , video_id )
2016-02-21 01:16:35 -08:00
thumbnails = [ ]
thumbnail = self . _html_search_meta (
' twitter:image:src ' , webpage , ' thumbnail ' , fatal = False )
def _find_dimension ( target ) :
w = int_or_none ( self . _html_search_meta (
' twitter: %s :width ' % target , webpage , fatal = False ) )
h = int_or_none ( self . _html_search_meta (
' twitter: %s :height ' % target , webpage , fatal = False ) )
return w , h
if thumbnail :
thumbnail_w , thumbnail_h = _find_dimension ( ' image ' )
thumbnails . append ( {
' url ' : thumbnail ,
' width ' : thumbnail_w ,
' height ' : thumbnail_h ,
} )
video_w , video_h = _find_dimension ( ' player ' )
formats = [ {
' url ' : video_url ,
' width ' : video_w ,
' height ' : video_h ,
} ]
2016-02-21 00:41:24 -08:00
return {
' id ' : video_id ,
' title ' : ' Twitter Video ' ,
2016-02-21 01:16:35 -08:00
' formats ' : formats ,
' thumbnails ' : thumbnails ,
2016-02-21 00:41:24 -08:00
}