2015-07-13 05:41:38 -07:00
# coding: utf-8
from __future__ import unicode_literals
import re
from . common import InfoExtractor
class DemocracynowIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:www \ .)?democracynow.org/?(?P<id>[^ \ ?]*) '
IE_NAME = ' democracynow '
_TESTS = [ {
' url ' : ' http://www.democracynow.org/shows/2015/7/3 ' ,
' info_dict ' : {
' id ' : ' 2015-0703-001 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' July 03, 2015 - Democracy Now! ' ,
' description ' : ' A daily independent global news hour with Amy Goodman & Juan Gonz \xe1 lez " What to the Slave is 4th of July? " : James Earl Jones Reads Frederick Douglass \u2019 Historic Speech : " This Flag Comes Down Today " : Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : " We Shall Overcome " : Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs ' ,
' uploader ' : ' Democracy Now ' ,
' upload_date ' : None ,
} ,
2015-07-17 00:57:08 -07:00
} , {
2015-07-13 05:41:38 -07:00
' url ' : ' http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree ' ,
' info_dict ' : {
' id ' : ' 2015-0703-001 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' " This Flag Comes Down Today " : Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag ' ,
' description ' : ' md5:4d2bc4f0d29f5553c2210a4bc7761a21 ' ,
' uploader ' : ' Democracy Now ' ,
' upload_date ' : None ,
} ,
} ]
def _real_extract ( self , url ) :
display_id = self . _match_id ( url )
base_host = re . search ( r ' ^(.+?://[^/]+) ' , url ) . group ( 1 )
if display_id == ' ' :
display_id = ' home '
webpage = self . _download_webpage ( url , display_id )
2015-07-17 00:57:08 -07:00
re_desc = re . search ( r ' <meta property=.og:description. content=([ " \' ])(.+?) \ 1 ' , webpage , re . DOTALL )
2015-07-13 05:41:38 -07:00
description = re_desc . group ( 2 ) if re_desc else ' '
jstr = self . _search_regex ( r ' ( { .+? " related_video_xml " .+?}) ' , webpage , ' json ' , default = None )
js = self . _parse_json ( jstr , display_id )
video_id = None
formats = [ ]
subtitles = { }
2015-07-17 00:57:08 -07:00
for key in ( ' caption_file ' , ' ....... ' ) :
2015-07-13 05:41:38 -07:00
# ....... = pending vtt support that doesn't clobber srt 'chapter_file':
2015-07-17 00:57:08 -07:00
url = js . get ( key , ' ' )
if url == ' ' or url is None :
2015-07-13 05:41:38 -07:00
continue
2015-07-17 00:57:08 -07:00
if not re . match ( r ' ^https?:// ' , url ) :
2015-07-13 05:41:38 -07:00
url = base_host + url
2015-07-17 00:57:08 -07:00
ext = re . search ( r ' \ .([^ \ .]+)$ ' , url ) . group ( 1 )
2015-07-13 05:41:38 -07:00
subtitles [ ' eng ' ] = [ {
' ext ' : ext ,
' url ' : url ,
} ]
for key in ( ' file ' , ' audio ' ) :
2015-07-17 00:57:08 -07:00
url = js . get ( key , ' ' )
if url == ' ' or url is None :
2015-07-13 05:41:38 -07:00
continue
2015-07-17 00:57:08 -07:00
if not re . match ( r ' ^https?:// ' , url ) :
2015-07-13 05:41:38 -07:00
url = base_host + url
2015-07-17 00:57:08 -07:00
purl = re . search ( r ' /(?P<dir>[^/]+)/(?:dn)?(?P<fn>[^/]+?) \ .(?P<ext>[^ \ . \ ?]+)(?P<hasparams> \ ?|$) ' , url )
if video_id is None :
2015-07-13 05:41:38 -07:00
video_id = purl . group ( ' fn ' )
2015-07-17 00:57:08 -07:00
if js . get ( ' start ' ) is not None :
2015-07-13 05:41:38 -07:00
url + = ' & ' if purl . group ( ' hasparams ' ) == ' ? ' else ' ? '
2015-07-17 00:57:08 -07:00
url = url + ' start= ' + str ( js . get ( ' start ' ) )
2015-07-13 05:41:38 -07:00
formats . append ( {
' format_id ' : purl . group ( ' dir ' ) ,
' ext ' : purl . group ( ' ext ' ) ,
' url ' : url ,
} )
self . _sort_formats ( formats )
ret = {
' id ' : video_id ,
' title ' : js . get ( ' title ' ) ,
' description ' : description ,
' uploader ' : ' Democracy Now ' ,
' subtitles ' : subtitles ,
' formats ' : formats ,
}
return ret