youtube-dl/youtube_dl/extractor/ninegag.py

from __future__ import unicode_literals

import re
import json

from .common import InfoExtractor
from ..utils import str_to_int


class NineGagIE(InfoExtractor):
    IE_NAME = '9gag'
    _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/p/(?P<id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^?#/]+))?'

    _TESTS = [{
        "url": "http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome",
        "info_dict": {
            "id": "Kk2X5",
            "ext": "mp4",
            "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
            "title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
            'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',
            'uploader': 'CompilationChannel',
            'upload_date': '20131110',
            "view_count": int,
            "thumbnail": "re:^https?://",
        },
        'add_ie': ['Youtube']
    }, {
        'url': 'http://9gag.com/tv/p/KklwM',
        'only_matching': True,
    }, {
        'url': 'http://9gag.tv/p/Kk2X5',
        'only_matching': True,
    }]
    _EXTERNAL_VIDEO_PROVIDER = {
        '1': {
            'url': '%s',
            'ie_key': 'Youtube',
        },
        '2': {
            'url': 'http://player.vimeo.com/video/%s',
            'ie_key': 'Vimeo',
        },
        '3': {
            'url': 'http://instagram.com/p/%s',
            'ie_key': 'Instagram',
        },
        '4': {
            'url': 'http://vine.co/v/%s',
            'ie_key': 'Vine',
        },
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        display_id = mobj.group('display_id') or video_id

        webpage = self._download_webpage(url, display_id)

        post_view = json.loads(self._html_search_regex(
            r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost',
            webpage, 'post view'))

        ie_key = None
        source_url = post_view.get('sourceUrl')
        if not source_url:
            external_video_id = post_view['videoExternalId']
            external_video_provider = post_view['videoExternalProvider']
            source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id
            ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key']
        title = post_view['title']
        description = post_view.get('description')
        view_count = str_to_int(post_view.get('externalView'))
        thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')

        return {
            '_type': 'url_transparent',
            'url': source_url,
            'ie_key': ie_key,
            'id': video_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'view_count': view_count,
            'thumbnail': thumbnail,
        }
[ninegag] Use unicode_literals 2014-01-28 09:55:06 -08:00			`from __future__ import unicode_literals`

[9gag] Add extractor 2013-12-05 05:29:08 -08:00			`import re`
[9gag] Fix and improve extraction 2014-04-15 05:49:38 -07:00			`import json`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00
			`from .common import InfoExtractor`
[9gag] Fix and improve extraction 2014-04-15 05:49:38 -07:00			`from ..utils import str_to_int`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00

			`class NineGagIE(InfoExtractor):`
			`IE_NAME = '9gag'`
[9gag] Allow old .tv domain There are still references to it in webpage's source 2015-09-22 12:43:26 -07:00			`_VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv\|\.tv)/p/(?P<id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^?#/]+))?'`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00
[ninegag] Add support for p/ URLs 2014-04-10 16:25:24 -07:00			`_TESTS = [{`
[ninegag] fix _VALID_URL regex and handle the use of other external providers 2015-09-21 06:01:12 -07:00			`"url": "http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome",`
[ninegag] Use unicode_literals 2014-01-28 09:55:06 -08:00			`"info_dict": {`
[ninegag] fix _VALID_URL regex and handle the use of other external providers 2015-09-21 06:01:12 -07:00			`"id": "Kk2X5",`
[ninegag] Modernize and remove unused import 2014-03-20 16:25:04 -07:00			`"ext": "mp4",`
[ninegag] Use unicode_literals 2014-01-28 09:55:06 -08:00			`"description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",`
[ninegag] Fix extraction 2014-03-13 08:40:53 -07:00			`"title": "\"People Are Awesome 2013\" Is Absolutely Awesome",`
[ninegag] Test for additional properties 2014-12-12 11:10:15 -08:00			`'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',`
			`'uploader': 'CompilationChannel',`
			`'upload_date': '20131110',`
[ninegag] Fix extraction 2014-03-13 08:40:53 -07:00			`"view_count": int,`
			`"thumbnail": "re:^https?://",`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00			`},`
[ninegag] Use unicode_literals 2014-01-28 09:55:06 -08:00			`'add_ie': ['Youtube']`
PEP8: applied even more rules 2014-11-23 12:39:15 -08:00			`}, {`
[9gag] Remove redundant test 2015-09-22 12:41:44 -07:00			`'url': 'http://9gag.com/tv/p/KklwM',`
			`'only_matching': True,`
[9gag] Allow old .tv domain There are still references to it in webpage's source 2015-09-22 12:43:26 -07:00			`}, {`
			`'url': 'http://9gag.tv/p/Kk2X5',`
			`'only_matching': True,`
[ninegag] Add support for p/ URLs 2014-04-10 16:25:24 -07:00			`}]`
[ninegag] fix _VALID_URL regex and handle the use of other external providers 2015-09-21 06:01:12 -07:00			`_EXTERNAL_VIDEO_PROVIDER = {`
			`'1': {`
			`'url': '%s',`
			`'ie_key': 'Youtube',`
			`},`
			`'2': {`
			`'url': 'http://player.vimeo.com/video/%s',`
			`'ie_key': 'Vimeo',`
			`},`
			`'3': {`
			`'url': 'http://instagram.com/p/%s',`
			`'ie_key': 'Instagram',`
			`},`
			`'4': {`
			`'url': 'http://vine.co/v/%s',`
			`'ie_key': 'Vine',`
			`},`
			`}`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00
			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
[ninegag] fix _VALID_URL regex and handle the use of other external providers 2015-09-21 06:01:12 -07:00			`video_id = mobj.group('id')`
[9gag] Make display_id optional 2015-09-22 12:40:06 -07:00			`display_id = mobj.group('display_id') or video_id`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00
[ninegag] Add support for p/ URLs 2014-04-10 16:25:24 -07:00			`webpage = self._download_webpage(url, display_id)`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00
[9gag] Fix and improve extraction 2014-04-15 05:49:38 -07:00			`post_view = json.loads(self._html_search_regex(`
[9gag] Make post view regex more robust 2015-09-22 12:44:38 -07:00			`r'var\s+postView\s=\snew\s+app\.PostView\({\spost:\s({.+?})\s,\sposts:\s*prefetchedCurrentPost',`
			`webpage, 'post view'))`
[9gag] Fix and improve extraction 2014-04-15 05:49:38 -07:00
[ninegag] extract source url 2015-09-22 11:20:18 -07:00			`ie_key = None`
			`source_url = post_view.get('sourceUrl')`
[ninegag] remove unnecessary condition 2015-09-22 12:28:00 -07:00			`if not source_url:`
[ninegag] extract source url 2015-09-22 11:20:18 -07:00			`external_video_id = post_view['videoExternalId']`
			`external_video_provider = post_view['videoExternalProvider']`
			`source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id`
			`ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key']`
[9gag] Fix and improve extraction 2014-04-15 05:49:38 -07:00			`title = post_view['title']`
[9gag] Relax optional fields 2015-09-22 12:46:40 -07:00			`description = post_view.get('description')`
			`view_count = str_to_int(post_view.get('externalView'))`
[9gag] Fix and improve extraction 2014-04-15 05:49:38 -07:00			`thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00
			`return {`
			`'_type': 'url_transparent',`
[ninegag] extract source url 2015-09-22 11:20:18 -07:00			`'url': source_url,`
			`'ie_key': ie_key,`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00			`'id': video_id,`
[ninegag] Add support for p/ URLs 2014-04-10 16:25:24 -07:00			`'display_id': display_id,`
			`'title': title,`
[ninegag] Fix extraction 2014-03-13 08:40:53 -07:00			`'description': description,`
			`'view_count': view_count,`
[9gag] Fix and improve extraction 2014-04-15 05:49:38 -07:00			`'thumbnail': thumbnail,`
[9gag] Add extractor 2013-12-05 05:29:08 -08:00			`}`