Compare commits
175 Commits
2016.06.18
...
2016.07.03
Author | SHA1 | Date | |
---|---|---|---|
|
bff98341d5 | ||
|
2644e911be | ||
|
a5f67895d3 | ||
|
15e4b6b758 | ||
|
2b28b892d8 | ||
|
7507fc98cb | ||
|
477b7a8474 | ||
|
034a884957 | ||
|
64436cb1a4 | ||
|
f138873900 | ||
|
e793338c88 | ||
|
369bb06206 | ||
|
2cb31d288e | ||
|
c723d1cd8d | ||
|
1f55234057 | ||
|
04006fae8d | ||
|
4cb13d0d6a | ||
|
a1f6f5c768 | ||
|
05c7feec77 | ||
|
bf83024826 | ||
|
a0cfd82dda | ||
|
1b734adb2d | ||
|
9b724d7277 | ||
|
c3a5dd3b5d | ||
|
e3755a624b | ||
|
95cf60e826 | ||
|
6b03e1e25d | ||
|
712b0b5b70 | ||
|
6a424391d9 | ||
|
dbf0157a26 | ||
|
7deef1ba67 | ||
|
fd6ca38262 | ||
|
bdafd88da0 | ||
|
7a1e71575e | ||
|
ac2d8f54d1 | ||
|
14ff6baa0e | ||
|
bb08101ec4 | ||
|
bc4b2d75ba | ||
|
35fc3021ba | ||
|
347227237b | ||
|
564dc3c6e8 | ||
|
9f4576a7eb | ||
|
f11315e8d4 | ||
|
0c2ac64bb8 | ||
|
a9eede3913 | ||
|
9e29ef13a3 | ||
|
eaaaaec042 | ||
|
3cb3b60064 | ||
|
044e3d91b5 | ||
|
c9e538a3b1 | ||
|
76dad392f5 | ||
|
9617b557aa | ||
|
bf4fa24414 | ||
|
20361b4f25 | ||
|
05a0068a76 | ||
|
66a42309fa | ||
|
fd94e2671a | ||
|
8ff6697861 | ||
|
eafa643715 | ||
|
049da7cb6c | ||
|
7dbeee7e22 | ||
|
93ad6c6bfa | ||
|
329179073b | ||
|
4d86d2008e | ||
|
ab47b6e881 | ||
|
df43389ade | ||
|
397b305cfe | ||
|
e496fa50cd | ||
|
06a96da15b | ||
|
70157c2c43 | ||
|
c58ed8563d | ||
|
4c7821227c | ||
|
42362fdb5e | ||
|
97124e572d | ||
|
32616c14cc | ||
|
8174d0fe95 | ||
|
8704778d95 | ||
|
c287f2bc60 | ||
|
9ea5c04c0d | ||
|
fd7a7498a4 | ||
|
e3a6747d8f | ||
|
f41ffc00d1 | ||
|
81fda15369 | ||
|
427cd050a3 | ||
|
b0c200f1ec | ||
|
92747e664a | ||
|
f1f336322d | ||
|
bf8dd79045 | ||
|
c6781156aa | ||
|
f484c5fa25 | ||
|
88d9f6c0c4 | ||
|
3c9c088f9c | ||
|
fc3996bfe1 | ||
|
5b6ad8630c | ||
|
30105f4ac0 | ||
|
1143535d76 | ||
|
7d52c052ef | ||
|
a2406fce3c | ||
|
3b34ab538c | ||
|
ac782306f1 | ||
|
0c00e889f3 | ||
|
ce96ed05f4 | ||
|
0463b77a1f | ||
|
2d185706ea | ||
|
b72b44318c | ||
|
46f59e89ea | ||
|
b4241e308e | ||
|
3d4b08dfc7 | ||
|
be49068d65 | ||
|
525cedb971 | ||
|
de3c7fe0d4 | ||
|
896cc72750 | ||
|
c1ff6e1ad0 | ||
|
fee70322d7 | ||
|
8065d6c55f | ||
|
494172d2e5 | ||
|
6e3c2047f8 | ||
|
011bd3221b | ||
|
b46eabecd3 | ||
|
0437307a41 | ||
|
22b7ac13ef | ||
|
96f88e91b7 | ||
|
3331a4644d | ||
|
adf1921dc1 | ||
|
97674f0419 | ||
|
73843ae8ac | ||
|
f2bb8c036a | ||
|
75ca6bcee2 | ||
|
089657ed1f | ||
|
b5eab86c24 | ||
|
c8e3e0974b | ||
|
dfc8f46e1c | ||
|
c143ddce5d | ||
|
169d836feb | ||
|
6ae938b295 | ||
|
cf40fdf5c1 | ||
|
23bdae0955 | ||
|
ca74c90bf5 | ||
|
7cfc1e2a10 | ||
|
1ac5705f62 | ||
|
e4f90ea0a7 | ||
|
cdfc187cd5 | ||
|
feef925f49 | ||
|
19e2d1cdea | ||
|
8369a4fe76 | ||
|
1f749b6658 | ||
|
819707920a | ||
|
43518503a6 | ||
|
5839d556e4 | ||
|
6c83e583b3 | ||
|
6aeb64b673 | ||
|
6cd64b6806 | ||
|
e154c65128 | ||
|
a50fd6e026 | ||
|
6a55bb66ee | ||
|
7c05097633 | ||
|
589568789f | ||
|
7577d849a6 | ||
|
cb23192bc4 | ||
|
41c1023300 | ||
|
90b6288cce | ||
|
c1823c8ad9 | ||
|
d7c6c656c5 | ||
|
b0b128049a | ||
|
e8f13f2637 | ||
|
b5aad37f6b | ||
|
6d0d4fc26d | ||
|
0278aa443f | ||
|
1f35745758 | ||
|
573c35272f | ||
|
09e3f91e40 | ||
|
1b6cf16be7 | ||
|
26264cb056 | ||
|
a72df5f36f | ||
|
c878e635de |
6
.github/ISSUE_TEMPLATE.md
vendored
6
.github/ISSUE_TEMPLATE.md
vendored
@@ -6,8 +6,8 @@
|
||||
|
||||
---
|
||||
|
||||
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.18.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
|
||||
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.18.1**
|
||||
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.03.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
|
||||
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.03.1**
|
||||
|
||||
### Before submitting an *issue* make sure you have:
|
||||
- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
|
||||
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
|
||||
[debug] User config: []
|
||||
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
|
||||
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
|
||||
[debug] youtube-dl version 2016.06.18.1
|
||||
[debug] youtube-dl version 2016.07.03.1
|
||||
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
|
||||
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
|
||||
[debug] Proxy map: {}
|
||||
|
22
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
22
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
## Please follow the guide below
|
||||
|
||||
- You will be asked some questions, please read them **carefully** and answer honestly
|
||||
- Put an `x` into all the boxes [ ] relevant to your *pull request* (like that [x])
|
||||
- Use *Preview* tab to see how your *pull request* will actually look like
|
||||
|
||||
---
|
||||
|
||||
### Before submitting a *pull request* make sure you have:
|
||||
- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/rg3/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/rg3/youtube-dl#youtube-dl-coding-conventions) sections
|
||||
- [ ] [Searched](https://github.com/rg3/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests
|
||||
|
||||
### What is the purpose of your *pull request*?
|
||||
- [ ] Bug fix
|
||||
- [ ] New extractor
|
||||
- [ ] New feature
|
||||
|
||||
---
|
||||
|
||||
### Description of your *pull request* and other information
|
||||
|
||||
Explanation of your *pull request* in arbitrary form goes here. Please make sure the description explains the purpose and effect of your *pull request* and is worded well enough to be understood. Provide as much context and examples as possible.
|
2
AUTHORS
2
AUTHORS
@@ -175,3 +175,5 @@ Tomáš Čech
|
||||
Déstin Reed
|
||||
Roman Tsiupa
|
||||
Artur Krysiak
|
||||
Jakub Adam Wieczorek
|
||||
Aleksandar Topuzović
|
||||
|
152
CONTRIBUTING.md
152
CONTRIBUTING.md
@@ -97,9 +97,17 @@ If you want to add support for a new site, first of all **make sure** this site
|
||||
After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`):
|
||||
|
||||
1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
|
||||
2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
|
||||
3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
|
||||
2. Check out the source code with:
|
||||
|
||||
git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git
|
||||
|
||||
3. Start a new git branch with
|
||||
|
||||
cd youtube-dl
|
||||
git checkout -b yourextractor
|
||||
|
||||
4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
|
||||
|
||||
```python
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
@@ -143,16 +151,148 @@ After you have ensured this site is distributing it's content legally, you can f
|
||||
5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
|
||||
6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
|
||||
7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
|
||||
8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`.
|
||||
9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
|
||||
10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
|
||||
8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
|
||||
9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
|
||||
|
||||
$ git add youtube_dl/extractor/extractors.py
|
||||
$ git add youtube_dl/extractor/yourextractor.py
|
||||
$ git commit -m '[yourextractor] Add new extractor'
|
||||
$ git push origin yourextractor
|
||||
|
||||
11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
|
||||
10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
|
||||
|
||||
In any case, thank you very much for your contributions!
|
||||
|
||||
## youtube-dl coding conventions
|
||||
|
||||
This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code.
|
||||
|
||||
Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros.
|
||||
|
||||
### Mandatory and optional metafields
|
||||
|
||||
For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl:
|
||||
|
||||
- `id` (media identifier)
|
||||
- `title` (media title)
|
||||
- `url` (media download URL) or `formats`
|
||||
|
||||
In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken.
|
||||
|
||||
[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields.
|
||||
|
||||
#### Example
|
||||
|
||||
Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`:
|
||||
|
||||
```python
|
||||
meta = self._download_json(url, video_id)
|
||||
```
|
||||
|
||||
Assume at this point `meta`'s layout is:
|
||||
|
||||
```python
|
||||
{
|
||||
...
|
||||
"summary": "some fancy summary text",
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like:
|
||||
|
||||
```python
|
||||
description = meta.get('summary') # correct
|
||||
```
|
||||
|
||||
and not like:
|
||||
|
||||
```python
|
||||
description = meta['summary'] # incorrect
|
||||
```
|
||||
|
||||
The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data).
|
||||
|
||||
Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance:
|
||||
|
||||
```python
|
||||
description = self._search_regex(
|
||||
r'<span[^>]+id="title"[^>]*>([^<]+)<',
|
||||
webpage, 'description', fatal=False)
|
||||
```
|
||||
|
||||
With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction.
|
||||
|
||||
You can also pass `default=<some fallback value>`, for example:
|
||||
|
||||
```python
|
||||
description = self._search_regex(
|
||||
r'<span[^>]+id="title"[^>]*>([^<]+)<',
|
||||
webpage, 'description', default=None)
|
||||
```
|
||||
|
||||
On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present.
|
||||
|
||||
### Provide fallbacks
|
||||
|
||||
When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable.
|
||||
|
||||
#### Example
|
||||
|
||||
Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like:
|
||||
|
||||
```python
|
||||
title = meta['title']
|
||||
```
|
||||
|
||||
If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected.
|
||||
|
||||
Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario:
|
||||
|
||||
```python
|
||||
title = meta.get('title') or self._og_search_title(webpage)
|
||||
```
|
||||
|
||||
This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`.
|
||||
|
||||
### Make regular expressions flexible
|
||||
|
||||
When using regular expressions try to write them fuzzy and flexible.
|
||||
|
||||
#### Example
|
||||
|
||||
Say you need to extract `title` from the following HTML code:
|
||||
|
||||
```html
|
||||
<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">some fancy title</span>
|
||||
```
|
||||
|
||||
The code for that task should look similar to:
|
||||
|
||||
```python
|
||||
title = self._search_regex(
|
||||
r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
|
||||
```
|
||||
|
||||
Or even better:
|
||||
|
||||
```python
|
||||
title = self._search_regex(
|
||||
r'<span[^>]+class=(["\'])title\1[^>]*>(?P<title>[^<]+)',
|
||||
webpage, 'title', group='title')
|
||||
```
|
||||
|
||||
Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute:
|
||||
|
||||
The code definitely should not look like:
|
||||
|
||||
```python
|
||||
title = self._search_regex(
|
||||
r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>',
|
||||
webpage, 'title', group='title')
|
||||
```
|
||||
|
||||
### Use safe conversion functions
|
||||
|
||||
Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
|
||||
|
||||
|
154
README.md
154
README.md
@@ -44,7 +44,7 @@ Or with [MacPorts](https://www.macports.org/):
|
||||
Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html).
|
||||
|
||||
# DESCRIPTION
|
||||
**youtube-dl** is a small command-line program to download videos from
|
||||
**youtube-dl** is a command-line program to download videos from
|
||||
YouTube.com and a few more sites. It requires the Python interpreter, version
|
||||
2.6, 2.7, or 3.2+, and it is not platform specific. It should work on
|
||||
your Unix box, on Windows or on Mac OS X. It is released to the public domain,
|
||||
@@ -890,9 +890,17 @@ If you want to add support for a new site, first of all **make sure** this site
|
||||
After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`):
|
||||
|
||||
1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
|
||||
2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
|
||||
3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
|
||||
2. Check out the source code with:
|
||||
|
||||
git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git
|
||||
|
||||
3. Start a new git branch with
|
||||
|
||||
cd youtube-dl
|
||||
git checkout -b yourextractor
|
||||
|
||||
4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
|
||||
|
||||
```python
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
@@ -936,19 +944,151 @@ After you have ensured this site is distributing it's content legally, you can f
|
||||
5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
|
||||
6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
|
||||
7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
|
||||
8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`.
|
||||
9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
|
||||
10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
|
||||
8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
|
||||
9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
|
||||
|
||||
$ git add youtube_dl/extractor/extractors.py
|
||||
$ git add youtube_dl/extractor/yourextractor.py
|
||||
$ git commit -m '[yourextractor] Add new extractor'
|
||||
$ git push origin yourextractor
|
||||
|
||||
11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
|
||||
10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
|
||||
|
||||
In any case, thank you very much for your contributions!
|
||||
|
||||
## youtube-dl coding conventions
|
||||
|
||||
This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code.
|
||||
|
||||
Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros.
|
||||
|
||||
### Mandatory and optional metafields
|
||||
|
||||
For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl:
|
||||
|
||||
- `id` (media identifier)
|
||||
- `title` (media title)
|
||||
- `url` (media download URL) or `formats`
|
||||
|
||||
In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken.
|
||||
|
||||
[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields.
|
||||
|
||||
#### Example
|
||||
|
||||
Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`:
|
||||
|
||||
```python
|
||||
meta = self._download_json(url, video_id)
|
||||
```
|
||||
|
||||
Assume at this point `meta`'s layout is:
|
||||
|
||||
```python
|
||||
{
|
||||
...
|
||||
"summary": "some fancy summary text",
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like:
|
||||
|
||||
```python
|
||||
description = meta.get('summary') # correct
|
||||
```
|
||||
|
||||
and not like:
|
||||
|
||||
```python
|
||||
description = meta['summary'] # incorrect
|
||||
```
|
||||
|
||||
The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data).
|
||||
|
||||
Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance:
|
||||
|
||||
```python
|
||||
description = self._search_regex(
|
||||
r'<span[^>]+id="title"[^>]*>([^<]+)<',
|
||||
webpage, 'description', fatal=False)
|
||||
```
|
||||
|
||||
With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction.
|
||||
|
||||
You can also pass `default=<some fallback value>`, for example:
|
||||
|
||||
```python
|
||||
description = self._search_regex(
|
||||
r'<span[^>]+id="title"[^>]*>([^<]+)<',
|
||||
webpage, 'description', default=None)
|
||||
```
|
||||
|
||||
On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present.
|
||||
|
||||
### Provide fallbacks
|
||||
|
||||
When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable.
|
||||
|
||||
#### Example
|
||||
|
||||
Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like:
|
||||
|
||||
```python
|
||||
title = meta['title']
|
||||
```
|
||||
|
||||
If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected.
|
||||
|
||||
Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario:
|
||||
|
||||
```python
|
||||
title = meta.get('title') or self._og_search_title(webpage)
|
||||
```
|
||||
|
||||
This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`.
|
||||
|
||||
### Make regular expressions flexible
|
||||
|
||||
When using regular expressions try to write them fuzzy and flexible.
|
||||
|
||||
#### Example
|
||||
|
||||
Say you need to extract `title` from the following HTML code:
|
||||
|
||||
```html
|
||||
<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">some fancy title</span>
|
||||
```
|
||||
|
||||
The code for that task should look similar to:
|
||||
|
||||
```python
|
||||
title = self._search_regex(
|
||||
r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
|
||||
```
|
||||
|
||||
Or even better:
|
||||
|
||||
```python
|
||||
title = self._search_regex(
|
||||
r'<span[^>]+class=(["\'])title\1[^>]*>(?P<title>[^<]+)',
|
||||
webpage, 'title', group='title')
|
||||
```
|
||||
|
||||
Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute:
|
||||
|
||||
The code definitely should not look like:
|
||||
|
||||
```python
|
||||
title = self._search_regex(
|
||||
r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>',
|
||||
webpage, 'title', group='title')
|
||||
```
|
||||
|
||||
### Use safe conversion functions
|
||||
|
||||
Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
|
||||
|
||||
# EMBEDDING YOUTUBE-DL
|
||||
|
||||
youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/rg3/youtube-dl/issues/new).
|
||||
|
@@ -14,15 +14,17 @@ if os.path.exists(lazy_extractors_filename):
|
||||
os.remove(lazy_extractors_filename)
|
||||
|
||||
from youtube_dl.extractor import _ALL_CLASSES
|
||||
from youtube_dl.extractor.common import InfoExtractor
|
||||
from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
|
||||
|
||||
with open('devscripts/lazy_load_template.py', 'rt') as f:
|
||||
module_template = f.read()
|
||||
|
||||
module_contents = [module_template + '\n' + getsource(InfoExtractor.suitable)]
|
||||
module_contents = [
|
||||
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
|
||||
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n']
|
||||
|
||||
ie_template = '''
|
||||
class {name}(LazyLoadExtractor):
|
||||
class {name}({bases}):
|
||||
_VALID_URL = {valid_url!r}
|
||||
_module = '{module}'
|
||||
'''
|
||||
@@ -34,10 +36,20 @@ make_valid_template = '''
|
||||
'''
|
||||
|
||||
|
||||
def get_base_name(base):
|
||||
if base is InfoExtractor:
|
||||
return 'LazyLoadExtractor'
|
||||
elif base is SearchInfoExtractor:
|
||||
return 'LazyLoadSearchExtractor'
|
||||
else:
|
||||
return base.__name__
|
||||
|
||||
|
||||
def build_lazy_ie(ie, name):
|
||||
valid_url = getattr(ie, '_VALID_URL', None)
|
||||
s = ie_template.format(
|
||||
name=name,
|
||||
bases=', '.join(map(get_base_name, ie.__bases__)),
|
||||
valid_url=valid_url,
|
||||
module=ie.__module__)
|
||||
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
|
||||
@@ -47,12 +59,35 @@ def build_lazy_ie(ie, name):
|
||||
s += make_valid_template.format(valid_url=ie._make_valid_url())
|
||||
return s
|
||||
|
||||
# find the correct sorting and add the required base classes so that sublcasses
|
||||
# can be correctly created
|
||||
classes = _ALL_CLASSES[:-1]
|
||||
ordered_cls = []
|
||||
while classes:
|
||||
for c in classes[:]:
|
||||
bases = set(c.__bases__) - set((object, InfoExtractor, SearchInfoExtractor))
|
||||
stop = False
|
||||
for b in bases:
|
||||
if b not in classes and b not in ordered_cls:
|
||||
if b.__name__ == 'GenericIE':
|
||||
exit()
|
||||
classes.insert(0, b)
|
||||
stop = True
|
||||
if stop:
|
||||
break
|
||||
if all(b in ordered_cls for b in bases):
|
||||
ordered_cls.append(c)
|
||||
classes.remove(c)
|
||||
break
|
||||
ordered_cls.append(_ALL_CLASSES[-1])
|
||||
|
||||
names = []
|
||||
for ie in list(sorted(_ALL_CLASSES[:-1], key=lambda cls: cls.ie_key())) + _ALL_CLASSES[-1:]:
|
||||
name = ie.ie_key() + 'IE'
|
||||
for ie in ordered_cls:
|
||||
name = ie.__name__
|
||||
src = build_lazy_ie(ie, name)
|
||||
module_contents.append(src)
|
||||
names.append(name)
|
||||
if ie in _ALL_CLASSES:
|
||||
names.append(name)
|
||||
|
||||
module_contents.append(
|
||||
'_ALL_CLASSES = [{0}]'.format(', '.join(names)))
|
||||
|
@@ -44,7 +44,6 @@
|
||||
- **appletrailers:section**
|
||||
- **archive.org**: archive.org videos
|
||||
- **ARD**
|
||||
- **ARD:mediathek**: Saarländischer Rundfunk
|
||||
- **ARD:mediathek**
|
||||
- **arte.tv**
|
||||
- **arte.tv:+7**
|
||||
@@ -128,6 +127,7 @@
|
||||
- **cliphunter**
|
||||
- **ClipRs**
|
||||
- **Clipsyndicate**
|
||||
- **CloserToTruth**
|
||||
- **cloudtime**: CloudTime
|
||||
- **Cloudy**
|
||||
- **Clubic**
|
||||
@@ -152,6 +152,8 @@
|
||||
- **CSNNE**
|
||||
- **CSpan**: C-SPAN
|
||||
- **CtsNews**: 華視新聞
|
||||
- **CTV**
|
||||
- **CTVNews**
|
||||
- **culturebox.francetvinfo.fr**
|
||||
- **CultureUnplugged**
|
||||
- **CWTV**
|
||||
@@ -240,6 +242,7 @@
|
||||
- **FreeVideo**
|
||||
- **Funimation**
|
||||
- **FunnyOrDie**
|
||||
- **Fusion**
|
||||
- **GameInformer**
|
||||
- **Gamekings**
|
||||
- **GameOne**
|
||||
@@ -247,7 +250,6 @@
|
||||
- **Gamersyde**
|
||||
- **GameSpot**
|
||||
- **GameStar**
|
||||
- **Gametrailers**
|
||||
- **Gazeta**
|
||||
- **GDCVault**
|
||||
- **generic**: Generic downloader that works on some sites
|
||||
@@ -273,6 +275,7 @@
|
||||
- **Helsinki**: helsinki.fi
|
||||
- **HentaiStigma**
|
||||
- **HistoricFilms**
|
||||
- **history:topic**: History.com Topic
|
||||
- **hitbox**
|
||||
- **hitbox:live**
|
||||
- **HornBunny**
|
||||
@@ -280,6 +283,8 @@
|
||||
- **HotStar**
|
||||
- **Howcast**
|
||||
- **HowStuffWorks**
|
||||
- **HRTi**
|
||||
- **HRTiPlaylist**
|
||||
- **HuffPost**: Huffington Post
|
||||
- **Hypem**
|
||||
- **Iconosquare**
|
||||
@@ -326,7 +331,7 @@
|
||||
- **kuwo:mv**: 酷我音乐 - MV
|
||||
- **kuwo:singer**: 酷我音乐 - 歌手
|
||||
- **kuwo:song**: 酷我音乐
|
||||
- **la7.tv**
|
||||
- **la7.it**
|
||||
- **Laola1Tv**
|
||||
- **Le**: 乐视网
|
||||
- **Learnr**
|
||||
@@ -359,6 +364,7 @@
|
||||
- **MatchTV**
|
||||
- **MDR**: MDR.DE and KiKA
|
||||
- **media.ccc.de**
|
||||
- **META**
|
||||
- **metacafe**
|
||||
- **Metacritic**
|
||||
- **Mgoon**
|
||||
@@ -385,7 +391,7 @@
|
||||
- **MovieFap**
|
||||
- **Moviezine**
|
||||
- **MPORA**
|
||||
- **MSNBC**
|
||||
- **MSN**
|
||||
- **MTV**
|
||||
- **mtv.de**
|
||||
- **mtviggy.com**
|
||||
@@ -439,6 +445,7 @@
|
||||
- **nick.de**
|
||||
- **niconico**: ニコニコ動画
|
||||
- **NiconicoPlaylist**
|
||||
- **NineCNineMedia**
|
||||
- **njoy**: N-JOY
|
||||
- **njoy:embed**
|
||||
- **Noco**
|
||||
@@ -502,8 +509,9 @@
|
||||
- **plus.google**: Google Plus
|
||||
- **pluzz.francetv.fr**
|
||||
- **podomatic**
|
||||
- **PolskieRadio**
|
||||
- **PornHd**
|
||||
- **PornHub**
|
||||
- **PornHub**: PornHub and Thumbzilla
|
||||
- **PornHubPlaylist**
|
||||
- **PornHubUserVideos**
|
||||
- **Pornotube**
|
||||
@@ -521,6 +529,7 @@
|
||||
- **qqmusic:singer**: QQ音乐 - 歌手
|
||||
- **qqmusic:toplist**: QQ音乐 - 排行榜
|
||||
- **R7**
|
||||
- **R7Article**
|
||||
- **radio.de**
|
||||
- **radiobremen**
|
||||
- **radiocanada**
|
||||
@@ -586,8 +595,10 @@
|
||||
- **Shared**: shared.sx and vivo.sx
|
||||
- **ShareSix**
|
||||
- **Sina**
|
||||
- **SixPlay**
|
||||
- **skynewsarabia:article**
|
||||
- **skynewsarabia:video**
|
||||
- **skynewsarabia:video**
|
||||
- **SkySports**
|
||||
- **Slideshare**
|
||||
- **Slutload**
|
||||
- **smotri**: Smotri.com
|
||||
@@ -619,6 +630,7 @@
|
||||
- **SportBoxEmbed**
|
||||
- **SportDeutschland**
|
||||
- **Sportschau**
|
||||
- **sr:mediathek**: Saarländischer Rundfunk
|
||||
- **SRGSSR**
|
||||
- **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites
|
||||
- **SSA**
|
||||
@@ -719,6 +731,7 @@
|
||||
- **UDNEmbed**: 聯合影音
|
||||
- **Unistra**
|
||||
- **Urort**: NRK P3 Urørt
|
||||
- **URPlay**
|
||||
- **USAToday**
|
||||
- **ustream**
|
||||
- **ustream:channel**
|
||||
@@ -736,6 +749,7 @@
|
||||
- **vh1.com**
|
||||
- **Vice**
|
||||
- **ViceShow**
|
||||
- **Vidbit**
|
||||
- **Viddler**
|
||||
- **video.google:search**: Google Video search
|
||||
- **video.mit.edu**
|
||||
|
63
setup.py
63
setup.py
@@ -21,25 +21,37 @@ try:
|
||||
import py2exe
|
||||
except ImportError:
|
||||
if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe':
|
||||
print("Cannot import py2exe", file=sys.stderr)
|
||||
print('Cannot import py2exe', file=sys.stderr)
|
||||
exit(1)
|
||||
|
||||
py2exe_options = {
|
||||
"bundle_files": 1,
|
||||
"compressed": 1,
|
||||
"optimize": 2,
|
||||
"dist_dir": '.',
|
||||
"dll_excludes": ['w9xpopen.exe', 'crypt32.dll'],
|
||||
'bundle_files': 1,
|
||||
'compressed': 1,
|
||||
'optimize': 2,
|
||||
'dist_dir': '.',
|
||||
'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'],
|
||||
}
|
||||
|
||||
# Get the version from youtube_dl/version.py without importing the package
|
||||
exec(compile(open('youtube_dl/version.py').read(),
|
||||
'youtube_dl/version.py', 'exec'))
|
||||
|
||||
DESCRIPTION = 'YouTube video downloader'
|
||||
LONG_DESCRIPTION = 'Command-line program to download videos from YouTube.com and other video sites'
|
||||
|
||||
py2exe_console = [{
|
||||
"script": "./youtube_dl/__main__.py",
|
||||
"dest_base": "youtube-dl",
|
||||
'script': './youtube_dl/__main__.py',
|
||||
'dest_base': 'youtube-dl',
|
||||
'version': __version__,
|
||||
'description': DESCRIPTION,
|
||||
'comments': LONG_DESCRIPTION,
|
||||
'product_name': 'youtube-dl',
|
||||
'product_version': __version__,
|
||||
}]
|
||||
|
||||
py2exe_params = {
|
||||
'console': py2exe_console,
|
||||
'options': {"py2exe": py2exe_options},
|
||||
'options': {'py2exe': py2exe_options},
|
||||
'zipfile': None
|
||||
}
|
||||
|
||||
@@ -72,7 +84,7 @@ else:
|
||||
params['scripts'] = ['bin/youtube-dl']
|
||||
|
||||
class build_lazy_extractors(Command):
|
||||
description = "Build the extractor lazy loading module"
|
||||
description = 'Build the extractor lazy loading module'
|
||||
user_options = []
|
||||
|
||||
def initialize_options(self):
|
||||
@@ -87,16 +99,11 @@ class build_lazy_extractors(Command):
|
||||
dry_run=self.dry_run,
|
||||
)
|
||||
|
||||
# Get the version from youtube_dl/version.py without importing the package
|
||||
exec(compile(open('youtube_dl/version.py').read(),
|
||||
'youtube_dl/version.py', 'exec'))
|
||||
|
||||
setup(
|
||||
name='youtube_dl',
|
||||
version=__version__,
|
||||
description='YouTube video downloader',
|
||||
long_description='Small command-line program to download videos from'
|
||||
' YouTube.com and other video sites.',
|
||||
description=DESCRIPTION,
|
||||
long_description=LONG_DESCRIPTION,
|
||||
url='https://github.com/rg3/youtube-dl',
|
||||
author='Ricardo Garcia',
|
||||
author_email='ytdl@yt-dl.org',
|
||||
@@ -112,17 +119,17 @@ setup(
|
||||
# test_requires = ['nosetest'],
|
||||
|
||||
classifiers=[
|
||||
"Topic :: Multimedia :: Video",
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"License :: Public Domain",
|
||||
"Programming Language :: Python :: 2.6",
|
||||
"Programming Language :: Python :: 2.7",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.2",
|
||||
"Programming Language :: Python :: 3.3",
|
||||
"Programming Language :: Python :: 3.4",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
'Topic :: Multimedia :: Video',
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'License :: Public Domain',
|
||||
'Programming Language :: Python :: 2.6',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.2',
|
||||
'Programming Language :: Python :: 3.3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
],
|
||||
|
||||
cmdclass={'build_lazy_extractors': build_lazy_extractors},
|
||||
|
@@ -11,7 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from test.helper import FakeYDL
|
||||
from youtube_dl.extractor.common import InfoExtractor
|
||||
from youtube_dl.extractor import YoutubeIE, get_info_extractor
|
||||
from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError
|
||||
from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
|
||||
|
||||
|
||||
class TestIE(InfoExtractor):
|
||||
@@ -66,6 +66,11 @@ class TestInfoExtractor(unittest.TestCase):
|
||||
self.assertEqual(ie._html_search_meta('d', html), '4')
|
||||
self.assertEqual(ie._html_search_meta('e', html), '5')
|
||||
self.assertEqual(ie._html_search_meta('f', html), '6')
|
||||
self.assertEqual(ie._html_search_meta(('a', 'b', 'c'), html), '1')
|
||||
self.assertEqual(ie._html_search_meta(('c', 'b', 'a'), html), '3')
|
||||
self.assertEqual(ie._html_search_meta(('z', 'x', 'c'), html), '3')
|
||||
self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
|
||||
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
|
||||
|
||||
def test_download_json(self):
|
||||
uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
|
||||
|
@@ -6,6 +6,7 @@ from __future__ import unicode_literals
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
import collections
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
||||
@@ -130,6 +131,15 @@ class TestAllURLsMatching(unittest.TestCase):
|
||||
'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html',
|
||||
['Yahoo'])
|
||||
|
||||
def test_no_duplicated_ie_names(self):
|
||||
name_accu = collections.defaultdict(list)
|
||||
for ie in self.ies:
|
||||
name_accu[ie.IE_NAME.lower()].append(type(ie).__name__)
|
||||
for (ie_name, ie_list) in name_accu.items():
|
||||
self.assertEqual(
|
||||
len(ie_list), 1,
|
||||
'Multiple extractors with the same IE_NAME "%s" (%s)' % (ie_name, ', '.join(ie_list)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
@@ -87,6 +87,7 @@ class TestCompat(unittest.TestCase):
|
||||
|
||||
def test_compat_shlex_split(self):
|
||||
self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
|
||||
self.assertEqual(compat_shlex_split('-option "one\ntwo" \n -flag'), ['-option', 'one\ntwo', '-flag'])
|
||||
|
||||
def test_compat_etree_fromstring(self):
|
||||
xml = '''
|
||||
|
@@ -60,11 +60,13 @@ from youtube_dl.utils import (
|
||||
timeconvert,
|
||||
unescapeHTML,
|
||||
unified_strdate,
|
||||
unified_timestamp,
|
||||
unsmuggle_url,
|
||||
uppercase_escape,
|
||||
lowercase_escape,
|
||||
url_basename,
|
||||
urlencode_postdata,
|
||||
urshift,
|
||||
update_url_query,
|
||||
version_tuple,
|
||||
xpath_with_ns,
|
||||
@@ -283,8 +285,28 @@ class TestUtil(unittest.TestCase):
|
||||
'20150202')
|
||||
self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214')
|
||||
self.assertEqual(unified_strdate('25-09-2014'), '20140925')
|
||||
self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227')
|
||||
self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None)
|
||||
|
||||
def test_unified_timestamps(self):
|
||||
self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600)
|
||||
self.assertEqual(unified_timestamp('8/7/2009'), 1247011200)
|
||||
self.assertEqual(unified_timestamp('Dec 14, 2012'), 1355443200)
|
||||
self.assertEqual(unified_timestamp('2012/10/11 01:56:38 +0000'), 1349920598)
|
||||
self.assertEqual(unified_timestamp('1968 12 10'), -33436800)
|
||||
self.assertEqual(unified_timestamp('1968-12-10'), -33436800)
|
||||
self.assertEqual(unified_timestamp('28/01/2014 21:00:00 +0100'), 1390939200)
|
||||
self.assertEqual(
|
||||
unified_timestamp('11/26/2014 11:30:00 AM PST', day_first=False),
|
||||
1417001400)
|
||||
self.assertEqual(
|
||||
unified_timestamp('2/2/2015 6:47:40 PM', day_first=False),
|
||||
1422902860)
|
||||
self.assertEqual(unified_timestamp('Feb 14th 2016 5:45PM'), 1455471900)
|
||||
self.assertEqual(unified_timestamp('25-09-2014'), 1411603200)
|
||||
self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200)
|
||||
self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None)
|
||||
|
||||
def test_determine_ext(self):
|
||||
self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
|
||||
self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None)
|
||||
@@ -959,5 +981,9 @@ The first line
|
||||
self.assertRaises(ValueError, encode_base_n, 0, 70)
|
||||
self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table)
|
||||
|
||||
def test_urshift(self):
|
||||
self.assertEqual(urshift(3, 1), 1)
|
||||
self.assertEqual(urshift(-3, 1), 2147483646)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
@@ -196,6 +196,11 @@ def build_fragments_list(boot_info):
|
||||
first_frag_number = fragment_run_entry_table[0]['first']
|
||||
fragments_counter = itertools.count(first_frag_number)
|
||||
for segment, fragments_count in segment_run_table['segment_run']:
|
||||
# In some live HDS streams (for example Rai), `fragments_count` is
|
||||
# abnormal and causing out-of-memory errors. It's OK to change the
|
||||
# number of fragments for live streams as they are updated periodically
|
||||
if fragments_count == 4294967295 and boot_info['live']:
|
||||
fragments_count = 2
|
||||
for _ in range(fragments_count):
|
||||
res.append((segment, next(fragments_counter)))
|
||||
|
||||
@@ -329,7 +334,11 @@ class F4mFD(FragmentFD):
|
||||
|
||||
base_url = compat_urlparse.urljoin(man_url, media.attrib['url'])
|
||||
bootstrap_node = doc.find(_add_ns('bootstrapInfo'))
|
||||
boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url)
|
||||
# From Adobe F4M 3.0 spec:
|
||||
# The <baseURL> element SHALL be the base URL for all relative
|
||||
# (HTTP-based) URLs in the manifest. If <baseURL> is not present, said
|
||||
# URLs should be relative to the location of the containing document.
|
||||
boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url)
|
||||
live = boot_info['live']
|
||||
metadata_node = media.find(_add_ns('metadata'))
|
||||
if metadata_node is not None:
|
||||
|
@@ -2,14 +2,24 @@ from __future__ import unicode_literals
|
||||
|
||||
import os.path
|
||||
import re
|
||||
import binascii
|
||||
try:
|
||||
from Crypto.Cipher import AES
|
||||
can_decrypt_frag = True
|
||||
except ImportError:
|
||||
can_decrypt_frag = False
|
||||
|
||||
from .fragment import FragmentFD
|
||||
from .external import FFmpegFD
|
||||
|
||||
from ..compat import compat_urlparse
|
||||
from ..compat import (
|
||||
compat_urlparse,
|
||||
compat_struct_pack,
|
||||
)
|
||||
from ..utils import (
|
||||
encodeFilename,
|
||||
sanitize_open,
|
||||
parse_m3u8_attributes,
|
||||
)
|
||||
|
||||
|
||||
@@ -21,7 +31,7 @@ class HlsFD(FragmentFD):
|
||||
@staticmethod
|
||||
def can_download(manifest):
|
||||
UNSUPPORTED_FEATURES = (
|
||||
r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1]
|
||||
r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1]
|
||||
r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
|
||||
|
||||
# Live streams heuristic does not always work (e.g. geo restricted to Germany
|
||||
@@ -39,7 +49,9 @@ class HlsFD(FragmentFD):
|
||||
# 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
|
||||
# 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
|
||||
)
|
||||
return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES)
|
||||
check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES]
|
||||
check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest)
|
||||
return all(check_results)
|
||||
|
||||
def real_download(self, filename, info_dict):
|
||||
man_url = info_dict['url']
|
||||
@@ -57,36 +69,60 @@ class HlsFD(FragmentFD):
|
||||
fd.add_progress_hook(ph)
|
||||
return fd.real_download(filename, info_dict)
|
||||
|
||||
fragment_urls = []
|
||||
total_frags = 0
|
||||
for line in s.splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
segment_url = (
|
||||
line
|
||||
if re.match(r'^https?://', line)
|
||||
else compat_urlparse.urljoin(man_url, line))
|
||||
fragment_urls.append(segment_url)
|
||||
# We only download the first fragment during the test
|
||||
if self.params.get('test', False):
|
||||
break
|
||||
total_frags += 1
|
||||
|
||||
ctx = {
|
||||
'filename': filename,
|
||||
'total_frags': len(fragment_urls),
|
||||
'total_frags': total_frags,
|
||||
}
|
||||
|
||||
self._prepare_and_start_frag_download(ctx)
|
||||
|
||||
i = 0
|
||||
media_sequence = 0
|
||||
decrypt_info = {'METHOD': 'NONE'}
|
||||
frags_filenames = []
|
||||
for i, frag_url in enumerate(fragment_urls):
|
||||
frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i)
|
||||
success = ctx['dl'].download(frag_filename, {'url': frag_url})
|
||||
if not success:
|
||||
return False
|
||||
down, frag_sanitized = sanitize_open(frag_filename, 'rb')
|
||||
ctx['dest_stream'].write(down.read())
|
||||
down.close()
|
||||
frags_filenames.append(frag_sanitized)
|
||||
for line in s.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
if not line.startswith('#'):
|
||||
frag_url = (
|
||||
line
|
||||
if re.match(r'^https?://', line)
|
||||
else compat_urlparse.urljoin(man_url, line))
|
||||
frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i)
|
||||
success = ctx['dl'].download(frag_filename, {'url': frag_url})
|
||||
if not success:
|
||||
return False
|
||||
down, frag_sanitized = sanitize_open(frag_filename, 'rb')
|
||||
frag_content = down.read()
|
||||
down.close()
|
||||
if decrypt_info['METHOD'] == 'AES-128':
|
||||
iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
|
||||
frag_content = AES.new(
|
||||
decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
|
||||
ctx['dest_stream'].write(frag_content)
|
||||
frags_filenames.append(frag_sanitized)
|
||||
# We only download the first fragment during the test
|
||||
if self.params.get('test', False):
|
||||
break
|
||||
i += 1
|
||||
media_sequence += 1
|
||||
elif line.startswith('#EXT-X-KEY'):
|
||||
decrypt_info = parse_m3u8_attributes(line[11:])
|
||||
if decrypt_info['METHOD'] == 'AES-128':
|
||||
if 'IV' in decrypt_info:
|
||||
decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:])
|
||||
if not re.match(r'^https?://', decrypt_info['URI']):
|
||||
decrypt_info['URI'] = compat_urlparse.urljoin(
|
||||
man_url, decrypt_info['URI'])
|
||||
decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read()
|
||||
elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
|
||||
media_sequence = int(line[22:])
|
||||
|
||||
self._finish_frag_download(ctx)
|
||||
|
||||
|
@@ -156,7 +156,10 @@ class AdobeTVVideoIE(InfoExtractor):
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
video_data = self._download_json(url + '?format=json', video_id)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
video_data = self._parse_json(self._search_regex(
|
||||
r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id)
|
||||
|
||||
formats = [{
|
||||
'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')),
|
||||
|
@@ -2,23 +2,137 @@ from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .theplatform import ThePlatformIE
|
||||
from ..utils import (
|
||||
smuggle_url,
|
||||
update_url_query,
|
||||
unescapeHTML,
|
||||
extract_attributes,
|
||||
get_element_by_attribute,
|
||||
)
|
||||
from ..compat import (
|
||||
compat_urlparse,
|
||||
)
|
||||
|
||||
|
||||
class AENetworksIE(InfoExtractor):
|
||||
class AENetworksBaseIE(ThePlatformIE):
|
||||
_THEPLATFORM_KEY = 'crazyjava'
|
||||
_THEPLATFORM_SECRET = 's3cr3t'
|
||||
|
||||
|
||||
class AENetworksIE(AENetworksBaseIE):
|
||||
IE_NAME = 'aenetworks'
|
||||
IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
|
||||
_VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
|
||||
_VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
|
||||
'md5': '8ff93eb073449f151d6b90c0ae1ef0c7',
|
||||
'info_dict': {
|
||||
'id': '22253814',
|
||||
'ext': 'mp4',
|
||||
'title': 'Winter Is Coming',
|
||||
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
|
||||
'timestamp': 1338306241,
|
||||
'upload_date': '20120529',
|
||||
'uploader': 'AENE-NEW',
|
||||
},
|
||||
'add_ie': ['ThePlatform'],
|
||||
}, {
|
||||
'url': 'http://www.history.com/shows/ancient-aliens/season-1',
|
||||
'info_dict': {
|
||||
'id': '71889446852',
|
||||
},
|
||||
'playlist_mincount': 5,
|
||||
}, {
|
||||
'url': 'http://www.mylifetime.com/shows/atlanta-plastic',
|
||||
'info_dict': {
|
||||
'id': 'SERIES4317',
|
||||
'title': 'Atlanta Plastic',
|
||||
},
|
||||
'playlist_mincount': 2,
|
||||
}, {
|
||||
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
|
||||
'only_matching': True
|
||||
}, {
|
||||
'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
|
||||
'only_matching': True
|
||||
}, {
|
||||
'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6',
|
||||
'only_matching': True
|
||||
}, {
|
||||
'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',
|
||||
'only_matching': True
|
||||
}]
|
||||
_DOMAIN_TO_REQUESTOR_ID = {
|
||||
'history.com': 'HISTORY',
|
||||
'aetv.com': 'AETV',
|
||||
'mylifetime.com': 'LIFETIME',
|
||||
'fyi.tv': 'FYI',
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups()
|
||||
display_id = show_path or movie_display_id
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
if show_path:
|
||||
url_parts = show_path.split('/')
|
||||
url_parts_len = len(url_parts)
|
||||
if url_parts_len == 1:
|
||||
entries = []
|
||||
for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
|
||||
entries.append(self.url_result(
|
||||
compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
|
||||
return self.playlist_result(
|
||||
entries, self._html_search_meta('aetn:SeriesId', webpage),
|
||||
self._html_search_meta('aetn:SeriesTitle', webpage))
|
||||
elif url_parts_len == 2:
|
||||
entries = []
|
||||
for episode_item in re.findall(r'(?s)<div[^>]+class="[^"]*episode-item[^"]*"[^>]*>', webpage):
|
||||
episode_attributes = extract_attributes(episode_item)
|
||||
episode_url = compat_urlparse.urljoin(
|
||||
url, episode_attributes['data-canonical'])
|
||||
entries.append(self.url_result(
|
||||
episode_url, 'AENetworks',
|
||||
episode_attributes['data-videoid']))
|
||||
return self.playlist_result(
|
||||
entries, self._html_search_meta('aetn:SeasonId', webpage))
|
||||
|
||||
query = {
|
||||
'mbr': 'true',
|
||||
'assetTypes': 'medium_video_s3'
|
||||
}
|
||||
video_id = self._html_search_meta('aetn:VideoID', webpage)
|
||||
media_url = self._search_regex(
|
||||
r"media_url\s*=\s*'([^']+)'", webpage, 'video url')
|
||||
theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
|
||||
r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
|
||||
info = self._parse_theplatform_metadata(theplatform_metadata)
|
||||
if theplatform_metadata.get('AETN$isBehindWall'):
|
||||
requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
|
||||
resource = '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"><channel><title>%s</title><item><title>%s</title><guid>%s</guid><media:rating scheme="urn:v-chip">%s</media:rating></item></channel></rss>' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating'])
|
||||
query['auth'] = self._extract_mvpd_auth(
|
||||
url, video_id, requestor_id, resource)
|
||||
info.update(self._search_json_ld(webpage, video_id, fatal=False))
|
||||
media_url = update_url_query(media_url, query)
|
||||
media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET)
|
||||
formats, subtitles = self._extract_theplatform_smil(media_url, video_id)
|
||||
self._sort_formats(formats)
|
||||
info.update({
|
||||
'id': video_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
})
|
||||
return info
|
||||
|
||||
|
||||
class HistoryTopicIE(AENetworksBaseIE):
|
||||
IE_NAME = 'history:topic'
|
||||
IE_DESC = 'History.com Topic'
|
||||
_VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
|
||||
'info_dict': {
|
||||
'id': 'g12m5Gyt3fdR',
|
||||
'id': '40700995724',
|
||||
'ext': 'mp4',
|
||||
'title': "Bet You Didn't Know: Valentine's Day",
|
||||
'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
|
||||
@@ -31,57 +145,61 @@ class AENetworksIE(InfoExtractor):
|
||||
'skip_download': True,
|
||||
},
|
||||
'add_ie': ['ThePlatform'],
|
||||
'expected_warnings': ['JSON-LD'],
|
||||
}, {
|
||||
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
|
||||
'md5': '8ff93eb073449f151d6b90c0ae1ef0c7',
|
||||
'info_dict': {
|
||||
'id': 'eg47EERs_JsZ',
|
||||
'ext': 'mp4',
|
||||
'title': 'Winter Is Coming',
|
||||
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
|
||||
'timestamp': 1338306241,
|
||||
'upload_date': '20120529',
|
||||
'uploader': 'AENE-NEW',
|
||||
'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos',
|
||||
'info_dict':
|
||||
{
|
||||
'id': 'world-war-i-history',
|
||||
'title': 'World War I History',
|
||||
},
|
||||
'add_ie': ['ThePlatform'],
|
||||
'playlist_mincount': 24,
|
||||
}, {
|
||||
'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry',
|
||||
'only_matching': True
|
||||
'url': 'http://www.history.com/topics/world-war-i-history/videos',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage',
|
||||
'only_matching': True
|
||||
'url': 'http://www.history.com/topics/world-war-i/world-war-i-history',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients',
|
||||
'only_matching': True
|
||||
'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
page_type, video_id = re.match(self._VALID_URL, url).groups()
|
||||
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
video_url_re = [
|
||||
r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id,
|
||||
r"media_url\s*=\s*'([^']+)'"
|
||||
]
|
||||
video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url'))
|
||||
query = {'mbr': 'true'}
|
||||
if page_type == 'shows':
|
||||
query['assetTypes'] = 'medium_video_s3'
|
||||
if 'switch=hds' in video_url:
|
||||
query['switch'] = 'hls'
|
||||
|
||||
info = self._search_json_ld(webpage, video_id, fatal=False)
|
||||
info.update({
|
||||
def theplatform_url_result(self, theplatform_url, video_id, query):
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'id': video_id,
|
||||
'url': smuggle_url(
|
||||
update_url_query(video_url, query),
|
||||
update_url_query(theplatform_url, query),
|
||||
{
|
||||
'sig': {
|
||||
'key': 'crazyjava',
|
||||
'secret': 's3cr3t'},
|
||||
'key': self._THEPLATFORM_KEY,
|
||||
'secret': self._THEPLATFORM_SECRET,
|
||||
},
|
||||
'force_smil_url': True
|
||||
}),
|
||||
})
|
||||
return info
|
||||
'ie_key': 'ThePlatform',
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
topic_id, video_display_id = re.match(self._VALID_URL, url).groups()
|
||||
if video_display_id:
|
||||
webpage = self._download_webpage(url, video_display_id)
|
||||
release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups()
|
||||
release_url = unescapeHTML(release_url)
|
||||
|
||||
return self.theplatform_url_result(
|
||||
release_url, video_id, {
|
||||
'mbr': 'true',
|
||||
'switch': 'hls'
|
||||
})
|
||||
else:
|
||||
webpage = self._download_webpage(url, topic_id)
|
||||
entries = []
|
||||
for episode_item in re.findall(r'<a.+?data-release-url="[^"]+"[^>]*>', webpage):
|
||||
video_attributes = extract_attributes(episode_item)
|
||||
entries.append(self.theplatform_url_result(
|
||||
video_attributes['data-release-url'], video_attributes['data-id'], {
|
||||
'mbr': 'true',
|
||||
'switch': 'hls'
|
||||
}))
|
||||
return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage))
|
||||
|
@@ -24,10 +24,10 @@ class AftonbladetIE(InfoExtractor):
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
# find internal video meta data
|
||||
meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
|
||||
meta_url = 'http://aftonbladet-play-metadata.cdn.drvideo.aptoma.no/video/%s.json'
|
||||
player_config = self._parse_json(self._html_search_regex(
|
||||
r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
|
||||
internal_meta_id = player_config['videoId']
|
||||
internal_meta_id = player_config['aptomaVideoId']
|
||||
internal_meta_url = meta_url % internal_meta_id
|
||||
internal_meta_json = self._download_json(
|
||||
internal_meta_url, video_id, 'Downloading video meta data')
|
||||
|
@@ -7,6 +7,8 @@ from .common import InfoExtractor
|
||||
from ..compat import compat_urlparse
|
||||
from ..utils import (
|
||||
int_or_none,
|
||||
parse_duration,
|
||||
unified_strdate,
|
||||
)
|
||||
|
||||
|
||||
@@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor):
|
||||
_TESTS = [{
|
||||
'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
|
||||
'info_dict': {
|
||||
'id': 'manofsteel',
|
||||
'id': '5111',
|
||||
'title': 'Man of Steel',
|
||||
},
|
||||
'playlist': [
|
||||
{
|
||||
@@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor):
|
||||
'id': 'blackthorn',
|
||||
},
|
||||
'playlist_mincount': 2,
|
||||
'expected_warnings': ['Unable to download JSON metadata'],
|
||||
}, {
|
||||
# json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
|
||||
'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
|
||||
'info_dict': {
|
||||
'id': '15881',
|
||||
'title': 'Kung Fu Panda 3',
|
||||
},
|
||||
'playlist_mincount': 4,
|
||||
}, {
|
||||
'url': 'http://trailers.apple.com/ca/metropole/autrui/',
|
||||
'only_matching': True,
|
||||
@@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor):
|
||||
movie = mobj.group('movie')
|
||||
uploader_id = mobj.group('company')
|
||||
|
||||
webpage = self._download_webpage(url, movie)
|
||||
film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
|
||||
film_data = self._download_json(
|
||||
'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id,
|
||||
film_id, fatal=False)
|
||||
|
||||
if film_data:
|
||||
entries = []
|
||||
for clip in film_data.get('clips', []):
|
||||
clip_title = clip['title']
|
||||
|
||||
formats = []
|
||||
for version, version_data in clip.get('versions', {}).items():
|
||||
for size, size_data in version_data.get('sizes', {}).items():
|
||||
src = size_data.get('src')
|
||||
if not src:
|
||||
continue
|
||||
formats.append({
|
||||
'format_id': '%s-%s' % (version, size),
|
||||
'url': re.sub(r'_(\d+p.mov)', r'_h\1', src),
|
||||
'width': int_or_none(size_data.get('width')),
|
||||
'height': int_or_none(size_data.get('height')),
|
||||
'language': version[:2],
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
entries.append({
|
||||
'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
|
||||
'formats': formats,
|
||||
'title': clip_title,
|
||||
'thumbnail': clip.get('screen') or clip.get('thumb'),
|
||||
'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
|
||||
'upload_date': unified_strdate(clip.get('posted')),
|
||||
'uploader_id': uploader_id,
|
||||
})
|
||||
|
||||
page_data = film_data.get('page', {})
|
||||
return self.playlist_result(entries, film_id, page_data.get('movie_title'))
|
||||
|
||||
playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
|
||||
|
||||
def fix_html(s):
|
||||
|
@@ -8,7 +8,6 @@ from .generic import GenericIE
|
||||
from ..utils import (
|
||||
determine_ext,
|
||||
ExtractorError,
|
||||
get_element_by_attribute,
|
||||
qualities,
|
||||
int_or_none,
|
||||
parse_duration,
|
||||
@@ -274,41 +273,3 @@ class ARDIE(InfoExtractor):
|
||||
'upload_date': upload_date,
|
||||
'thumbnail': thumbnail,
|
||||
}
|
||||
|
||||
|
||||
class SportschauIE(ARDMediathekIE):
|
||||
IE_NAME = 'Sportschau'
|
||||
_VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
|
||||
'info_dict': {
|
||||
'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
|
||||
'ext': 'mp4',
|
||||
'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
|
||||
},
|
||||
'params': {
|
||||
# m3u8 download
|
||||
'skip_download': True,
|
||||
},
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
video_id = mobj.group('id')
|
||||
base_url = mobj.group('baseurl')
|
||||
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
title = get_element_by_attribute('class', 'headline', webpage)
|
||||
description = self._html_search_meta('description', webpage, 'description')
|
||||
|
||||
info = self._extract_media_info(
|
||||
base_url + '-mc_defaultQuality-h.json', webpage, video_id)
|
||||
|
||||
info.update({
|
||||
'title': title,
|
||||
'description': description,
|
||||
})
|
||||
|
||||
return info
|
||||
|
@@ -180,11 +180,14 @@ class ArteTVBaseIE(InfoExtractor):
|
||||
|
||||
class ArteTVPlus7IE(ArteTVBaseIE):
|
||||
IE_NAME = 'arte.tv:+7'
|
||||
_VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)'
|
||||
_VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
@classmethod
|
||||
@@ -240,10 +243,10 @@ class ArteTVPlus7IE(ArteTVBaseIE):
|
||||
return self._extract_from_json_url(json_url, video_id, lang, title=title)
|
||||
# Different kind of embed URL (e.g.
|
||||
# http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
|
||||
embed_url = self._search_regex(
|
||||
r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
|
||||
webpage, 'embed url', group='url')
|
||||
return self.url_result(embed_url)
|
||||
entries = [
|
||||
self.url_result(url)
|
||||
for _, url in re.findall(r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage)]
|
||||
return self.playlist_result(entries)
|
||||
|
||||
|
||||
# It also uses the arte_vp_url url from the webpage to extract the information
|
||||
@@ -252,22 +255,17 @@ class ArteTVCreativeIE(ArteTVPlus7IE):
|
||||
_VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
|
||||
'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1',
|
||||
'info_dict': {
|
||||
'id': '72176',
|
||||
'id': '057405-001-A',
|
||||
'ext': 'mp4',
|
||||
'title': 'Folge 2 - Corporate Design',
|
||||
'upload_date': '20131004',
|
||||
'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)',
|
||||
'upload_date': '20150716',
|
||||
},
|
||||
}, {
|
||||
'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion',
|
||||
'info_dict': {
|
||||
'id': '160676',
|
||||
'ext': 'mp4',
|
||||
'title': 'Monty Python live (mostly)',
|
||||
'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n',
|
||||
'upload_date': '20140805',
|
||||
}
|
||||
'playlist_count': 11,
|
||||
'add_ie': ['Youtube'],
|
||||
}, {
|
||||
'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde',
|
||||
'only_matching': True,
|
||||
@@ -349,14 +347,13 @@ class ArteTVCinemaIE(ArteTVPlus7IE):
|
||||
_VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'http://cinema.arte.tv/de/node/38291',
|
||||
'md5': '6b275511a5107c60bacbeeda368c3aa1',
|
||||
'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck',
|
||||
'md5': 'a5b9dd5575a11d93daf0e3f404f45438',
|
||||
'info_dict': {
|
||||
'id': '055876-000_PWA12025-D',
|
||||
'id': '062494-000-A',
|
||||
'ext': 'mp4',
|
||||
'title': 'Tod auf dem Nil',
|
||||
'upload_date': '20160122',
|
||||
'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',
|
||||
'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck',
|
||||
'upload_date': '20150807',
|
||||
},
|
||||
}]
|
||||
|
||||
@@ -422,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
|
||||
'info_dict': {
|
||||
'id': 'PL-013263',
|
||||
'title': 'Areva & Uramin',
|
||||
'description': 'md5:a1dc0312ce357c262259139cfd48c9bf',
|
||||
},
|
||||
'playlist_mincount': 6,
|
||||
}, {
|
||||
|
@@ -46,6 +46,7 @@ class AzubuIE(InfoExtractor):
|
||||
'uploader_id': 272749,
|
||||
'view_count': int,
|
||||
},
|
||||
'skip': 'Channel offline',
|
||||
},
|
||||
]
|
||||
|
||||
@@ -56,22 +57,26 @@ class AzubuIE(InfoExtractor):
|
||||
'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data']
|
||||
|
||||
title = data['title'].strip()
|
||||
description = data['description']
|
||||
thumbnail = data['thumbnail']
|
||||
view_count = data['view_count']
|
||||
uploader = data['user']['username']
|
||||
uploader_id = data['user']['id']
|
||||
description = data.get('description')
|
||||
thumbnail = data.get('thumbnail')
|
||||
view_count = data.get('view_count')
|
||||
user = data.get('user', {})
|
||||
uploader = user.get('username')
|
||||
uploader_id = user.get('id')
|
||||
|
||||
stream_params = json.loads(data['stream_params'])
|
||||
|
||||
timestamp = float_or_none(stream_params['creationDate'], 1000)
|
||||
duration = float_or_none(stream_params['length'], 1000)
|
||||
timestamp = float_or_none(stream_params.get('creationDate'), 1000)
|
||||
duration = float_or_none(stream_params.get('length'), 1000)
|
||||
|
||||
renditions = stream_params.get('renditions') or []
|
||||
video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength')
|
||||
if video:
|
||||
renditions.append(video)
|
||||
|
||||
if not renditions and not user.get('channel', {}).get('is_live', True):
|
||||
raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True)
|
||||
|
||||
formats = [{
|
||||
'url': fmt['url'],
|
||||
'width': fmt['frameWidth'],
|
||||
|
@@ -192,6 +192,7 @@ class BBCCoUkIE(InfoExtractor):
|
||||
# rtmp download
|
||||
'skip_download': True,
|
||||
},
|
||||
'skip': 'Now it\'s really geo-restricted',
|
||||
}, {
|
||||
# compact player (https://github.com/rg3/youtube-dl/issues/8147)
|
||||
'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
|
||||
|
@@ -1,31 +1,27 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import compat_urllib_parse_unquote
|
||||
from ..utils import (
|
||||
xpath_text,
|
||||
xpath_with_ns,
|
||||
int_or_none,
|
||||
parse_iso8601,
|
||||
)
|
||||
from .mtv import MTVServicesInfoExtractor
|
||||
from ..utils import unified_strdate
|
||||
from ..compat import compat_urllib_parse_urlencode
|
||||
|
||||
|
||||
class BetIE(InfoExtractor):
|
||||
class BetIE(MTVServicesInfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
|
||||
_TESTS = [
|
||||
{
|
||||
'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
|
||||
'info_dict': {
|
||||
'id': 'news/national/2014/a-conversation-with-president-obama',
|
||||
'id': '07e96bd3-8850-3051-b856-271b457f0ab8',
|
||||
'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
|
||||
'ext': 'flv',
|
||||
'title': 'A Conversation With President Obama',
|
||||
'description': 'md5:699d0652a350cf3e491cd15cc745b5da',
|
||||
'description': 'President Obama urges persistence in confronting racism and bias.',
|
||||
'duration': 1534,
|
||||
'timestamp': 1418075340,
|
||||
'upload_date': '20141208',
|
||||
'uploader': 'admin',
|
||||
'thumbnail': 're:(?i)^https?://.*\.jpg$',
|
||||
'subtitles': {
|
||||
'en': 'mincount:2',
|
||||
}
|
||||
},
|
||||
'params': {
|
||||
# rtmp download
|
||||
@@ -35,16 +31,17 @@ class BetIE(InfoExtractor):
|
||||
{
|
||||
'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
|
||||
'info_dict': {
|
||||
'id': 'news/national/2014/justice-for-ferguson-a-community-reacts',
|
||||
'id': '9f516bf1-7543-39c4-8076-dd441b459ba9',
|
||||
'display_id': 'justice-for-ferguson-a-community-reacts',
|
||||
'ext': 'flv',
|
||||
'title': 'Justice for Ferguson: A Community Reacts',
|
||||
'description': 'A BET News special.',
|
||||
'duration': 1696,
|
||||
'timestamp': 1416942360,
|
||||
'upload_date': '20141125',
|
||||
'uploader': 'admin',
|
||||
'thumbnail': 're:(?i)^https?://.*\.jpg$',
|
||||
'subtitles': {
|
||||
'en': 'mincount:2',
|
||||
}
|
||||
},
|
||||
'params': {
|
||||
# rtmp download
|
||||
@@ -53,57 +50,32 @@ class BetIE(InfoExtractor):
|
||||
}
|
||||
]
|
||||
|
||||
_FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player"
|
||||
|
||||
def _get_feed_query(self, uri):
|
||||
return compat_urllib_parse_urlencode({
|
||||
'uuid': uri,
|
||||
})
|
||||
|
||||
def _extract_mgid(self, webpage):
|
||||
return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid')
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
mgid = self._extract_mgid(webpage)
|
||||
videos_info = self._get_videos_info(mgid)
|
||||
|
||||
media_url = compat_urllib_parse_unquote(self._search_regex(
|
||||
[r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
|
||||
webpage, 'media URL'))
|
||||
info_dict = videos_info['entries'][0]
|
||||
|
||||
video_id = self._search_regex(
|
||||
r'/video/(.*)/_jcr_content/', media_url, 'video id')
|
||||
upload_date = unified_strdate(self._html_search_meta('date', webpage))
|
||||
description = self._html_search_meta('description', webpage)
|
||||
|
||||
mrss = self._download_xml(media_url, display_id)
|
||||
|
||||
item = mrss.find('./channel/item')
|
||||
|
||||
NS_MAP = {
|
||||
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||
'media': 'http://search.yahoo.com/mrss/',
|
||||
'ka': 'http://kickapps.com/karss',
|
||||
}
|
||||
|
||||
title = xpath_text(item, './title', 'title')
|
||||
description = xpath_text(
|
||||
item, './description', 'description', fatal=False)
|
||||
|
||||
timestamp = parse_iso8601(xpath_text(
|
||||
item, xpath_with_ns('./dc:date', NS_MAP),
|
||||
'upload date', fatal=False))
|
||||
uploader = xpath_text(
|
||||
item, xpath_with_ns('./dc:creator', NS_MAP),
|
||||
'uploader', fatal=False)
|
||||
|
||||
media_content = item.find(
|
||||
xpath_with_ns('./media:content', NS_MAP))
|
||||
duration = int_or_none(media_content.get('duration'))
|
||||
smil_url = media_content.get('url')
|
||||
|
||||
thumbnail = media_content.find(
|
||||
xpath_with_ns('./media:thumbnail', NS_MAP)).get('url')
|
||||
|
||||
formats = self._extract_smil_formats(smil_url, display_id)
|
||||
self._sort_formats(formats)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
info_dict.update({
|
||||
'display_id': display_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'thumbnail': thumbnail,
|
||||
'timestamp': timestamp,
|
||||
'uploader': uploader,
|
||||
'duration': duration,
|
||||
'formats': formats,
|
||||
}
|
||||
'upload_date': upload_date,
|
||||
})
|
||||
|
||||
return info_dict
|
||||
|
@@ -29,7 +29,8 @@ class BRIE(InfoExtractor):
|
||||
'duration': 180,
|
||||
'uploader': 'Reinhard Weber',
|
||||
'upload_date': '20150422',
|
||||
}
|
||||
},
|
||||
'skip': '404 not found',
|
||||
},
|
||||
{
|
||||
'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
|
||||
@@ -40,7 +41,8 @@ class BRIE(InfoExtractor):
|
||||
'title': 'Manfred Schreiber ist tot',
|
||||
'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',
|
||||
'duration': 26,
|
||||
}
|
||||
},
|
||||
'skip': '404 not found',
|
||||
},
|
||||
{
|
||||
'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',
|
||||
@@ -51,7 +53,8 @@ class BRIE(InfoExtractor):
|
||||
'title': 'Kurzweilig und sehr bewegend',
|
||||
'description': 'md5:0351996e3283d64adeb38ede91fac54e',
|
||||
'duration': 296,
|
||||
}
|
||||
},
|
||||
'skip': '404 not found',
|
||||
},
|
||||
{
|
||||
'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
|
||||
|
@@ -5,6 +5,7 @@ import json
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .facebook import FacebookIE
|
||||
|
||||
|
||||
class BuzzFeedIE(InfoExtractor):
|
||||
@@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor):
|
||||
'info_dict': {
|
||||
'id': 'aVCR29aE_OQ',
|
||||
'ext': 'mp4',
|
||||
'title': 'Angry Ram destroys a punching bag..',
|
||||
'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
|
||||
'upload_date': '20141024',
|
||||
'uploader_id': 'Buddhanz1',
|
||||
'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl',
|
||||
'uploader': 'Buddhanz',
|
||||
'title': 'Angry Ram destroys a punching bag',
|
||||
'uploader': 'Angry Ram',
|
||||
}
|
||||
}]
|
||||
}, {
|
||||
@@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor):
|
||||
'info_dict': {
|
||||
'id': 'mVmBL8B-In0',
|
||||
'ext': 'mp4',
|
||||
'title': 're:Munchkin the Teddy Bear gets her exercise',
|
||||
'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
|
||||
'upload_date': '20141124',
|
||||
'uploader_id': 'CindysMunchkin',
|
||||
'description': 're:© 2014 Munchkin the',
|
||||
'uploader': 're:^Munchkin the',
|
||||
'title': 're:Munchkin the Teddy Bear gets her exercise',
|
||||
},
|
||||
}]
|
||||
}, {
|
||||
'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
|
||||
'info_dict': {
|
||||
'id': 'the-most-adorable-crash-landing-ever',
|
||||
'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
|
||||
'description': 'This gosling knows how to stick a landing.',
|
||||
},
|
||||
'playlist': [{
|
||||
'md5': '763ca415512f91ca62e4621086900a23',
|
||||
'info_dict': {
|
||||
'id': '971793786185728',
|
||||
'ext': 'mp4',
|
||||
'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
|
||||
'uploader': 'Calgary Outdoor Centre-University of Calgary',
|
||||
},
|
||||
}],
|
||||
'add_ie': ['Facebook'],
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
@@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor):
|
||||
continue
|
||||
entries.append(self.url_result(video['url']))
|
||||
|
||||
facebook_url = FacebookIE._extract_url(webpage)
|
||||
if facebook_url:
|
||||
entries.append(self.url_result(facebook_url))
|
||||
|
||||
return {
|
||||
'_type': 'playlist',
|
||||
'id': playlist_id,
|
||||
|
@@ -1,17 +1,13 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .theplatform import ThePlatformIE
|
||||
from .theplatform import ThePlatformFeedIE
|
||||
from ..utils import (
|
||||
xpath_text,
|
||||
xpath_element,
|
||||
int_or_none,
|
||||
find_xpath_attr,
|
||||
)
|
||||
|
||||
|
||||
class CBSBaseIE(ThePlatformIE):
|
||||
class CBSBaseIE(ThePlatformFeedIE):
|
||||
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
|
||||
closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL')
|
||||
return {
|
||||
@@ -21,9 +17,22 @@ class CBSBaseIE(ThePlatformIE):
|
||||
}]
|
||||
} if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
|
||||
|
||||
def _extract_video_info(self, filter_query, video_id):
|
||||
return self._extract_feed_info(
|
||||
'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: {
|
||||
'series': entry.get('cbs$SeriesTitle'),
|
||||
'season_number': int_or_none(entry.get('cbs$SeasonNumber')),
|
||||
'episode': entry.get('cbs$EpisodeTitle'),
|
||||
'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')),
|
||||
}, {
|
||||
'StreamPack': {
|
||||
'manifest': 'm3u',
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
class CBSIE(CBSBaseIE):
|
||||
_VALID_URL = r'(?:cbs:(?P<content_id>\w+)|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<display_id>[^/]+))'
|
||||
_VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
|
||||
@@ -38,25 +47,7 @@ class CBSIE(CBSBaseIE):
|
||||
'upload_date': '20131127',
|
||||
'uploader': 'CBSI-NEW',
|
||||
},
|
||||
'params': {
|
||||
# rtmp download
|
||||
'skip_download': True,
|
||||
},
|
||||
'_skip': 'Blocked outside the US',
|
||||
}, {
|
||||
'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
|
||||
'info_dict': {
|
||||
'id': 'WWF_5KqY3PK1',
|
||||
'display_id': 'st-vincent',
|
||||
'ext': 'flv',
|
||||
'title': 'Live on Letterman - St. Vincent',
|
||||
'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
|
||||
'duration': 3221,
|
||||
},
|
||||
'params': {
|
||||
# rtmp download
|
||||
'skip_download': True,
|
||||
},
|
||||
'expected_warnings': ['Failed to download m3u8 information'],
|
||||
'_skip': 'Blocked outside the US',
|
||||
}, {
|
||||
'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
|
||||
@@ -68,44 +59,5 @@ class CBSIE(CBSBaseIE):
|
||||
TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'
|
||||
|
||||
def _real_extract(self, url):
|
||||
content_id, display_id = re.match(self._VALID_URL, url).groups()
|
||||
if not content_id:
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
content_id = self._search_regex(
|
||||
[r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"],
|
||||
webpage, 'content id')
|
||||
items_data = self._download_xml(
|
||||
'http://can.cbs.com/thunder/player/videoPlayerService.php',
|
||||
content_id, query={'partner': 'cbs', 'contentId': content_id})
|
||||
video_data = xpath_element(items_data, './/item')
|
||||
title = xpath_text(video_data, 'videoTitle', 'title', True)
|
||||
|
||||
subtitles = {}
|
||||
formats = []
|
||||
for item in items_data.findall('.//item'):
|
||||
pid = xpath_text(item, 'pid')
|
||||
if not pid:
|
||||
continue
|
||||
tp_release_url = self.TP_RELEASE_URL_TEMPLATE % pid
|
||||
if '.m3u8' in xpath_text(item, 'contentUrl', default=''):
|
||||
tp_release_url += '&manifest=m3u'
|
||||
tp_formats, tp_subtitles = self._extract_theplatform_smil(
|
||||
tp_release_url, content_id, 'Downloading %s SMIL data' % pid)
|
||||
formats.extend(tp_formats)
|
||||
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
|
||||
self._sort_formats(formats)
|
||||
|
||||
info = self.get_metadata('dJ5BDC/media/guid/2198311517/%s' % content_id, content_id)
|
||||
info.update({
|
||||
'id': content_id,
|
||||
'display_id': display_id,
|
||||
'title': title,
|
||||
'series': xpath_text(video_data, 'seriesTitle'),
|
||||
'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
|
||||
'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
|
||||
'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
|
||||
'thumbnail': xpath_text(video_data, 'previewImageURL'),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
})
|
||||
return info
|
||||
content_id = self._match_id(url)
|
||||
return self._extract_video_info('byGuid=%s' % content_id, content_id)
|
||||
|
@@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE):
|
||||
|
||||
media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId'])
|
||||
formats, subtitles = [], {}
|
||||
if site == 'cnet':
|
||||
formats, subtitles = self._extract_theplatform_smil(
|
||||
self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id)
|
||||
for (fkey, vid) in vdata['files'].items():
|
||||
if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
|
||||
continue
|
||||
@@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE):
|
||||
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
|
||||
self._sort_formats(formats)
|
||||
|
||||
info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id)
|
||||
info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id)
|
||||
info.update({
|
||||
'id': video_id,
|
||||
'display_id': display_id,
|
||||
|
@@ -30,9 +30,12 @@ class CBSNewsIE(CBSBaseIE):
|
||||
{
|
||||
'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
|
||||
'info_dict': {
|
||||
'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
|
||||
'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y',
|
||||
'ext': 'mp4',
|
||||
'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
|
||||
'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
|
||||
'upload_date': '19700101',
|
||||
'uploader': 'CBSI-NEW',
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
'duration': 205,
|
||||
'subtitles': {
|
||||
@@ -58,30 +61,8 @@ class CBSNewsIE(CBSBaseIE):
|
||||
webpage, 'video JSON info'), video_id)
|
||||
|
||||
item = video_info['item'] if 'item' in video_info else video_info
|
||||
title = item.get('articleTitle') or item.get('hed')
|
||||
duration = item.get('duration')
|
||||
thumbnail = item.get('mediaImage') or item.get('thumbnail')
|
||||
|
||||
subtitles = {}
|
||||
formats = []
|
||||
for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
|
||||
pid = item.get('media' + format_id)
|
||||
if not pid:
|
||||
continue
|
||||
release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid
|
||||
tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid)
|
||||
formats.extend(tp_formats)
|
||||
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
|
||||
self._sort_formats(formats)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'thumbnail': thumbnail,
|
||||
'duration': duration,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
guid = item['mpxRefId']
|
||||
return self._extract_video_info('byGuid=%s' % guid, guid)
|
||||
|
||||
|
||||
class CBSNewsLiveVideoIE(InfoExtractor):
|
||||
|
@@ -1,30 +1,28 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .cbs import CBSBaseIE
|
||||
|
||||
|
||||
class CBSSportsIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)'
|
||||
class CBSSportsIE(CBSBaseIE):
|
||||
_VALID_URL = r'https?://www\.cbssports\.com/video/player/[^/]+/(?P<id>\d+)'
|
||||
|
||||
_TEST = {
|
||||
'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s',
|
||||
_TESTS = [{
|
||||
'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast',
|
||||
'info_dict': {
|
||||
'id': '_d5_GbO8p1sT',
|
||||
'ext': 'flv',
|
||||
'title': 'US Open flashbacks: 1990s',
|
||||
'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.',
|
||||
'id': '708337219968',
|
||||
'ext': 'mp4',
|
||||
'title': 'Ben Simmons the next LeBron? Not so fast',
|
||||
'description': 'md5:854294f627921baba1f4b9a990d87197',
|
||||
'timestamp': 1466293740,
|
||||
'upload_date': '20160618',
|
||||
'uploader': 'CBSI-NEW',
|
||||
},
|
||||
}
|
||||
'params': {
|
||||
# m3u8 download
|
||||
'skip_download': True,
|
||||
}
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
section = mobj.group('section')
|
||||
video_id = mobj.group('id')
|
||||
all_videos = self._download_json(
|
||||
'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section,
|
||||
video_id)
|
||||
# The json file contains the info of all the videos in the section
|
||||
video_info = next(v for v in all_videos if v['pcid'] == video_id)
|
||||
return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform')
|
||||
video_id = self._match_id(url)
|
||||
return self._extract_video_info('byId=%s' % video_id, video_id)
|
||||
|
92
youtube_dl/extractor/closertotruth.py
Normal file
92
youtube_dl/extractor/closertotruth.py
Normal file
@@ -0,0 +1,92 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
|
||||
|
||||
class CloserToTruthIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
|
||||
'info_dict': {
|
||||
'id': '0_zof1ktre',
|
||||
'display_id': 'solutions-the-mind-body-problem',
|
||||
'ext': 'mov',
|
||||
'title': 'Solutions to the Mind-Body Problem?',
|
||||
'upload_date': '20140221',
|
||||
'timestamp': 1392956007,
|
||||
'uploader_id': 'CTTXML'
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
}, {
|
||||
'url': 'http://closertotruth.com/episodes/how-do-brains-work',
|
||||
'info_dict': {
|
||||
'id': '0_iuxai6g6',
|
||||
'display_id': 'how-do-brains-work',
|
||||
'ext': 'mov',
|
||||
'title': 'How do Brains Work?',
|
||||
'upload_date': '20140221',
|
||||
'timestamp': 1392956024,
|
||||
'uploader_id': 'CTTXML'
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
}, {
|
||||
'url': 'http://closertotruth.com/interviews/1725',
|
||||
'info_dict': {
|
||||
'id': '1725',
|
||||
'title': 'AyaFr-002',
|
||||
},
|
||||
'playlist_mincount': 2,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
|
||||
partner_id = self._search_regex(
|
||||
r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
|
||||
webpage, 'kaltura partner_id')
|
||||
|
||||
title = self._search_regex(
|
||||
r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
|
||||
|
||||
select = self._search_regex(
|
||||
r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
|
||||
webpage, 'select version', default=None)
|
||||
if select:
|
||||
entry_ids = set()
|
||||
entries = []
|
||||
for mobj in re.finditer(
|
||||
r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
|
||||
webpage):
|
||||
entry_id = mobj.group('id')
|
||||
if entry_id in entry_ids:
|
||||
continue
|
||||
entry_ids.add(entry_id)
|
||||
entries.append({
|
||||
'_type': 'url_transparent',
|
||||
'url': 'kaltura:%s:%s' % (partner_id, entry_id),
|
||||
'ie_key': 'Kaltura',
|
||||
'title': mobj.group('title'),
|
||||
})
|
||||
if entries:
|
||||
return self.playlist_result(entries, display_id, title)
|
||||
|
||||
entry_id = self._search_regex(
|
||||
r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
|
||||
webpage, 'kaltura entry_id', group='id')
|
||||
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'display_id': display_id,
|
||||
'url': 'kaltura:%s:%s' % (partner_id, entry_id),
|
||||
'ie_key': 'Kaltura',
|
||||
'title': title
|
||||
}
|
@@ -53,6 +53,7 @@ from ..utils import (
|
||||
mimetype2ext,
|
||||
update_Request,
|
||||
update_url_query,
|
||||
parse_m3u8_attributes,
|
||||
)
|
||||
|
||||
|
||||
@@ -748,10 +749,12 @@ class InfoExtractor(object):
|
||||
return self._og_search_property('url', html, **kargs)
|
||||
|
||||
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
|
||||
if not isinstance(name, (list, tuple)):
|
||||
name = [name]
|
||||
if display_name is None:
|
||||
display_name = name
|
||||
display_name = name[0]
|
||||
return self._html_search_regex(
|
||||
self._meta_regex(name),
|
||||
[self._meta_regex(n) for n in name],
|
||||
html, display_name, fatal=fatal, group='content', **kwargs)
|
||||
|
||||
def _dc_search_uploader(self, html):
|
||||
@@ -875,7 +878,11 @@ class InfoExtractor(object):
|
||||
f['ext'] = determine_ext(f['url'])
|
||||
|
||||
if isinstance(field_preference, (list, tuple)):
|
||||
return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
|
||||
return tuple(
|
||||
f.get(field)
|
||||
if f.get(field) is not None
|
||||
else ('' if field == 'format_id' else -1)
|
||||
for field in field_preference)
|
||||
|
||||
preference = f.get('preference')
|
||||
if preference is None:
|
||||
@@ -1150,23 +1157,11 @@ class InfoExtractor(object):
|
||||
}]
|
||||
last_info = None
|
||||
last_media = None
|
||||
kv_rex = re.compile(
|
||||
r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
|
||||
for line in m3u8_doc.splitlines():
|
||||
if line.startswith('#EXT-X-STREAM-INF:'):
|
||||
last_info = {}
|
||||
for m in kv_rex.finditer(line):
|
||||
v = m.group('val')
|
||||
if v.startswith('"'):
|
||||
v = v[1:-1]
|
||||
last_info[m.group('key')] = v
|
||||
last_info = parse_m3u8_attributes(line)
|
||||
elif line.startswith('#EXT-X-MEDIA:'):
|
||||
last_media = {}
|
||||
for m in kv_rex.finditer(line):
|
||||
v = m.group('val')
|
||||
if v.startswith('"'):
|
||||
v = v[1:-1]
|
||||
last_media[m.group('key')] = v
|
||||
last_media = parse_m3u8_attributes(line)
|
||||
elif line.startswith('#') or not line.strip():
|
||||
continue
|
||||
else:
|
||||
|
30
youtube_dl/extractor/ctv.py
Normal file
30
youtube_dl/extractor/ctv.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
|
||||
|
||||
class CTVIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.ctv.ca/video/player?vid=706966',
|
||||
'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0',
|
||||
'info_dict': {
|
||||
'id': '706966',
|
||||
'ext': 'mp4',
|
||||
'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'',
|
||||
'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.',
|
||||
'upload_date': '20150919',
|
||||
'timestamp': 1442624700,
|
||||
},
|
||||
'expected_warnings': ['HTTP Error 404'],
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'id': video_id,
|
||||
'url': '9c9media:ctv_web:%s' % video_id,
|
||||
'ie_key': 'NineCNineMedia',
|
||||
}
|
65
youtube_dl/extractor/ctvnews.py
Normal file
65
youtube_dl/extractor/ctvnews.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import orderedSet
|
||||
|
||||
|
||||
class CTVNewsIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.ctvnews.ca/video?clipId=901995',
|
||||
'md5': '10deb320dc0ccb8d01d34d12fc2ea672',
|
||||
'info_dict': {
|
||||
'id': '901995',
|
||||
'ext': 'mp4',
|
||||
'title': 'Extended: \'That person cannot be me\' Johnson says',
|
||||
'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285',
|
||||
'timestamp': 1467286284,
|
||||
'upload_date': '20160630',
|
||||
}
|
||||
}, {
|
||||
'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224',
|
||||
'info_dict':
|
||||
{
|
||||
'id': '1.2966224',
|
||||
},
|
||||
'playlist_mincount': 19,
|
||||
}, {
|
||||
'url': 'http://www.ctvnews.ca/video?binId=1.2876780',
|
||||
'info_dict':
|
||||
{
|
||||
'id': '1.2876780',
|
||||
},
|
||||
'playlist_mincount': 100,
|
||||
}, {
|
||||
'url': 'http://www.ctvnews.ca/1.810401',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
page_id = self._match_id(url)
|
||||
|
||||
def ninecninemedia_url_result(clip_id):
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'id': clip_id,
|
||||
'url': '9c9media:ctvnews_web:%s' % clip_id,
|
||||
'ie_key': 'NineCNineMedia',
|
||||
}
|
||||
|
||||
if page_id.isdigit():
|
||||
return ninecninemedia_url_result(page_id)
|
||||
else:
|
||||
webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={
|
||||
'ot': 'example.AjaxPageLayout.ot',
|
||||
'maxItemsPerPage': 1000000,
|
||||
})
|
||||
entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
|
||||
re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
|
||||
return self.playlist_result(entries, page_id)
|
@@ -20,7 +20,7 @@ from ..utils import (
|
||||
|
||||
|
||||
class DCNIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
|
||||
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
|
||||
|
||||
def _real_extract(self, url):
|
||||
show_id, video_id, season_id = re.match(self._VALID_URL, url).groups()
|
||||
@@ -55,30 +55,32 @@ class DCNBaseIE(InfoExtractor):
|
||||
'is_live': is_live,
|
||||
}
|
||||
|
||||
def _extract_video_formats(self, webpage, video_id, entry_protocol):
|
||||
def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol):
|
||||
formats = []
|
||||
m3u8_url = self._html_search_regex(
|
||||
r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False)
|
||||
if m3u8_url:
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None))
|
||||
|
||||
rtsp_url = self._search_regex(
|
||||
r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False)
|
||||
if rtsp_url:
|
||||
formats.append({
|
||||
'url': rtsp_url,
|
||||
'format_id': 'rtsp',
|
||||
})
|
||||
|
||||
format_url_base = 'http' + self._html_search_regex(
|
||||
[
|
||||
r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8',
|
||||
r'<a[^>]+href="rtsp(://[^"]+)"'
|
||||
], webpage, 'format url')
|
||||
# TODO: Current DASH formats are broken - $Time$ pattern in
|
||||
# <SegmentTemplate> not implemented yet
|
||||
# formats.extend(self._extract_mpd_formats(
|
||||
# format_url_base + '/manifest.mpd',
|
||||
# video_id, mpd_id='dash', fatal=False))
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
format_url_base + '/playlist.m3u8', video_id, 'mp4',
|
||||
m3u8_entry_protocol, m3u8_id='hls', fatal=False))
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
format_url_base + '/manifest.f4m',
|
||||
video_id, f4m_id='hds', fatal=False))
|
||||
self._sort_formats(formats)
|
||||
return formats
|
||||
|
||||
|
||||
class DCNVideoIE(DCNBaseIE):
|
||||
IE_NAME = 'dcn:video'
|
||||
_VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
|
||||
_TEST = {
|
||||
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',
|
||||
'info_dict':
|
||||
{
|
||||
@@ -94,7 +96,10 @@ class DCNVideoIE(DCNBaseIE):
|
||||
# m3u8 download
|
||||
'skip_download': True,
|
||||
},
|
||||
}
|
||||
}, {
|
||||
'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
@@ -120,7 +125,7 @@ class DCNVideoIE(DCNBaseIE):
|
||||
|
||||
class DCNLiveIE(DCNBaseIE):
|
||||
IE_NAME = 'dcn:live'
|
||||
_VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)'
|
||||
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)'
|
||||
|
||||
def _real_extract(self, url):
|
||||
channel_id = self._match_id(url)
|
||||
@@ -147,7 +152,7 @@ class DCNLiveIE(DCNBaseIE):
|
||||
|
||||
class DCNSeasonIE(InfoExtractor):
|
||||
IE_NAME = 'dcn:season'
|
||||
_VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
|
||||
_VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
|
||||
_TEST = {
|
||||
'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A',
|
||||
'info_dict':
|
||||
|
@@ -50,6 +50,14 @@ class EaglePlatformIE(InfoExtractor):
|
||||
'skip': 'Georestricted',
|
||||
}]
|
||||
|
||||
@staticmethod
|
||||
def _extract_url(webpage):
|
||||
mobj = re.search(
|
||||
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1',
|
||||
webpage)
|
||||
if mobj is not None:
|
||||
return mobj.group('url')
|
||||
|
||||
@staticmethod
|
||||
def _handle_error(response):
|
||||
status = int_or_none(response.get('status', 200))
|
||||
|
@@ -20,7 +20,10 @@ from .adobetv import (
|
||||
AdobeTVVideoIE,
|
||||
)
|
||||
from .adultswim import AdultSwimIE
|
||||
from .aenetworks import AENetworksIE
|
||||
from .aenetworks import (
|
||||
AENetworksIE,
|
||||
HistoryTopicIE,
|
||||
)
|
||||
from .afreecatv import AfreecaTVIE
|
||||
from .aftonbladet import AftonbladetIE
|
||||
from .airmozilla import AirMozillaIE
|
||||
@@ -44,7 +47,6 @@ from .archiveorg import ArchiveOrgIE
|
||||
from .ard import (
|
||||
ARDIE,
|
||||
ARDMediathekIE,
|
||||
SportschauIE,
|
||||
)
|
||||
from .arte import (
|
||||
ArteTvIE,
|
||||
@@ -141,6 +143,7 @@ from .cliprs import ClipRsIE
|
||||
from .clipfish import ClipfishIE
|
||||
from .cliphunter import CliphunterIE
|
||||
from .clipsyndicate import ClipsyndicateIE
|
||||
from .closertotruth import CloserToTruthIE
|
||||
from .cloudy import CloudyIE
|
||||
from .clubic import ClubicIE
|
||||
from .clyp import ClypIE
|
||||
@@ -168,6 +171,8 @@ from .crunchyroll import (
|
||||
)
|
||||
from .cspan import CSpanIE
|
||||
from .ctsnews import CtsNewsIE
|
||||
from .ctv import CTVIE
|
||||
from .ctvnews import CTVNewsIE
|
||||
from .cultureunplugged import CultureUnpluggedIE
|
||||
from .cwtv import CWTVIE
|
||||
from .dailymail import DailyMailIE
|
||||
@@ -276,6 +281,7 @@ from .freespeech import FreespeechIE
|
||||
from .freevideo import FreeVideoIE
|
||||
from .funimation import FunimationIE
|
||||
from .funnyordie import FunnyOrDieIE
|
||||
from .fusion import FusionIE
|
||||
from .gameinformer import GameInformerIE
|
||||
from .gamekings import GamekingsIE
|
||||
from .gameone import (
|
||||
@@ -285,7 +291,6 @@ from .gameone import (
|
||||
from .gamersyde import GamersydeIE
|
||||
from .gamespot import GameSpotIE
|
||||
from .gamestar import GameStarIE
|
||||
from .gametrailers import GametrailersIE
|
||||
from .gazeta import GazetaIE
|
||||
from .gdcvault import GDCVaultIE
|
||||
from .generic import GenericIE
|
||||
@@ -321,6 +326,10 @@ from .hotnewhiphop import HotNewHipHopIE
|
||||
from .hotstar import HotStarIE
|
||||
from .howcast import HowcastIE
|
||||
from .howstuffworks import HowStuffWorksIE
|
||||
from .hrti import (
|
||||
HRTiIE,
|
||||
HRTiPlaylistIE,
|
||||
)
|
||||
from .huffpost import HuffPostIE
|
||||
from .hypem import HypemIE
|
||||
from .iconosquare import IconosquareIE
|
||||
@@ -423,6 +432,7 @@ from .makerschannel import MakersChannelIE
|
||||
from .makertv import MakerTVIE
|
||||
from .matchtv import MatchTVIE
|
||||
from .mdr import MDRIE
|
||||
from .meta import METAIE
|
||||
from .metacafe import MetacafeIE
|
||||
from .metacritic import MetacriticIE
|
||||
from .mgoon import MgoonIE
|
||||
@@ -455,6 +465,7 @@ from .motherless import MotherlessIE
|
||||
from .motorsport import MotorsportIE
|
||||
from .movieclips import MovieClipsIE
|
||||
from .moviezine import MoviezineIE
|
||||
from .msn import MSNIE
|
||||
from .mtv import (
|
||||
MTVIE,
|
||||
MTVServicesEmbeddedIE,
|
||||
@@ -481,7 +492,6 @@ from .nbc import (
|
||||
NBCNewsIE,
|
||||
NBCSportsIE,
|
||||
NBCSportsVPlayerIE,
|
||||
MSNBCIE,
|
||||
)
|
||||
from .ndr import (
|
||||
NDRIE,
|
||||
@@ -523,6 +533,7 @@ from .nick import (
|
||||
NickDeIE,
|
||||
)
|
||||
from .niconico import NiconicoIE, NiconicoPlaylistIE
|
||||
from .ninecninemedia import NineCNineMediaIE
|
||||
from .ninegag import NineGagIE
|
||||
from .noco import NocoIE
|
||||
from .normalboots import NormalbootsIE
|
||||
@@ -608,6 +619,7 @@ from .pluralsight import (
|
||||
PluralsightCourseIE,
|
||||
)
|
||||
from .podomatic import PodomaticIE
|
||||
from .polskieradio import PolskieRadioIE
|
||||
from .porn91 import Porn91IE
|
||||
from .pornhd import PornHdIE
|
||||
from .pornhub import (
|
||||
@@ -631,7 +643,10 @@ from .qqmusic import (
|
||||
QQMusicToplistIE,
|
||||
QQMusicPlaylistIE,
|
||||
)
|
||||
from .r7 import R7IE
|
||||
from .r7 import (
|
||||
R7IE,
|
||||
R7ArticleIE,
|
||||
)
|
||||
from .radiocanada import (
|
||||
RadioCanadaIE,
|
||||
RadioCanadaAudioVideoIE,
|
||||
@@ -703,10 +718,12 @@ from .shahid import ShahidIE
|
||||
from .shared import SharedIE
|
||||
from .sharesix import ShareSixIE
|
||||
from .sina import SinaIE
|
||||
from .sixplay import SixPlayIE
|
||||
from .skynewsarabia import (
|
||||
SkyNewsArabiaIE,
|
||||
SkyNewsArabiaArticleIE,
|
||||
)
|
||||
from .skysports import SkySportsIE
|
||||
from .slideshare import SlideshareIE
|
||||
from .slutload import SlutloadIE
|
||||
from .smotri import (
|
||||
@@ -747,6 +764,7 @@ from .sportbox import (
|
||||
SportBoxEmbedIE,
|
||||
)
|
||||
from .sportdeutschland import SportDeutschlandIE
|
||||
from .sportschau import SportschauIE
|
||||
from .srgssr import (
|
||||
SRGSSRIE,
|
||||
SRGSSRPlayIE,
|
||||
@@ -887,6 +905,7 @@ from .udn import UDNEmbedIE
|
||||
from .digiteka import DigitekaIE
|
||||
from .unistra import UnistraIE
|
||||
from .urort import UrortIE
|
||||
from .urplay import URPlayIE
|
||||
from .usatoday import USATodayIE
|
||||
from .ustream import UstreamIE, UstreamChannelIE
|
||||
from .ustudio import (
|
||||
@@ -913,6 +932,7 @@ from .vice import (
|
||||
ViceIE,
|
||||
ViceShowIE,
|
||||
)
|
||||
from .vidbit import VidbitIE
|
||||
from .viddler import ViddlerIE
|
||||
from .videodetective import VideoDetectiveIE
|
||||
from .videofyme import VideofyMeIE
|
||||
|
@@ -129,6 +129,21 @@ class FacebookIE(InfoExtractor):
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
@staticmethod
|
||||
def _extract_url(webpage):
|
||||
mobj = re.search(
|
||||
r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
|
||||
if mobj is not None:
|
||||
return mobj.group('url')
|
||||
|
||||
# Facebook API embed
|
||||
# see https://developers.facebook.com/docs/plugins/embedded-video-player
|
||||
mobj = re.search(r'''(?x)<div[^>]+
|
||||
class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
|
||||
data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage)
|
||||
if mobj is not None:
|
||||
return mobj.group('url')
|
||||
|
||||
def _login(self):
|
||||
(useremail, password) = self._get_login_info()
|
||||
if useremail is None:
|
||||
@@ -239,6 +254,8 @@ class FacebookIE(InfoExtractor):
|
||||
|
||||
formats = []
|
||||
for format_id, f in video_data.items():
|
||||
if f and isinstance(f, dict):
|
||||
f = [f]
|
||||
if not f or not isinstance(f, list):
|
||||
continue
|
||||
for quality in ('sd', 'hd'):
|
||||
|
@@ -1,7 +1,10 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import smuggle_url
|
||||
from ..utils import (
|
||||
smuggle_url,
|
||||
update_url_query,
|
||||
)
|
||||
|
||||
|
||||
class FoxSportsIE(InfoExtractor):
|
||||
@@ -9,11 +12,15 @@ class FoxSportsIE(InfoExtractor):
|
||||
|
||||
_TEST = {
|
||||
'url': 'http://www.foxsports.com/video?vid=432609859715',
|
||||
'md5': 'b49050e955bebe32c301972e4012ac17',
|
||||
'info_dict': {
|
||||
'id': 'gA0bHB3Ladz3',
|
||||
'ext': 'flv',
|
||||
'id': 'i0qKWsk3qJaM',
|
||||
'ext': 'mp4',
|
||||
'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
|
||||
'description': 'Courtney Lee talks about Memphis being focused.',
|
||||
'upload_date': '20150423',
|
||||
'timestamp': 1429761109,
|
||||
'uploader': 'NEWA-FNG-FOXSPORTS',
|
||||
},
|
||||
'add_ie': ['ThePlatform'],
|
||||
}
|
||||
@@ -28,5 +35,8 @@ class FoxSportsIE(InfoExtractor):
|
||||
r"data-player-config='([^']+)'", webpage, 'data player config'),
|
||||
video_id)
|
||||
|
||||
return self.url_result(smuggle_url(
|
||||
config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True}))
|
||||
return self.url_result(smuggle_url(update_url_query(
|
||||
config['releaseURL'], {
|
||||
'mbr': 'true',
|
||||
'switch': 'http',
|
||||
}), {'force_smil_url': True}))
|
||||
|
35
youtube_dl/extractor/fusion.py
Normal file
35
youtube_dl/extractor/fusion.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .ooyala import OoyalaIE
|
||||
|
||||
|
||||
class FusionIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/',
|
||||
'info_dict': {
|
||||
'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P',
|
||||
'ext': 'mp4',
|
||||
'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs',
|
||||
'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7',
|
||||
'duration': 140.0,
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
'add_ie': ['Ooyala'],
|
||||
}, {
|
||||
'url': 'http://fusion.net/video/201781',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
|
||||
ooyala_code = self._search_regex(
|
||||
r'data-video-id=(["\'])(?P<code>.+?)\1',
|
||||
webpage, 'ooyala code', group='code')
|
||||
|
||||
return OoyalaIE._build_url_result(ooyala_code)
|
@@ -1,19 +1,19 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .once import OnceIE
|
||||
from ..compat import (
|
||||
compat_urllib_parse_unquote,
|
||||
compat_urlparse,
|
||||
)
|
||||
from ..utils import (
|
||||
unescapeHTML,
|
||||
url_basename,
|
||||
dict_get,
|
||||
)
|
||||
|
||||
|
||||
class GameSpotIE(InfoExtractor):
|
||||
class GameSpotIE(OnceIE):
|
||||
_VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
|
||||
@@ -39,29 +39,73 @@ class GameSpotIE(InfoExtractor):
|
||||
webpage = self._download_webpage(url, page_id)
|
||||
data_video_json = self._search_regex(
|
||||
r'data-video=["\'](.*?)["\']', webpage, 'data video')
|
||||
data_video = json.loads(unescapeHTML(data_video_json))
|
||||
data_video = self._parse_json(unescapeHTML(data_video_json), page_id)
|
||||
streams = data_video['videoStreams']
|
||||
|
||||
manifest_url = None
|
||||
formats = []
|
||||
f4m_url = streams.get('f4m_stream')
|
||||
if f4m_url is not None:
|
||||
# Transform the manifest url to a link to the mp4 files
|
||||
# they are used in mobile devices.
|
||||
f4m_path = compat_urlparse.urlparse(f4m_url).path
|
||||
QUALITIES_RE = r'((,\d+)+,?)'
|
||||
qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
|
||||
http_path = f4m_path[1:].split('/', 1)[1]
|
||||
http_template = re.sub(QUALITIES_RE, r'%s', http_path)
|
||||
http_template = http_template.replace('.csmil/manifest.f4m', '')
|
||||
http_template = compat_urlparse.urljoin(
|
||||
'http://video.gamespotcdn.com/', http_template)
|
||||
for q in qualities:
|
||||
formats.append({
|
||||
'url': http_template % q,
|
||||
'ext': 'mp4',
|
||||
'format_id': q,
|
||||
})
|
||||
else:
|
||||
if f4m_url:
|
||||
manifest_url = f4m_url
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False))
|
||||
m3u8_url = streams.get('m3u8_stream')
|
||||
if m3u8_url:
|
||||
manifest_url = m3u8_url
|
||||
m3u8_formats = self._extract_m3u8_formats(
|
||||
m3u8_url, page_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls', fatal=False)
|
||||
formats.extend(m3u8_formats)
|
||||
progressive_url = dict_get(
|
||||
streams, ('progressive_hd', 'progressive_high', 'progressive_low'))
|
||||
if progressive_url and manifest_url:
|
||||
qualities_basename = self._search_regex(
|
||||
'/([^/]+)\.csmil/',
|
||||
manifest_url, 'qualities basename', default=None)
|
||||
if qualities_basename:
|
||||
QUALITIES_RE = r'((,\d+)+,?)'
|
||||
qualities = self._search_regex(
|
||||
QUALITIES_RE, qualities_basename,
|
||||
'qualities', default=None)
|
||||
if qualities:
|
||||
qualities = list(map(lambda q: int(q), qualities.strip(',').split(',')))
|
||||
qualities.sort()
|
||||
http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename)
|
||||
http_url_basename = url_basename(progressive_url)
|
||||
if m3u8_formats:
|
||||
self._sort_formats(m3u8_formats)
|
||||
m3u8_formats = list(filter(
|
||||
lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
|
||||
m3u8_formats))
|
||||
if len(qualities) == len(m3u8_formats):
|
||||
for q, m3u8_format in zip(qualities, m3u8_formats):
|
||||
f = m3u8_format.copy()
|
||||
f.update({
|
||||
'url': progressive_url.replace(
|
||||
http_url_basename, http_template % q),
|
||||
'format_id': f['format_id'].replace('hls', 'http'),
|
||||
'protocol': 'http',
|
||||
})
|
||||
formats.append(f)
|
||||
else:
|
||||
for q in qualities:
|
||||
formats.append({
|
||||
'url': progressive_url.replace(
|
||||
http_url_basename, http_template % q),
|
||||
'ext': 'mp4',
|
||||
'format_id': 'http-%d' % q,
|
||||
'tbr': q,
|
||||
})
|
||||
|
||||
onceux_json = self._search_regex(
|
||||
r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None)
|
||||
if onceux_json:
|
||||
onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri')
|
||||
if onceux_url:
|
||||
formats.extend(self._extract_once_formats(re.sub(
|
||||
r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', '')))
|
||||
|
||||
if not formats:
|
||||
for quality in ['sd', 'hd']:
|
||||
# It's actually a link to a flv file
|
||||
flv_url = streams.get('f4m_{0}'.format(quality))
|
||||
@@ -71,6 +115,7 @@ class GameSpotIE(InfoExtractor):
|
||||
'ext': 'flv',
|
||||
'format_id': quality,
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
return {
|
||||
'id': data_video['guid'],
|
||||
|
@@ -1,62 +0,0 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
int_or_none,
|
||||
parse_age_limit,
|
||||
url_basename,
|
||||
)
|
||||
|
||||
|
||||
class GametrailersIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)'
|
||||
|
||||
_TEST = {
|
||||
'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review',
|
||||
'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a',
|
||||
'info_dict': {
|
||||
'id': '2983958',
|
||||
'ext': 'mp4',
|
||||
'display_id': '116437-Just-Cause-3-Review',
|
||||
'title': 'Just Cause 3 - Review',
|
||||
'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?',
|
||||
},
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
title = self._html_search_regex(
|
||||
r'<title>(.+?)\|', webpage, 'title').strip()
|
||||
embed_url = self._proto_relative_url(
|
||||
self._search_regex(
|
||||
r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage,
|
||||
'embed url'),
|
||||
scheme='http:')
|
||||
video_id = url_basename(embed_url)
|
||||
embed_page = self._download_webpage(embed_url, video_id)
|
||||
embed_vars_json = self._search_regex(
|
||||
r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page,
|
||||
'embed vars')
|
||||
info = self._parse_json(embed_vars_json, video_id)
|
||||
|
||||
formats = []
|
||||
for media in info['media']:
|
||||
if media['mediaPurpose'] == 'play':
|
||||
formats.append({
|
||||
'url': media['uri'],
|
||||
'height': media['height'],
|
||||
'width:': media['width'],
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'display_id': display_id,
|
||||
'title': title,
|
||||
'formats': formats,
|
||||
'thumbnail': info.get('thumbUri'),
|
||||
'description': self._og_search_description(webpage),
|
||||
'duration': int_or_none(info.get('videoLengthInSeconds')),
|
||||
'age_limit': parse_age_limit(info.get('audienceRating')),
|
||||
}
|
@@ -64,6 +64,9 @@ from .liveleak import LiveLeakIE
|
||||
from .threeqsdn import ThreeQSDNIE
|
||||
from .theplatform import ThePlatformIE
|
||||
from .vessel import VesselIE
|
||||
from .kaltura import KalturaIE
|
||||
from .eagleplatform import EaglePlatformIE
|
||||
from .facebook import FacebookIE
|
||||
|
||||
|
||||
class GenericIE(InfoExtractor):
|
||||
@@ -920,6 +923,24 @@ class GenericIE(InfoExtractor):
|
||||
},
|
||||
'add_ie': ['Kaltura'],
|
||||
},
|
||||
{
|
||||
# Kaltura embedded via quoted entry_id
|
||||
'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures',
|
||||
'info_dict': {
|
||||
'id': '0_utuok90b',
|
||||
'ext': 'mp4',
|
||||
'title': '06_matthew_brender_raj_dutt',
|
||||
'timestamp': 1466638791,
|
||||
'upload_date': '20160622',
|
||||
},
|
||||
'add_ie': ['Kaltura'],
|
||||
'expected_warnings': [
|
||||
'Could not send HEAD request'
|
||||
],
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
}
|
||||
},
|
||||
# Eagle.Platform embed (generic URL)
|
||||
{
|
||||
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
|
||||
@@ -1091,12 +1112,17 @@ class GenericIE(InfoExtractor):
|
||||
# Dailymotion Cloud video
|
||||
{
|
||||
'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
|
||||
'md5': '49444254273501a64675a7e68c502681',
|
||||
'md5': 'dcaf23ad0c67a256f4278bce6e0bae38',
|
||||
'info_dict': {
|
||||
'id': '5585de919473990de4bee11b',
|
||||
'id': 'x2uy8t3',
|
||||
'ext': 'mp4',
|
||||
'title': 'Le débat',
|
||||
'title': 'Sauvons les abeilles ! - Le débat',
|
||||
'description': 'md5:d9082128b1c5277987825d684939ca26',
|
||||
'thumbnail': 're:^https?://.*\.jpe?g$',
|
||||
'timestamp': 1434970506,
|
||||
'upload_date': '20150622',
|
||||
'uploader': 'Public Sénat',
|
||||
'uploader_id': 'xa9gza',
|
||||
}
|
||||
},
|
||||
# OnionStudios embed
|
||||
@@ -1220,6 +1246,55 @@ class GenericIE(InfoExtractor):
|
||||
'uploader': 'www.hudl.com',
|
||||
},
|
||||
},
|
||||
# twitter:player embed
|
||||
{
|
||||
'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/',
|
||||
'md5': 'a3e0df96369831de324f0778e126653c',
|
||||
'info_dict': {
|
||||
'id': '4909620399001',
|
||||
'ext': 'mp4',
|
||||
'title': 'What Do Black Holes Sound Like?',
|
||||
'description': 'what do black holes sound like',
|
||||
'upload_date': '20160524',
|
||||
'uploader_id': '29913724001',
|
||||
'timestamp': 1464107587,
|
||||
'uploader': 'TheAtlantic',
|
||||
},
|
||||
'add_ie': ['BrightcoveLegacy'],
|
||||
},
|
||||
# Facebook <iframe> embed
|
||||
{
|
||||
'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
|
||||
'md5': 'fbcde74f534176ecb015849146dd3aee',
|
||||
'info_dict': {
|
||||
'id': '599637780109885',
|
||||
'ext': 'mp4',
|
||||
'title': 'Facebook video #599637780109885',
|
||||
},
|
||||
},
|
||||
# Facebook API embed
|
||||
{
|
||||
'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
|
||||
'md5': 'a47372ee61b39a7b90287094d447d94e',
|
||||
'info_dict': {
|
||||
'id': '10153467542406923',
|
||||
'ext': 'mp4',
|
||||
'title': 'Facebook video #10153467542406923',
|
||||
},
|
||||
},
|
||||
# Wordpress "YouTube Video Importer" plugin
|
||||
{
|
||||
'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/',
|
||||
'md5': 'd16797741b560b485194eddda8121b48',
|
||||
'info_dict': {
|
||||
'id': 'HNTXWDXV9Is',
|
||||
'ext': 'mp4',
|
||||
'title': 'Blue Devils Drumline Stanford lot 2016',
|
||||
'upload_date': '20160627',
|
||||
'uploader_id': 'GENOCIDE8GENERAL10',
|
||||
'uploader': 'cylus cyrus',
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
def report_following_redirect(self, new_url):
|
||||
@@ -1576,6 +1651,13 @@ class GenericIE(InfoExtractor):
|
||||
if matches:
|
||||
return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
|
||||
|
||||
# Look for Wordpress "YouTube Video Importer" plugin
|
||||
matches = re.findall(r'''(?x)<div[^>]+
|
||||
class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
|
||||
data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
|
||||
if matches:
|
||||
return _playlist_from_matches(matches, lambda m: m[-1])
|
||||
|
||||
# Look for embedded Dailymotion player
|
||||
matches = re.findall(
|
||||
r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
|
||||
@@ -1718,10 +1800,9 @@ class GenericIE(InfoExtractor):
|
||||
return self.url_result(mobj.group('url'))
|
||||
|
||||
# Look for embedded Facebook player
|
||||
mobj = re.search(
|
||||
r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
|
||||
if mobj is not None:
|
||||
return self.url_result(mobj.group('url'), 'Facebook')
|
||||
facebook_url = FacebookIE._extract_url(webpage)
|
||||
if facebook_url is not None:
|
||||
return self.url_result(facebook_url, 'Facebook')
|
||||
|
||||
# Look for embedded VK player
|
||||
mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
|
||||
@@ -1903,18 +1984,14 @@ class GenericIE(InfoExtractor):
|
||||
return self.url_result(mobj.group('url'), 'Zapiks')
|
||||
|
||||
# Look for Kaltura embeds
|
||||
mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or
|
||||
re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
|
||||
if mobj is not None:
|
||||
return self.url_result(smuggle_url(
|
||||
'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(),
|
||||
{'source_url': url}), 'Kaltura')
|
||||
kaltura_url = KalturaIE._extract_url(webpage)
|
||||
if kaltura_url:
|
||||
return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
|
||||
|
||||
# Look for Eagle.Platform embeds
|
||||
mobj = re.search(
|
||||
r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
|
||||
if mobj is not None:
|
||||
return self.url_result(mobj.group('url'), 'EaglePlatform')
|
||||
eagleplatform_url = EaglePlatformIE._extract_url(webpage)
|
||||
if eagleplatform_url:
|
||||
return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
|
||||
|
||||
# Look for ClipYou (uses Eagle.Platform) embeds
|
||||
mobj = re.search(
|
||||
@@ -2060,6 +2137,11 @@ class GenericIE(InfoExtractor):
|
||||
'uploader': video_uploader,
|
||||
}
|
||||
|
||||
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser
|
||||
embed_url = self._html_search_meta('twitter:player', webpage, default=None)
|
||||
if embed_url:
|
||||
return self.url_result(embed_url)
|
||||
|
||||
def check_video(vurl):
|
||||
if YoutubeIE.suitable(vurl):
|
||||
return True
|
||||
|
202
youtube_dl/extractor/hrti.py
Normal file
202
youtube_dl/extractor/hrti.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import compat_HTTPError
|
||||
from ..utils import (
|
||||
clean_html,
|
||||
ExtractorError,
|
||||
int_or_none,
|
||||
parse_age_limit,
|
||||
sanitized_Request,
|
||||
try_get,
|
||||
)
|
||||
|
||||
|
||||
class HRTiBaseIE(InfoExtractor):
|
||||
"""
|
||||
Base Information Extractor for Croatian Radiotelevision
|
||||
video on demand site https://hrti.hrt.hr
|
||||
Reverse engineered from the JavaScript app in app.min.js
|
||||
"""
|
||||
_NETRC_MACHINE = 'hrti'
|
||||
|
||||
_APP_LANGUAGE = 'hr'
|
||||
_APP_VERSION = '1.1'
|
||||
_APP_PUBLICATION_ID = 'all_in_one'
|
||||
_API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json'
|
||||
|
||||
def _initialize_api(self):
|
||||
init_data = {
|
||||
'application_publication_id': self._APP_PUBLICATION_ID
|
||||
}
|
||||
|
||||
uuid = self._download_json(
|
||||
self._API_URL, None, note='Downloading uuid',
|
||||
errnote='Unable to download uuid',
|
||||
data=json.dumps(init_data).encode('utf-8'))['uuid']
|
||||
|
||||
app_data = {
|
||||
'uuid': uuid,
|
||||
'application_publication_id': self._APP_PUBLICATION_ID,
|
||||
'application_version': self._APP_VERSION
|
||||
}
|
||||
|
||||
req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8'))
|
||||
req.get_method = lambda: 'PUT'
|
||||
|
||||
resources = self._download_json(
|
||||
req, None, note='Downloading session information',
|
||||
errnote='Unable to download session information')
|
||||
|
||||
self._session_id = resources['session_id']
|
||||
|
||||
modules = resources['modules']
|
||||
|
||||
self._search_url = modules['vod_catalog']['resources']['search']['uri'].format(
|
||||
language=self._APP_LANGUAGE,
|
||||
application_id=self._APP_PUBLICATION_ID)
|
||||
|
||||
self._login_url = (modules['user']['resources']['login']['uri'] +
|
||||
'/format/json').format(session_id=self._session_id)
|
||||
|
||||
self._logout_url = modules['user']['resources']['logout']['uri']
|
||||
|
||||
def _login(self):
|
||||
(username, password) = self._get_login_info()
|
||||
# TODO: figure out authentication with cookies
|
||||
if username is None or password is None:
|
||||
self.raise_login_required()
|
||||
|
||||
auth_data = {
|
||||
'username': username,
|
||||
'password': password,
|
||||
}
|
||||
|
||||
try:
|
||||
auth_info = self._download_json(
|
||||
self._login_url, None, note='Logging in', errnote='Unable to log in',
|
||||
data=json.dumps(auth_data).encode('utf-8'))
|
||||
except ExtractorError as e:
|
||||
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406:
|
||||
auth_info = self._parse_json(e.cause.read().encode('utf-8'), None)
|
||||
else:
|
||||
raise
|
||||
|
||||
error_message = auth_info.get('error', {}).get('message')
|
||||
if error_message:
|
||||
raise ExtractorError(
|
||||
'%s said: %s' % (self.IE_NAME, error_message),
|
||||
expected=True)
|
||||
|
||||
self._token = auth_info['secure_streaming_token']
|
||||
|
||||
def _real_initialize(self):
|
||||
self._initialize_api()
|
||||
self._login()
|
||||
|
||||
|
||||
class HRTiIE(HRTiBaseIE):
|
||||
_VALID_URL = r'''(?x)
|
||||
(?:
|
||||
hrti:(?P<short_id>[0-9]+)|
|
||||
https?://
|
||||
hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?
|
||||
)
|
||||
'''
|
||||
_TESTS = [{
|
||||
'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd',
|
||||
'info_dict': {
|
||||
'id': '2181385',
|
||||
'display_id': 'republika-dokumentarna-serija-16-hd',
|
||||
'ext': 'mp4',
|
||||
'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)',
|
||||
'description': 'md5:48af85f620e8e0e1df4096270568544f',
|
||||
'duration': 2922,
|
||||
'view_count': int,
|
||||
'average_rating': int,
|
||||
'episode_number': int,
|
||||
'season_number': int,
|
||||
'age_limit': 12,
|
||||
},
|
||||
'skip': 'Requires account credentials',
|
||||
}, {
|
||||
'url': 'https://hrti.hrt.hr/#/video/show/2181385/',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'hrti:2181385',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
video_id = mobj.group('short_id') or mobj.group('id')
|
||||
display_id = mobj.group('display_id') or video_id
|
||||
|
||||
video = self._download_json(
|
||||
'%s/video_id/%s/format/json' % (self._search_url, video_id),
|
||||
display_id, 'Downloading video metadata JSON')['video'][0]
|
||||
|
||||
title_info = video['title']
|
||||
title = title_info['title_long']
|
||||
|
||||
movie = video['video_assets']['movie'][0]
|
||||
m3u8_url = movie['url'].format(TOKEN=self._token)
|
||||
formats = self._extract_m3u8_formats(
|
||||
m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native',
|
||||
m3u8_id='hls')
|
||||
self._sort_formats(formats)
|
||||
|
||||
description = clean_html(title_info.get('summary_long'))
|
||||
age_limit = parse_age_limit(video.get('parental_control', {}).get('rating'))
|
||||
view_count = int_or_none(video.get('views'))
|
||||
average_rating = int_or_none(video.get('user_rating'))
|
||||
duration = int_or_none(movie.get('duration'))
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'display_id': display_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'duration': duration,
|
||||
'view_count': view_count,
|
||||
'average_rating': average_rating,
|
||||
'age_limit': age_limit,
|
||||
'formats': formats,
|
||||
}
|
||||
|
||||
|
||||
class HRTiPlaylistIE(HRTiBaseIE):
|
||||
_VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?'
|
||||
_TESTS = [{
|
||||
'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena',
|
||||
'info_dict': {
|
||||
'id': '212',
|
||||
'title': 'ekumena',
|
||||
},
|
||||
'playlist_mincount': 8,
|
||||
'skip': 'Requires account credentials',
|
||||
}, {
|
||||
'url': 'https://hrti.hrt.hr/#/video/list/category/212/',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
category_id = mobj.group('id')
|
||||
display_id = mobj.group('display_id') or category_id
|
||||
|
||||
response = self._download_json(
|
||||
'%s/category_id/%s/format/json' % (self._search_url, category_id),
|
||||
display_id, 'Downloading video metadata JSON')
|
||||
|
||||
video_ids = try_get(
|
||||
response, lambda x: x['video_listings'][0]['alternatives'][0]['list'],
|
||||
list) or [video['id'] for video in response.get('videos', []) if video.get('id')]
|
||||
|
||||
entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids]
|
||||
|
||||
return self.playlist_result(entries, category_id, display_id)
|
@@ -3,28 +3,22 @@ from __future__ import unicode_literals
|
||||
|
||||
import hashlib
|
||||
import itertools
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import (
|
||||
compat_parse_qs,
|
||||
compat_str,
|
||||
compat_urllib_parse_urlencode,
|
||||
compat_urllib_parse_urlparse,
|
||||
)
|
||||
from ..utils import (
|
||||
clean_html,
|
||||
decode_packed_codes,
|
||||
get_element_by_id,
|
||||
get_element_by_attribute,
|
||||
ExtractorError,
|
||||
ohdave_rsa_encrypt,
|
||||
remove_start,
|
||||
sanitized_Request,
|
||||
urlencode_postdata,
|
||||
url_basename,
|
||||
)
|
||||
|
||||
|
||||
@@ -171,70 +165,21 @@ class IqiyiIE(InfoExtractor):
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
|
||||
'md5': '2cb594dc2781e6c941a110d8f358118b',
|
||||
'md5': '5b0591f55961117155430b5d544fdb01',
|
||||
'info_dict': {
|
||||
'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
|
||||
'ext': 'mp4',
|
||||
'title': '美国德州空中惊现奇异云团 酷似UFO',
|
||||
'ext': 'f4v',
|
||||
}
|
||||
}, {
|
||||
'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
|
||||
'md5': '667171934041350c5de3f5015f7f1152',
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
'playlist': [{
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
|
||||
'ext': 'f4v',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
|
||||
'ext': 'f4v',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
|
||||
'ext': 'f4v',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
|
||||
'ext': 'f4v',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
|
||||
'ext': 'f4v',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
|
||||
'ext': 'f4v',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
|
||||
'ext': 'f4v',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
|
||||
'ext': 'f4v',
|
||||
'title': '名侦探柯南第752集',
|
||||
},
|
||||
}],
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
'ext': 'mp4',
|
||||
'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇',
|
||||
},
|
||||
'skip': 'Geo-restricted to China',
|
||||
}, {
|
||||
'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html',
|
||||
'only_matching': True,
|
||||
@@ -250,22 +195,10 @@ class IqiyiIE(InfoExtractor):
|
||||
'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',
|
||||
'info_dict': {
|
||||
'id': 'f3cf468b39dddb30d676f89a91200dc1',
|
||||
'ext': 'mp4',
|
||||
'title': '泰坦尼克号',
|
||||
},
|
||||
'playlist': [{
|
||||
'info_dict': {
|
||||
'id': 'f3cf468b39dddb30d676f89a91200dc1_part1',
|
||||
'ext': 'f4v',
|
||||
'title': '泰坦尼克号',
|
||||
},
|
||||
}, {
|
||||
'info_dict': {
|
||||
'id': 'f3cf468b39dddb30d676f89a91200dc1_part2',
|
||||
'ext': 'f4v',
|
||||
'title': '泰坦尼克号',
|
||||
},
|
||||
}],
|
||||
'expected_warnings': ['Needs a VIP account for full video'],
|
||||
'skip': 'Geo-restricted to China',
|
||||
}, {
|
||||
'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
|
||||
'info_dict': {
|
||||
@@ -278,20 +211,15 @@ class IqiyiIE(InfoExtractor):
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
_FORMATS_MAP = [
|
||||
('1', 'h6'),
|
||||
('2', 'h5'),
|
||||
('3', 'h4'),
|
||||
('4', 'h3'),
|
||||
('5', 'h2'),
|
||||
('10', 'h1'),
|
||||
]
|
||||
|
||||
AUTH_API_ERRORS = {
|
||||
# No preview available (不允许试看鉴权失败)
|
||||
'Q00505': 'This video requires a VIP account',
|
||||
# End of preview time (试看结束鉴权失败)
|
||||
'Q00506': 'Needs a VIP account for full video',
|
||||
_FORMATS_MAP = {
|
||||
'96': 1, # 216p, 240p
|
||||
'1': 2, # 336p, 360p
|
||||
'2': 3, # 480p, 504p
|
||||
'21': 4, # 504p
|
||||
'4': 5, # 720p
|
||||
'17': 5, # 720p
|
||||
'5': 6, # 1072p, 1080p
|
||||
'18': 7, # 1080p
|
||||
}
|
||||
|
||||
def _real_initialize(self):
|
||||
@@ -352,177 +280,27 @@ class IqiyiIE(InfoExtractor):
|
||||
|
||||
return True
|
||||
|
||||
def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
|
||||
auth_params = {
|
||||
# version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as
|
||||
'version': '2.0',
|
||||
'platform': 'b6c13e26323c537d',
|
||||
'aid': tvid,
|
||||
def get_raw_data(self, tvid, video_id):
|
||||
tm = int(time.time() * 1000)
|
||||
|
||||
key = 'd5fb4bd9d50c4be6948c97edd7254b0e'
|
||||
sc = md5_text(compat_str(tm) + key + tvid)
|
||||
params = {
|
||||
'tvid': tvid,
|
||||
'uid': '',
|
||||
'deviceId': _uuid,
|
||||
'playType': 'main', # XXX: always main?
|
||||
'filename': os.path.splitext(url_basename(api_video_url))[0],
|
||||
}
|
||||
|
||||
qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query)
|
||||
for key, val in qd_items.items():
|
||||
auth_params[key] = val[0]
|
||||
|
||||
auth_req = sanitized_Request(
|
||||
'http://api.vip.iqiyi.com/services/ckn.action',
|
||||
urlencode_postdata(auth_params))
|
||||
# iQiyi server throws HTTP 405 error without the following header
|
||||
auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
|
||||
auth_result = self._download_json(
|
||||
auth_req, video_id,
|
||||
note='Downloading video authentication JSON',
|
||||
errnote='Unable to download video authentication JSON')
|
||||
|
||||
code = auth_result.get('code')
|
||||
msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code
|
||||
if code == 'Q00506':
|
||||
if do_report_warning:
|
||||
self.report_warning(msg)
|
||||
return False
|
||||
if 'data' not in auth_result:
|
||||
if msg is not None:
|
||||
raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True)
|
||||
raise ExtractorError('Unexpected error from Iqiyi auth API')
|
||||
|
||||
return auth_result['data']
|
||||
|
||||
def construct_video_urls(self, data, video_id, _uuid, tvid):
|
||||
def do_xor(x, y):
|
||||
a = y % 3
|
||||
if a == 1:
|
||||
return x ^ 121
|
||||
if a == 2:
|
||||
return x ^ 72
|
||||
return x ^ 103
|
||||
|
||||
def get_encode_code(l):
|
||||
a = 0
|
||||
b = l.split('-')
|
||||
c = len(b)
|
||||
s = ''
|
||||
for i in range(c - 1, -1, -1):
|
||||
a = do_xor(int(b[c - i - 1], 16), i)
|
||||
s += chr(a)
|
||||
return s[::-1]
|
||||
|
||||
def get_path_key(x, format_id, segment_index):
|
||||
mg = ')(*&^flash@#$%a'
|
||||
tm = self._download_json(
|
||||
'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
|
||||
note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
|
||||
)['t']
|
||||
t = str(int(math.floor(int(tm) / (600.0))))
|
||||
return md5_text(t + mg + x)
|
||||
|
||||
video_urls_dict = {}
|
||||
need_vip_warning_report = True
|
||||
for format_item in data['vp']['tkl'][0]['vs']:
|
||||
if 0 < int(format_item['bid']) <= 10:
|
||||
format_id = self.get_format(format_item['bid'])
|
||||
else:
|
||||
continue
|
||||
|
||||
video_urls = []
|
||||
|
||||
video_urls_info = format_item['fs']
|
||||
if not format_item['fs'][0]['l'].startswith('/'):
|
||||
t = get_encode_code(format_item['fs'][0]['l'])
|
||||
if t.endswith('mp4'):
|
||||
video_urls_info = format_item['flvs']
|
||||
|
||||
for segment_index, segment in enumerate(video_urls_info):
|
||||
vl = segment['l']
|
||||
if not vl.startswith('/'):
|
||||
vl = get_encode_code(vl)
|
||||
is_vip_video = '/vip/' in vl
|
||||
filesize = segment['b']
|
||||
base_url = data['vp']['du'].split('/')
|
||||
if not is_vip_video:
|
||||
key = get_path_key(
|
||||
vl.split('/')[-1].split('.')[0], format_id, segment_index)
|
||||
base_url.insert(-1, key)
|
||||
base_url = '/'.join(base_url)
|
||||
param = {
|
||||
'su': _uuid,
|
||||
'qyid': uuid.uuid4().hex,
|
||||
'client': '',
|
||||
'z': '',
|
||||
'bt': '',
|
||||
'ct': '',
|
||||
'tn': str(int(time.time()))
|
||||
}
|
||||
api_video_url = base_url + vl
|
||||
if is_vip_video:
|
||||
api_video_url = api_video_url.replace('.f4v', '.hml')
|
||||
auth_result = self._authenticate_vip_video(
|
||||
api_video_url, video_id, tvid, _uuid, need_vip_warning_report)
|
||||
if auth_result is False:
|
||||
need_vip_warning_report = False
|
||||
break
|
||||
param.update({
|
||||
't': auth_result['t'],
|
||||
# cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
|
||||
'cid': 'afbe8fd3d73448c9',
|
||||
'vid': video_id,
|
||||
'QY00001': auth_result['u'],
|
||||
})
|
||||
api_video_url += '?' if '?' not in api_video_url else '&'
|
||||
api_video_url += compat_urllib_parse_urlencode(param)
|
||||
js = self._download_json(
|
||||
api_video_url, video_id,
|
||||
note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
|
||||
video_url = js['l']
|
||||
video_urls.append(
|
||||
(video_url, filesize))
|
||||
|
||||
video_urls_dict[format_id] = video_urls
|
||||
return video_urls_dict
|
||||
|
||||
def get_format(self, bid):
|
||||
matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
|
||||
return matched_format_ids[0] if len(matched_format_ids) else None
|
||||
|
||||
def get_bid(self, format_id):
|
||||
matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
|
||||
return matched_bids[0] if len(matched_bids) else None
|
||||
|
||||
def get_raw_data(self, tvid, video_id, enc_key, _uuid):
|
||||
tm = str(int(time.time()))
|
||||
tail = tm + tvid
|
||||
param = {
|
||||
'key': 'fvip',
|
||||
'src': md5_text('youtube-dl'),
|
||||
'tvId': tvid,
|
||||
'vid': video_id,
|
||||
'vinfo': 1,
|
||||
'tm': tm,
|
||||
'enc': md5_text(enc_key + tail),
|
||||
'qyid': _uuid,
|
||||
'tn': random.random(),
|
||||
# In iQiyi's flash player, um is set to 1 if there's a logged user
|
||||
# Some 1080P formats are only available with a logged user.
|
||||
# Here force um=1 to trick the iQiyi server
|
||||
'um': 1,
|
||||
'authkey': md5_text(md5_text('') + tail),
|
||||
'k_tag': 1,
|
||||
'src': '76f90cbd92f94a2e925d83e8ccd22cb7',
|
||||
'sc': sc,
|
||||
't': tm,
|
||||
}
|
||||
|
||||
api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
|
||||
compat_urllib_parse_urlencode(param)
|
||||
raw_data = self._download_json(api_url, video_id)
|
||||
return raw_data
|
||||
|
||||
def get_enc_key(self, video_id):
|
||||
# TODO: automatic key extraction
|
||||
# last update at 2016-01-22 for Zombie::bite
|
||||
enc_key = '4a1caba4b4465345366f28da7c117d20'
|
||||
return enc_key
|
||||
headers = {}
|
||||
cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
|
||||
if cn_verification_proxy:
|
||||
headers['Ytdl-request-proxy'] = cn_verification_proxy
|
||||
return self._download_json(
|
||||
'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id),
|
||||
video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='),
|
||||
query=params, headers=headers)
|
||||
|
||||
def _extract_playlist(self, webpage):
|
||||
PAGE_SIZE = 50
|
||||
@@ -571,58 +349,41 @@ class IqiyiIE(InfoExtractor):
|
||||
r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
|
||||
video_id = self._search_regex(
|
||||
r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
|
||||
_uuid = uuid.uuid4().hex
|
||||
|
||||
enc_key = self.get_enc_key(video_id)
|
||||
formats = []
|
||||
for _ in range(5):
|
||||
raw_data = self.get_raw_data(tvid, video_id)
|
||||
|
||||
raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
|
||||
if raw_data['code'] != 'A00000':
|
||||
if raw_data['code'] == 'A00111':
|
||||
self.raise_geo_restricted()
|
||||
raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
|
||||
|
||||
if raw_data['code'] != 'A000000':
|
||||
raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
|
||||
data = raw_data['data']
|
||||
|
||||
data = raw_data['data']
|
||||
for stream in data['vidl']:
|
||||
if 'm3utx' not in stream:
|
||||
continue
|
||||
vd = compat_str(stream['vd'])
|
||||
formats.append({
|
||||
'url': stream['m3utx'],
|
||||
'format_id': vd,
|
||||
'ext': 'mp4',
|
||||
'preference': self._FORMATS_MAP.get(vd, -1),
|
||||
'protocol': 'm3u8_native',
|
||||
})
|
||||
|
||||
title = data['vi']['vn']
|
||||
if formats:
|
||||
break
|
||||
|
||||
# generate video_urls_dict
|
||||
video_urls_dict = self.construct_video_urls(
|
||||
data, video_id, _uuid, tvid)
|
||||
self._sleep(5, video_id)
|
||||
|
||||
# construct info
|
||||
entries = []
|
||||
for format_id in video_urls_dict:
|
||||
video_urls = video_urls_dict[format_id]
|
||||
for i, video_url_info in enumerate(video_urls):
|
||||
if len(entries) < i + 1:
|
||||
entries.append({'formats': []})
|
||||
entries[i]['formats'].append(
|
||||
{
|
||||
'url': video_url_info[0],
|
||||
'filesize': video_url_info[-1],
|
||||
'format_id': format_id,
|
||||
'preference': int(self.get_bid(format_id))
|
||||
}
|
||||
)
|
||||
self._sort_formats(formats)
|
||||
title = (get_element_by_id('widget-videotitle', webpage) or
|
||||
clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)))
|
||||
|
||||
for i in range(len(entries)):
|
||||
self._sort_formats(entries[i]['formats'])
|
||||
entries[i].update(
|
||||
{
|
||||
'id': '%s_part%d' % (video_id, i + 1),
|
||||
'title': title,
|
||||
}
|
||||
)
|
||||
|
||||
if len(entries) > 1:
|
||||
info = {
|
||||
'_type': 'multi_video',
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'entries': entries,
|
||||
}
|
||||
else:
|
||||
info = entries[0]
|
||||
info['id'] = video_id
|
||||
info['title'] = title
|
||||
|
||||
return info
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'formats': formats,
|
||||
}
|
||||
|
@@ -64,6 +64,32 @@ class KalturaIE(InfoExtractor):
|
||||
}
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _extract_url(webpage):
|
||||
mobj = (
|
||||
re.search(
|
||||
r"""(?xs)
|
||||
kWidget\.(?:thumb)?[Ee]mbed\(
|
||||
\{.*?
|
||||
(?P<q1>['\"])wid(?P=q1)\s*:\s*
|
||||
(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?
|
||||
(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*
|
||||
(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),
|
||||
""", webpage) or
|
||||
re.search(
|
||||
r'''(?xs)
|
||||
(?P<q1>["\'])
|
||||
(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?
|
||||
(?P=q1).*?
|
||||
(?:
|
||||
entry_?[Ii]d|
|
||||
(?P<q2>["\'])entry_?[Ii]d(?P=q2)
|
||||
)\s*:\s*
|
||||
(?P<q3>["\'])(?P<id>.+?)(?P=q3)
|
||||
''', webpage))
|
||||
if mobj:
|
||||
return 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict()
|
||||
|
||||
def _kaltura_api_call(self, video_id, actions, *args, **kwargs):
|
||||
params = actions[0]
|
||||
if len(actions) > 1:
|
||||
|
@@ -1,60 +1,74 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
parse_duration,
|
||||
determine_ext,
|
||||
js_to_json,
|
||||
)
|
||||
|
||||
|
||||
class LA7IE(InfoExtractor):
|
||||
IE_NAME = 'la7.tv'
|
||||
_VALID_URL = r'''(?x)
|
||||
https?://(?:www\.)?la7\.tv/
|
||||
(?:
|
||||
richplayer/\?assetid=|
|
||||
\?contentId=
|
||||
)
|
||||
(?P<id>[0-9]+)'''
|
||||
IE_NAME = 'la7.it'
|
||||
_VALID_URL = r'''(?x)(https?://)?(?:
|
||||
(?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/|
|
||||
tg\.la7\.it/repliche-tgla7\?id=
|
||||
)(?P<id>.+)'''
|
||||
|
||||
_TEST = {
|
||||
'url': 'http://www.la7.tv/richplayer/?assetid=50355319',
|
||||
'md5': 'ec7d1f0224d20ba293ab56cf2259651f',
|
||||
_TESTS = [{
|
||||
# 'src' is a plain URL
|
||||
'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
|
||||
'md5': '6054674766e7988d3e02f2148ff92180',
|
||||
'info_dict': {
|
||||
'id': '50355319',
|
||||
'id': 'inccool8-02-10-2015-163722',
|
||||
'ext': 'mp4',
|
||||
'title': 'IL DIVO',
|
||||
'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti e Flavio Bucci',
|
||||
'duration': 6254,
|
||||
'title': 'Inc.Cool8',
|
||||
'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
|
||||
'thumbnail': 're:^https?://.*',
|
||||
},
|
||||
'skip': 'Blocked in the US',
|
||||
}
|
||||
}, {
|
||||
# 'src' is a dictionary
|
||||
'url': 'http://tg.la7.it/repliche-tgla7?id=189080',
|
||||
'md5': '6b0d8888d286e39870208dfeceaf456b',
|
||||
'info_dict': {
|
||||
'id': '189080',
|
||||
'ext': 'mp4',
|
||||
'title': 'TG LA7',
|
||||
},
|
||||
}, {
|
||||
'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id
|
||||
doc = self._download_xml(xml_url, video_id)
|
||||
|
||||
video_title = doc.find('title').text
|
||||
description = doc.find('description').text
|
||||
duration = parse_duration(doc.find('duration').text)
|
||||
thumbnail = doc.find('img').text
|
||||
view_count = int(doc.find('views').text)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:')
|
||||
player_data = self._parse_json(
|
||||
self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'),
|
||||
video_id, transform_source=js_to_json)
|
||||
|
||||
formats = [{
|
||||
'format': vnode.find('quality').text,
|
||||
'tbr': int(vnode.find('quality').text),
|
||||
'url': vnode.find('fms').text.strip().replace('mp4:', prefix),
|
||||
} for vnode in doc.findall('.//videos/video')]
|
||||
source = player_data['src']
|
||||
source_urls = source.values() if isinstance(source, dict) else [source]
|
||||
|
||||
formats = []
|
||||
for source_url in source_urls:
|
||||
ext = determine_ext(source_url)
|
||||
if ext == 'm3u8':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
source_url, video_id, ext='mp4',
|
||||
entry_protocol='m3u8_native', m3u8_id='hls'))
|
||||
else:
|
||||
formats.append({
|
||||
'url': source_url,
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': video_title,
|
||||
'description': description,
|
||||
'thumbnail': thumbnail,
|
||||
'duration': duration,
|
||||
'title': player_data['title'],
|
||||
'description': self._og_search_description(webpage, default=None),
|
||||
'thumbnail': player_data.get('poster'),
|
||||
'formats': formats,
|
||||
'view_count': view_count,
|
||||
}
|
||||
|
@@ -23,6 +23,7 @@ from ..utils import (
|
||||
sanitized_Request,
|
||||
str_or_none,
|
||||
url_basename,
|
||||
urshift,
|
||||
)
|
||||
|
||||
|
||||
@@ -74,15 +75,11 @@ class LeIE(InfoExtractor):
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
@staticmethod
|
||||
def urshift(val, n):
|
||||
return val >> n if val >= 0 else (val + 0x100000000) >> n
|
||||
|
||||
# ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
|
||||
def ror(self, param1, param2):
|
||||
_loc3_ = 0
|
||||
while _loc3_ < param2:
|
||||
param1 = self.urshift(param1, 1) + ((param1 & 1) << 31)
|
||||
param1 = urshift(param1, 1) + ((param1 & 1) << 31)
|
||||
_loc3_ += 1
|
||||
return param1
|
||||
|
||||
|
@@ -1,8 +1,6 @@
|
||||
# encoding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
|
||||
|
||||
@@ -23,34 +21,5 @@ class M6IE(InfoExtractor):
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
video_id = mobj.group('id')
|
||||
|
||||
rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
|
||||
'Downloading video RSS')
|
||||
|
||||
title = rss.find('./channel/item/title').text
|
||||
description = rss.find('./channel/item/description').text
|
||||
thumbnail = rss.find('./channel/item/visuel_clip_big').text
|
||||
duration = int(rss.find('./channel/item/duration').text)
|
||||
view_count = int(rss.find('./channel/item/nombre_vues').text)
|
||||
|
||||
formats = []
|
||||
for format_id in ['lq', 'sd', 'hq', 'hd']:
|
||||
video_url = rss.find('./channel/item/url_video_%s' % format_id)
|
||||
if video_url is None:
|
||||
continue
|
||||
formats.append({
|
||||
'url': video_url.text,
|
||||
'format_id': format_id,
|
||||
})
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'thumbnail': thumbnail,
|
||||
'duration': duration,
|
||||
'view_count': view_count,
|
||||
'formats': formats,
|
||||
}
|
||||
video_id = self._match_id(url)
|
||||
return self.url_result('6play:%s' % video_id, 'SixPlay', video_id)
|
||||
|
73
youtube_dl/extractor/meta.py
Normal file
73
youtube_dl/extractor/meta.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .pladform import PladformIE
|
||||
from ..utils import (
|
||||
unescapeHTML,
|
||||
int_or_none,
|
||||
ExtractorError,
|
||||
)
|
||||
|
||||
|
||||
class METAIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://video.meta.ua/5502115.video',
|
||||
'md5': '71b6f3ee274bef16f1ab410f7f56b476',
|
||||
'info_dict': {
|
||||
'id': '5502115',
|
||||
'ext': 'mp4',
|
||||
'title': 'Sony Xperia Z camera test [HQ]',
|
||||
'description': 'Xperia Z shoots video in FullHD HDR.',
|
||||
'uploader_id': 'nomobile',
|
||||
'uploader': 'CHЁZA.TV',
|
||||
'upload_date': '20130211',
|
||||
},
|
||||
'add_ie': ['Youtube'],
|
||||
}, {
|
||||
'url': 'http://video.meta.ua/iframe/5502115',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
# pladform embed
|
||||
'url': 'http://video.meta.ua/7121015.video',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
st_html5 = self._search_regex(
|
||||
r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None)
|
||||
|
||||
if st_html5:
|
||||
# uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js
|
||||
json_str = ''
|
||||
for i in range(0, len(st_html5), 3):
|
||||
json_str += '�%s;' % st_html5[i:i + 3]
|
||||
uppod_data = self._parse_json(unescapeHTML(json_str), video_id)
|
||||
error = uppod_data.get('customnotfound')
|
||||
if error:
|
||||
raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
|
||||
|
||||
video_url = uppod_data['file']
|
||||
info = {
|
||||
'id': video_id,
|
||||
'url': video_url,
|
||||
'title': uppod_data.get('comment') or self._og_search_title(webpage),
|
||||
'description': self._og_search_description(webpage, default=None),
|
||||
'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage),
|
||||
'duration': int_or_none(self._og_search_property(
|
||||
'video:duration', webpage, default=None)),
|
||||
}
|
||||
if 'youtube.com/' in video_url:
|
||||
info.update({
|
||||
'_type': 'url_transparent',
|
||||
'ie_key': 'Youtube',
|
||||
})
|
||||
return info
|
||||
|
||||
pladform_url = PladformIE._extract_url(webpage)
|
||||
if pladform_url:
|
||||
return self.url_result(pladform_url)
|
@@ -102,11 +102,11 @@ class MixcloudIE(InfoExtractor):
|
||||
description = self._og_search_description(webpage)
|
||||
like_count = parse_count(self._search_regex(
|
||||
r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
|
||||
webpage, 'like count', fatal=False))
|
||||
webpage, 'like count', default=None))
|
||||
view_count = str_to_int(self._search_regex(
|
||||
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
|
||||
r'/listeners/?">([0-9,.]+)</a>'],
|
||||
webpage, 'play count', fatal=False))
|
||||
webpage, 'play count', default=None))
|
||||
|
||||
return {
|
||||
'id': track_id,
|
||||
|
122
youtube_dl/extractor/msn.py
Normal file
122
youtube_dl/extractor/msn.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import compat_str
|
||||
from ..utils import (
|
||||
determine_ext,
|
||||
ExtractorError,
|
||||
int_or_none,
|
||||
unescapeHTML,
|
||||
)
|
||||
|
||||
|
||||
class MSNIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE',
|
||||
'md5': '8442f66c116cbab1ff7098f986983458',
|
||||
'info_dict': {
|
||||
'id': 'BBqQYNE',
|
||||
'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message',
|
||||
'ext': 'mp4',
|
||||
'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message',
|
||||
'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25',
|
||||
'duration': 104,
|
||||
'uploader': 'CBS Entertainment',
|
||||
'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v',
|
||||
},
|
||||
}, {
|
||||
'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
# geo restricted
|
||||
'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
video_id, display_id = mobj.group('id', 'display_id')
|
||||
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
|
||||
video = self._parse_json(
|
||||
self._search_regex(
|
||||
r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1',
|
||||
webpage, 'video data', default='{}', group='data'),
|
||||
display_id, transform_source=unescapeHTML)
|
||||
|
||||
if not video:
|
||||
error = unescapeHTML(self._search_regex(
|
||||
r'data-error=(["\'])(?P<error>.+?)\1',
|
||||
webpage, 'error', group='error'))
|
||||
raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
|
||||
|
||||
title = video['title']
|
||||
|
||||
formats = []
|
||||
for file_ in video.get('videoFiles', []):
|
||||
format_url = file_.get('url')
|
||||
if not format_url:
|
||||
continue
|
||||
ext = determine_ext(format_url)
|
||||
# .ism is not yet supported (see
|
||||
# https://github.com/rg3/youtube-dl/issues/8118)
|
||||
if ext == 'ism':
|
||||
continue
|
||||
if 'm3u8' in format_url:
|
||||
# m3u8_native should not be used here until
|
||||
# https://github.com/rg3/youtube-dl/issues/9913 is fixed
|
||||
m3u8_formats = self._extract_m3u8_formats(
|
||||
format_url, display_id, 'mp4',
|
||||
m3u8_id='hls', fatal=False)
|
||||
# Despite metadata in m3u8 all video+audio formats are
|
||||
# actually video-only (no audio)
|
||||
for f in m3u8_formats:
|
||||
if f.get('acodec') != 'none' and f.get('vcodec') != 'none':
|
||||
f['acodec'] = 'none'
|
||||
formats.extend(m3u8_formats)
|
||||
else:
|
||||
formats.append({
|
||||
'url': format_url,
|
||||
'ext': 'mp4',
|
||||
'format_id': 'http',
|
||||
'width': int_or_none(file_.get('width')),
|
||||
'height': int_or_none(file_.get('height')),
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
subtitles = {}
|
||||
for file_ in video.get('files', []):
|
||||
format_url = file_.get('url')
|
||||
format_code = file_.get('formatCode')
|
||||
if not format_url or not format_code:
|
||||
continue
|
||||
if compat_str(format_code) == '3100':
|
||||
subtitles.setdefault(file_.get('culture', 'en'), []).append({
|
||||
'ext': determine_ext(format_url, 'ttml'),
|
||||
'url': format_url,
|
||||
})
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'display_id': display_id,
|
||||
'title': title,
|
||||
'description': video.get('description'),
|
||||
'thumbnail': video.get('headlineImage', {}).get('url'),
|
||||
'duration': int_or_none(video.get('durationSecs')),
|
||||
'uploader': video.get('sourceFriendly'),
|
||||
'uploader_id': video.get('providerId'),
|
||||
'creator': video.get('creator'),
|
||||
'subtitles': subtitles,
|
||||
'formats': formats,
|
||||
}
|
@@ -85,9 +85,10 @@ class MTVServicesInfoExtractor(InfoExtractor):
|
||||
rtmp_video_url = rendition.find('./src').text
|
||||
if rtmp_video_url.endswith('siteunavail.png'):
|
||||
continue
|
||||
new_url = self._transform_rtmp_url(rtmp_video_url)
|
||||
formats.append({
|
||||
'ext': ext,
|
||||
'url': self._transform_rtmp_url(rtmp_video_url),
|
||||
'ext': 'flv' if new_url.startswith('rtmp') else ext,
|
||||
'url': new_url,
|
||||
'format_id': rendition.get('bitrate'),
|
||||
'width': int(rendition.get('width')),
|
||||
'height': int(rendition.get('height')),
|
||||
|
@@ -1,6 +1,7 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from .theplatform import ThePlatformIE
|
||||
from ..utils import (
|
||||
smuggle_url,
|
||||
url_basename,
|
||||
@@ -61,7 +62,7 @@ class NationalGeographicIE(InfoExtractor):
|
||||
}
|
||||
|
||||
|
||||
class NationalGeographicChannelIE(InfoExtractor):
|
||||
class NationalGeographicChannelIE(ThePlatformIE):
|
||||
IE_NAME = 'natgeo:channel'
|
||||
_VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P<id>[^/?]+)'
|
||||
|
||||
@@ -102,12 +103,22 @@ class NationalGeographicChannelIE(InfoExtractor):
|
||||
release_url = self._search_regex(
|
||||
r'video_auth_playlist_url\s*=\s*"([^"]+)"',
|
||||
webpage, 'release url')
|
||||
query = {
|
||||
'mbr': 'true',
|
||||
'switch': 'http',
|
||||
}
|
||||
is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False)
|
||||
if is_auth == 'auth':
|
||||
auth_resource_id = self._search_regex(
|
||||
r"video_auth_resourceId\s*=\s*'([^']+)'",
|
||||
webpage, 'auth resource id')
|
||||
query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or ''
|
||||
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'ie_key': 'ThePlatform',
|
||||
'url': smuggle_url(
|
||||
update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}),
|
||||
update_url_query(release_url, query),
|
||||
{'force_smil_url': True}),
|
||||
'display_id': display_id,
|
||||
}
|
||||
|
@@ -9,10 +9,6 @@ from ..utils import (
|
||||
lowercase_escape,
|
||||
smuggle_url,
|
||||
unescapeHTML,
|
||||
update_url_query,
|
||||
int_or_none,
|
||||
HEADRequest,
|
||||
parse_iso8601,
|
||||
)
|
||||
|
||||
|
||||
@@ -192,9 +188,9 @@ class CSNNEIE(InfoExtractor):
|
||||
|
||||
|
||||
class NBCNewsIE(ThePlatformIE):
|
||||
_VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/
|
||||
_VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/
|
||||
(?:video/.+?/(?P<id>\d+)|
|
||||
([^/]+/)*(?P<display_id>[^/?]+))
|
||||
([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+))
|
||||
'''
|
||||
|
||||
_TESTS = [
|
||||
@@ -216,13 +212,16 @@ class NBCNewsIE(ThePlatformIE):
|
||||
'ext': 'mp4',
|
||||
'title': 'How Twitter Reacted To The Snowden Interview',
|
||||
'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
|
||||
'uploader': 'NBCU-NEWS',
|
||||
'timestamp': 1401363060,
|
||||
'upload_date': '20140529',
|
||||
},
|
||||
},
|
||||
{
|
||||
'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
|
||||
'md5': 'fdbf39ab73a72df5896b6234ff98518a',
|
||||
'info_dict': {
|
||||
'id': 'Wjf9EDR3A_60',
|
||||
'id': '529953347624',
|
||||
'ext': 'mp4',
|
||||
'title': 'FULL EPISODE: Family Business',
|
||||
'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
|
||||
@@ -237,6 +236,9 @@ class NBCNewsIE(ThePlatformIE):
|
||||
'ext': 'mp4',
|
||||
'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
|
||||
'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
|
||||
'timestamp': 1423104900,
|
||||
'uploader': 'NBCU-NEWS',
|
||||
'upload_date': '20150205',
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -245,10 +247,12 @@ class NBCNewsIE(ThePlatformIE):
|
||||
'info_dict': {
|
||||
'id': '529953347624',
|
||||
'ext': 'mp4',
|
||||
'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'',
|
||||
'description': 'md5:d22d1281a24f22ea0880741bb4dd6301',
|
||||
'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up',
|
||||
'description': 'md5:c8be487b2d80ff0594c005add88d8351',
|
||||
'upload_date': '20150922',
|
||||
'timestamp': 1442917800,
|
||||
'uploader': 'NBCU-NEWS',
|
||||
},
|
||||
'expected_warnings': ['http-6000 is not available']
|
||||
},
|
||||
{
|
||||
'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
|
||||
@@ -260,6 +264,22 @@ class NBCNewsIE(ThePlatformIE):
|
||||
'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
|
||||
'upload_date': '20160420',
|
||||
'timestamp': 1461152093,
|
||||
'uploader': 'NBCU-NEWS',
|
||||
},
|
||||
},
|
||||
{
|
||||
'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
|
||||
'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
|
||||
'info_dict': {
|
||||
'id': '314487875924',
|
||||
'ext': 'mp4',
|
||||
'title': 'The chaotic GOP immigration vote',
|
||||
'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
'timestamp': 1406937606,
|
||||
'upload_date': '20140802',
|
||||
'uploader': 'NBCU-NEWS',
|
||||
'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -290,105 +310,28 @@ class NBCNewsIE(ThePlatformIE):
|
||||
}
|
||||
else:
|
||||
# "feature" and "nightly-news" pages use theplatform.com
|
||||
display_id = mobj.group('display_id')
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
info = None
|
||||
bootstrap_json = self._search_regex(
|
||||
[r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$',
|
||||
r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'],
|
||||
webpage, 'bootstrap json', default=None)
|
||||
bootstrap = self._parse_json(
|
||||
bootstrap_json, display_id, transform_source=unescapeHTML)
|
||||
if 'results' in bootstrap:
|
||||
info = bootstrap['results'][0]['video']
|
||||
elif 'video' in bootstrap:
|
||||
info = bootstrap['video']
|
||||
else:
|
||||
info = bootstrap
|
||||
video_id = info['mpxId']
|
||||
title = info['title']
|
||||
|
||||
subtitles = {}
|
||||
caption_links = info.get('captionLinks')
|
||||
if caption_links:
|
||||
for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')):
|
||||
sub_url = caption_links.get(sub_key)
|
||||
if sub_url:
|
||||
subtitles.setdefault('en', []).append({
|
||||
'url': sub_url,
|
||||
'ext': sub_ext,
|
||||
})
|
||||
|
||||
formats = []
|
||||
for video_asset in info['videoAssets']:
|
||||
video_url = video_asset.get('publicUrl')
|
||||
if not video_url:
|
||||
continue
|
||||
container = video_asset.get('format')
|
||||
asset_type = video_asset.get('assetType') or ''
|
||||
if container == 'ISM' or asset_type == 'FireTV-Once':
|
||||
continue
|
||||
elif asset_type == 'OnceURL':
|
||||
tp_formats, tp_subtitles = self._extract_theplatform_smil(
|
||||
video_url, video_id)
|
||||
formats.extend(tp_formats)
|
||||
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
|
||||
video_id = mobj.group('mpx_id')
|
||||
if not video_id.isdigit():
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
info = None
|
||||
bootstrap_json = self._search_regex(
|
||||
[r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$',
|
||||
r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'],
|
||||
webpage, 'bootstrap json', default=None)
|
||||
bootstrap = self._parse_json(
|
||||
bootstrap_json, video_id, transform_source=unescapeHTML)
|
||||
if 'results' in bootstrap:
|
||||
info = bootstrap['results'][0]['video']
|
||||
elif 'video' in bootstrap:
|
||||
info = bootstrap['video']
|
||||
else:
|
||||
tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000)
|
||||
format_id = 'http%s' % ('-%d' % tbr if tbr else '')
|
||||
video_url = update_url_query(
|
||||
video_url, {'format': 'redirect'})
|
||||
# resolve the url so that we can check availability and detect the correct extension
|
||||
head = self._request_webpage(
|
||||
HEADRequest(video_url), video_id,
|
||||
'Checking %s url' % format_id,
|
||||
'%s is not available' % format_id,
|
||||
fatal=False)
|
||||
if head:
|
||||
video_url = head.geturl()
|
||||
formats.append({
|
||||
'format_id': format_id,
|
||||
'url': video_url,
|
||||
'width': int_or_none(video_asset.get('width')),
|
||||
'height': int_or_none(video_asset.get('height')),
|
||||
'tbr': tbr,
|
||||
'container': video_asset.get('format'),
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
info = bootstrap
|
||||
video_id = info['mpxId']
|
||||
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'description': info.get('description'),
|
||||
'thumbnail': info.get('thumbnail'),
|
||||
'duration': int_or_none(info.get('duration')),
|
||||
'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
# http://feed.theplatform.com/f/2E2eJC/nbcnews also works
|
||||
'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id,
|
||||
'ie_key': 'ThePlatformFeed',
|
||||
}
|
||||
|
||||
|
||||
class MSNBCIE(InfoExtractor):
|
||||
# https URLs redirect to corresponding http ones
|
||||
_VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)'
|
||||
_TEST = {
|
||||
'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
|
||||
'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
|
||||
'info_dict': {
|
||||
'id': 'n_hayes_Aimm_140801_272214',
|
||||
'ext': 'mp4',
|
||||
'title': 'The chaotic GOP immigration vote',
|
||||
'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
'timestamp': 1406937606,
|
||||
'upload_date': '20140802',
|
||||
'uploader': 'NBCU-NEWS',
|
||||
'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
|
||||
},
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
embed_url = self._html_search_meta('embedURL', webpage)
|
||||
return self.url_result(embed_url)
|
||||
|
55
youtube_dl/extractor/ninecninemedia.py
Normal file
55
youtube_dl/extractor/ninecninemedia.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
parse_iso8601,
|
||||
parse_duration,
|
||||
ExtractorError
|
||||
)
|
||||
|
||||
|
||||
class NineCNineMediaIE(InfoExtractor):
|
||||
_VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)'
|
||||
|
||||
def _real_extract(self, url):
|
||||
destination_code, video_id = re.match(self._VALID_URL, url).groups()
|
||||
api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id)
|
||||
content = self._download_json(api_base_url, video_id, query={
|
||||
'$include': '[contentpackages]',
|
||||
})
|
||||
title = content['Name']
|
||||
if len(content['ContentPackages']) > 1:
|
||||
raise ExtractorError('multiple content packages')
|
||||
content_package = content['ContentPackages'][0]
|
||||
stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id']
|
||||
stacks = self._download_json(stacks_base_url, video_id)['Items']
|
||||
if len(stacks) > 1:
|
||||
raise ExtractorError('multiple stacks')
|
||||
stack = stacks[0]
|
||||
stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id'])
|
||||
formats = []
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
stack_base_url + 'm3u8', video_id, 'mp4',
|
||||
'm3u8_native', m3u8_id='hls', fatal=False))
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
stack_base_url + 'f4m', video_id,
|
||||
f4m_id='hds', fatal=False))
|
||||
mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False)
|
||||
if mp4_url:
|
||||
formats.append({
|
||||
'url': mp4_url,
|
||||
'format_id': 'mp4',
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'description': content.get('Desc') or content.get('ShortDesc'),
|
||||
'timestamp': parse_iso8601(content.get('BroadcastDateTime')),
|
||||
'duration': parse_duration(content.get('BroadcastTime')),
|
||||
'formats': formats,
|
||||
}
|
@@ -516,9 +516,14 @@ class PBSIE(InfoExtractor):
|
||||
# https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
|
||||
if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'):
|
||||
continue
|
||||
f_url = re.sub(r'\d+k|baseline', bitrate, http_url)
|
||||
# This may produce invalid links sometimes (e.g.
|
||||
# http://www.pbs.org/wgbh/frontline/film/suicide-plan)
|
||||
if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate):
|
||||
continue
|
||||
f = m3u8_format.copy()
|
||||
f.update({
|
||||
'url': re.sub(r'\d+k|baseline', bitrate, http_url),
|
||||
'url': f_url,
|
||||
'format_id': m3u8_format['format_id'].replace('hls', 'http'),
|
||||
'protocol': 'http',
|
||||
})
|
||||
|
@@ -120,9 +120,12 @@ class PeriscopeUserIE(InfoExtractor):
|
||||
title = user.get('display_name') or user.get('username')
|
||||
description = user.get('description')
|
||||
|
||||
broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or
|
||||
data_store.get('BroadcastCache', {}).get('broadcastIds', []))
|
||||
|
||||
entries = [
|
||||
self.url_result(
|
||||
'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id']))
|
||||
for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])]
|
||||
'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id))
|
||||
for broadcast_id in broadcast_ids]
|
||||
|
||||
return self.playlist_result(entries, user_id, title, description)
|
||||
|
@@ -49,7 +49,7 @@ class PladformIE(InfoExtractor):
|
||||
@staticmethod
|
||||
def _extract_url(webpage):
|
||||
mobj = re.search(
|
||||
r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage)
|
||||
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage)
|
||||
if mobj:
|
||||
return mobj.group('url')
|
||||
|
||||
|
95
youtube_dl/extractor/polskieradio.py
Normal file
95
youtube_dl/extractor/polskieradio.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import (
|
||||
compat_str,
|
||||
compat_urllib_parse_unquote,
|
||||
)
|
||||
from ..utils import (
|
||||
int_or_none,
|
||||
strip_or_none,
|
||||
unified_timestamp,
|
||||
)
|
||||
|
||||
|
||||
class PolskieRadioIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
|
||||
'info_dict': {
|
||||
'id': '1587943',
|
||||
'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
|
||||
'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
|
||||
},
|
||||
'playlist': [{
|
||||
'md5': '2984ee6ce9046d91fc233bc1a864a09a',
|
||||
'info_dict': {
|
||||
'id': '1540576',
|
||||
'ext': 'mp3',
|
||||
'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
|
||||
'timestamp': 1456594200,
|
||||
'upload_date': '20160227',
|
||||
'duration': 2364,
|
||||
},
|
||||
}],
|
||||
}, {
|
||||
'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal',
|
||||
'info_dict': {
|
||||
'id': '1635803',
|
||||
'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał',
|
||||
'description': 'md5:01cb7d0cad58664095d72b51a1ebada2',
|
||||
},
|
||||
'playlist_mincount': 12,
|
||||
}, {
|
||||
'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
# with mp4 video
|
||||
'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
playlist_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(url, playlist_id)
|
||||
|
||||
content = self._search_regex(
|
||||
r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>',
|
||||
webpage, 'content')
|
||||
|
||||
timestamp = unified_timestamp(self._html_search_regex(
|
||||
r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
|
||||
webpage, 'timestamp', fatal=False))
|
||||
|
||||
entries = []
|
||||
|
||||
media_urls = set()
|
||||
|
||||
for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content):
|
||||
media = self._parse_json(data_media, playlist_id, fatal=False)
|
||||
if not media.get('file') or not media.get('desc'):
|
||||
continue
|
||||
media_url = self._proto_relative_url(media['file'], 'http:')
|
||||
if media_url in media_urls:
|
||||
continue
|
||||
media_urls.add(media_url)
|
||||
entries.append({
|
||||
'id': compat_str(media['id']),
|
||||
'url': media_url,
|
||||
'title': compat_urllib_parse_unquote(media['desc']),
|
||||
'duration': int_or_none(media.get('length')),
|
||||
'vcodec': 'none' if media.get('provider') == 'audio' else None,
|
||||
'timestamp': timestamp,
|
||||
})
|
||||
|
||||
title = self._og_search_title(webpage).strip()
|
||||
description = strip_or_none(self._og_search_description(webpage))
|
||||
|
||||
return self.playlist_result(entries, playlist_id, title, description)
|
@@ -25,7 +25,15 @@ from ..aes import (
|
||||
|
||||
|
||||
class PornHubIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
|
||||
IE_DESC = 'PornHub and Thumbzilla'
|
||||
_VALID_URL = r'''(?x)
|
||||
https?://
|
||||
(?:
|
||||
(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
|
||||
(?:www\.)?thumbzilla\.com/video/
|
||||
)
|
||||
(?P<id>[0-9a-z]+)
|
||||
'''
|
||||
_TESTS = [{
|
||||
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
|
||||
'md5': '1e19b41231a02eba417839222ac9d58e',
|
||||
@@ -63,8 +71,20 @@ class PornHubIE(InfoExtractor):
|
||||
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
# removed at the request of cam4.com
|
||||
'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
# removed at the request of the copyright owner
|
||||
'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
# removed by uploader
|
||||
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
@classmethod
|
||||
@@ -87,8 +107,8 @@ class PornHubIE(InfoExtractor):
|
||||
webpage = self._download_webpage(req, video_id)
|
||||
|
||||
error_msg = self._html_search_regex(
|
||||
r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
|
||||
webpage, 'error message', default=None)
|
||||
r'(?s)<div[^>]+class=(["\']).*?\bremoved\b.*?\1[^>]*>(?P<error>.+?)</div>',
|
||||
webpage, 'error message', default=None, group='error')
|
||||
if error_msg:
|
||||
error_msg = re.sub(r'\s+', ' ', error_msg)
|
||||
raise ExtractorError(
|
||||
|
@@ -2,22 +2,19 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
js_to_json,
|
||||
unescapeHTML,
|
||||
int_or_none,
|
||||
)
|
||||
from ..utils import int_or_none
|
||||
|
||||
|
||||
class R7IE(InfoExtractor):
|
||||
_VALID_URL = r'''(?x)https?://
|
||||
_VALID_URL = r'''(?x)
|
||||
https?://
|
||||
(?:
|
||||
(?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
|
||||
noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
|
||||
player\.r7\.com/video/i/
|
||||
)
|
||||
(?P<id>[\da-f]{24})
|
||||
'''
|
||||
'''
|
||||
_TESTS = [{
|
||||
'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',
|
||||
'md5': '403c4e393617e8e8ddc748978ee8efde',
|
||||
@@ -25,6 +22,7 @@ class R7IE(InfoExtractor):
|
||||
'id': '54e7050b0cf2ff57e0279389',
|
||||
'ext': 'mp4',
|
||||
'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
|
||||
'description': 'md5:01812008664be76a6479aa58ec865b72',
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
'duration': 98,
|
||||
'like_count': int,
|
||||
@@ -44,45 +42,72 @@ class R7IE(InfoExtractor):
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(
|
||||
'http://player.r7.com/video/i/%s' % video_id, video_id)
|
||||
video = self._download_json(
|
||||
'http://player-api.r7.com/video/i/%s' % video_id, video_id)
|
||||
|
||||
item = self._parse_json(js_to_json(self._search_regex(
|
||||
r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id)
|
||||
|
||||
title = unescapeHTML(item['title'])
|
||||
thumbnail = item.get('init', {}).get('thumbUri')
|
||||
duration = None
|
||||
|
||||
statistics = item.get('statistics', {})
|
||||
like_count = int_or_none(statistics.get('likes'))
|
||||
view_count = int_or_none(statistics.get('views'))
|
||||
title = video['title']
|
||||
|
||||
formats = []
|
||||
for format_key, format_dict in item['playlist'][0].items():
|
||||
src = format_dict.get('src')
|
||||
if not src:
|
||||
continue
|
||||
format_id = format_dict.get('format') or format_key
|
||||
if duration is None:
|
||||
duration = format_dict.get('duration')
|
||||
if '.f4m' in src:
|
||||
formats.extend(self._extract_f4m_formats(src, video_id, preference=-1))
|
||||
elif src.endswith('.m3u8'):
|
||||
formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2))
|
||||
else:
|
||||
formats.append({
|
||||
'url': src,
|
||||
'format_id': format_id,
|
||||
})
|
||||
media_url_hls = video.get('media_url_hls')
|
||||
if media_url_hls:
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native',
|
||||
m3u8_id='hls', fatal=False))
|
||||
media_url = video.get('media_url')
|
||||
if media_url:
|
||||
f = {
|
||||
'url': media_url,
|
||||
'format_id': 'http',
|
||||
}
|
||||
# m3u8 format always matches the http format, let's copy metadata from
|
||||
# one to another
|
||||
m3u8_formats = list(filter(
|
||||
lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
|
||||
formats))
|
||||
if len(m3u8_formats) == 1:
|
||||
f_copy = m3u8_formats[0].copy()
|
||||
f_copy.update(f)
|
||||
f_copy['protocol'] = 'http'
|
||||
f = f_copy
|
||||
formats.append(f)
|
||||
self._sort_formats(formats)
|
||||
|
||||
description = video.get('description')
|
||||
thumbnail = video.get('thumb')
|
||||
duration = int_or_none(video.get('media_duration'))
|
||||
like_count = int_or_none(video.get('likes'))
|
||||
view_count = int_or_none(video.get('views'))
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'thumbnail': thumbnail,
|
||||
'duration': duration,
|
||||
'like_count': like_count,
|
||||
'view_count': view_count,
|
||||
'formats': formats,
|
||||
}
|
||||
|
||||
|
||||
class R7ArticleIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)'
|
||||
_TEST = {
|
||||
'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015',
|
||||
'only_matching': True,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def suitable(cls, url):
|
||||
return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url)
|
||||
|
||||
def _real_extract(self, url):
|
||||
display_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
|
||||
video_id = self._search_regex(
|
||||
r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})',
|
||||
webpage, 'video id')
|
||||
|
||||
return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())
|
||||
|
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import(
|
||||
from ..utils import (
|
||||
unified_strdate,
|
||||
str_to_int,
|
||||
)
|
||||
|
@@ -1,47 +1,146 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import (
|
||||
compat_urllib_parse,
|
||||
compat_urlparse,
|
||||
)
|
||||
from ..compat import compat_urlparse
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
determine_ext,
|
||||
ExtractorError,
|
||||
find_xpath_attr,
|
||||
fix_xml_ampersands,
|
||||
int_or_none,
|
||||
parse_duration,
|
||||
unified_strdate,
|
||||
int_or_none,
|
||||
update_url_query,
|
||||
xpath_text,
|
||||
)
|
||||
|
||||
|
||||
class RaiTVIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
|
||||
class RaiBaseIE(InfoExtractor):
|
||||
def _extract_relinker_formats(self, relinker_url, video_id):
|
||||
formats = []
|
||||
|
||||
for platform in ('mon', 'flash', 'native'):
|
||||
headers = {}
|
||||
# TODO: rename --cn-verification-proxy
|
||||
cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
|
||||
if cn_verification_proxy:
|
||||
headers['Ytdl-request-proxy'] = cn_verification_proxy
|
||||
|
||||
relinker = self._download_xml(
|
||||
relinker_url, video_id,
|
||||
note='Downloading XML metadata for platform %s' % platform,
|
||||
transform_source=fix_xml_ampersands,
|
||||
query={'output': 45, 'pl': platform}, headers=headers)
|
||||
|
||||
media_url = find_xpath_attr(relinker, './url', 'type', 'content').text
|
||||
if media_url == 'http://download.rai.it/video_no_available.mp4':
|
||||
self.raise_geo_restricted()
|
||||
|
||||
ext = determine_ext(media_url)
|
||||
if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
|
||||
continue
|
||||
|
||||
if ext == 'm3u8':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
media_url, video_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls', fatal=False))
|
||||
elif ext == 'f4m':
|
||||
manifest_url = update_url_query(
|
||||
media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
|
||||
{'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
manifest_url, video_id, f4m_id='hds', fatal=False))
|
||||
else:
|
||||
bitrate = int_or_none(xpath_text(relinker, 'bitrate'))
|
||||
formats.append({
|
||||
'url': media_url,
|
||||
'tbr': bitrate if bitrate > 0 else None,
|
||||
'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
|
||||
})
|
||||
|
||||
return formats
|
||||
|
||||
def _extract_from_content_id(self, content_id, base_url):
|
||||
media = self._download_json(
|
||||
'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,
|
||||
content_id, 'Downloading video JSON')
|
||||
|
||||
thumbnails = []
|
||||
for image_type in ('image', 'image_medium', 'image_300'):
|
||||
thumbnail_url = media.get(image_type)
|
||||
if thumbnail_url:
|
||||
thumbnails.append({
|
||||
'url': compat_urlparse.urljoin(base_url, thumbnail_url),
|
||||
})
|
||||
|
||||
formats = []
|
||||
media_type = media['type']
|
||||
if 'Audio' in media_type:
|
||||
formats.append({
|
||||
'format_id': media.get('formatoAudio'),
|
||||
'url': media['audioUrl'],
|
||||
'ext': media.get('formatoAudio'),
|
||||
})
|
||||
elif 'Video' in media_type:
|
||||
formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id))
|
||||
self._sort_formats(formats)
|
||||
else:
|
||||
raise ExtractorError('not a media file')
|
||||
|
||||
subtitles = {}
|
||||
captions = media.get('subtitlesUrl')
|
||||
if captions:
|
||||
STL_EXT = '.stl'
|
||||
SRT_EXT = '.srt'
|
||||
if captions.endswith(STL_EXT):
|
||||
captions = captions[:-len(STL_EXT)] + SRT_EXT
|
||||
subtitles['it'] = [{
|
||||
'ext': 'srt',
|
||||
'url': captions,
|
||||
}]
|
||||
|
||||
return {
|
||||
'id': content_id,
|
||||
'title': media['name'],
|
||||
'description': media.get('desc'),
|
||||
'thumbnails': thumbnails,
|
||||
'uploader': media.get('author'),
|
||||
'upload_date': unified_strdate(media.get('date')),
|
||||
'duration': parse_duration(media.get('length')),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
|
||||
|
||||
class RaiTVIE(RaiBaseIE):
|
||||
_VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
|
||||
_TESTS = [
|
||||
{
|
||||
'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
|
||||
'md5': '96382709b61dd64a6b88e0f791e6df4c',
|
||||
'md5': '8970abf8caf8aef4696e7b1f2adfc696',
|
||||
'info_dict': {
|
||||
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
|
||||
'ext': 'flv',
|
||||
'ext': 'mp4',
|
||||
'title': 'Report del 07/04/2014',
|
||||
'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
|
||||
'upload_date': '20140407',
|
||||
'duration': 6160,
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
}
|
||||
},
|
||||
{
|
||||
# no m3u8 stream
|
||||
'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
|
||||
'md5': 'd9751b78eac9710d62c2447b224dea39',
|
||||
# HDS download, MD5 is unstable
|
||||
'info_dict': {
|
||||
'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
|
||||
'ext': 'flv',
|
||||
'title': 'TG PRIMO TEMPO',
|
||||
'upload_date': '20140612',
|
||||
'duration': 1758,
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
},
|
||||
'skip': 'Geo-restricted to Italy',
|
||||
},
|
||||
{
|
||||
'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html',
|
||||
@@ -67,127 +166,70 @@ class RaiTVIE(InfoExtractor):
|
||||
},
|
||||
{
|
||||
'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html',
|
||||
'md5': '496ab63e420574447f70d02578333437',
|
||||
'md5': 'e57493e1cb8bc7c564663f363b171847',
|
||||
'info_dict': {
|
||||
'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',
|
||||
'ext': 'flv',
|
||||
'ext': 'mp4',
|
||||
'title': 'Il Candidato - Primo episodio: "Le Primarie"',
|
||||
'description': 'md5:364b604f7db50594678f483353164fb8',
|
||||
'upload_date': '20140923',
|
||||
'duration': 386,
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
media = self._download_json(
|
||||
'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id,
|
||||
video_id, 'Downloading video JSON')
|
||||
|
||||
thumbnails = []
|
||||
for image_type in ('image', 'image_medium', 'image_300'):
|
||||
thumbnail_url = media.get(image_type)
|
||||
if thumbnail_url:
|
||||
thumbnails.append({
|
||||
'url': thumbnail_url,
|
||||
})
|
||||
|
||||
subtitles = []
|
||||
formats = []
|
||||
media_type = media['type']
|
||||
if 'Audio' in media_type:
|
||||
formats.append({
|
||||
'format_id': media.get('formatoAudio'),
|
||||
'url': media['audioUrl'],
|
||||
'ext': media.get('formatoAudio'),
|
||||
})
|
||||
elif 'Video' in media_type:
|
||||
def fix_xml(xml):
|
||||
return xml.replace(' tag elementi', '').replace('>/', '</')
|
||||
|
||||
relinker = self._download_xml(
|
||||
media['mediaUri'] + '&output=43',
|
||||
video_id, transform_source=fix_xml)
|
||||
|
||||
has_subtitle = False
|
||||
|
||||
for element in relinker.findall('element'):
|
||||
media_url = xpath_text(element, 'url')
|
||||
ext = determine_ext(media_url)
|
||||
content_type = xpath_text(element, 'content-type')
|
||||
if ext == 'm3u8':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
media_url, video_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls', fatal=False))
|
||||
elif ext == 'f4m':
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
|
||||
video_id, f4m_id='hds', fatal=False))
|
||||
elif ext == 'stl':
|
||||
has_subtitle = True
|
||||
elif content_type.startswith('video/'):
|
||||
bitrate = int_or_none(xpath_text(element, 'bitrate'))
|
||||
formats.append({
|
||||
'url': media_url,
|
||||
'tbr': bitrate if bitrate > 0 else None,
|
||||
'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
|
||||
})
|
||||
elif content_type.startswith('image/'):
|
||||
thumbnails.append({
|
||||
'url': media_url,
|
||||
})
|
||||
|
||||
self._sort_formats(formats)
|
||||
|
||||
if has_subtitle:
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
subtitles = self._get_subtitles(video_id, webpage)
|
||||
else:
|
||||
raise ExtractorError('not a media file')
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': media['name'],
|
||||
'description': media.get('desc'),
|
||||
'thumbnails': thumbnails,
|
||||
'uploader': media.get('author'),
|
||||
'upload_date': unified_strdate(media.get('date')),
|
||||
'duration': parse_duration(media.get('length')),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
|
||||
def _get_subtitles(self, video_id, webpage):
|
||||
subtitles = {}
|
||||
m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage)
|
||||
if m:
|
||||
captions = m.group('captions')
|
||||
STL_EXT = '.stl'
|
||||
SRT_EXT = '.srt'
|
||||
if captions.endswith(STL_EXT):
|
||||
captions = captions[:-len(STL_EXT)] + SRT_EXT
|
||||
subtitles['it'] = [{
|
||||
'ext': 'srt',
|
||||
'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions),
|
||||
}]
|
||||
return subtitles
|
||||
return self._extract_from_content_id(video_id, url)
|
||||
|
||||
|
||||
class RaiIE(InfoExtractor):
|
||||
class RaiIE(RaiBaseIE):
|
||||
_VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
|
||||
_TESTS = [
|
||||
{
|
||||
'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
|
||||
'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7',
|
||||
'md5': '2dd727e61114e1ee9c47f0da6914e178',
|
||||
'info_dict': {
|
||||
'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
|
||||
'ext': 'flv',
|
||||
'ext': 'mp4',
|
||||
'title': 'Il pacco',
|
||||
'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
|
||||
'upload_date': '20141221',
|
||||
},
|
||||
}
|
||||
},
|
||||
{
|
||||
# Direct relinker URL
|
||||
'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
|
||||
# HDS live stream, MD5 is unstable
|
||||
'info_dict': {
|
||||
'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
|
||||
'ext': 'flv',
|
||||
'title': 'EuroNews',
|
||||
},
|
||||
'skip': 'Geo-restricted to Italy',
|
||||
},
|
||||
{
|
||||
# Embedded content item ID
|
||||
'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
|
||||
'md5': '84c1135ce960e8822ae63cec34441d63',
|
||||
'info_dict': {
|
||||
'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8',
|
||||
'ext': 'mp4',
|
||||
'title': 'TG1 ore 20:00 del 02/07/2016',
|
||||
'upload_date': '20160702',
|
||||
},
|
||||
},
|
||||
{
|
||||
'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
|
||||
# HDS live stream, MD5 is unstable
|
||||
'info_dict': {
|
||||
'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
|
||||
'ext': 'flv',
|
||||
'title': 'La diretta di Rainews24',
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
@classmethod
|
||||
@@ -201,7 +243,30 @@ class RaiIE(InfoExtractor):
|
||||
iframe_url = self._search_regex(
|
||||
[r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',
|
||||
r'drawMediaRaiTV\(["\'](.+?)["\']'],
|
||||
webpage, 'iframe')
|
||||
if not iframe_url.startswith('http'):
|
||||
iframe_url = compat_urlparse.urljoin(url, iframe_url)
|
||||
return self.url_result(iframe_url)
|
||||
webpage, 'iframe', default=None)
|
||||
if iframe_url:
|
||||
if not iframe_url.startswith('http'):
|
||||
iframe_url = compat_urlparse.urljoin(url, iframe_url)
|
||||
return self.url_result(iframe_url)
|
||||
|
||||
content_item_id = self._search_regex(
|
||||
r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)',
|
||||
webpage, 'content item ID', group='content_id', default=None)
|
||||
if content_item_id:
|
||||
return self._extract_from_content_id(content_item_id, url)
|
||||
|
||||
relinker_url = compat_urlparse.urljoin(url, self._search_regex(
|
||||
r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)',
|
||||
webpage, 'relinker URL', group='url'))
|
||||
formats = self._extract_relinker_formats(relinker_url, video_id)
|
||||
self._sort_formats(formats)
|
||||
|
||||
title = self._search_regex(
|
||||
r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1',
|
||||
webpage, 'title', group='title', default=None) or self._og_search_title(webpage)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'formats': formats,
|
||||
}
|
||||
|
@@ -1,23 +1,23 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
parse_duration,
|
||||
parse_iso8601,
|
||||
js_to_json,
|
||||
)
|
||||
from ..compat import compat_str
|
||||
|
||||
|
||||
class RDSIE(InfoExtractor):
|
||||
IE_DESC = 'RDS.ca'
|
||||
_VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)'
|
||||
_VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+'
|
||||
|
||||
_TESTS = [{
|
||||
'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
|
||||
'info_dict': {
|
||||
'id': '3.1132799',
|
||||
'id': '604333',
|
||||
'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
|
||||
'ext': 'mp4',
|
||||
'title': 'Fowler Jr. prend la direction de Jacksonville',
|
||||
@@ -33,22 +33,17 @@ class RDSIE(InfoExtractor):
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
video_id = mobj.group('id')
|
||||
display_id = mobj.group('display_id')
|
||||
display_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
|
||||
# TODO: extract f4m from 9c9media.com
|
||||
video_url = self._search_regex(
|
||||
r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"',
|
||||
webpage, 'video url')
|
||||
|
||||
title = self._og_search_title(webpage) or self._html_search_meta(
|
||||
item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json)
|
||||
video_id = compat_str(item['id'])
|
||||
title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta(
|
||||
'title', webpage, 'title', fatal=True)
|
||||
description = self._og_search_description(webpage) or self._html_search_meta(
|
||||
'description', webpage, 'description')
|
||||
thumbnail = self._og_search_thumbnail(webpage) or self._search_regex(
|
||||
thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex(
|
||||
[r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
|
||||
r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
|
||||
webpage, 'thumbnail', fatal=False)
|
||||
@@ -61,13 +56,15 @@ class RDSIE(InfoExtractor):
|
||||
age_limit = self._family_friendly_search(webpage)
|
||||
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'id': video_id,
|
||||
'display_id': display_id,
|
||||
'url': video_url,
|
||||
'url': '9c9media:rds_web:%s' % video_id,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'thumbnail': thumbnail,
|
||||
'timestamp': timestamp,
|
||||
'duration': duration,
|
||||
'age_limit': age_limit,
|
||||
'ie_key': 'NineCNineMedia',
|
||||
}
|
||||
|
60
youtube_dl/extractor/sixplay.py
Normal file
60
youtube_dl/extractor/sixplay.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
qualities,
|
||||
int_or_none,
|
||||
)
|
||||
|
||||
|
||||
class SixPlayIE(InfoExtractor):
|
||||
_VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)'
|
||||
_TEST = {
|
||||
'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320',
|
||||
'md5': '42310bffe4ba3982db112b9cd3467328',
|
||||
'info_dict': {
|
||||
'id': '11495320',
|
||||
'ext': 'mp4',
|
||||
'title': 'Jamel et ses amis au Marrakech du rire 2015',
|
||||
'description': 'md5:ba2149d5c321d5201b78070ee839d872',
|
||||
},
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
clip_data = self._download_json(
|
||||
'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id,
|
||||
video_id)
|
||||
video_data = clip_data['videoInfo']
|
||||
|
||||
quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
|
||||
formats = []
|
||||
for source in clip_data['sources']:
|
||||
source_type, source_url = source.get('type'), source.get('src')
|
||||
if not source_url or source_type == 'hls/primetime':
|
||||
continue
|
||||
if source_type == 'application/vnd.apple.mpegURL':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
source_url, video_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls', fatal=False))
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
source_url.replace('.m3u8', '.f4m'),
|
||||
video_id, f4m_id='hds', fatal=False))
|
||||
elif source_type == 'video/mp4':
|
||||
quality = source.get('quality')
|
||||
formats.append({
|
||||
'url': source_url,
|
||||
'format_id': quality,
|
||||
'quality': quality_key(quality),
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': video_data['title'].strip(),
|
||||
'description': video_data.get('description'),
|
||||
'duration': int_or_none(video_data.get('duration')),
|
||||
'series': video_data.get('titlePgm'),
|
||||
'formats': formats,
|
||||
}
|
@@ -67,7 +67,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE):
|
||||
|
||||
|
||||
class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE):
|
||||
IE_NAME = 'skynewsarabia:video'
|
||||
IE_NAME = 'skynewsarabia:article'
|
||||
_VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9',
|
||||
|
33
youtube_dl/extractor/skysports.py
Normal file
33
youtube_dl/extractor/skysports.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
|
||||
|
||||
class SkySportsIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)'
|
||||
_TEST = {
|
||||
'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine',
|
||||
'md5': 'c44a1db29f27daf9a0003e010af82100',
|
||||
'info_dict': {
|
||||
'id': '10328419',
|
||||
'ext': 'flv',
|
||||
'title': 'Bale: Its our time to shine',
|
||||
'description': 'md5:9fd1de3614d525f5addda32ac3c482c9',
|
||||
},
|
||||
'add_ie': ['Ooyala'],
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
return {
|
||||
'_type': 'url_transparent',
|
||||
'id': video_id,
|
||||
'url': 'ooyala:%s' % self._search_regex(
|
||||
r'data-video-id="([^"]+)"', webpage, 'ooyala id'),
|
||||
'title': self._og_search_title(webpage),
|
||||
'description': self._og_search_description(webpage),
|
||||
'ie_key': 'Ooyala',
|
||||
}
|
38
youtube_dl/extractor/sportschau.py
Normal file
38
youtube_dl/extractor/sportschau.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .wdr import WDRBaseIE
|
||||
from ..utils import get_element_by_attribute
|
||||
|
||||
|
||||
class SportschauIE(WDRBaseIE):
|
||||
IE_NAME = 'Sportschau'
|
||||
_VALID_URL = r'https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video-?(?P<id>[^/#?]+)\.html'
|
||||
_TEST = {
|
||||
'url': 'http://www.sportschau.de/uefaeuro2016/videos/video-dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100.html',
|
||||
'info_dict': {
|
||||
'id': 'mdb-1140188',
|
||||
'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100',
|
||||
'ext': 'mp4',
|
||||
'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen',
|
||||
'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.',
|
||||
'upload_date': '20160615',
|
||||
},
|
||||
'skip': 'Geo-restricted to Germany',
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
title = get_element_by_attribute('class', 'headline', webpage)
|
||||
description = self._html_search_meta('description', webpage, 'description')
|
||||
|
||||
info = self._extract_wdr_video(webpage, video_id)
|
||||
|
||||
info.update({
|
||||
'title': title,
|
||||
'description': description,
|
||||
})
|
||||
|
||||
return info
|
@@ -9,6 +9,7 @@ from ..utils import (
|
||||
|
||||
|
||||
class SRMediathekIE(ARDMediathekIE):
|
||||
IE_NAME = 'sr:mediathek'
|
||||
IE_DESC = 'Saarländischer Rundfunk'
|
||||
_VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
|
||||
|
||||
|
@@ -6,7 +6,6 @@ import re
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
sanitized_Request,
|
||||
urlencode_postdata,
|
||||
)
|
||||
|
||||
@@ -45,20 +44,26 @@ class StreamcloudIE(InfoExtractor):
|
||||
(?:id="[^"]+"\s+)?
|
||||
value="([^"]*)"
|
||||
''', orig_webpage)
|
||||
post = urlencode_postdata(fields)
|
||||
|
||||
self._sleep(12, video_id)
|
||||
headers = {
|
||||
b'Content-Type': b'application/x-www-form-urlencoded',
|
||||
}
|
||||
req = sanitized_Request(url, post, headers)
|
||||
|
||||
webpage = self._download_webpage(
|
||||
req, video_id, note='Downloading video page ...')
|
||||
title = self._html_search_regex(
|
||||
r'<h1[^>]*>([^<]+)<', webpage, 'title')
|
||||
video_url = self._search_regex(
|
||||
r'file:\s*"([^"]+)"', webpage, 'video URL')
|
||||
url, video_id, data=urlencode_postdata(fields), headers={
|
||||
b'Content-Type': b'application/x-www-form-urlencoded',
|
||||
})
|
||||
|
||||
try:
|
||||
title = self._html_search_regex(
|
||||
r'<h1[^>]*>([^<]+)<', webpage, 'title')
|
||||
video_url = self._search_regex(
|
||||
r'file:\s*"([^"]+)"', webpage, 'video URL')
|
||||
except ExtractorError:
|
||||
message = self._html_search_regex(
|
||||
r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>',
|
||||
webpage, 'message', default=None, group='message')
|
||||
if message:
|
||||
raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
|
||||
raise
|
||||
thumbnail = self._search_regex(
|
||||
r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False)
|
||||
|
||||
|
@@ -6,17 +6,14 @@ import re
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
determine_ext,
|
||||
dict_get,
|
||||
int_or_none,
|
||||
try_get,
|
||||
)
|
||||
|
||||
|
||||
class SVTBaseIE(InfoExtractor):
|
||||
def _extract_video(self, url, video_id):
|
||||
info = self._download_json(url, video_id)
|
||||
|
||||
title = info['context']['title']
|
||||
thumbnail = info['context'].get('thumbnailImage')
|
||||
|
||||
video_info = info['video']
|
||||
def _extract_video(self, video_info, video_id):
|
||||
formats = []
|
||||
for vr in video_info['videoReferences']:
|
||||
player_type = vr.get('playerType')
|
||||
@@ -40,27 +37,49 @@ class SVTBaseIE(InfoExtractor):
|
||||
'format_id': player_type,
|
||||
'url': vurl,
|
||||
})
|
||||
if not formats and video_info.get('rights', {}).get('geoBlockedSweden'):
|
||||
self.raise_geo_restricted('This video is only available in Sweden')
|
||||
self._sort_formats(formats)
|
||||
|
||||
subtitles = {}
|
||||
subtitle_references = video_info.get('subtitleReferences')
|
||||
subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
|
||||
if isinstance(subtitle_references, list):
|
||||
for sr in subtitle_references:
|
||||
subtitle_url = sr.get('url')
|
||||
subtitle_lang = sr.get('language', 'sv')
|
||||
if subtitle_url:
|
||||
subtitles.setdefault('sv', []).append({'url': subtitle_url})
|
||||
if determine_ext(subtitle_url) == 'm3u8':
|
||||
# TODO(yan12125): handle WebVTT in m3u8 manifests
|
||||
continue
|
||||
|
||||
duration = video_info.get('materialLength')
|
||||
age_limit = 18 if video_info.get('inappropriateForChildren') else 0
|
||||
subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
|
||||
|
||||
title = video_info.get('title')
|
||||
|
||||
series = video_info.get('programTitle')
|
||||
season_number = int_or_none(video_info.get('season'))
|
||||
episode = video_info.get('episodeTitle')
|
||||
episode_number = int_or_none(video_info.get('episodeNumber'))
|
||||
|
||||
duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))
|
||||
age_limit = None
|
||||
adult = dict_get(
|
||||
video_info, ('inappropriateForChildren', 'blockedForChildren'),
|
||||
skip_false_values=False)
|
||||
if adult is not None:
|
||||
age_limit = 18 if adult else 0
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'thumbnail': thumbnail,
|
||||
'duration': duration,
|
||||
'age_limit': age_limit,
|
||||
'series': series,
|
||||
'season_number': season_number,
|
||||
'episode': episode,
|
||||
'episode_number': episode_number,
|
||||
}
|
||||
|
||||
|
||||
@@ -68,11 +87,11 @@ class SVTIE(SVTBaseIE):
|
||||
_VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
|
||||
_TEST = {
|
||||
'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
|
||||
'md5': '9648197555fc1b49e3dc22db4af51d46',
|
||||
'md5': '33e9a5d8f646523ce0868ecfb0eed77d',
|
||||
'info_dict': {
|
||||
'id': '2900353',
|
||||
'ext': 'flv',
|
||||
'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
|
||||
'ext': 'mp4',
|
||||
'title': 'Stjärnorna skojar till det - under SVT-intervjun',
|
||||
'duration': 27,
|
||||
'age_limit': 0,
|
||||
},
|
||||
@@ -89,15 +108,20 @@ class SVTIE(SVTBaseIE):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
widget_id = mobj.group('widget_id')
|
||||
article_id = mobj.group('id')
|
||||
return self._extract_video(
|
||||
|
||||
info = self._download_json(
|
||||
'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
|
||||
article_id)
|
||||
|
||||
info_dict = self._extract_video(info['video'], article_id)
|
||||
info_dict['title'] = info['context']['title']
|
||||
return info_dict
|
||||
|
||||
|
||||
class SVTPlayIE(SVTBaseIE):
|
||||
IE_DESC = 'SVT Play and Öppet arkiv'
|
||||
_VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
|
||||
_TEST = {
|
||||
_VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
|
||||
'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
|
||||
'info_dict': {
|
||||
@@ -113,12 +137,50 @@ class SVTPlayIE(SVTBaseIE):
|
||||
}]
|
||||
},
|
||||
},
|
||||
}
|
||||
}, {
|
||||
# geo restricted to Sweden
|
||||
'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
video_id = mobj.group('id')
|
||||
host = mobj.group('host')
|
||||
return self._extract_video(
|
||||
'http://www.%s.se/video/%s?output=json' % (host, video_id),
|
||||
video_id)
|
||||
video_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
|
||||
data = self._parse_json(
|
||||
self._search_regex(
|
||||
r'root\["__svtplay"\]\s*=\s*([^;]+);',
|
||||
webpage, 'embedded data', default='{}'),
|
||||
video_id, fatal=False)
|
||||
|
||||
thumbnail = self._og_search_thumbnail(webpage)
|
||||
|
||||
if data:
|
||||
video_info = try_get(
|
||||
data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
|
||||
dict)
|
||||
if video_info:
|
||||
info_dict = self._extract_video(video_info, video_id)
|
||||
info_dict.update({
|
||||
'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
|
||||
'thumbnail': thumbnail,
|
||||
})
|
||||
return info_dict
|
||||
|
||||
video_id = self._search_regex(
|
||||
r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
|
||||
webpage, 'video id', default=None)
|
||||
|
||||
if video_id:
|
||||
data = self._download_json(
|
||||
'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id)
|
||||
info_dict = self._extract_video(data, video_id)
|
||||
if not info_dict.get('title'):
|
||||
info_dict['title'] = re.sub(
|
||||
r'\s*\|\s*.+?$', '',
|
||||
info_dict.get('episode') or self._og_search_title(webpage))
|
||||
return info_dict
|
||||
|
@@ -48,6 +48,6 @@ class TF1IE(InfoExtractor):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
wat_id = self._html_search_regex(
|
||||
r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8}).*?\1',
|
||||
r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',
|
||||
webpage, 'wat id', group='id')
|
||||
return self.url_result('wat:%s' % wat_id, 'Wat')
|
||||
|
@@ -6,6 +6,7 @@ import time
|
||||
import hmac
|
||||
import binascii
|
||||
import hashlib
|
||||
import netrc
|
||||
|
||||
|
||||
from .once import OnceIE
|
||||
@@ -24,6 +25,9 @@ from ..utils import (
|
||||
xpath_with_ns,
|
||||
mimetype2ext,
|
||||
find_xpath_attr,
|
||||
unescapeHTML,
|
||||
urlencode_postdata,
|
||||
unified_timestamp,
|
||||
)
|
||||
|
||||
default_ns = 'http://www.w3.org/2005/SMIL21/Language'
|
||||
@@ -62,10 +66,11 @@ class ThePlatformBaseIE(OnceIE):
|
||||
|
||||
return formats, subtitles
|
||||
|
||||
def get_metadata(self, path, video_id):
|
||||
def _download_theplatform_metadata(self, path, video_id):
|
||||
info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
|
||||
info = self._download_json(info_url, video_id)
|
||||
return self._download_json(info_url, video_id)
|
||||
|
||||
def _parse_theplatform_metadata(self, info):
|
||||
subtitles = {}
|
||||
captions = info.get('captions')
|
||||
if isinstance(captions, list):
|
||||
@@ -86,6 +91,10 @@ class ThePlatformBaseIE(OnceIE):
|
||||
'uploader': info.get('billingCode'),
|
||||
}
|
||||
|
||||
def _extract_theplatform_metadata(self, path, video_id):
|
||||
info = self._download_theplatform_metadata(path, video_id)
|
||||
return self._parse_theplatform_metadata(info)
|
||||
|
||||
|
||||
class ThePlatformIE(ThePlatformBaseIE):
|
||||
_VALID_URL = r'''(?x)
|
||||
@@ -158,6 +167,7 @@ class ThePlatformIE(ThePlatformBaseIE):
|
||||
'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',
|
||||
'only_matching': True,
|
||||
}]
|
||||
_SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s'
|
||||
|
||||
@classmethod
|
||||
def _extract_urls(cls, webpage):
|
||||
@@ -192,6 +202,96 @@ class ThePlatformIE(ThePlatformBaseIE):
|
||||
sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
|
||||
return '%s&sig=%s' % (url, sig)
|
||||
|
||||
def _extract_mvpd_auth(self, url, video_id, requestor_id, resource):
|
||||
def xml_text(xml_str, tag):
|
||||
return self._search_regex(
|
||||
'<%s>(.+?)</%s>' % (tag, tag), xml_str, tag)
|
||||
|
||||
mvpd_headers = {
|
||||
'ap_42': 'anonymous',
|
||||
'ap_11': 'Linux i686',
|
||||
'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0',
|
||||
}
|
||||
|
||||
guid = xml_text(resource, 'guid')
|
||||
requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {}
|
||||
authn_token = requestor_info.get('authn_token')
|
||||
if authn_token:
|
||||
token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', ''))
|
||||
if token_expires and token_expires >= time.time():
|
||||
authn_token = None
|
||||
if not authn_token:
|
||||
# TODO add support for other TV Providers
|
||||
mso_id = 'DTV'
|
||||
login_info = netrc.netrc().authenticators(mso_id)
|
||||
if not login_info:
|
||||
return None
|
||||
|
||||
def post_form(form_page, note, data={}):
|
||||
post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url')
|
||||
return self._download_webpage(
|
||||
post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
})
|
||||
|
||||
provider_redirect_page = self._download_webpage(
|
||||
self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id,
|
||||
'Downloading Provider Redirect Page', query={
|
||||
'noflash': 'true',
|
||||
'mso_id': mso_id,
|
||||
'requestor_id': requestor_id,
|
||||
'no_iframe': 'false',
|
||||
'domain_name': 'adobe.com',
|
||||
'redirect_url': url,
|
||||
})
|
||||
provider_login_page = post_form(
|
||||
provider_redirect_page, 'Downloading Provider Login Page')
|
||||
mvpd_confirm_page = post_form(provider_login_page, 'Logging in', {
|
||||
'username': login_info[0],
|
||||
'password': login_info[2],
|
||||
})
|
||||
post_form(mvpd_confirm_page, 'Confirming Login')
|
||||
|
||||
session = self._download_webpage(
|
||||
self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id,
|
||||
'Retrieving Session', data=urlencode_postdata({
|
||||
'_method': 'GET',
|
||||
'requestor_id': requestor_id,
|
||||
}), headers=mvpd_headers)
|
||||
authn_token = unescapeHTML(xml_text(session, 'authnToken'))
|
||||
requestor_info['authn_token'] = authn_token
|
||||
self._downloader.cache.store('mvpd', requestor_id, requestor_info)
|
||||
|
||||
authz_token = requestor_info.get(guid)
|
||||
if not authz_token:
|
||||
authorize = self._download_webpage(
|
||||
self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id,
|
||||
'Retrieving Authorization Token', data=urlencode_postdata({
|
||||
'resource_id': resource,
|
||||
'requestor_id': requestor_id,
|
||||
'authentication_token': authn_token,
|
||||
'mso_id': xml_text(authn_token, 'simpleTokenMsoID'),
|
||||
'userMeta': '1',
|
||||
}), headers=mvpd_headers)
|
||||
authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))
|
||||
requestor_info[guid] = authz_token
|
||||
self._downloader.cache.store('mvpd', requestor_id, requestor_info)
|
||||
|
||||
mvpd_headers.update({
|
||||
'ap_19': xml_text(authn_token, 'simpleSamlNameID'),
|
||||
'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'),
|
||||
})
|
||||
|
||||
return self._download_webpage(
|
||||
self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize',
|
||||
video_id, 'Retrieving Media Token', data=urlencode_postdata({
|
||||
'authz_token': authz_token,
|
||||
'requestor_id': requestor_id,
|
||||
'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'),
|
||||
'hashed_guid': 'false',
|
||||
}), headers=mvpd_headers)
|
||||
|
||||
def _real_extract(self, url):
|
||||
url, smuggled_data = unsmuggle_url(url, {})
|
||||
|
||||
@@ -265,7 +365,7 @@ class ThePlatformIE(ThePlatformBaseIE):
|
||||
formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
|
||||
self._sort_formats(formats)
|
||||
|
||||
ret = self.get_metadata(path, video_id)
|
||||
ret = self._extract_theplatform_metadata(path, video_id)
|
||||
combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
|
||||
ret.update({
|
||||
'id': video_id,
|
||||
@@ -277,9 +377,9 @@ class ThePlatformIE(ThePlatformBaseIE):
|
||||
|
||||
|
||||
class ThePlatformFeedIE(ThePlatformBaseIE):
|
||||
_URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s'
|
||||
_VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)'
|
||||
_TEST = {
|
||||
_URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
|
||||
_VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))'
|
||||
_TESTS = [{
|
||||
# From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
|
||||
'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
|
||||
'md5': '6e32495b5073ab414471b615c5ded394',
|
||||
@@ -295,32 +395,38 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
|
||||
'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
|
||||
'uploader': 'NBCU-NEWS',
|
||||
},
|
||||
}
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
|
||||
video_id = mobj.group('id')
|
||||
provider_id = mobj.group('provider_id')
|
||||
feed_id = mobj.group('feed_id')
|
||||
|
||||
real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id)
|
||||
feed = self._download_json(real_url, video_id)
|
||||
entry = feed['entries'][0]
|
||||
def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}):
|
||||
real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
|
||||
entry = self._download_json(real_url, video_id)['entries'][0]
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
first_video_id = None
|
||||
duration = None
|
||||
asset_types = []
|
||||
for item in entry['media$content']:
|
||||
smil_url = item['plfile$url'] + '&mbr=true'
|
||||
smil_url = item['plfile$url']
|
||||
cur_video_id = ThePlatformIE._match_id(smil_url)
|
||||
if first_video_id is None:
|
||||
first_video_id = cur_video_id
|
||||
duration = float_or_none(item.get('plfile$duration'))
|
||||
cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)
|
||||
formats.extend(cur_formats)
|
||||
subtitles = self._merge_subtitles(subtitles, cur_subtitles)
|
||||
for asset_type in item['plfile$assetTypes']:
|
||||
if asset_type in asset_types:
|
||||
continue
|
||||
asset_types.append(asset_type)
|
||||
query = {
|
||||
'mbr': 'true',
|
||||
'formats': item['plfile$format'],
|
||||
'assetTypes': asset_type,
|
||||
}
|
||||
if asset_type in asset_types_query:
|
||||
query.update(asset_types_query[asset_type])
|
||||
cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
|
||||
smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
|
||||
formats.extend(cur_formats)
|
||||
subtitles = self._merge_subtitles(subtitles, cur_subtitles)
|
||||
|
||||
self._sort_formats(formats)
|
||||
|
||||
@@ -333,7 +439,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
|
||||
timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
|
||||
categories = [item['media$name'] for item in entry.get('media$categories', [])]
|
||||
|
||||
ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
|
||||
ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)
|
||||
subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
|
||||
ret.update({
|
||||
'id': video_id,
|
||||
@@ -344,5 +450,17 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
|
||||
'timestamp': timestamp,
|
||||
'categories': categories,
|
||||
})
|
||||
if custom_fields:
|
||||
ret.update(custom_fields(entry))
|
||||
|
||||
return ret
|
||||
|
||||
def _real_extract(self, url):
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
|
||||
video_id = mobj.group('id')
|
||||
provider_id = mobj.group('provider_id')
|
||||
feed_id = mobj.group('feed_id')
|
||||
filter_query = mobj.group('filter')
|
||||
|
||||
return self._extract_feed_info(provider_id, feed_id, filter_query, video_id)
|
||||
|
@@ -4,6 +4,12 @@ from __future__ import unicode_literals
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
determine_ext,
|
||||
clean_html,
|
||||
get_element_by_attribute,
|
||||
ExtractorError,
|
||||
)
|
||||
|
||||
|
||||
class TVPIE(InfoExtractor):
|
||||
@@ -21,7 +27,7 @@ class TVPIE(InfoExtractor):
|
||||
},
|
||||
}, {
|
||||
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
|
||||
'md5': 'c3b15ed1af288131115ff17a17c19dda',
|
||||
'md5': 'b0005b542e5b4de643a9690326ab1257',
|
||||
'info_dict': {
|
||||
'id': '17916176',
|
||||
'ext': 'mp4',
|
||||
@@ -53,6 +59,11 @@ class TVPIE(InfoExtractor):
|
||||
webpage = self._download_webpage(
|
||||
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
|
||||
|
||||
error_massage = get_element_by_attribute('class', 'msg error', webpage)
|
||||
if error_massage:
|
||||
raise ExtractorError('%s said: %s' % (
|
||||
self.IE_NAME, clean_html(error_massage)), expected=True)
|
||||
|
||||
title = self._search_regex(
|
||||
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
|
||||
webpage, 'title', group='title')
|
||||
@@ -66,24 +77,50 @@ class TVPIE(InfoExtractor):
|
||||
r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
|
||||
|
||||
video_url = self._search_regex(
|
||||
r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None)
|
||||
if not video_url:
|
||||
r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
|
||||
'formats', group='url', default=None)
|
||||
if not video_url or 'material_niedostepny.mp4' in video_url:
|
||||
video_url = self._download_json(
|
||||
'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
|
||||
video_id)['video_url']
|
||||
|
||||
ext = video_url.rsplit('.', 1)[-1]
|
||||
if ext != 'ism/manifest':
|
||||
if '/' in ext:
|
||||
ext = 'mp4'
|
||||
formats = []
|
||||
video_url_base = self._search_regex(
|
||||
r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)',
|
||||
video_url, 'video base url', default=None)
|
||||
if video_url_base:
|
||||
# TODO: Current DASH formats are broken - $Time$ pattern in
|
||||
# <SegmentTemplate> not implemented yet
|
||||
# formats.extend(self._extract_mpd_formats(
|
||||
# video_url_base + '.ism/video.mpd',
|
||||
# video_id, mpd_id='dash', fatal=False))
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
video_url_base + '.ism/video.f4m',
|
||||
video_id, f4m_id='hds', fatal=False))
|
||||
m3u8_formats = self._extract_m3u8_formats(
|
||||
video_url_base + '.ism/video.m3u8', video_id,
|
||||
'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
|
||||
self._sort_formats(m3u8_formats)
|
||||
m3u8_formats = list(filter(
|
||||
lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
|
||||
m3u8_formats))
|
||||
formats.extend(m3u8_formats)
|
||||
for i, m3u8_format in enumerate(m3u8_formats, 2):
|
||||
http_url = '%s-%d.mp4' % (video_url_base, i)
|
||||
if self._is_valid_url(http_url, video_id):
|
||||
f = m3u8_format.copy()
|
||||
f.update({
|
||||
'url': http_url,
|
||||
'format_id': f['format_id'].replace('hls', 'http'),
|
||||
'protocol': 'http',
|
||||
})
|
||||
formats.append(f)
|
||||
else:
|
||||
formats = [{
|
||||
'format_id': 'direct',
|
||||
'url': video_url,
|
||||
'ext': ext,
|
||||
'ext': determine_ext(video_url, 'mp4'),
|
||||
}]
|
||||
else:
|
||||
m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url)
|
||||
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
|
||||
|
||||
self._sort_formats(formats)
|
||||
|
||||
|
@@ -29,7 +29,7 @@ class TwitchBaseIE(InfoExtractor):
|
||||
_VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
|
||||
|
||||
_API_BASE = 'https://api.twitch.tv'
|
||||
_USHER_BASE = 'http://usher.twitch.tv'
|
||||
_USHER_BASE = 'https://usher.ttvnw.net'
|
||||
_LOGIN_URL = 'http://www.twitch.tv/login'
|
||||
_NETRC_MACHINE = 'twitch'
|
||||
|
||||
|
67
youtube_dl/extractor/urplay.py
Normal file
67
youtube_dl/extractor/urplay.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
|
||||
|
||||
class URPlayIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P<id>[0-9]+)'
|
||||
_TEST = {
|
||||
'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde',
|
||||
'md5': '15ca67b63fd8fb320ac2bcd854bad7b6',
|
||||
'info_dict': {
|
||||
'id': '190031',
|
||||
'ext': 'mp4',
|
||||
'title': 'Tripp, Trapp, Träd : Sovkudde',
|
||||
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
|
||||
}
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
urplayer_data = self._parse_json(self._search_regex(
|
||||
r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id)
|
||||
host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
|
||||
|
||||
formats = []
|
||||
for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)):
|
||||
file_rtmp = urplayer_data.get('file_rtmp' + quality_attr)
|
||||
if file_rtmp:
|
||||
formats.append({
|
||||
'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp),
|
||||
'format_id': quality + '-rtmp',
|
||||
'ext': 'flv',
|
||||
'preference': preference,
|
||||
})
|
||||
file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr)
|
||||
if file_http:
|
||||
file_http_base_url = 'http://%s/%s' % (host, file_http)
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
file_http_base_url + 'manifest.f4m', video_id,
|
||||
preference, '%s-hds' % quality, fatal=False))
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
file_http_base_url + 'playlist.m3u8', video_id, 'mp4',
|
||||
'm3u8_native', preference, '%s-hls' % quality, fatal=False))
|
||||
self._sort_formats(formats)
|
||||
|
||||
subtitles = {}
|
||||
for subtitle in urplayer_data.get('subtitles', []):
|
||||
subtitle_url = subtitle.get('file')
|
||||
kind = subtitle.get('kind')
|
||||
if subtitle_url or kind and kind != 'captions':
|
||||
continue
|
||||
subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({
|
||||
'url': subtitle_url,
|
||||
})
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': urplayer_data['title'],
|
||||
'description': self._og_search_description(webpage),
|
||||
'thumbnail': urplayer_data.get('image'),
|
||||
'series': urplayer_data.get('series_title'),
|
||||
'subtitles': subtitles,
|
||||
'formats': formats,
|
||||
}
|
84
youtube_dl/extractor/vidbit.py
Normal file
84
youtube_dl/extractor/vidbit.py
Normal file
@@ -0,0 +1,84 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import compat_urlparse
|
||||
from ..utils import (
|
||||
int_or_none,
|
||||
js_to_json,
|
||||
remove_end,
|
||||
unified_strdate,
|
||||
)
|
||||
|
||||
|
||||
class VidbitIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2',
|
||||
'md5': '1a34b7f14defe3b8fafca9796892924d',
|
||||
'info_dict': {
|
||||
'id': 'jkL2yDOEq2',
|
||||
'ext': 'mp4',
|
||||
'title': 'Intro to VidBit',
|
||||
'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7',
|
||||
'thumbnail': 're:https?://.*\.jpg$',
|
||||
'upload_date': '20160618',
|
||||
'view_count': int,
|
||||
'comment_count': int,
|
||||
}
|
||||
}, {
|
||||
'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
|
||||
webpage = self._download_webpage(
|
||||
compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id)
|
||||
|
||||
video_url, title = [None] * 2
|
||||
|
||||
config = self._parse_json(self._search_regex(
|
||||
r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'),
|
||||
video_id, transform_source=js_to_json)
|
||||
if config:
|
||||
if config.get('file'):
|
||||
video_url = compat_urlparse.urljoin(url, config['file'])
|
||||
title = config.get('title')
|
||||
|
||||
if not video_url:
|
||||
video_url = compat_urlparse.urljoin(url, self._search_regex(
|
||||
r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
|
||||
webpage, 'video URL', group='url'))
|
||||
|
||||
if not title:
|
||||
title = remove_end(
|
||||
self._html_search_regex(
|
||||
(r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'),
|
||||
webpage, 'title', default=None) or self._og_search_title(webpage),
|
||||
' - VidBit')
|
||||
|
||||
description = self._html_search_meta(
|
||||
('description', 'og:description', 'twitter:description'),
|
||||
webpage, 'description')
|
||||
|
||||
upload_date = unified_strdate(self._html_search_meta(
|
||||
'datePublished', webpage, 'upload date'))
|
||||
|
||||
view_count = int_or_none(self._search_regex(
|
||||
r'<strong>(\d+)</strong> views',
|
||||
webpage, 'view count', fatal=False))
|
||||
comment_count = int_or_none(self._search_regex(
|
||||
r'id=["\']cmt_num["\'][^>]*>\((\d+)\)',
|
||||
webpage, 'comment count', fatal=False))
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'url': video_url,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'thumbnail': self._og_search_thumbnail(webpage),
|
||||
'upload_date': upload_date,
|
||||
'view_count': view_count,
|
||||
'comment_count': comment_count,
|
||||
}
|
@@ -8,6 +8,7 @@ import itertools
|
||||
from .common import InfoExtractor
|
||||
from ..compat import (
|
||||
compat_HTTPError,
|
||||
compat_str,
|
||||
compat_urlparse,
|
||||
)
|
||||
from ..utils import (
|
||||
@@ -15,6 +16,7 @@ from ..utils import (
|
||||
ExtractorError,
|
||||
InAdvancePagedList,
|
||||
int_or_none,
|
||||
NO_DEFAULT,
|
||||
RegexNotFoundError,
|
||||
sanitized_Request,
|
||||
smuggle_url,
|
||||
@@ -24,6 +26,7 @@ from ..utils import (
|
||||
urlencode_postdata,
|
||||
unescapeHTML,
|
||||
parse_filesize,
|
||||
try_get,
|
||||
)
|
||||
|
||||
|
||||
@@ -54,6 +57,26 @@ class VimeoBaseInfoExtractor(InfoExtractor):
|
||||
self._set_vimeo_cookie('vuid', vuid)
|
||||
self._download_webpage(login_request, None, False, 'Wrong login info')
|
||||
|
||||
def _verify_video_password(self, url, video_id, webpage):
|
||||
password = self._downloader.params.get('videopassword')
|
||||
if password is None:
|
||||
raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
|
||||
token, vuid = self._extract_xsrft_and_vuid(webpage)
|
||||
data = urlencode_postdata({
|
||||
'password': password,
|
||||
'token': token,
|
||||
})
|
||||
if url.startswith('http://'):
|
||||
# vimeo only supports https now, but the user can give an http url
|
||||
url = url.replace('http://', 'https://')
|
||||
password_request = sanitized_Request(url + '/password', data)
|
||||
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
|
||||
password_request.add_header('Referer', url)
|
||||
self._set_vimeo_cookie('vuid', vuid)
|
||||
return self._download_webpage(
|
||||
password_request, video_id,
|
||||
'Verifying the password', 'Wrong password')
|
||||
|
||||
def _extract_xsrft_and_vuid(self, webpage):
|
||||
xsrft = self._search_regex(
|
||||
r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
|
||||
@@ -144,7 +167,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
|
||||
\.
|
||||
)?
|
||||
vimeo(?P<pro>pro)?\.com/
|
||||
(?!channels/[^/?#]+/?(?:$|[?#])|[^/]+/review/|(?:album|ondemand)/)
|
||||
(?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
|
||||
(?:.*?/)?
|
||||
(?:
|
||||
(?:
|
||||
@@ -225,8 +248,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
|
||||
{
|
||||
'url': 'http://vimeo.com/channels/keypeele/75629013',
|
||||
'md5': '2f86a05afe9d7abc0b9126d229bbe15d',
|
||||
'note': 'Video is freely available via original URL '
|
||||
'and protected with password when accessed via http://vimeo.com/75629013',
|
||||
'info_dict': {
|
||||
'id': '75629013',
|
||||
'ext': 'mp4',
|
||||
@@ -270,7 +291,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
|
||||
{
|
||||
# contains original format
|
||||
'url': 'https://vimeo.com/33951933',
|
||||
'md5': '53c688fa95a55bf4b7293d37a89c5c53',
|
||||
'md5': '2d9f5475e0537f013d0073e812ab89e6',
|
||||
'info_dict': {
|
||||
'id': '33951933',
|
||||
'ext': 'mp4',
|
||||
@@ -282,6 +303,29 @@ class VimeoIE(VimeoBaseInfoExtractor):
|
||||
'description': 'md5:ae23671e82d05415868f7ad1aec21147',
|
||||
},
|
||||
},
|
||||
{
|
||||
# only available via https://vimeo.com/channels/tributes/6213729 and
|
||||
# not via https://vimeo.com/6213729
|
||||
'url': 'https://vimeo.com/channels/tributes/6213729',
|
||||
'info_dict': {
|
||||
'id': '6213729',
|
||||
'ext': 'mp4',
|
||||
'title': 'Vimeo Tribute: The Shining',
|
||||
'uploader': 'Casey Donahue',
|
||||
'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue',
|
||||
'uploader_id': 'caseydonahue',
|
||||
'upload_date': '20090821',
|
||||
'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
'expected_warnings': ['Unable to download JSON metadata'],
|
||||
},
|
||||
{
|
||||
'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741',
|
||||
'only_matching': True,
|
||||
},
|
||||
{
|
||||
'url': 'https://vimeo.com/109815029',
|
||||
'note': 'Video not completely processed, "failed" seed status',
|
||||
@@ -291,6 +335,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
|
||||
'url': 'https://vimeo.com/groups/travelhd/videos/22439234',
|
||||
'only_matching': True,
|
||||
},
|
||||
{
|
||||
'url': 'https://vimeo.com/album/2632481/video/79010983',
|
||||
'only_matching': True,
|
||||
},
|
||||
{
|
||||
# source file returns 403: Forbidden
|
||||
'url': 'https://vimeo.com/7809605',
|
||||
@@ -317,26 +365,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
|
||||
if mobj:
|
||||
return mobj.group(1)
|
||||
|
||||
def _verify_video_password(self, url, video_id, webpage):
|
||||
password = self._downloader.params.get('videopassword')
|
||||
if password is None:
|
||||
raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
|
||||
token, vuid = self._extract_xsrft_and_vuid(webpage)
|
||||
data = urlencode_postdata({
|
||||
'password': password,
|
||||
'token': token,
|
||||
})
|
||||
if url.startswith('http://'):
|
||||
# vimeo only supports https now, but the user can give an http url
|
||||
url = url.replace('http://', 'https://')
|
||||
password_request = sanitized_Request(url + '/password', data)
|
||||
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
|
||||
password_request.add_header('Referer', url)
|
||||
self._set_vimeo_cookie('vuid', vuid)
|
||||
return self._download_webpage(
|
||||
password_request, video_id,
|
||||
'Verifying the password', 'Wrong password')
|
||||
|
||||
def _verify_player_video_password(self, url, video_id):
|
||||
password = self._downloader.params.get('videopassword')
|
||||
if password is None:
|
||||
@@ -367,7 +395,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
|
||||
orig_url = url
|
||||
if mobj.group('pro') or mobj.group('player'):
|
||||
url = 'https://player.vimeo.com/video/' + video_id
|
||||
else:
|
||||
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
|
||||
url = 'https://vimeo.com/' + video_id
|
||||
|
||||
# Retrieve video webpage to extract further information
|
||||
@@ -445,7 +473,18 @@ class VimeoIE(VimeoBaseInfoExtractor):
|
||||
if config.get('view') == 4:
|
||||
config = self._verify_player_video_password(url, video_id)
|
||||
|
||||
if '>You rented this title.<' in webpage:
|
||||
def is_rented():
|
||||
if '>You rented this title.<' in webpage:
|
||||
return True
|
||||
if config.get('user', {}).get('purchased'):
|
||||
return True
|
||||
label = try_get(
|
||||
config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str)
|
||||
if label and label.startswith('You rented this'):
|
||||
return True
|
||||
return False
|
||||
|
||||
if is_rented():
|
||||
feature_id = config.get('video', {}).get('vod', {}).get('feature_id')
|
||||
if feature_id and not data.get('force_feature_id', False):
|
||||
return self.url_result(smuggle_url(
|
||||
@@ -617,8 +656,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
|
||||
webpage = self._login_list_password(page_url, list_id, webpage)
|
||||
yield self._extract_list_title(webpage)
|
||||
|
||||
for video_id in re.findall(r'id="clip_(\d+?)"', webpage):
|
||||
yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')
|
||||
# Try extracting href first since not all videos are available via
|
||||
# short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729)
|
||||
clips = re.findall(
|
||||
r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage)
|
||||
if clips:
|
||||
for video_id, video_url in clips:
|
||||
yield self.url_result(
|
||||
compat_urlparse.urljoin(base_url, video_url),
|
||||
VimeoIE.ie_key(), video_id=video_id)
|
||||
# More relaxed fallback
|
||||
else:
|
||||
for video_id in re.findall(r'id=["\']clip_(\d+)', webpage):
|
||||
yield self.url_result(
|
||||
'https://vimeo.com/%s' % video_id,
|
||||
VimeoIE.ie_key(), video_id=video_id)
|
||||
|
||||
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
|
||||
break
|
||||
@@ -655,7 +707,7 @@ class VimeoUserIE(VimeoChannelIE):
|
||||
|
||||
class VimeoAlbumIE(VimeoChannelIE):
|
||||
IE_NAME = 'vimeo:album'
|
||||
_VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)'
|
||||
_VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))'
|
||||
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
|
||||
_TESTS = [{
|
||||
'url': 'https://vimeo.com/album/2632481',
|
||||
@@ -675,6 +727,13 @@ class VimeoAlbumIE(VimeoChannelIE):
|
||||
'params': {
|
||||
'videopassword': 'youtube-dl',
|
||||
}
|
||||
}, {
|
||||
'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
# TODO: respect page number
|
||||
'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _page_url(self, base_url, pagenum):
|
||||
@@ -733,12 +792,39 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
'uploader_id': 'user22258446',
|
||||
}
|
||||
}, {
|
||||
'note': 'Password protected',
|
||||
'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde',
|
||||
'info_dict': {
|
||||
'id': '138823582',
|
||||
'ext': 'mp4',
|
||||
'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1',
|
||||
'uploader': 'TMB',
|
||||
'uploader_id': 'user37284429',
|
||||
},
|
||||
'params': {
|
||||
'videopassword': 'holygrail',
|
||||
},
|
||||
}]
|
||||
|
||||
def _real_initialize(self):
|
||||
self._login()
|
||||
|
||||
def _get_config_url(self, webpage_url, video_id, video_password_verified=False):
|
||||
webpage = self._download_webpage(webpage_url, video_id)
|
||||
config_url = self._html_search_regex(
|
||||
r'data-config-url="([^"]+)"', webpage, 'config URL',
|
||||
default=NO_DEFAULT if video_password_verified else None)
|
||||
if config_url is None:
|
||||
self._verify_video_password(webpage_url, video_id, webpage)
|
||||
config_url = self._get_config_url(
|
||||
webpage_url, video_id, video_password_verified=True)
|
||||
return config_url
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
config = self._download_json(
|
||||
'https://player.vimeo.com/video/%s/config' % video_id, video_id)
|
||||
config_url = self._get_config_url(url, video_id)
|
||||
config = self._download_json(config_url, video_id)
|
||||
info_dict = self._parse_config(config, video_id)
|
||||
self._vimeo_sort_formats(info_dict['formats'])
|
||||
info_dict['id'] = video_id
|
||||
|
@@ -24,6 +24,7 @@ class VineIE(InfoExtractor):
|
||||
'upload_date': '20130519',
|
||||
'uploader': 'Jack Dorsey',
|
||||
'uploader_id': '76',
|
||||
'view_count': int,
|
||||
'like_count': int,
|
||||
'comment_count': int,
|
||||
'repost_count': int,
|
||||
@@ -39,6 +40,7 @@ class VineIE(InfoExtractor):
|
||||
'upload_date': '20140815',
|
||||
'uploader': 'Mars Ruiz',
|
||||
'uploader_id': '1102363502380728320',
|
||||
'view_count': int,
|
||||
'like_count': int,
|
||||
'comment_count': int,
|
||||
'repost_count': int,
|
||||
@@ -54,6 +56,7 @@ class VineIE(InfoExtractor):
|
||||
'upload_date': '20130430',
|
||||
'uploader': 'Z3k3',
|
||||
'uploader_id': '936470460173008896',
|
||||
'view_count': int,
|
||||
'like_count': int,
|
||||
'comment_count': int,
|
||||
'repost_count': int,
|
||||
@@ -71,6 +74,7 @@ class VineIE(InfoExtractor):
|
||||
'upload_date': '20150705',
|
||||
'uploader': 'Pimry_zaa',
|
||||
'uploader_id': '1135760698325307392',
|
||||
'view_count': int,
|
||||
'like_count': int,
|
||||
'comment_count': int,
|
||||
'repost_count': int,
|
||||
@@ -86,10 +90,12 @@ class VineIE(InfoExtractor):
|
||||
|
||||
data = self._parse_json(
|
||||
self._search_regex(
|
||||
r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id,
|
||||
r'window\.POST_DATA\s*=\s*({.+?});\s*</script>',
|
||||
webpage, 'vine data'),
|
||||
video_id)
|
||||
|
||||
data = data[list(data.keys())[0]]
|
||||
|
||||
formats = [{
|
||||
'format_id': '%(format)s-%(rate)s' % f,
|
||||
'vcodec': f.get('format'),
|
||||
@@ -109,6 +115,7 @@ class VineIE(InfoExtractor):
|
||||
'upload_date': unified_strdate(data.get('created')),
|
||||
'uploader': username,
|
||||
'uploader_id': data.get('userIdStr'),
|
||||
'view_count': int_or_none(data.get('loops', {}).get('count')),
|
||||
'like_count': int_or_none(data.get('likes', {}).get('count')),
|
||||
'comment_count': int_or_none(data.get('comments', {}).get('count')),
|
||||
'repost_count': int_or_none(data.get('reposts', {}).get('count')),
|
||||
|
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import compat_str
|
||||
@@ -10,7 +11,6 @@ from ..utils import (
|
||||
ExtractorError,
|
||||
int_or_none,
|
||||
orderedSet,
|
||||
sanitized_Request,
|
||||
str_to_int,
|
||||
unescapeHTML,
|
||||
unified_strdate,
|
||||
@@ -27,12 +27,12 @@ class VKIE(InfoExtractor):
|
||||
https?://
|
||||
(?:
|
||||
(?:
|
||||
(?:m\.)?vk\.com/video_|
|
||||
(?:(?:m|new)\.)?vk\.com/video_|
|
||||
(?:www\.)?daxab.com/
|
||||
)
|
||||
ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
|
||||
(?:
|
||||
(?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
|
||||
(?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video|
|
||||
(?:www\.)?daxab.com/embed/
|
||||
)
|
||||
(?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
|
||||
@@ -182,6 +182,10 @@ class VKIE(InfoExtractor):
|
||||
# pladform embed
|
||||
'url': 'https://vk.com/video-76116461_171554880',
|
||||
'only_matching': True,
|
||||
},
|
||||
{
|
||||
'url': 'http://new.vk.com/video205387401_165548505',
|
||||
'only_matching': True,
|
||||
}
|
||||
]
|
||||
|
||||
@@ -190,7 +194,7 @@ class VKIE(InfoExtractor):
|
||||
if username is None:
|
||||
return
|
||||
|
||||
login_page = self._download_webpage(
|
||||
login_page, url_handle = self._download_webpage_handle(
|
||||
'https://vk.com', None, 'Downloading login page')
|
||||
|
||||
login_form = self._hidden_inputs(login_page)
|
||||
@@ -200,11 +204,26 @@ class VKIE(InfoExtractor):
|
||||
'pass': password.encode('cp1251'),
|
||||
})
|
||||
|
||||
request = sanitized_Request(
|
||||
'https://login.vk.com/?act=login',
|
||||
urlencode_postdata(login_form))
|
||||
# https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
|
||||
# and expects the first one to be set rather than second (see
|
||||
# https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
|
||||
# As of RFC6265 the newer one cookie should be set into cookie store
|
||||
# what actually happens.
|
||||
# We will workaround this VK issue by resetting the remixlhk cookie to
|
||||
# the first one manually.
|
||||
cookies = url_handle.headers.get('Set-Cookie')
|
||||
if sys.version_info[0] >= 3:
|
||||
cookies = cookies.encode('iso-8859-1')
|
||||
cookies = cookies.decode('utf-8')
|
||||
remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
|
||||
if remixlhk:
|
||||
value, domain = remixlhk.groups()
|
||||
self._set_cookie(domain, 'remixlhk', value)
|
||||
|
||||
login_page = self._download_webpage(
|
||||
request, None, note='Logging in as %s' % username)
|
||||
'https://login.vk.com/?act=login', None,
|
||||
note='Logging in as %s' % username,
|
||||
data=urlencode_postdata(login_form))
|
||||
|
||||
if re.search(r'onLoginFailed', login_page):
|
||||
raise ExtractorError(
|
||||
@@ -339,7 +358,7 @@ class VKIE(InfoExtractor):
|
||||
class VKUserVideosIE(InfoExtractor):
|
||||
IE_NAME = 'vk:uservideos'
|
||||
IE_DESC = "VK - User's Videos"
|
||||
_VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
|
||||
_VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
|
||||
_TEMPLATE_URL = 'https://vk.com/videos'
|
||||
_TESTS = [{
|
||||
'url': 'http://vk.com/videos205387401',
|
||||
@@ -354,6 +373,12 @@ class VKUserVideosIE(InfoExtractor):
|
||||
}, {
|
||||
'url': 'http://vk.com/videos-97664626?section=all',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://m.vk.com/videos205387401',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://new.vk.com/videos205387401',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
|
@@ -25,7 +25,8 @@ class VRTIE(InfoExtractor):
|
||||
'timestamp': 1414271750.949,
|
||||
'upload_date': '20141025',
|
||||
'duration': 929,
|
||||
}
|
||||
},
|
||||
'skip': 'HTTP Error 404: Not Found',
|
||||
},
|
||||
# sporza.be
|
||||
{
|
||||
@@ -39,7 +40,8 @@ class VRTIE(InfoExtractor):
|
||||
'timestamp': 1413835980.560,
|
||||
'upload_date': '20141020',
|
||||
'duration': 3238,
|
||||
}
|
||||
},
|
||||
'skip': 'HTTP Error 404: Not Found',
|
||||
},
|
||||
# cobra.be
|
||||
{
|
||||
@@ -53,16 +55,39 @@ class VRTIE(InfoExtractor):
|
||||
'timestamp': 1413967500.494,
|
||||
'upload_date': '20141022',
|
||||
'duration': 661,
|
||||
}
|
||||
},
|
||||
'skip': 'HTTP Error 404: Not Found',
|
||||
},
|
||||
{
|
||||
# YouTube video
|
||||
'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957',
|
||||
'only_matching': True,
|
||||
'md5': 'b8b93da1df1cea6c8556255a796b7d61',
|
||||
'info_dict': {
|
||||
'id': 'Wji-BZ0oCwg',
|
||||
'ext': 'mp4',
|
||||
'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer',
|
||||
'description': 'md5:8e468944dce15567a786a67f74262583',
|
||||
'uploader': 'Star Wars',
|
||||
'uploader_id': 'starwars',
|
||||
'upload_date': '20160407',
|
||||
},
|
||||
'add_ie': ['Youtube'],
|
||||
},
|
||||
{
|
||||
'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055',
|
||||
'only_matching': True,
|
||||
'md5': '',
|
||||
'info_dict': {
|
||||
'id': '2377055',
|
||||
'ext': 'mp4',
|
||||
'title': 'Cafe Derby',
|
||||
'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.',
|
||||
'upload_date': '20150626',
|
||||
'timestamp': 1435305240.769,
|
||||
},
|
||||
'params': {
|
||||
# m3u8 download
|
||||
'skip_download': True,
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@@ -98,6 +123,32 @@ class VRTIE(InfoExtractor):
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
src, video_id, 'mp4', entry_protocol='m3u8_native',
|
||||
m3u8_id='hls', fatal=False))
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
src.replace('playlist.m3u8', 'manifest.f4m'),
|
||||
video_id, f4m_id='hds', fatal=False))
|
||||
if 'data-video-geoblocking="true"' not in webpage:
|
||||
rtmp_formats = self._extract_smil_formats(
|
||||
src.replace('playlist.m3u8', 'jwplayer.smil'),
|
||||
video_id, fatal=False)
|
||||
formats.extend(rtmp_formats)
|
||||
for rtmp_format in rtmp_formats:
|
||||
rtmp_format_c = rtmp_format.copy()
|
||||
rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
|
||||
del rtmp_format_c['play_path']
|
||||
del rtmp_format_c['ext']
|
||||
http_format = rtmp_format_c.copy()
|
||||
http_format.update({
|
||||
'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''),
|
||||
'format_id': rtmp_format['format_id'].replace('rtmp', 'http'),
|
||||
'protocol': 'http',
|
||||
})
|
||||
rtsp_format = rtmp_format_c.copy()
|
||||
rtsp_format.update({
|
||||
'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
|
||||
'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
|
||||
'protocol': 'rtsp',
|
||||
})
|
||||
formats.extend([http_format, rtsp_format])
|
||||
else:
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
'%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False))
|
||||
|
@@ -15,7 +15,87 @@ from ..utils import (
|
||||
)
|
||||
|
||||
|
||||
class WDRIE(InfoExtractor):
|
||||
class WDRBaseIE(InfoExtractor):
|
||||
def _extract_wdr_video(self, webpage, display_id):
|
||||
# for wdr.de the data-extension is in a tag with the class "mediaLink"
|
||||
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
|
||||
# for wdrmaus its in a link to the page in a multiline "videoLink"-tag
|
||||
json_metadata = self._html_search_regex(
|
||||
r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
|
||||
webpage, 'media link', default=None, flags=re.MULTILINE)
|
||||
|
||||
if not json_metadata:
|
||||
return
|
||||
|
||||
media_link_obj = self._parse_json(json_metadata, display_id,
|
||||
transform_source=js_to_json)
|
||||
jsonp_url = media_link_obj['mediaObj']['url']
|
||||
|
||||
metadata = self._download_json(
|
||||
jsonp_url, 'metadata', transform_source=strip_jsonp)
|
||||
|
||||
metadata_tracker_data = metadata['trackerData']
|
||||
metadata_media_resource = metadata['mediaResource']
|
||||
|
||||
formats = []
|
||||
|
||||
# check if the metadata contains a direct URL to a file
|
||||
for kind, media_resource in metadata_media_resource.items():
|
||||
if kind not in ('dflt', 'alt'):
|
||||
continue
|
||||
|
||||
for tag_name, medium_url in media_resource.items():
|
||||
if tag_name not in ('videoURL', 'audioURL'):
|
||||
continue
|
||||
|
||||
ext = determine_ext(medium_url)
|
||||
if ext == 'm3u8':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
medium_url, display_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls'))
|
||||
elif ext == 'f4m':
|
||||
manifest_url = update_url_query(
|
||||
medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'})
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
manifest_url, display_id, f4m_id='hds', fatal=False))
|
||||
elif ext == 'smil':
|
||||
formats.extend(self._extract_smil_formats(
|
||||
medium_url, 'stream', fatal=False))
|
||||
else:
|
||||
a_format = {
|
||||
'url': medium_url
|
||||
}
|
||||
if ext == 'unknown_video':
|
||||
urlh = self._request_webpage(
|
||||
medium_url, display_id, note='Determining extension')
|
||||
ext = urlhandle_detect_ext(urlh)
|
||||
a_format['ext'] = ext
|
||||
formats.append(a_format)
|
||||
|
||||
self._sort_formats(formats)
|
||||
|
||||
subtitles = {}
|
||||
caption_url = metadata_media_resource.get('captionURL')
|
||||
if caption_url:
|
||||
subtitles['de'] = [{
|
||||
'url': caption_url,
|
||||
'ext': 'ttml',
|
||||
}]
|
||||
|
||||
title = metadata_tracker_data['trackerClipTitle']
|
||||
|
||||
return {
|
||||
'id': metadata_tracker_data.get('trackerClipId', display_id),
|
||||
'display_id': display_id,
|
||||
'title': title,
|
||||
'alt_title': metadata_tracker_data.get('trackerClipSubcategory'),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')),
|
||||
}
|
||||
|
||||
|
||||
class WDRIE(WDRBaseIE):
|
||||
_CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
|
||||
_PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html'
|
||||
_VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
|
||||
@@ -91,10 +171,10 @@ class WDRIE(InfoExtractor):
|
||||
},
|
||||
{
|
||||
'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5',
|
||||
# HDS download, MD5 is unstable
|
||||
'md5': '803138901f6368ee497b4d195bb164f2',
|
||||
'info_dict': {
|
||||
'id': 'mdb-186083',
|
||||
'ext': 'flv',
|
||||
'ext': 'mp4',
|
||||
'upload_date': '20130919',
|
||||
'title': 'Sachgeschichte - Achterbahn ',
|
||||
'description': '- Die Sendung mit der Maus -',
|
||||
@@ -120,14 +200,9 @@ class WDRIE(InfoExtractor):
|
||||
display_id = mobj.group('display_id')
|
||||
webpage = self._download_webpage(url, display_id)
|
||||
|
||||
# for wdr.de the data-extension is in a tag with the class "mediaLink"
|
||||
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
|
||||
# for wdrmaus its in a link to the page in a multiline "videoLink"-tag
|
||||
json_metadata = self._html_search_regex(
|
||||
r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
|
||||
webpage, 'media link', default=None, flags=re.MULTILINE)
|
||||
info_dict = self._extract_wdr_video(webpage, display_id)
|
||||
|
||||
if not json_metadata:
|
||||
if not info_dict:
|
||||
entries = [
|
||||
self.url_result(page_url + href[0], 'WDR')
|
||||
for href in re.findall(
|
||||
@@ -140,86 +215,22 @@ class WDRIE(InfoExtractor):
|
||||
|
||||
raise ExtractorError('No downloadable streams found', expected=True)
|
||||
|
||||
media_link_obj = self._parse_json(json_metadata, display_id,
|
||||
transform_source=js_to_json)
|
||||
jsonp_url = media_link_obj['mediaObj']['url']
|
||||
|
||||
metadata = self._download_json(
|
||||
jsonp_url, 'metadata', transform_source=strip_jsonp)
|
||||
|
||||
metadata_tracker_data = metadata['trackerData']
|
||||
metadata_media_resource = metadata['mediaResource']
|
||||
|
||||
formats = []
|
||||
|
||||
# check if the metadata contains a direct URL to a file
|
||||
for kind, media_resource in metadata_media_resource.items():
|
||||
if kind not in ('dflt', 'alt'):
|
||||
continue
|
||||
|
||||
for tag_name, medium_url in media_resource.items():
|
||||
if tag_name not in ('videoURL', 'audioURL'):
|
||||
continue
|
||||
|
||||
ext = determine_ext(medium_url)
|
||||
if ext == 'm3u8':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
medium_url, display_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls'))
|
||||
elif ext == 'f4m':
|
||||
manifest_url = update_url_query(
|
||||
medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'})
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
manifest_url, display_id, f4m_id='hds', fatal=False))
|
||||
elif ext == 'smil':
|
||||
formats.extend(self._extract_smil_formats(
|
||||
medium_url, 'stream', fatal=False))
|
||||
else:
|
||||
a_format = {
|
||||
'url': medium_url
|
||||
}
|
||||
if ext == 'unknown_video':
|
||||
urlh = self._request_webpage(
|
||||
medium_url, display_id, note='Determining extension')
|
||||
ext = urlhandle_detect_ext(urlh)
|
||||
a_format['ext'] = ext
|
||||
formats.append(a_format)
|
||||
|
||||
self._sort_formats(formats)
|
||||
|
||||
subtitles = {}
|
||||
caption_url = metadata_media_resource.get('captionURL')
|
||||
if caption_url:
|
||||
subtitles['de'] = [{
|
||||
'url': caption_url,
|
||||
'ext': 'ttml',
|
||||
}]
|
||||
|
||||
title = metadata_tracker_data.get('trackerClipTitle')
|
||||
is_live = url_type == 'live'
|
||||
|
||||
if is_live:
|
||||
title = self._live_title(title)
|
||||
upload_date = None
|
||||
elif 'trackerClipAirTime' in metadata_tracker_data:
|
||||
upload_date = metadata_tracker_data['trackerClipAirTime']
|
||||
else:
|
||||
upload_date = self._html_search_meta('DC.Date', webpage, 'upload date')
|
||||
info_dict.update({
|
||||
'title': self._live_title(info_dict['title']),
|
||||
'upload_date': None,
|
||||
})
|
||||
elif 'upload_date' not in info_dict:
|
||||
info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date'))
|
||||
|
||||
if upload_date:
|
||||
upload_date = unified_strdate(upload_date)
|
||||
|
||||
return {
|
||||
'id': metadata_tracker_data.get('trackerClipId', display_id),
|
||||
'display_id': display_id,
|
||||
'title': title,
|
||||
'alt_title': metadata_tracker_data.get('trackerClipSubcategory'),
|
||||
'formats': formats,
|
||||
'upload_date': upload_date,
|
||||
info_dict.update({
|
||||
'description': self._html_search_meta('Description', webpage),
|
||||
'is_live': is_live,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
})
|
||||
|
||||
return info_dict
|
||||
|
||||
|
||||
class WDRMobileIE(InfoExtractor):
|
||||
|
@@ -6,17 +6,23 @@ from ..compat import compat_urllib_parse_unquote
|
||||
|
||||
|
||||
class XNXXIE(InfoExtractor):
|
||||
_VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)'
|
||||
_TEST = {
|
||||
'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
|
||||
'md5': '0831677e2b4761795f68d417e0b7b445',
|
||||
_VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/'
|
||||
_TESTS = [{
|
||||
'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video',
|
||||
'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0',
|
||||
'info_dict': {
|
||||
'id': '1135332',
|
||||
'id': '55awb78',
|
||||
'ext': 'flv',
|
||||
'title': 'lida » Naked Funny Actress (5)',
|
||||
'title': 'Skyrim Test Video',
|
||||
'age_limit': 18,
|
||||
}
|
||||
}
|
||||
},
|
||||
}, {
|
||||
'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
|
||||
'only_matching': True,
|
||||
}, {
|
||||
'url': 'http://www.xnxx.com/video-55awb78/',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
|
@@ -4,17 +4,23 @@ import itertools
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..compat import compat_urllib_parse_unquote
|
||||
from ..utils import (
|
||||
int_or_none,
|
||||
orderedSet,
|
||||
parse_duration,
|
||||
sanitized_Request,
|
||||
str_to_int,
|
||||
)
|
||||
|
||||
|
||||
class XTubeIE(InfoExtractor):
|
||||
_VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)'
|
||||
_VALID_URL = r'''(?x)
|
||||
(?:
|
||||
xtube:|
|
||||
https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-)
|
||||
)
|
||||
(?P<id>[^/?&#]+)
|
||||
'''
|
||||
|
||||
_TESTS = [{
|
||||
# old URL schema
|
||||
@@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor):
|
||||
'description': 'contains:an ET kind of thing',
|
||||
'uploader': 'greenshowers',
|
||||
'duration': 450,
|
||||
'view_count': int,
|
||||
'comment_count': int,
|
||||
'age_limit': 18,
|
||||
}
|
||||
}, {
|
||||
@@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor):
|
||||
req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1')
|
||||
webpage = self._download_webpage(req, display_id)
|
||||
|
||||
flashvars = self._parse_json(
|
||||
self._search_regex(
|
||||
r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'),
|
||||
video_id)['flashvars']
|
||||
sources = self._parse_json(self._search_regex(
|
||||
r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id)
|
||||
|
||||
title = flashvars.get('title') or self._search_regex(
|
||||
r'<h1>([^<]+)</h1>', webpage, 'title')
|
||||
video_url = compat_urllib_parse_unquote(flashvars['video_url'])
|
||||
duration = int_or_none(flashvars.get('video_duration'))
|
||||
formats = []
|
||||
for format_id, format_url in sources.items():
|
||||
formats.append({
|
||||
'url': format_url,
|
||||
'format_id': format_id,
|
||||
'height': int_or_none(format_id),
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
uploader = self._search_regex(
|
||||
r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
|
||||
webpage, 'uploader', fatal=False)
|
||||
title = self._search_regex(
|
||||
(r'<h1>(?P<title>[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'),
|
||||
webpage, 'title', group='title')
|
||||
description = self._search_regex(
|
||||
r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)
|
||||
uploader = self._search_regex(
|
||||
(r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
|
||||
r'<span[^>]+class="nickname"[^>]*>([^<]+)'),
|
||||
webpage, 'uploader', fatal=False)
|
||||
duration = parse_duration(self._search_regex(
|
||||
r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>',
|
||||
webpage, 'duration', fatal=False))
|
||||
view_count = str_to_int(self._search_regex(
|
||||
r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>',
|
||||
webpage, 'view count', fatal=False))
|
||||
@@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor):
|
||||
return {
|
||||
'id': video_id,
|
||||
'display_id': display_id,
|
||||
'url': video_url,
|
||||
'title': title,
|
||||
'description': description,
|
||||
'uploader': uploader,
|
||||
@@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor):
|
||||
'view_count': view_count,
|
||||
'comment_count': comment_count,
|
||||
'age_limit': 18,
|
||||
'formats': formats,
|
||||
}
|
||||
|
||||
|
||||
|
@@ -501,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'youtube_include_dash_manifest': True,
|
||||
'format': '141',
|
||||
},
|
||||
'skip': 'format 141 not served anymore',
|
||||
},
|
||||
# DASH manifest with encrypted signature
|
||||
{
|
||||
@@ -517,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
},
|
||||
'params': {
|
||||
'youtube_include_dash_manifest': True,
|
||||
'format': '141',
|
||||
'format': '141/bestaudio[ext=m4a]',
|
||||
},
|
||||
},
|
||||
# JS player signature function name containing $
|
||||
@@ -537,7 +538,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
},
|
||||
'params': {
|
||||
'youtube_include_dash_manifest': True,
|
||||
'format': '141',
|
||||
'format': '141/bestaudio[ext=m4a]',
|
||||
},
|
||||
},
|
||||
# Controversy video
|
||||
@@ -618,7 +619,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',
|
||||
'license': 'Standard YouTube License',
|
||||
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
|
||||
'uploader': 'Olympics',
|
||||
'uploader': 'Olympic',
|
||||
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
|
||||
},
|
||||
'params': {
|
||||
@@ -671,7 +672,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
|
||||
'uploader': 'dorappi2000',
|
||||
'license': 'Standard YouTube License',
|
||||
'formats': 'mincount:33',
|
||||
'formats': 'mincount:32',
|
||||
},
|
||||
},
|
||||
# DASH manifest with segment_list
|
||||
@@ -691,7 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'params': {
|
||||
'youtube_include_dash_manifest': True,
|
||||
'format': '135', # bestvideo
|
||||
}
|
||||
},
|
||||
'skip': 'This live event has ended.',
|
||||
},
|
||||
{
|
||||
# Multifeed videos (multiple cameras), URL is for Main Camera
|
||||
@@ -762,6 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
|
||||
},
|
||||
'playlist_count': 2,
|
||||
'skip': 'Not multifeed anymore',
|
||||
},
|
||||
{
|
||||
'url': 'http://vid.plus/FlRa-iH7PGw',
|
||||
@@ -814,6 +817,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
},
|
||||
'skip': 'This video does not exist.',
|
||||
},
|
||||
{
|
||||
# Video licensed under Creative Commons
|
||||
@@ -1331,7 +1335,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
(?:[a-zA-Z-]+="[^"]*"\s+)*?
|
||||
(?:title|href)="([^"]+)"\s+
|
||||
(?:[a-zA-Z-]+="[^"]*"\s+)*?
|
||||
class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>
|
||||
class="[^"]*"[^>]*>
|
||||
[^<]+\.{3}\s*
|
||||
</a>
|
||||
''', r'\1', video_description)
|
||||
|
@@ -232,7 +232,7 @@ class JSInterpreter(object):
|
||||
def extract_function(self, funcname):
|
||||
func_m = re.search(
|
||||
r'''(?x)
|
||||
(?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
|
||||
(?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
|
||||
\((?P<args>[^)]*)\)\s*
|
||||
\{(?P<code>[^}]+)\}''' % (
|
||||
re.escape(funcname), re.escape(funcname), re.escape(funcname)),
|
||||
|
@@ -26,9 +26,7 @@ def parseOpts(overrideArguments=None):
|
||||
except IOError:
|
||||
return default # silently skip if file is not present
|
||||
try:
|
||||
res = []
|
||||
for l in optionf:
|
||||
res += compat_shlex_split(l, comments=True)
|
||||
res = compat_shlex_split(optionf.read(), comments=True)
|
||||
finally:
|
||||
optionf.close()
|
||||
return res
|
||||
|
@@ -76,7 +76,7 @@ class Socks4Error(ProxyError):
|
||||
|
||||
CODES = {
|
||||
91: 'request rejected or failed',
|
||||
92: 'request rejected becasue SOCKS server cannot connect to identd on the client',
|
||||
92: 'request rejected because SOCKS server cannot connect to identd on the client',
|
||||
93: 'request rejected because the client program and identd report different user-ids'
|
||||
}
|
||||
|
||||
|
@@ -110,6 +110,49 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙ
|
||||
itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
|
||||
'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
|
||||
|
||||
DATE_FORMATS = (
|
||||
'%d %B %Y',
|
||||
'%d %b %Y',
|
||||
'%B %d %Y',
|
||||
'%b %d %Y',
|
||||
'%b %dst %Y %I:%M',
|
||||
'%b %dnd %Y %I:%M',
|
||||
'%b %dth %Y %I:%M',
|
||||
'%Y %m %d',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d',
|
||||
'%Y/%m/%d %H:%M:%S',
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%d %H:%M:%S.%f',
|
||||
'%d.%m.%Y %H:%M',
|
||||
'%d.%m.%Y %H.%M',
|
||||
'%Y-%m-%dT%H:%M:%SZ',
|
||||
'%Y-%m-%dT%H:%M:%S.%fZ',
|
||||
'%Y-%m-%dT%H:%M:%S.%f0Z',
|
||||
'%Y-%m-%dT%H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S.%f',
|
||||
'%Y-%m-%dT%H:%M',
|
||||
)
|
||||
|
||||
DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
|
||||
DATE_FORMATS_DAY_FIRST.extend([
|
||||
'%d-%m-%Y',
|
||||
'%d.%m.%Y',
|
||||
'%d.%m.%y',
|
||||
'%d/%m/%Y',
|
||||
'%d/%m/%y',
|
||||
'%d/%m/%Y %H:%M:%S',
|
||||
])
|
||||
|
||||
DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
|
||||
DATE_FORMATS_MONTH_FIRST.extend([
|
||||
'%m-%d-%Y',
|
||||
'%m.%d.%Y',
|
||||
'%m/%d/%Y',
|
||||
'%m/%d/%y',
|
||||
'%m/%d/%Y %H:%M:%S',
|
||||
])
|
||||
|
||||
|
||||
def preferredencoding():
|
||||
"""Get preferred encoding.
|
||||
@@ -975,6 +1018,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
|
||||
https_response = http_response
|
||||
|
||||
|
||||
def extract_timezone(date_str):
|
||||
m = re.search(
|
||||
r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
|
||||
date_str)
|
||||
if not m:
|
||||
timezone = datetime.timedelta()
|
||||
else:
|
||||
date_str = date_str[:-len(m.group('tz'))]
|
||||
if not m.group('sign'):
|
||||
timezone = datetime.timedelta()
|
||||
else:
|
||||
sign = 1 if m.group('sign') == '+' else -1
|
||||
timezone = datetime.timedelta(
|
||||
hours=sign * int(m.group('hours')),
|
||||
minutes=sign * int(m.group('minutes')))
|
||||
return timezone, date_str
|
||||
|
||||
|
||||
def parse_iso8601(date_str, delimiter='T', timezone=None):
|
||||
""" Return a UNIX timestamp from the given date """
|
||||
|
||||
@@ -984,20 +1045,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
|
||||
date_str = re.sub(r'\.[0-9]+', '', date_str)
|
||||
|
||||
if timezone is None:
|
||||
m = re.search(
|
||||
r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
|
||||
date_str)
|
||||
if not m:
|
||||
timezone = datetime.timedelta()
|
||||
else:
|
||||
date_str = date_str[:-len(m.group(0))]
|
||||
if not m.group('sign'):
|
||||
timezone = datetime.timedelta()
|
||||
else:
|
||||
sign = 1 if m.group('sign') == '+' else -1
|
||||
timezone = datetime.timedelta(
|
||||
hours=sign * int(m.group('hours')),
|
||||
minutes=sign * int(m.group('minutes')))
|
||||
timezone, date_str = extract_timezone(date_str)
|
||||
|
||||
try:
|
||||
date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
|
||||
dt = datetime.datetime.strptime(date_str, date_format) - timezone
|
||||
@@ -1006,6 +1055,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
|
||||
pass
|
||||
|
||||
|
||||
def date_formats(day_first=True):
|
||||
return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
|
||||
|
||||
|
||||
def unified_strdate(date_str, day_first=True):
|
||||
"""Return a string with the date in the format YYYYMMDD"""
|
||||
|
||||
@@ -1014,53 +1067,11 @@ def unified_strdate(date_str, day_first=True):
|
||||
upload_date = None
|
||||
# Replace commas
|
||||
date_str = date_str.replace(',', ' ')
|
||||
# %z (UTC offset) is only supported in python>=3.2
|
||||
if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
|
||||
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
|
||||
# Remove AM/PM + timezone
|
||||
date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
|
||||
_, date_str = extract_timezone(date_str)
|
||||
|
||||
format_expressions = [
|
||||
'%d %B %Y',
|
||||
'%d %b %Y',
|
||||
'%B %d %Y',
|
||||
'%b %d %Y',
|
||||
'%b %dst %Y %I:%M',
|
||||
'%b %dnd %Y %I:%M',
|
||||
'%b %dth %Y %I:%M',
|
||||
'%Y %m %d',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d',
|
||||
'%Y/%m/%d %H:%M:%S',
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%d %H:%M:%S.%f',
|
||||
'%d.%m.%Y %H:%M',
|
||||
'%d.%m.%Y %H.%M',
|
||||
'%Y-%m-%dT%H:%M:%SZ',
|
||||
'%Y-%m-%dT%H:%M:%S.%fZ',
|
||||
'%Y-%m-%dT%H:%M:%S.%f0Z',
|
||||
'%Y-%m-%dT%H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S.%f',
|
||||
'%Y-%m-%dT%H:%M',
|
||||
]
|
||||
if day_first:
|
||||
format_expressions.extend([
|
||||
'%d-%m-%Y',
|
||||
'%d.%m.%Y',
|
||||
'%d.%m.%y',
|
||||
'%d/%m/%Y',
|
||||
'%d/%m/%y',
|
||||
'%d/%m/%Y %H:%M:%S',
|
||||
])
|
||||
else:
|
||||
format_expressions.extend([
|
||||
'%m-%d-%Y',
|
||||
'%m.%d.%Y',
|
||||
'%m/%d/%Y',
|
||||
'%m/%d/%y',
|
||||
'%m/%d/%Y %H:%M:%S',
|
||||
])
|
||||
for expression in format_expressions:
|
||||
for expression in date_formats(day_first):
|
||||
try:
|
||||
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
|
||||
except ValueError:
|
||||
@@ -1076,6 +1087,29 @@ def unified_strdate(date_str, day_first=True):
|
||||
return compat_str(upload_date)
|
||||
|
||||
|
||||
def unified_timestamp(date_str, day_first=True):
|
||||
if date_str is None:
|
||||
return None
|
||||
|
||||
date_str = date_str.replace(',', ' ')
|
||||
|
||||
pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
|
||||
timezone, date_str = extract_timezone(date_str)
|
||||
|
||||
# Remove AM/PM + timezone
|
||||
date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
|
||||
|
||||
for expression in date_formats(day_first):
|
||||
try:
|
||||
dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
|
||||
return calendar.timegm(dt.timetuple())
|
||||
except ValueError:
|
||||
pass
|
||||
timetuple = email.utils.parsedate_tz(date_str)
|
||||
if timetuple:
|
||||
return calendar.timegm(timetuple.timetuple())
|
||||
|
||||
|
||||
def determine_ext(url, default_ext='unknown_video'):
|
||||
if url is None:
|
||||
return default_ext
|
||||
@@ -1591,6 +1625,11 @@ class HEADRequest(compat_urllib_request.Request):
|
||||
return 'HEAD'
|
||||
|
||||
|
||||
class PUTRequest(compat_urllib_request.Request):
|
||||
def get_method(self):
|
||||
return 'PUT'
|
||||
|
||||
|
||||
def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
|
||||
if get_attr:
|
||||
if v is not None:
|
||||
@@ -1626,6 +1665,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):
|
||||
return default
|
||||
|
||||
|
||||
def strip_or_none(v):
|
||||
return None if v is None else v.strip()
|
||||
|
||||
|
||||
def parse_duration(s):
|
||||
if not isinstance(s, compat_basestring):
|
||||
return None
|
||||
@@ -1882,7 +1925,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
|
||||
req_headers.update(headers)
|
||||
req_data = data or req.data
|
||||
req_url = update_url_query(url or req.get_full_url(), query)
|
||||
req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
|
||||
req_get_method = req.get_method()
|
||||
if req_get_method == 'HEAD':
|
||||
req_type = HEADRequest
|
||||
elif req_get_method == 'PUT':
|
||||
req_type = PUTRequest
|
||||
else:
|
||||
req_type = compat_urllib_request.Request
|
||||
new_req = req_type(
|
||||
req_url, data=req_data, headers=req_headers,
|
||||
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
|
||||
@@ -2852,3 +2901,16 @@ def decode_packed_codes(code):
|
||||
return re.sub(
|
||||
r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
|
||||
obfucasted_code)
|
||||
|
||||
|
||||
def parse_m3u8_attributes(attrib):
|
||||
info = {}
|
||||
for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
|
||||
if val.startswith('"'):
|
||||
val = val[1:-1]
|
||||
info[key] = val
|
||||
return info
|
||||
|
||||
|
||||
def urshift(val, n):
|
||||
return val >> n if val >= 0 else (val + 0x100000000) >> n
|
||||
|
@@ -1,3 +1,3 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
__version__ = '2016.06.18.1'
|
||||
__version__ = '2016.07.03.1'
|
||||
|
Reference in New Issue
Block a user