[mixcloud] Rewrite extractor (fixes #278)
This commit is contained in:
		| @@ -5,34 +5,27 @@ import socket | ||||
| from .common import InfoExtractor | ||||
| from ..utils import ( | ||||
|     compat_http_client, | ||||
|     compat_str, | ||||
|     compat_urllib_error, | ||||
|     compat_urllib_request, | ||||
|  | ||||
|     ExtractorError, | ||||
|     unified_strdate, | ||||
| ) | ||||
|  | ||||
|  | ||||
| class MixcloudIE(InfoExtractor): | ||||
|     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ | ||||
|     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' | ||||
|     IE_NAME = u'mixcloud' | ||||
|  | ||||
|     def report_download_json(self, file_id): | ||||
|         """Report JSON download.""" | ||||
|         self.to_screen(u'Downloading json') | ||||
|  | ||||
|     def get_urls(self, jsonData, fmt, bitrate='best'): | ||||
|         """Get urls from 'audio_formats' section in json""" | ||||
|         try: | ||||
|             bitrate_list = jsonData[fmt] | ||||
|             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: | ||||
|                 bitrate = max(bitrate_list) # select highest | ||||
|  | ||||
|             url_list = jsonData[fmt][bitrate] | ||||
|         except TypeError: # we have no bitrate info. | ||||
|             url_list = jsonData[fmt] | ||||
|         return url_list | ||||
|     _TEST = { | ||||
|         u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/', | ||||
|         u'file': u'dholbach-cryptkeeper.mp3', | ||||
|         u'info_dict': { | ||||
|             u'title': u'Cryptkeeper', | ||||
|             u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', | ||||
|             u'uploader': u'Daniel Holbach', | ||||
|             u'uploader_id': u'dholbach', | ||||
|             u'upload_date': u'20111115', | ||||
|         }, | ||||
|     } | ||||
|  | ||||
|     def check_urls(self, url_list): | ||||
|         """Returns 1st active url from list""" | ||||
| @@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor): | ||||
|  | ||||
|         return None | ||||
|  | ||||
|     def _print_formats(self, formats): | ||||
|         print('Available formats:') | ||||
|         for fmt in formats.keys(): | ||||
|             for b in formats[fmt]: | ||||
|                 try: | ||||
|                     ext = formats[fmt][b][0] | ||||
|                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) | ||||
|                 except TypeError: # we have no bitrate info | ||||
|                     ext = formats[fmt][0] | ||||
|                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) | ||||
|                     break | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         if mobj is None: | ||||
|             raise ExtractorError(u'Invalid URL: %s' % url) | ||||
|         # extract uploader & filename from url | ||||
|         uploader = mobj.group(1).decode('utf-8') | ||||
|         file_id = uploader + "-" + mobj.group(2).decode('utf-8') | ||||
|  | ||||
|         # construct API request | ||||
|         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' | ||||
|         # retrieve .json file with links to files | ||||
|         request = compat_urllib_request.Request(file_url) | ||||
|         try: | ||||
|             self.report_download_json(file_url) | ||||
|             jsonData = compat_urllib_request.urlopen(request).read() | ||||
|         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: | ||||
|             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) | ||||
|         uploader = mobj.group(1) | ||||
|         cloudcast_name = mobj.group(2) | ||||
|         track_id = '-'.join((uploader, cloudcast_name)) | ||||
|         api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) | ||||
|         webpage = self._download_webpage(url, track_id) | ||||
|         json_data = self._download_webpage(api_url, track_id, | ||||
|             u'Downloading cloudcast info') | ||||
|         info = json.loads(json_data) | ||||
|  | ||||
|         # parse JSON | ||||
|         json_data = json.loads(jsonData) | ||||
|         player_url = json_data['player_swf_url'] | ||||
|         formats = dict(json_data['audio_formats']) | ||||
|         preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') | ||||
|         song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') | ||||
|         template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) | ||||
|         final_song_url = self.check_urls(template_url % i for i in range(30)) | ||||
|  | ||||
|         req_format = self._downloader.params.get('format', None) | ||||
|  | ||||
|         if self._downloader.params.get('listformats', None): | ||||
|             self._print_formats(formats) | ||||
|             return | ||||
|  | ||||
|         if req_format is None or req_format == 'best': | ||||
|             for format_param in formats.keys(): | ||||
|                 url_list = self.get_urls(formats, format_param) | ||||
|                 # check urls | ||||
|                 file_url = self.check_urls(url_list) | ||||
|                 if file_url is not None: | ||||
|                     break # got it! | ||||
|         else: | ||||
|             if req_format not in formats: | ||||
|                 raise ExtractorError(u'Format is not available') | ||||
|  | ||||
|             url_list = self.get_urls(formats, req_format) | ||||
|             file_url = self.check_urls(url_list) | ||||
|             format_param = req_format | ||||
|  | ||||
|         return [{ | ||||
|             'id': file_id.decode('utf-8'), | ||||
|             'url': file_url.decode('utf-8'), | ||||
|             'uploader': uploader.decode('utf-8'), | ||||
|             'upload_date': None, | ||||
|             'title': json_data['name'], | ||||
|             'ext': file_url.split('.')[-1].decode('utf-8'), | ||||
|             'format': (format_param is None and u'NA' or format_param.decode('utf-8')), | ||||
|             'thumbnail': json_data['thumbnail_url'], | ||||
|             'description': json_data['description'], | ||||
|             'player_url': player_url.decode('utf-8'), | ||||
|         }] | ||||
|         return { | ||||
|             'id': track_id, | ||||
|             'title': info['name'], | ||||
|             'url': final_song_url, | ||||
|             'ext': 'mp3', | ||||
|             'description': info['description'], | ||||
|             'thumbnail': info['pictures'].get('extra_large'), | ||||
|             'uploader': info['user']['name'], | ||||
|             'uploader_id': info['user']['username'], | ||||
|             'upload_date': unified_strdate(info['created_time']), | ||||
|             'view_count': info['play_count'], | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user