From: Remita Amine Date: Tue, 19 Jan 2021 09:23:02 +0000 (+0100) Subject: [ninegag] improve extraction X-Git-Url: http://git.oshgnacknak.de/?a=commitdiff_plain;h=54856480d7bac670c9d571d4191f5f35aadc5270;p=youtube-dl [ninegag] improve extraction --- diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index 3753bc0a2..440f865bc 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,148 +1,125 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( determine_ext, - url_or_none, + ExtractorError, int_or_none, - float_or_none, - ExtractorError + try_get, + url_or_none, ) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' - _TESTS = [{ - 'url': 'https://9gag.com/gag/an5Qz5b', - 'info_dict': { - 'id': 'an5Qz5b', - 'ext': 'webm', - 'title': 'Dogs playing tetherball', - 'upload_date': '20191108', - 'timestamp': 1573243994, - 'age_limit': 0, - 'categories': [ - 'Wholesome' - ], - 'tags': [ - 'Dog' - ] - } - }, { + _TEST = { 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', - 'ext': 'webm', + 'ext': 'mp4', 'title': 'Capybara Agility Training', 'upload_date': '20191108', 'timestamp': 1573237208, - 'age_limit': 0, - 'categories': [ - 'Awesome' - ], - 'tags': [ - 'Weimaraner', - 'American Pit Bull Terrier' - ] + 'categories': ['Awesome'], + 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'duration': 44, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, } - }] - - _EXTERNAL_VIDEO_PROVIDERS = { - 'Youtube': 'https://youtube.com/watch?v=%s' } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - rawJsonData = self._search_regex( - r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);', - webpage, - 'data') - rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/') - data = self._parse_json(rawJsonData, video_id)['data']['post'] - - if data['type'] == 'Video': - vid = data['video']['id'] - ie_key = data['video']['source'].capitalize() - return { - '_type': 'url_transparent', - 'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid, - 'ie_key': ie_key, - 'id': vid, - 'duration': data['video'].get('duration'), - 'start_time': data['video'].get('startTs') - } + post_id = self._match_id(url) + post = self._download_json( + 'https://9gag.com/v1/post', post_id, query={ + 'id': post_id + })['data']['post'] - if data['type'] == 'EmbedVideo': - vid = data['video']['id'] - ie_key = data['video']['source'].capitalize() - return { - '_type': 'url_transparent', - 'url': data['video']['embedUrl'], - #'ie_key': vid, - 'start_time': data['video'].get('startTs') - } - - if data['type'] != 'Animated': + if post.get('type') != 'Animated': raise ExtractorError( 'The given url does not contain a video', expected=True) + title = post['title'] + duration = None formats = [] thumbnails = [] - for key in data['images']: - image = data['images'][key] - if 'duration' in image and duration is None: - duration = int_or_none(image['duration']) - url = url_or_none(image.get('url')) - if url == None: + for key, image in (post.get('images') or {}).items(): + image_url = url_or_none(image.get('url')) + if not image_url: continue - ext = determine_ext(url) - if ext == 'jpg' or ext == 'png': - thumbnail = { - 'url': url, - 'width': float_or_none(image.get('width')), - 'height': float_or_none(image.get('height')) - } - thumbnails.append(thumbnail) - elif ext == 'webm' or ext == 'mp4': - formats.append({ - 'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url), + ext = determine_ext(image_url) + image_id = key.strip('image') + common = { + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } + if ext in ('jpg', 'png'): + webp_url = image.get('webpUrl') + if webp_url: + t = common.copy() + t.update({ + 'id': image_id + '-webp', + 'url': webp_url, + }) + thumbnails.append(t) + common.update({ + 'id': image_id, 'ext': ext, - 'url': url, - 'width': float_or_none(image.get('width')), - 'height': float_or_none(image.get('height')) }) - section = None - postSection = data.get('postSection') - if postSection != None and 'name' in postSection: - section = re.sub(r'\\[^\\]{5}', '', postSection['name']) - age_limit = int_or_none(data.get('nsfw')) - if age_limit != None: - age_limit = age_limit * 18 + thumbnails.append(common) + elif ext in ('webm', 'mp4'): + if not duration: + duration = int_or_none(image.get('duration')) + common['acodec'] = 'none' if image.get('hasAudio') == 0 else None + for vcodec in ('vp8', 'vp9', 'h265'): + c_url = image.get(vcodec + 'Url') + if not c_url: + continue + c_f = common.copy() + c_f.update({ + 'format_id': image_id + '-' + vcodec, + 'url': c_url, + 'vcodec': vcodec, + }) + formats.append(c_f) + common.update({ + 'ext': ext, + 'format_id': image_id, + }) + formats.append(common) + self._sort_formats(formats) + + section = try_get(post, lambda x: x['postSection']['name']) + tags = None - if 'tags' in data: + post_tags = post.get('tags') + if post_tags: tags = [] - for tag in data.get('tags') or []: - tags.append(tag.get('key')) + for tag in post_tags: + tag_key = tag.get('key') + if not tag_key: + continue + tags.append(tag_key) + + get_count = lambda x: int_or_none(post.get(x + 'Count')) return { - 'id': video_id, - 'title': data['title'], - 'timestamp': int_or_none(data.get('creationTs')), + 'id': post_id, + 'title': title, + 'timestamp': int_or_none(post.get('creationTs')), 'duration': duration, 'formats': formats, 'thumbnails': thumbnails, - 'like_count': int_or_none(data.get('upVoteCount')), - 'dislike_count': int_or_none(data.get('downVoteCount')), - 'comment_count': int_or_none(data.get('commentsCount')), - 'age_limit': age_limit, - 'categories': [section], + 'like_count': get_count('upVote'), + 'dislike_count': get_count('downVote'), + 'comment_count': get_count('comments'), + 'age_limit': 18 if post.get('nsfw') == 1 else None, + 'categories': [section] if section else None, 'tags': tags, - 'is_live': False }