From be96f9924f4b93ab4632b602a7e7b97518a1ddab Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 19 Jan 2020 20:15:02 +0100 Subject: [PATCH] [voicerepublic] fix extraction --- youtube_dl/extractor/voicerepublic.py | 76 +++++++-------------------- 1 file changed, 19 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 59e1359c4..a52e40afa 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,17 +1,12 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, int_or_none, - sanitized_Request, + urljoin, ) @@ -26,8 +21,7 @@ class VoiceRepublicIE(InfoExtractor): 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', - 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', - 'duration': 1800, + 'duration': 1556, 'view_count': int, } }, { @@ -38,63 +32,31 @@ class VoiceRepublicIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - req = sanitized_Request( - compat_urlparse.urljoin(url, '/talks/%s' % display_id)) - # Older versions of Firefox get redirected to an "upgrade browser" page - req.add_header('User-Agent', 'youtube-dl') - webpage = self._download_webpage(req, display_id) + webpage = self._download_webpage(url, display_id) if '>Queued for processing, please stand by...<' in webpage: raise ExtractorError( 'Audio is still queued for processing', expected=True) - config = self._search_regex( - r'(?s)return ({.+?});\s*\n', webpage, - 'data', default=None) - data = self._parse_json(config, display_id, fatal=False) if config else None - if data: - title = data['title'] - description = data.get('teaser') - talk_id = compat_str(data.get('talk_id') or display_id) - talk = data['talk'] - duration = int_or_none(talk.get('duration')) - formats = [{ - 'url': compat_urlparse.urljoin(url, talk_url), - 'format_id': format_id, - 'ext': determine_ext(talk_url) or format_id, - 'vcodec': 'none', - } for format_id, talk_url in talk['links'].items()] - else: - title = self._og_search_title(webpage) - description = self._html_search_regex( - r"(?s)
]*>(.+?)
", - webpage, 'description', fatal=False) - talk_id = self._search_regex( - [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], - webpage, 'talk id', default=None) or display_id - duration = None - player = self._search_regex( - r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player') - formats = [{ - 'url': compat_urlparse.urljoin(url, talk_url), - 'format_id': format_id, - 'ext': determine_ext(talk_url) or format_id, - 'vcodec': 'none', - } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)] + talk = self._parse_json(self._search_regex( + r'initialSnapshot\s*=\s*({.+?});', + webpage, 'talk'), display_id)['talk'] + title = talk['title'] + formats = [{ + 'url': urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in talk['media_links'].items()] self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) - view_count = int_or_none(self._search_regex( - r"class='play-count[^']*'>\s*(\d+) plays", - webpage, 'play count', fatal=False)) - return { - 'id': talk_id, + 'id': compat_str(talk.get('id') or display_id), 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, + 'description': talk.get('teaser'), + 'thumbnail': talk.get('image_url'), + 'duration': int_or_none(talk.get('archived_duration')), + 'view_count': int_or_none(talk.get('play_count')), 'formats': formats, } -- 2.22.2