[condenast] fix extraction and extract subtitles
authorRemita Amine <remitamine@gmail.com>
Mon, 16 Nov 2020 17:57:33 +0000 (18:57 +0100)
committerRemita Amine <remitamine@gmail.com>
Mon, 16 Nov 2020 17:57:56 +0000 (18:57 +0100)
youtube_dl/extractor/condenast.py

index ed278fefc67089f44760ed8e0635cf1c5cbb1fde..d5e77af3216b215d9a11bce0fb788ef4a8c15fd9 100644 (file)
@@ -16,6 +16,8 @@ from ..utils import (
     mimetype2ext,
     orderedSet,
     parse_iso8601,
+    strip_or_none,
+    try_get,
 )
 
 
@@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):
             'uploader': 'gq',
             'upload_date': '20170321',
             'timestamp': 1490126427,
+            'description': 'How much grimmer would things be if these people were competent?',
         },
     }, {
         # JS embed
@@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):
             'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
             'uploader': 'arstechnica',
             'upload_date': '20150916',
-            'timestamp': 1442434955,
+            'timestamp': 1442434920,
         }
     }, {
         'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
@@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor):
             })
         self._sort_formats(formats)
 
+        subtitles = {}
+        for t, caption in video_info.get('captions', {}).items():
+            caption_url = caption.get('src')
+            if not (t in ('vtt', 'srt', 'tml') and caption_url):
+                continue
+            subtitles.setdefault('en', []).append({'url': caption_url})
+
         return {
             'id': video_id,
             'formats': formats,
@@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor):
             'season': video_info.get('season_title'),
             'timestamp': parse_iso8601(video_info.get('premiere_date')),
             'categories': video_info.get('categories'),
+            'subtitles': subtitles,
         }
 
     def _real_extract(self, url):
@@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor):
         if url_type == 'series':
             return self._extract_series(url, webpage)
         else:
-            params = self._extract_video_params(webpage, display_id)
-            info = self._search_json_ld(
-                webpage, display_id, fatal=False)
+            video = try_get(self._parse_json(self._search_regex(
+                r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
+                'preload state', '{}'), display_id),
+                lambda x: x['transformed']['video'])
+            if video:
+                params = {'videoId': video['id']}
+                info = {'description': strip_or_none(video.get('description'))}
+            else:
+                params = self._extract_video_params(webpage, display_id)
+                info = self._search_json_ld(
+                    webpage, display_id, fatal=False)
             info.update(self._extract_video(params))
             return info