[youtube] Extract chapters from JSON (closes #24819)

author Sergey M․ <dstftw@gmail.com>

Fri, 5 Jun 2020 21:16:31 +0000 (04:16 +0700)

committer Sergey M․ <dstftw@gmail.com>

Fri, 5 Jun 2020 21:22:10 +0000 (04:22 +0700)
author Sergey M․ <dstftw@gmail.com>
Fri, 5 Jun 2020 21:16:31 +0000 (04:16 +0700)
committer Sergey M․ <dstftw@gmail.com>
Fri, 5 Jun 2020 21:22:10 +0000 (04:22 +0700)
diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py

index 324ca852578531757d9964f2c90cf6f8e1c4d3b1..e69c57377e617e2864a80e2e736bb72c87c4122e 100644 (file)
--- a/test/test_youtube_chapters.py
+++ b/test/test_youtube_chapters.py
@@ -267,7 +267,7 @@ class TestYoutubeChapters(unittest.TestCase):
          for description, duration, expected_chapters in self._TEST_CASES:
              ie = YoutubeIE()
              expect_value(
-                self, ie._extract_chapters(description, duration),
+                self, ie._extract_chapters_from_description(description, duration),
                  expected_chapters, None)
  
  
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index fec17987b8dca48f6f9fcd921ad6b65d7e59c1a0..54ec76db5657570402023259a99354b0cb8dbfdf 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1652,8 +1652,63 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          video_id = mobj.group(2)
          return video_id
  
+    def _extract_chapters_from_json(self, webpage, video_id, duration):
+        if not webpage:
+            return
+        player = self._parse_json(
+            self._search_regex(
+                r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
+                'player args', default='{}'),
+            video_id, fatal=False)
+        if not player or not isinstance(player, dict):
+            return
+        watch_next_response = player.get('watch_next_response')
+        if not isinstance(watch_next_response, compat_str):
+            return
+        response = self._parse_json(watch_next_response, video_id, fatal=False)
+        if not response or not isinstance(response, dict):
+            return
+        chapters_list = try_get(
+            response,
+            lambda x: x['playerOverlays']
+                       ['playerOverlayRenderer']
+                       ['decoratedPlayerBarRenderer']
+                       ['decoratedPlayerBarRenderer']
+                       ['playerBar']
+                       ['chapteredPlayerBarRenderer']
+                       ['chapters'],
+            list)
+        if not chapters_list:
+            return
+
+        def chapter_time(chapter):
+            return float_or_none(
+                try_get(
+                    chapter,
+                    lambda x: x['chapterRenderer']['timeRangeStartMillis'],
+                    int),
+                scale=1000)
+        chapters = []
+        for next_num, chapter in enumerate(chapters_list, start=1):
+            start_time = chapter_time(chapter)
+            if start_time is None:
+                continue
+            end_time = (chapter_time(chapters_list[next_num])
+                        if next_num < len(chapters_list) else duration)
+            if end_time is None:
+                continue
+            title = try_get(
+                chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
+                compat_str)
+            chapters.append({
+                'start_time': start_time,
+                'end_time': end_time,
+                'title': title,
+            })
+        return chapters
+
      @staticmethod
-    def _extract_chapters(description, duration):
+    def _extract_chapters_from_description(description, duration):
          if not description:
              return None
          chapter_lines = re.findall(
@@ -1687,6 +1742,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              })
          return chapters
  
+    def _extract_chapters(self, webpage, description, video_id, duration):
+        return (self._extract_chapters_from_json(webpage, video_id, duration)
+                or self._extract_chapters_from_description(description, duration))
+
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url, {})
  
@@ -2324,7 +2383,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      errnote='Unable to download video annotations', fatal=False,
                      data=urlencode_postdata({xsrf_field_name: xsrf_token}))
  
-        chapters = self._extract_chapters(description_original, video_duration)
+        chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
  
          # Look for the DASH manifest
          if self._downloader.params.get('youtube_include_dash_manifest', True):
author	Sergey M․ <dstftw@gmail.com>
	Fri, 5 Jun 2020 21:16:31 +0000 (04:16 +0700)
committer	Sergey M․ <dstftw@gmail.com>
	Fri, 5 Jun 2020 21:22:10 +0000 (04:22 +0700)
test/test_youtube_chapters.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history