[youtube:feed] Check each 'load more' portion for unique video ids

author Sergey M․ <dstftw@gmail.com>

Fri, 15 May 2015 15:42:34 +0000 (21:42 +0600)

committer Sergey M․ <dstftw@gmail.com>

Fri, 15 May 2015 15:42:34 +0000 (21:42 +0600)
author Sergey M․ <dstftw@gmail.com>
Fri, 15 May 2015 15:42:34 +0000 (21:42 +0600)
committer Sergey M․ <dstftw@gmail.com>
Fri, 15 May 2015 15:42:34 +0000 (21:42 +0600)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 9096a29756ca6e1a66ecd442a92977fa1b999b31..1f9940cf5c1e4c8698a8a0bed9a874d5a79f18b0 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1621,10 +1621,16 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
          # for the video ids doesn't contain an index
          ids = []
          more_widget_html = content_html = page
-
          for page_num in itertools.count(1):
              matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
+
+            # 'recommended' feed has infinite 'load more' and each new portion spins
+            # the same videos in (sometimes) slightly different order, so we'll check
+            # for unicity and break when portion has no new videos
+            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+            if not new_ids:
+                break
+
              ids.extend(new_ids)
  
              mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
author	Sergey M․ <dstftw@gmail.com>
	Fri, 15 May 2015 15:42:34 +0000 (21:42 +0600)
committer	Sergey M․ <dstftw@gmail.com>
	Fri, 15 May 2015 15:42:34 +0000 (21:42 +0600)