youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit fb69240ca0934299583bf6c7a855d5c602a4a7e0
parent 830d53bfae7a665b55656dd50c9f35f0d0b0161d
Author: Sergey M․ <dstftw@gmail.com>
Date:   Sun, 12 Apr 2015 23:19:00 +0600

[youtube] Extract video titles for channel playlist if possible (Closes #4971)

Diffstat:
Myoutube_dl/extractor/youtube.py | 29++++++++++++++++++++---------
1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py @@ -1370,10 +1370,18 @@ class YoutubeChannelIE(InfoExtractor): def extract_videos_from_page(self, page): ids_in_page = [] - for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - return ids_in_page + titles_in_page = [] + for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): + video_id = mobj.group('id') + video_title = unescapeHTML(mobj.group('title')) + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + return zip(ids_in_page, titles_in_page) def _real_extract(self, url): channel_id = self._match_id(url) @@ -1390,10 +1398,12 @@ class YoutubeChannelIE(InfoExtractor): if autogenerated: # The videos are contained in a single page # the ajax pages can't be used, they are empty - video_ids = self.extract_videos_from_page(channel_page) + videos = self.extract_videos_from_page(channel_page) entries = [ - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] + self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + for video_id, video_title in videos] return self.playlist_result(entries, channel_id) def _entries(): @@ -1401,9 +1411,10 @@ class YoutubeChannelIE(InfoExtractor): for pagenum in itertools.count(1): ids_in_page = self.extract_videos_from_page(content_html) - for video_id in ids_in_page: + for video_id, video_title in ids_in_page: yield self.url_result( - video_id, 'Youtube', video_id=video_id) + video_id, 'Youtube', video_id=video_id, + video_title=video_title) mobj = re.search( r'data-uix-load-more-href="/?(?P<more>[^"]+)"',