youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit 0e832c2c97c62f67593ad356ea6d507778c56759
parent 8e4aa7bf18af4403bf98742270483f3b9cfbdeb6
Author: Sergey M․ <dstftw@gmail.com>
Date:   Sun, 13 Mar 2016 15:54:56 +0600

[bbc] Improve title and description extraction (Closes #8826, closes #8822)

Diffstat:
Myoutube_dl/extractor/bbc.py | 27++++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py @@ -564,6 +564,14 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 18, }, { + # school report playlist with single video + 'url': 'http://www.bbc.co.uk/schoolreport/35744779', + 'info_dict': { + 'id': '35744779', + 'title': 'School which breaks down barriers in Jerusalem', + }, + 'playlist_count': 1, + }, { # single video embedded with data-playable containing vpid 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { @@ -734,8 +742,17 @@ class BBCIE(BBCCoUkIE): json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) timestamp = json_ld_info.get('timestamp') + playlist_title = json_ld_info.get('title') - playlist_description = json_ld_info.get('description') + if not playlist_title: + playlist_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'<title>(.+?)</title>', webpage, 'playlist title', default=None) + if playlist_title: + playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + + playlist_description = json_ld_info.get( + 'description') or self._og_search_description(webpage, default=None) if not timestamp: timestamp = parse_iso8601(self._search_regex( @@ -795,14 +812,6 @@ class BBCIE(BBCCoUkIE): entries.append(self._extract_from_playlist_sxml( playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) - playlist_title = self._og_search_title(webpage, default=None) - playlist_title = playlist_title or self._html_search_regex( - r'<title>(.*?)</title>', webpage, 'playlist title') - - playlist_title = self._search_regex(r'(.+)\s*-\s*BBC', playlist_title, 'title', default=playlist_title) - - playlist_description = self._og_search_description(webpage, default=None) - if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)