bpb.py (2204B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import ( 8 js_to_json, 9 determine_ext, 10 ) 11 12 13 class BpbIE(InfoExtractor): 14 IE_DESC = 'Bundeszentrale für politische Bildung' 15 _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' 16 17 _TEST = { 18 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', 19 # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 20 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 21 'info_dict': { 22 'id': '297', 23 'ext': 'mp4', 24 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', 25 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' 26 } 27 } 28 29 def _real_extract(self, url): 30 video_id = self._match_id(url) 31 webpage = self._download_webpage(url, video_id) 32 33 title = self._html_search_regex( 34 r'<h2 class="white">(.*?)</h2>', webpage, 'title') 35 video_info_dicts = re.findall( 36 r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) 37 38 formats = [] 39 for video_info in video_info_dicts: 40 video_info = self._parse_json( 41 video_info, video_id, transform_source=js_to_json, fatal=False) 42 if not video_info: 43 continue 44 video_url = video_info.get('src') 45 if not video_url: 46 continue 47 quality = 'high' if '_high' in video_url else 'low' 48 formats.append({ 49 'url': video_url, 50 'preference': 10 if quality == 'high' else 0, 51 'format_note': quality, 52 'format_id': '%s-%s' % (quality, determine_ext(video_url)), 53 }) 54 55 self._sort_formats(formats) 56 57 return { 58 'id': video_id, 59 'formats': formats, 60 'title': title, 61 'description': self._og_search_description(webpage), 62 }