drbonanza.py (1981B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..utils import ( 7 js_to_json, 8 parse_duration, 9 unescapeHTML, 10 ) 11 12 13 class DRBonanzaIE(InfoExtractor): 14 _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/[^/]+/\d+/[^/]+/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' 15 _TEST = { 16 'url': 'http://www.dr.dk/bonanza/serie/154/matador/40312/matador---0824-komme-fremmede-', 17 'info_dict': { 18 'id': '40312', 19 'display_id': 'matador---0824-komme-fremmede-', 20 'ext': 'mp4', 21 'title': 'MATADOR - 08:24. "Komme fremmede".', 22 'description': 'md5:77b4c1ac4d4c1b9d610ab4395212ff84', 23 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', 24 'duration': 4613, 25 }, 26 } 27 28 def _real_extract(self, url): 29 mobj = re.match(self._VALID_URL, url) 30 video_id, display_id = mobj.group('id', 'display_id') 31 32 webpage = self._download_webpage(url, display_id) 33 34 info = self._parse_html5_media_entries( 35 url, webpage, display_id, m3u8_id='hls', 36 m3u8_entry_protocol='m3u8_native')[0] 37 self._sort_formats(info['formats']) 38 39 asset = self._parse_json( 40 self._search_regex( 41 r'(?s)currentAsset\s*=\s*({.+?})\s*</script', webpage, 'asset'), 42 display_id, transform_source=js_to_json) 43 44 title = unescapeHTML(asset['AssetTitle']).strip() 45 46 def extract(field): 47 return self._search_regex( 48 r'<div[^>]+>\s*<p>%s:<p>\s*</div>\s*<div[^>]+>\s*<p>([^<]+)</p>' % field, 49 webpage, field, default=None) 50 51 info.update({ 52 'id': asset.get('AssetId') or video_id, 53 'display_id': display_id, 54 'title': title, 55 'description': extract('Programinfo'), 56 'duration': parse_duration(extract('Tid')), 57 'thumbnail': asset.get('AssetImageUrl'), 58 }) 59 return info