dailymail.py (3138B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..compat import compat_str 8 from ..utils import ( 9 int_or_none, 10 determine_protocol, 11 try_get, 12 unescapeHTML, 13 ) 14 15 16 class DailyMailIE(InfoExtractor): 17 _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' 18 _TESTS = [{ 19 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 20 'md5': 'f6129624562251f628296c3a9ffde124', 21 'info_dict': { 22 'id': '1295863', 23 'ext': 'mp4', 24 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', 25 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', 26 } 27 }, { 28 'url': 'http://www.dailymail.co.uk/embed/video/1295863.html', 29 'only_matching': True, 30 }] 31 32 @staticmethod 33 def _extract_urls(webpage): 34 return re.findall( 35 r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', 36 webpage) 37 38 def _real_extract(self, url): 39 video_id = self._match_id(url) 40 webpage = self._download_webpage(url, video_id) 41 video_data = self._parse_json(self._search_regex( 42 r"data-opts='({.+?})'", webpage, 'video data'), video_id) 43 title = unescapeHTML(video_data['title']) 44 45 sources_url = (try_get( 46 video_data, 47 (lambda x: x['plugins']['sources']['url'], 48 lambda x: x['sources']['url']), compat_str) 49 or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) 50 51 video_sources = self._download_json(sources_url, video_id) 52 body = video_sources.get('body') 53 if body: 54 video_sources = body 55 56 formats = [] 57 for rendition in video_sources['renditions']: 58 rendition_url = rendition.get('url') 59 if not rendition_url: 60 continue 61 tbr = int_or_none(rendition.get('encodingRate'), 1000) 62 container = rendition.get('videoContainer') 63 is_hls = container == 'M2TS' 64 protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url}) 65 formats.append({ 66 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''), 67 'url': rendition_url, 68 'width': int_or_none(rendition.get('frameWidth')), 69 'height': int_or_none(rendition.get('frameHeight')), 70 'tbr': tbr, 71 'vcodec': rendition.get('videoCodec'), 72 'container': container, 73 'protocol': protocol, 74 'ext': 'mp4' if is_hls else None, 75 }) 76 self._sort_formats(formats) 77 78 return { 79 'id': video_id, 80 'title': title, 81 'description': unescapeHTML(video_data.get('descr')), 82 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), 83 'formats': formats, 84 }