hbo.py (6128B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import ( 8 xpath_text, 9 xpath_element, 10 int_or_none, 11 parse_duration, 12 urljoin, 13 ) 14 15 16 class HBOBaseIE(InfoExtractor): 17 _FORMATS_INFO = { 18 'pro7': { 19 'width': 1280, 20 'height': 720, 21 }, 22 '1920': { 23 'width': 1280, 24 'height': 720, 25 }, 26 'pro6': { 27 'width': 768, 28 'height': 432, 29 }, 30 '640': { 31 'width': 768, 32 'height': 432, 33 }, 34 'pro5': { 35 'width': 640, 36 'height': 360, 37 }, 38 'highwifi': { 39 'width': 640, 40 'height': 360, 41 }, 42 'high3g': { 43 'width': 640, 44 'height': 360, 45 }, 46 'medwifi': { 47 'width': 400, 48 'height': 224, 49 }, 50 'med3g': { 51 'width': 400, 52 'height': 224, 53 }, 54 } 55 56 def _extract_info(self, url, display_id): 57 video_data = self._download_xml(url, display_id) 58 video_id = xpath_text(video_data, 'id', fatal=True) 59 episode_title = title = xpath_text(video_data, 'title', fatal=True) 60 series = xpath_text(video_data, 'program') 61 if series: 62 title = '%s - %s' % (series, title) 63 64 formats = [] 65 for source in xpath_element(video_data, 'videos', 'sources', True): 66 if source.tag == 'size': 67 path = xpath_text(source, './/path') 68 if not path: 69 continue 70 width = source.attrib.get('width') 71 format_info = self._FORMATS_INFO.get(width, {}) 72 height = format_info.get('height') 73 fmt = { 74 'url': path, 75 'format_id': 'http%s' % ('-%dp' % height if height else ''), 76 'width': format_info.get('width'), 77 'height': height, 78 } 79 rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', path) 80 if rtmp: 81 fmt.update({ 82 'url': rtmp.group('url'), 83 'play_path': rtmp.group('playpath'), 84 'app': rtmp.group('app'), 85 'ext': 'flv', 86 'format_id': fmt['format_id'].replace('http', 'rtmp'), 87 }) 88 formats.append(fmt) 89 else: 90 video_url = source.text 91 if not video_url: 92 continue 93 if source.tag == 'tarball': 94 formats.extend(self._extract_m3u8_formats( 95 video_url.replace('.tar', '/base_index_w8.m3u8'), 96 video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) 97 elif source.tag == 'hls': 98 m3u8_formats = self._extract_m3u8_formats( 99 video_url.replace('.tar', '/base_index.m3u8'), 100 video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) 101 for f in m3u8_formats: 102 if f.get('vcodec') == 'none' and not f.get('tbr'): 103 f['tbr'] = int_or_none(self._search_regex( 104 r'-(\d+)k/', f['url'], 'tbr', default=None)) 105 formats.extend(m3u8_formats) 106 elif source.tag == 'dash': 107 formats.extend(self._extract_mpd_formats( 108 video_url.replace('.tar', '/manifest.mpd'), 109 video_id, mpd_id='dash', fatal=False)) 110 else: 111 format_info = self._FORMATS_INFO.get(source.tag, {}) 112 formats.append({ 113 'format_id': 'http-%s' % source.tag, 114 'url': video_url, 115 'width': format_info.get('width'), 116 'height': format_info.get('height'), 117 }) 118 self._sort_formats(formats) 119 120 thumbnails = [] 121 card_sizes = xpath_element(video_data, 'titleCardSizes') 122 if card_sizes is not None: 123 for size in card_sizes: 124 path = xpath_text(size, 'path') 125 if not path: 126 continue 127 width = int_or_none(size.get('width')) 128 thumbnails.append({ 129 'id': width, 130 'url': path, 131 'width': width, 132 }) 133 134 subtitles = None 135 caption_url = xpath_text(video_data, 'captionUrl') 136 if caption_url: 137 subtitles = { 138 'en': [{ 139 'url': caption_url, 140 'ext': 'ttml' 141 }], 142 } 143 144 return { 145 'id': video_id, 146 'title': title, 147 'duration': parse_duration(xpath_text(video_data, 'duration/tv14')), 148 'series': series, 149 'episode': episode_title, 150 'formats': formats, 151 'thumbnails': thumbnails, 152 'subtitles': subtitles, 153 } 154 155 156 class HBOIE(HBOBaseIE): 157 IE_NAME = 'hbo' 158 _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?:video|embed)(?:/[^/]+)*/(?P<id>[^/?#]+)' 159 _TEST = { 160 'url': 'https://www.hbo.com/video/game-of-thrones/seasons/season-8/videos/trailer', 161 'md5': '8126210656f433c452a21367f9ad85b3', 162 'info_dict': { 163 'id': '22113301', 164 'ext': 'mp4', 165 'title': 'Game of Thrones - Trailer', 166 }, 167 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], 168 } 169 170 def _real_extract(self, url): 171 display_id = self._match_id(url) 172 webpage = self._download_webpage(url, display_id) 173 location_path = self._parse_json(self._html_search_regex( 174 r'data-state="({.+?})"', webpage, 'state'), display_id)['video']['locationUrl'] 175 return self._extract_info(urljoin(url, location_path), display_id)