zdf.py (14493B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..compat import compat_str 8 from ..utils import ( 9 determine_ext, 10 float_or_none, 11 int_or_none, 12 merge_dicts, 13 NO_DEFAULT, 14 orderedSet, 15 parse_codecs, 16 qualities, 17 try_get, 18 unified_timestamp, 19 update_url_query, 20 url_or_none, 21 urljoin, 22 ) 23 24 25 class ZDFBaseIE(InfoExtractor): 26 _GEO_COUNTRIES = ['DE'] 27 _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') 28 29 def _call_api(self, url, video_id, item, api_token=None, referrer=None): 30 headers = {} 31 if api_token: 32 headers['Api-Auth'] = 'Bearer %s' % api_token 33 if referrer: 34 headers['Referer'] = referrer 35 return self._download_json( 36 url, video_id, 'Downloading JSON %s' % item, headers=headers) 37 38 @staticmethod 39 def _extract_subtitles(src): 40 subtitles = {} 41 for caption in try_get(src, lambda x: x['captions'], list) or []: 42 subtitle_url = url_or_none(caption.get('uri')) 43 if subtitle_url: 44 lang = caption.get('language', 'deu') 45 subtitles.setdefault(lang, []).append({ 46 'url': subtitle_url, 47 }) 48 return subtitles 49 50 def _extract_format(self, video_id, formats, format_urls, meta): 51 format_url = url_or_none(meta.get('url')) 52 if not format_url: 53 return 54 if format_url in format_urls: 55 return 56 format_urls.add(format_url) 57 mime_type = meta.get('mimeType') 58 ext = determine_ext(format_url) 59 if mime_type == 'application/x-mpegURL' or ext == 'm3u8': 60 formats.extend(self._extract_m3u8_formats( 61 format_url, video_id, 'mp4', m3u8_id='hls', 62 entry_protocol='m3u8_native', fatal=False)) 63 elif mime_type == 'application/f4m+xml' or ext == 'f4m': 64 formats.extend(self._extract_f4m_formats( 65 update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) 66 else: 67 f = parse_codecs(meta.get('mimeCodec')) 68 format_id = ['http'] 69 for p in (meta.get('type'), meta.get('quality')): 70 if p and isinstance(p, compat_str): 71 format_id.append(p) 72 f.update({ 73 'url': format_url, 74 'format_id': '-'.join(format_id), 75 'format_note': meta.get('quality'), 76 'language': meta.get('language'), 77 'quality': qualities(self._QUALITIES)(meta.get('quality')), 78 'preference': -10, 79 }) 80 formats.append(f) 81 82 def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): 83 ptmd = self._call_api( 84 ptmd_url, video_id, 'metadata', api_token, referrer) 85 86 content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] 87 88 formats = [] 89 track_uris = set() 90 for p in ptmd['priorityList']: 91 formitaeten = p.get('formitaeten') 92 if not isinstance(formitaeten, list): 93 continue 94 for f in formitaeten: 95 f_qualities = f.get('qualities') 96 if not isinstance(f_qualities, list): 97 continue 98 for quality in f_qualities: 99 tracks = try_get(quality, lambda x: x['audio']['tracks'], list) 100 if not tracks: 101 continue 102 for track in tracks: 103 self._extract_format( 104 content_id, formats, track_uris, { 105 'url': track.get('uri'), 106 'type': f.get('type'), 107 'mimeType': f.get('mimeType'), 108 'quality': quality.get('quality'), 109 'language': track.get('language'), 110 }) 111 self._sort_formats(formats) 112 113 duration = float_or_none(try_get( 114 ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) 115 116 return { 117 'extractor_key': ZDFIE.ie_key(), 118 'id': content_id, 119 'duration': duration, 120 'formats': formats, 121 'subtitles': self._extract_subtitles(ptmd), 122 } 123 124 def _extract_player(self, webpage, video_id, fatal=True): 125 return self._parse_json( 126 self._search_regex( 127 r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage, 128 'player JSON', default='{}' if not fatal else NO_DEFAULT, 129 group='json'), 130 video_id) 131 132 133 class ZDFIE(ZDFBaseIE): 134 _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' 135 _TESTS = [{ 136 # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html 137 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', 138 'md5': '34ec321e7eb34231fd88616c65c92db0', 139 'info_dict': { 140 'id': '210222_phx_nachgehakt_corona_protest', 141 'ext': 'mp4', 142 'title': 'Wohin führt der Protest in der Pandemie?', 143 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', 144 'duration': 1691, 145 'timestamp': 1613948400, 146 'upload_date': '20210221', 147 }, 148 }, { 149 # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html 150 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', 151 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', 152 'info_dict': { 153 'id': '141007_ab18_10wochensommer_film', 154 'ext': 'mp4', 155 'title': 'Ab 18! - 10 Wochen Sommer', 156 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', 157 'duration': 2660, 158 'timestamp': 1608604200, 159 'upload_date': '20201222', 160 }, 161 }, { 162 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 163 'info_dict': { 164 'id': '151025_magie_farben2_tex', 165 'ext': 'mp4', 166 'title': 'Die Magie der Farben (2/2)', 167 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', 168 'duration': 2615, 169 'timestamp': 1465021200, 170 'upload_date': '20160604', 171 }, 172 }, { 173 # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche 174 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', 175 'only_matching': True, 176 }, { 177 # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html 178 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html', 179 'only_matching': True, 180 }, { 181 # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids 182 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html', 183 'only_matching': True, 184 }, { 185 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', 186 'only_matching': True, 187 }, { 188 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', 189 'only_matching': True, 190 }, { 191 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', 192 'only_matching': True, 193 }] 194 195 def _extract_entry(self, url, player, content, video_id): 196 title = content.get('title') or content['teaserHeadline'] 197 198 t = content['mainVideoContent']['http://zdf.de/rels/target'] 199 200 ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') 201 202 if not ptmd_path: 203 ptmd_path = t[ 204 'http://zdf.de/rels/streams/ptmd-template'].replace( 205 '{playerId}', 'ngplayer_2_4') 206 207 info = self._extract_ptmd( 208 urljoin(url, ptmd_path), video_id, player['apiToken'], url) 209 210 thumbnails = [] 211 layouts = try_get( 212 content, lambda x: x['teaserImageRef']['layouts'], dict) 213 if layouts: 214 for layout_key, layout_url in layouts.items(): 215 layout_url = url_or_none(layout_url) 216 if not layout_url: 217 continue 218 thumbnail = { 219 'url': layout_url, 220 'format_id': layout_key, 221 } 222 mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key) 223 if mobj: 224 thumbnail.update({ 225 'width': int(mobj.group('width')), 226 'height': int(mobj.group('height')), 227 }) 228 thumbnails.append(thumbnail) 229 230 return merge_dicts(info, { 231 'title': title, 232 'description': content.get('leadParagraph') or content.get('teasertext'), 233 'duration': int_or_none(t.get('duration')), 234 'timestamp': unified_timestamp(content.get('editorialDate')), 235 'thumbnails': thumbnails, 236 }) 237 238 def _extract_regular(self, url, player, video_id): 239 content = self._call_api( 240 player['content'], video_id, 'content', player['apiToken'], url) 241 return self._extract_entry(player['content'], player, content, video_id) 242 243 def _extract_mobile(self, video_id): 244 video = self._download_json( 245 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, 246 video_id) 247 248 document = video['document'] 249 250 title = document['titel'] 251 content_id = document['basename'] 252 253 formats = [] 254 format_urls = set() 255 for f in document['formitaeten']: 256 self._extract_format(content_id, formats, format_urls, f) 257 self._sort_formats(formats) 258 259 thumbnails = [] 260 teaser_bild = document.get('teaserBild') 261 if isinstance(teaser_bild, dict): 262 for thumbnail_key, thumbnail in teaser_bild.items(): 263 thumbnail_url = try_get( 264 thumbnail, lambda x: x['url'], compat_str) 265 if thumbnail_url: 266 thumbnails.append({ 267 'url': thumbnail_url, 268 'id': thumbnail_key, 269 'width': int_or_none(thumbnail.get('width')), 270 'height': int_or_none(thumbnail.get('height')), 271 }) 272 273 return { 274 'id': content_id, 275 'title': title, 276 'description': document.get('beschreibung'), 277 'duration': int_or_none(document.get('length')), 278 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( 279 try_get(video, lambda x: x['meta']['editorialDate'], compat_str)), 280 'thumbnails': thumbnails, 281 'subtitles': self._extract_subtitles(document), 282 'formats': formats, 283 } 284 285 def _real_extract(self, url): 286 video_id = self._match_id(url) 287 288 webpage = self._download_webpage(url, video_id, fatal=False) 289 if webpage: 290 player = self._extract_player(webpage, url, fatal=False) 291 if player: 292 return self._extract_regular(url, player, video_id) 293 294 return self._extract_mobile(video_id) 295 296 297 class ZDFChannelIE(ZDFBaseIE): 298 _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' 299 _TESTS = [{ 300 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 301 'info_dict': { 302 'id': 'das-aktuelle-sportstudio', 303 'title': 'das aktuelle sportstudio | ZDF', 304 }, 305 'playlist_mincount': 23, 306 }, { 307 'url': 'https://www.zdf.de/dokumentation/planet-e', 308 'info_dict': { 309 'id': 'planet-e', 310 'title': 'planet e.', 311 }, 312 'playlist_mincount': 50, 313 }, { 314 'url': 'https://www.zdf.de/filme/taunuskrimi/', 315 'only_matching': True, 316 }] 317 318 @classmethod 319 def suitable(cls, url): 320 return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) 321 322 def _real_extract(self, url): 323 channel_id = self._match_id(url) 324 325 webpage = self._download_webpage(url, channel_id) 326 327 entries = [ 328 self.url_result(item_url, ie=ZDFIE.ie_key()) 329 for item_url in orderedSet(re.findall( 330 r'data-plusbar-url=["\'](http.+?\.html)', webpage))] 331 332 return self.playlist_result( 333 entries, channel_id, self._og_search_title(webpage, fatal=False)) 334 335 r""" 336 player = self._extract_player(webpage, channel_id) 337 338 channel_id = self._search_regex( 339 r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, 340 'channel id', group='id') 341 342 channel = self._call_api( 343 'https://api.zdf.de/content/documents/%s.json' % channel_id, 344 player, url, channel_id) 345 346 items = [] 347 for module in channel['module']: 348 for teaser in try_get(module, lambda x: x['teaser'], list) or []: 349 t = try_get( 350 teaser, lambda x: x['http://zdf.de/rels/target'], dict) 351 if not t: 352 continue 353 items.extend(try_get( 354 t, 355 lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], 356 list) or []) 357 items.extend(try_get( 358 module, 359 lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], 360 list) or []) 361 362 entries = [] 363 entry_urls = set() 364 for item in items: 365 t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) 366 if not t: 367 continue 368 sharing_url = t.get('http://zdf.de/rels/sharing-url') 369 if not sharing_url or not isinstance(sharing_url, compat_str): 370 continue 371 if sharing_url in entry_urls: 372 continue 373 entry_urls.add(sharing_url) 374 entries.append(self.url_result( 375 sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) 376 377 return self.playlist_result(entries, channel_id, channel.get('title')) 378 """