tagesschau.py (12208B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import ( 8 determine_ext, 9 js_to_json, 10 parse_iso8601, 11 parse_filesize, 12 ) 13 14 15 class TagesschauPlayerIE(InfoExtractor): 16 IE_NAME = 'tagesschau:player' 17 _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html' 18 19 _TESTS = [{ 20 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', 21 'md5': '8d09548d5c15debad38bee3a4d15ca21', 22 'info_dict': { 23 'id': '179517', 24 'ext': 'mp4', 25 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', 26 'thumbnail': r're:^https?:.*\.jpg$', 27 'formats': 'mincount:6', 28 }, 29 }, { 30 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', 31 'md5': '76e6eec6ebd40740671cf0a2c88617e5', 32 'info_dict': { 33 'id': '29417', 34 'ext': 'mp3', 35 'title': 'Trabi - Bye, bye Rennpappe', 36 'thumbnail': r're:^https?:.*\.jpg$', 37 'formats': 'mincount:2', 38 }, 39 }, { 40 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', 41 'only_matching': True, 42 }] 43 44 _FORMATS = { 45 'xs': {'quality': 0}, 46 's': {'width': 320, 'height': 180, 'quality': 1}, 47 'm': {'width': 512, 'height': 288, 'quality': 2}, 48 'l': {'width': 960, 'height': 540, 'quality': 3}, 49 'xl': {'width': 1280, 'height': 720, 'quality': 4}, 50 'xxl': {'quality': 5}, 51 } 52 53 def _extract_via_api(self, kind, video_id): 54 info = self._download_json( 55 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), 56 video_id) 57 title = info['headline'] 58 formats = [] 59 for media in info['mediadata']: 60 for format_id, format_url in media.items(): 61 if determine_ext(format_url) == 'm3u8': 62 formats.extend(self._extract_m3u8_formats( 63 format_url, video_id, 'mp4', 64 entry_protocol='m3u8_native', m3u8_id='hls')) 65 else: 66 formats.append({ 67 'url': format_url, 68 'format_id': format_id, 69 'vcodec': 'none' if kind == 'audio' else None, 70 }) 71 self._sort_formats(formats) 72 timestamp = parse_iso8601(info.get('date')) 73 return { 74 'id': video_id, 75 'title': title, 76 'timestamp': timestamp, 77 'formats': formats, 78 } 79 80 def _real_extract(self, url): 81 mobj = re.match(self._VALID_URL, url) 82 video_id = mobj.group('id') 83 84 # kind = mobj.group('kind').lower() 85 # if kind == 'video': 86 # return self._extract_via_api(kind, video_id) 87 88 # JSON api does not provide some audio formats (e.g. ogg) thus 89 # extracting audio via webpage 90 91 webpage = self._download_webpage(url, video_id) 92 93 title = self._og_search_title(webpage).strip() 94 formats = [] 95 96 for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): 97 media = self._parse_json(js_to_json(media_json), video_id, fatal=False) 98 if not media: 99 continue 100 src = media.get('src') 101 if not src: 102 return 103 quality = media.get('quality') 104 kind = media.get('type', '').split('/')[0] 105 ext = determine_ext(src) 106 f = { 107 'url': src, 108 'format_id': '%s_%s' % (quality, ext) if quality else ext, 109 'ext': ext, 110 'vcodec': 'none' if kind == 'audio' else None, 111 } 112 f.update(self._FORMATS.get(quality, {})) 113 formats.append(f) 114 115 self._sort_formats(formats) 116 117 thumbnail = self._og_search_thumbnail(webpage) 118 119 return { 120 'id': video_id, 121 'title': title, 122 'thumbnail': thumbnail, 123 'formats': formats, 124 } 125 126 127 class TagesschauIE(InfoExtractor): 128 _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' 129 130 _TESTS = [{ 131 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', 132 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', 133 'info_dict': { 134 'id': 'video-102143', 135 'ext': 'mp4', 136 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 137 'description': '18.07.2015 20:10 Uhr', 138 'thumbnail': r're:^https?:.*\.jpg$', 139 }, 140 }, { 141 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 142 'md5': '3c54c1f6243d279b706bde660ceec633', 143 'info_dict': { 144 'id': 'ts-5727', 145 'ext': 'mp4', 146 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 147 'description': 'md5:695c01bfd98b7e313c501386327aea59', 148 'thumbnail': r're:^https?:.*\.jpg$', 149 }, 150 }, { 151 # exclusive audio 152 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', 153 'md5': '76e6eec6ebd40740671cf0a2c88617e5', 154 'info_dict': { 155 'id': 'audio-29417', 156 'ext': 'mp3', 157 'title': 'Trabi - Bye, bye Rennpappe', 158 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', 159 'thumbnail': r're:^https?:.*\.jpg$', 160 }, 161 }, { 162 # audio in article 163 'url': 'http://www.tagesschau.de/inland/bnd-303.html', 164 'md5': 'e0916c623e85fc1d2b26b78f299d3958', 165 'info_dict': { 166 'id': 'bnd-303', 167 'ext': 'mp3', 168 'title': 'Viele Baustellen für neuen BND-Chef', 169 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', 170 'thumbnail': r're:^https?:.*\.jpg$', 171 }, 172 }, { 173 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', 174 'info_dict': { 175 'id': 'afd-parteitag-135', 176 'title': 'Möchtegern-Underdog mit Machtanspruch', 177 }, 178 'playlist_count': 2, 179 }, { 180 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 181 'only_matching': True, 182 }, { 183 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', 184 'only_matching': True, 185 }, { 186 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', 187 'only_matching': True, 188 }, { 189 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', 190 'only_matching': True, 191 }, { 192 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', 193 'only_matching': True, 194 }, { 195 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', 196 'only_matching': True, 197 }, { 198 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 199 'only_matching': True, 200 }, { 201 'url': 'http://www.tagesschau.de/100sekunden/index.html', 202 'only_matching': True, 203 }, { 204 # playlist article with collapsing sections 205 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', 206 'only_matching': True, 207 }] 208 209 @classmethod 210 def suitable(cls, url): 211 return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) 212 213 def _extract_formats(self, download_text, media_kind): 214 links = re.finditer( 215 r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', 216 download_text) 217 formats = [] 218 for l in links: 219 link_url = l.group('url') 220 if not link_url: 221 continue 222 format_id = self._search_regex( 223 r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', 224 default=determine_ext(link_url)) 225 format = { 226 'format_id': format_id, 227 'url': l.group('url'), 228 'format_name': l.group('name'), 229 } 230 title = l.group('title') 231 if title: 232 if media_kind.lower() == 'video': 233 m = re.match( 234 r'''(?x) 235 Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; 236 (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; 237 (?P<vbr>[0-9]+)kbps&\#10; 238 Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; 239 Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', 240 title) 241 if m: 242 format.update({ 243 'format_note': m.group('audio_desc'), 244 'vcodec': m.group('vcodec'), 245 'width': int(m.group('width')), 246 'height': int(m.group('height')), 247 'abr': int(m.group('abr')), 248 'vbr': int(m.group('vbr')), 249 'filesize_approx': parse_filesize(m.group('filesize_approx')), 250 }) 251 else: 252 m = re.match( 253 r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)', 254 title) 255 if m: 256 format.update({ 257 'format_note': '%s, %s' % (m.group('format'), m.group('note')), 258 'vcodec': 'none', 259 'abr': int(m.group('abr')), 260 }) 261 formats.append(format) 262 self._sort_formats(formats) 263 return formats 264 265 def _real_extract(self, url): 266 mobj = re.match(self._VALID_URL, url) 267 video_id = mobj.group('id') or mobj.group('path') 268 display_id = video_id.lstrip('-') 269 270 webpage = self._download_webpage(url, display_id) 271 272 title = self._html_search_regex( 273 r'<span[^>]*class="headline"[^>]*>(.+?)</span>', 274 webpage, 'title', default=None) or self._og_search_title(webpage) 275 276 DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' 277 278 webpage_type = self._og_search_property('type', webpage, default=None) 279 if webpage_type == 'website': # Article 280 entries = [] 281 for num, (entry_title, media_kind, download_text) in enumerate(re.findall( 282 r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, 283 webpage), 1): 284 entries.append({ 285 'id': '%s-%d' % (display_id, num), 286 'title': '%s' % entry_title, 287 'formats': self._extract_formats(download_text, media_kind), 288 }) 289 if len(entries) > 1: 290 return self.playlist_result(entries, display_id, title) 291 formats = entries[0]['formats'] 292 else: # Assume single video 293 download_text = self._search_regex( 294 DOWNLOAD_REGEX, webpage, 'download links', group='links') 295 media_kind = self._search_regex( 296 DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') 297 formats = self._extract_formats(download_text, media_kind) 298 thumbnail = self._og_search_thumbnail(webpage) 299 description = self._html_search_regex( 300 r'(?s)<p class="teasertext">(.*?)</p>', 301 webpage, 'description', default=None) 302 303 self._sort_formats(formats) 304 305 return { 306 'id': display_id, 307 'title': title, 308 'thumbnail': thumbnail, 309 'formats': formats, 310 'description': description, 311 }