youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

br.py (11903B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import json
      5 import re
      6 
      7 from .common import InfoExtractor
      8 from ..utils import (
      9     determine_ext,
     10     ExtractorError,
     11     int_or_none,
     12     parse_duration,
     13     parse_iso8601,
     14     xpath_element,
     15     xpath_text,
     16 )
     17 
     18 
     19 class BRIE(InfoExtractor):
     20     IE_DESC = 'Bayerischer Rundfunk'
     21     _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html'
     22 
     23     _TESTS = [
     24         {
     25             'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html',
     26             'md5': '83a0477cf0b8451027eb566d88b51106',
     27             'info_dict': {
     28                 'id': '48f656ef-287e-486f-be86-459122db22cc',
     29                 'ext': 'mp4',
     30                 'title': 'Die böse Überraschung',
     31                 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9',
     32                 'duration': 180,
     33                 'uploader': 'Reinhard Weber',
     34                 'upload_date': '20150422',
     35             },
     36             'skip': '404 not found',
     37         },
     38         {
     39             'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
     40             'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef',
     41             'info_dict': {
     42                 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05',
     43                 'ext': 'flv',
     44                 'title': 'Manfred Schreiber ist tot',
     45                 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',
     46                 'duration': 26,
     47             },
     48             'skip': '404 not found',
     49         },
     50         {
     51             'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',
     52             'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',
     53             'info_dict': {
     54                 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',
     55                 'ext': 'aac',
     56                 'title': 'Kurzweilig und sehr bewegend',
     57                 'description': 'md5:0351996e3283d64adeb38ede91fac54e',
     58                 'duration': 296,
     59             },
     60             'skip': '404 not found',
     61         },
     62         {
     63             'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
     64             'md5': 'dbab0aef2e047060ea7a21fc1ce1078a',
     65             'info_dict': {
     66                 'id': '6ba73750-d405-45d3-861d-1ce8c524e059',
     67                 'ext': 'mp4',
     68                 'title': 'Umweltbewusster Häuslebauer',
     69                 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2',
     70                 'duration': 116,
     71             }
     72         },
     73         {
     74             'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html',
     75             'md5': '23bca295f1650d698f94fc570977dae3',
     76             'info_dict': {
     77                 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d',
     78                 'ext': 'mp4',
     79                 'title': 'Folge 1 - Metaphysik',
     80                 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',
     81                 'duration': 893,
     82                 'uploader': 'Eva Maria Steimle',
     83                 'upload_date': '20170208',
     84             }
     85         },
     86     ]
     87 
     88     def _real_extract(self, url):
     89         base_url, display_id = re.search(self._VALID_URL, url).groups()
     90         page = self._download_webpage(url, display_id)
     91         xml_url = self._search_regex(
     92             r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
     93         xml = self._download_xml(base_url + xml_url, display_id)
     94 
     95         medias = []
     96 
     97         for xml_media in xml.findall('video') + xml.findall('audio'):
     98             media_id = xml_media.get('externalId')
     99             media = {
    100                 'id': media_id,
    101                 'title': xpath_text(xml_media, 'title', 'title', True),
    102                 'duration': parse_duration(xpath_text(xml_media, 'duration')),
    103                 'formats': self._extract_formats(xpath_element(
    104                     xml_media, 'assets'), media_id),
    105                 'thumbnails': self._extract_thumbnails(xpath_element(
    106                     xml_media, 'teaserImage/variants'), base_url),
    107                 'description': xpath_text(xml_media, 'desc'),
    108                 'webpage_url': xpath_text(xml_media, 'permalink'),
    109                 'uploader': xpath_text(xml_media, 'author'),
    110             }
    111             broadcast_date = xpath_text(xml_media, 'broadcastDate')
    112             if broadcast_date:
    113                 media['upload_date'] = ''.join(reversed(broadcast_date.split('.')))
    114             medias.append(media)
    115 
    116         if len(medias) > 1:
    117             self._downloader.report_warning(
    118                 'found multiple medias; please '
    119                 'report this with the video URL to http://yt-dl.org/bug')
    120         if not medias:
    121             raise ExtractorError('No media entries found')
    122         return medias[0]
    123 
    124     def _extract_formats(self, assets, media_id):
    125         formats = []
    126         for asset in assets.findall('asset'):
    127             format_url = xpath_text(asset, ['downloadUrl', 'url'])
    128             asset_type = asset.get('type')
    129             if asset_type.startswith('HDS'):
    130                 formats.extend(self._extract_f4m_formats(
    131                     format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False))
    132             elif asset_type.startswith('HLS'):
    133                 formats.extend(self._extract_m3u8_formats(
    134                     format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False))
    135             else:
    136                 format_info = {
    137                     'ext': xpath_text(asset, 'mediaType'),
    138                     'width': int_or_none(xpath_text(asset, 'frameWidth')),
    139                     'height': int_or_none(xpath_text(asset, 'frameHeight')),
    140                     'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')),
    141                     'abr': int_or_none(xpath_text(asset, 'bitrateAudio')),
    142                     'vcodec': xpath_text(asset, 'codecVideo'),
    143                     'acodec': xpath_text(asset, 'codecAudio'),
    144                     'container': xpath_text(asset, 'mediaType'),
    145                     'filesize': int_or_none(xpath_text(asset, 'size')),
    146                 }
    147                 format_url = self._proto_relative_url(format_url)
    148                 if format_url:
    149                     http_format_info = format_info.copy()
    150                     http_format_info.update({
    151                         'url': format_url,
    152                         'format_id': 'http-%s' % asset_type,
    153                     })
    154                     formats.append(http_format_info)
    155                 server_prefix = xpath_text(asset, 'serverPrefix')
    156                 if server_prefix:
    157                     rtmp_format_info = format_info.copy()
    158                     rtmp_format_info.update({
    159                         'url': server_prefix,
    160                         'play_path': xpath_text(asset, 'fileName'),
    161                         'format_id': 'rtmp-%s' % asset_type,
    162                     })
    163                     formats.append(rtmp_format_info)
    164         self._sort_formats(formats)
    165         return formats
    166 
    167     def _extract_thumbnails(self, variants, base_url):
    168         thumbnails = [{
    169             'url': base_url + xpath_text(variant, 'url'),
    170             'width': int_or_none(xpath_text(variant, 'width')),
    171             'height': int_or_none(xpath_text(variant, 'height')),
    172         } for variant in variants.findall('variant') if xpath_text(variant, 'url')]
    173         thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True)
    174         return thumbnails
    175 
    176 
    177 class BRMediathekIE(InfoExtractor):
    178     IE_DESC = 'Bayerischer Rundfunk Mediathek'
    179     _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})'
    180 
    181     _TESTS = [{
    182         'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e',
    183         'md5': 'fdc3d485835966d1622587d08ba632ec',
    184         'info_dict': {
    185             'id': 'av:5a1e6a6e8fce6d001871cc8e',
    186             'ext': 'mp4',
    187             'title': 'Die Sendung vom 28.11.2017',
    188             'description': 'md5:6000cdca5912ab2277e5b7339f201ccc',
    189             'timestamp': 1511942766,
    190             'upload_date': '20171129',
    191         }
    192     }]
    193 
    194     def _real_extract(self, url):
    195         clip_id = self._match_id(url)
    196 
    197         clip = self._download_json(
    198             'https://proxy-base.master.mango.express/graphql',
    199             clip_id, data=json.dumps({
    200                 "query": """{
    201   viewer {
    202     clip(id: "%s") {
    203       title
    204       description
    205       duration
    206       createdAt
    207       ageRestriction
    208       videoFiles {
    209         edges {
    210           node {
    211             publicLocation
    212             fileSize
    213             videoProfile {
    214               width
    215               height
    216               bitrate
    217               encoding
    218             }
    219           }
    220         }
    221       }
    222       captionFiles {
    223         edges {
    224           node {
    225             publicLocation
    226           }
    227         }
    228       }
    229       teaserImages {
    230         edges {
    231           node {
    232             imageFiles {
    233               edges {
    234                 node {
    235                   publicLocation
    236                   width
    237                   height
    238                 }
    239               }
    240             }
    241           }
    242         }
    243       }
    244     }
    245   }
    246 }""" % clip_id}).encode(), headers={
    247                 'Content-Type': 'application/json',
    248             })['data']['viewer']['clip']
    249         title = clip['title']
    250 
    251         formats = []
    252         for edge in clip.get('videoFiles', {}).get('edges', []):
    253             node = edge.get('node', {})
    254             n_url = node.get('publicLocation')
    255             if not n_url:
    256                 continue
    257             ext = determine_ext(n_url)
    258             if ext == 'm3u8':
    259                 formats.extend(self._extract_m3u8_formats(
    260                     n_url, clip_id, 'mp4', 'm3u8_native',
    261                     m3u8_id='hls', fatal=False))
    262             else:
    263                 video_profile = node.get('videoProfile', {})
    264                 tbr = int_or_none(video_profile.get('bitrate'))
    265                 format_id = 'http'
    266                 if tbr:
    267                     format_id += '-%d' % tbr
    268                 formats.append({
    269                     'format_id': format_id,
    270                     'url': n_url,
    271                     'width': int_or_none(video_profile.get('width')),
    272                     'height': int_or_none(video_profile.get('height')),
    273                     'tbr': tbr,
    274                     'filesize': int_or_none(node.get('fileSize')),
    275                 })
    276         self._sort_formats(formats)
    277 
    278         subtitles = {}
    279         for edge in clip.get('captionFiles', {}).get('edges', []):
    280             node = edge.get('node', {})
    281             n_url = node.get('publicLocation')
    282             if not n_url:
    283                 continue
    284             subtitles.setdefault('de', []).append({
    285                 'url': n_url,
    286             })
    287 
    288         thumbnails = []
    289         for edge in clip.get('teaserImages', {}).get('edges', []):
    290             for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []):
    291                 node = image_edge.get('node', {})
    292                 n_url = node.get('publicLocation')
    293                 if not n_url:
    294                     continue
    295                 thumbnails.append({
    296                     'url': n_url,
    297                     'width': int_or_none(node.get('width')),
    298                     'height': int_or_none(node.get('height')),
    299                 })
    300 
    301         return {
    302             'id': clip_id,
    303             'title': title,
    304             'description': clip.get('description'),
    305             'duration': int_or_none(clip.get('duration')),
    306             'timestamp': parse_iso8601(clip.get('createdAt')),
    307             'age_limit': int_or_none(clip.get('ageRestriction')),
    308             'formats': formats,
    309             'subtitles': subtitles,
    310             'thumbnails': thumbnails,
    311         }