youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

f4m.py (15432B)


      1 from __future__ import division, unicode_literals
      2 
      3 import io
      4 import itertools
      5 import time
      6 
      7 from .fragment import FragmentFD
      8 from ..compat import (
      9     compat_b64decode,
     10     compat_etree_fromstring,
     11     compat_urlparse,
     12     compat_urllib_error,
     13     compat_urllib_parse_urlparse,
     14     compat_struct_pack,
     15     compat_struct_unpack,
     16 )
     17 from ..utils import (
     18     fix_xml_ampersands,
     19     xpath_text,
     20 )
     21 
     22 
     23 class DataTruncatedError(Exception):
     24     pass
     25 
     26 
     27 class FlvReader(io.BytesIO):
     28     """
     29     Reader for Flv files
     30     The file format is documented in https://www.adobe.com/devnet/f4v.html
     31     """
     32 
     33     def read_bytes(self, n):
     34         data = self.read(n)
     35         if len(data) < n:
     36             raise DataTruncatedError(
     37                 'FlvReader error: need %d bytes while only %d bytes got' % (
     38                     n, len(data)))
     39         return data
     40 
     41     # Utility functions for reading numbers and strings
     42     def read_unsigned_long_long(self):
     43         return compat_struct_unpack('!Q', self.read_bytes(8))[0]
     44 
     45     def read_unsigned_int(self):
     46         return compat_struct_unpack('!I', self.read_bytes(4))[0]
     47 
     48     def read_unsigned_char(self):
     49         return compat_struct_unpack('!B', self.read_bytes(1))[0]
     50 
     51     def read_string(self):
     52         res = b''
     53         while True:
     54             char = self.read_bytes(1)
     55             if char == b'\x00':
     56                 break
     57             res += char
     58         return res
     59 
     60     def read_box_info(self):
     61         """
     62         Read a box and return the info as a tuple: (box_size, box_type, box_data)
     63         """
     64         real_size = size = self.read_unsigned_int()
     65         box_type = self.read_bytes(4)
     66         header_end = 8
     67         if size == 1:
     68             real_size = self.read_unsigned_long_long()
     69             header_end = 16
     70         return real_size, box_type, self.read_bytes(real_size - header_end)
     71 
     72     def read_asrt(self):
     73         # version
     74         self.read_unsigned_char()
     75         # flags
     76         self.read_bytes(3)
     77         quality_entry_count = self.read_unsigned_char()
     78         # QualityEntryCount
     79         for i in range(quality_entry_count):
     80             self.read_string()
     81 
     82         segment_run_count = self.read_unsigned_int()
     83         segments = []
     84         for i in range(segment_run_count):
     85             first_segment = self.read_unsigned_int()
     86             fragments_per_segment = self.read_unsigned_int()
     87             segments.append((first_segment, fragments_per_segment))
     88 
     89         return {
     90             'segment_run': segments,
     91         }
     92 
     93     def read_afrt(self):
     94         # version
     95         self.read_unsigned_char()
     96         # flags
     97         self.read_bytes(3)
     98         # time scale
     99         self.read_unsigned_int()
    100 
    101         quality_entry_count = self.read_unsigned_char()
    102         # QualitySegmentUrlModifiers
    103         for i in range(quality_entry_count):
    104             self.read_string()
    105 
    106         fragments_count = self.read_unsigned_int()
    107         fragments = []
    108         for i in range(fragments_count):
    109             first = self.read_unsigned_int()
    110             first_ts = self.read_unsigned_long_long()
    111             duration = self.read_unsigned_int()
    112             if duration == 0:
    113                 discontinuity_indicator = self.read_unsigned_char()
    114             else:
    115                 discontinuity_indicator = None
    116             fragments.append({
    117                 'first': first,
    118                 'ts': first_ts,
    119                 'duration': duration,
    120                 'discontinuity_indicator': discontinuity_indicator,
    121             })
    122 
    123         return {
    124             'fragments': fragments,
    125         }
    126 
    127     def read_abst(self):
    128         # version
    129         self.read_unsigned_char()
    130         # flags
    131         self.read_bytes(3)
    132 
    133         self.read_unsigned_int()  # BootstrapinfoVersion
    134         # Profile,Live,Update,Reserved
    135         flags = self.read_unsigned_char()
    136         live = flags & 0x20 != 0
    137         # time scale
    138         self.read_unsigned_int()
    139         # CurrentMediaTime
    140         self.read_unsigned_long_long()
    141         # SmpteTimeCodeOffset
    142         self.read_unsigned_long_long()
    143 
    144         self.read_string()  # MovieIdentifier
    145         server_count = self.read_unsigned_char()
    146         # ServerEntryTable
    147         for i in range(server_count):
    148             self.read_string()
    149         quality_count = self.read_unsigned_char()
    150         # QualityEntryTable
    151         for i in range(quality_count):
    152             self.read_string()
    153         # DrmData
    154         self.read_string()
    155         # MetaData
    156         self.read_string()
    157 
    158         segments_count = self.read_unsigned_char()
    159         segments = []
    160         for i in range(segments_count):
    161             box_size, box_type, box_data = self.read_box_info()
    162             assert box_type == b'asrt'
    163             segment = FlvReader(box_data).read_asrt()
    164             segments.append(segment)
    165         fragments_run_count = self.read_unsigned_char()
    166         fragments = []
    167         for i in range(fragments_run_count):
    168             box_size, box_type, box_data = self.read_box_info()
    169             assert box_type == b'afrt'
    170             fragments.append(FlvReader(box_data).read_afrt())
    171 
    172         return {
    173             'segments': segments,
    174             'fragments': fragments,
    175             'live': live,
    176         }
    177 
    178     def read_bootstrap_info(self):
    179         total_size, box_type, box_data = self.read_box_info()
    180         assert box_type == b'abst'
    181         return FlvReader(box_data).read_abst()
    182 
    183 
    184 def read_bootstrap_info(bootstrap_bytes):
    185     return FlvReader(bootstrap_bytes).read_bootstrap_info()
    186 
    187 
    188 def build_fragments_list(boot_info):
    189     """ Return a list of (segment, fragment) for each fragment in the video """
    190     res = []
    191     segment_run_table = boot_info['segments'][0]
    192     fragment_run_entry_table = boot_info['fragments'][0]['fragments']
    193     first_frag_number = fragment_run_entry_table[0]['first']
    194     fragments_counter = itertools.count(first_frag_number)
    195     for segment, fragments_count in segment_run_table['segment_run']:
    196         # In some live HDS streams (for example Rai), `fragments_count` is
    197         # abnormal and causing out-of-memory errors. It's OK to change the
    198         # number of fragments for live streams as they are updated periodically
    199         if fragments_count == 4294967295 and boot_info['live']:
    200             fragments_count = 2
    201         for _ in range(fragments_count):
    202             res.append((segment, next(fragments_counter)))
    203 
    204     if boot_info['live']:
    205         res = res[-2:]
    206 
    207     return res
    208 
    209 
    210 def write_unsigned_int(stream, val):
    211     stream.write(compat_struct_pack('!I', val))
    212 
    213 
    214 def write_unsigned_int_24(stream, val):
    215     stream.write(compat_struct_pack('!I', val)[1:])
    216 
    217 
    218 def write_flv_header(stream):
    219     """Writes the FLV header to stream"""
    220     # FLV header
    221     stream.write(b'FLV\x01')
    222     stream.write(b'\x05')
    223     stream.write(b'\x00\x00\x00\x09')
    224     stream.write(b'\x00\x00\x00\x00')
    225 
    226 
    227 def write_metadata_tag(stream, metadata):
    228     """Writes optional metadata tag to stream"""
    229     SCRIPT_TAG = b'\x12'
    230     FLV_TAG_HEADER_LEN = 11
    231 
    232     if metadata:
    233         stream.write(SCRIPT_TAG)
    234         write_unsigned_int_24(stream, len(metadata))
    235         stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
    236         stream.write(metadata)
    237         write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata))
    238 
    239 
    240 def remove_encrypted_media(media):
    241     return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib
    242                                  and 'drmAdditionalHeaderSetId' not in e.attrib,
    243                        media))
    244 
    245 
    246 def _add_ns(prop, ver=1):
    247     return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop)
    248 
    249 
    250 def get_base_url(manifest):
    251     base_url = xpath_text(
    252         manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)],
    253         'base URL', default=None)
    254     if base_url:
    255         base_url = base_url.strip()
    256     return base_url
    257 
    258 
    259 class F4mFD(FragmentFD):
    260     """
    261     A downloader for f4m manifests or AdobeHDS.
    262     """
    263 
    264     FD_NAME = 'f4m'
    265 
    266     def _get_unencrypted_media(self, doc):
    267         media = doc.findall(_add_ns('media'))
    268         if not media:
    269             self.report_error('No media found')
    270         for e in (doc.findall(_add_ns('drmAdditionalHeader'))
    271                   + doc.findall(_add_ns('drmAdditionalHeaderSet'))):
    272             # If id attribute is missing it's valid for all media nodes
    273             # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute
    274             if 'id' not in e.attrib:
    275                 self.report_error('Missing ID in f4m DRM')
    276         media = remove_encrypted_media(media)
    277         if not media:
    278             self.report_error('Unsupported DRM')
    279         return media
    280 
    281     def _get_bootstrap_from_url(self, bootstrap_url):
    282         bootstrap = self.ydl.urlopen(bootstrap_url).read()
    283         return read_bootstrap_info(bootstrap)
    284 
    285     def _update_live_fragments(self, bootstrap_url, latest_fragment):
    286         fragments_list = []
    287         retries = 30
    288         while (not fragments_list) and (retries > 0):
    289             boot_info = self._get_bootstrap_from_url(bootstrap_url)
    290             fragments_list = build_fragments_list(boot_info)
    291             fragments_list = [f for f in fragments_list if f[1] > latest_fragment]
    292             if not fragments_list:
    293                 # Retry after a while
    294                 time.sleep(5.0)
    295                 retries -= 1
    296 
    297         if not fragments_list:
    298             self.report_error('Failed to update fragments')
    299 
    300         return fragments_list
    301 
    302     def _parse_bootstrap_node(self, node, base_url):
    303         # Sometimes non empty inline bootstrap info can be specified along
    304         # with bootstrap url attribute (e.g. dummy inline bootstrap info
    305         # contains whitespace characters in [1]). We will prefer bootstrap
    306         # url over inline bootstrap info when present.
    307         # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m
    308         bootstrap_url = node.get('url')
    309         if bootstrap_url:
    310             bootstrap_url = compat_urlparse.urljoin(
    311                 base_url, bootstrap_url)
    312             boot_info = self._get_bootstrap_from_url(bootstrap_url)
    313         else:
    314             bootstrap_url = None
    315             bootstrap = compat_b64decode(node.text)
    316             boot_info = read_bootstrap_info(bootstrap)
    317         return boot_info, bootstrap_url
    318 
    319     def real_download(self, filename, info_dict):
    320         man_url = info_dict['url']
    321         requested_bitrate = info_dict.get('tbr')
    322         self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
    323 
    324         urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
    325         man_url = urlh.geturl()
    326         # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
    327         # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244
    328         # and https://github.com/ytdl-org/youtube-dl/issues/7823)
    329         manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip()
    330 
    331         doc = compat_etree_fromstring(manifest)
    332         formats = [(int(f.attrib.get('bitrate', -1)), f)
    333                    for f in self._get_unencrypted_media(doc)]
    334         if requested_bitrate is None or len(formats) == 1:
    335             # get the best format
    336             formats = sorted(formats, key=lambda f: f[0])
    337             rate, media = formats[-1]
    338         else:
    339             rate, media = list(filter(
    340                 lambda f: int(f[0]) == requested_bitrate, formats))[0]
    341 
    342         # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec.
    343         man_base_url = get_base_url(doc) or man_url
    344 
    345         base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url'])
    346         bootstrap_node = doc.find(_add_ns('bootstrapInfo'))
    347         boot_info, bootstrap_url = self._parse_bootstrap_node(
    348             bootstrap_node, man_base_url)
    349         live = boot_info['live']
    350         metadata_node = media.find(_add_ns('metadata'))
    351         if metadata_node is not None:
    352             metadata = compat_b64decode(metadata_node.text)
    353         else:
    354             metadata = None
    355 
    356         fragments_list = build_fragments_list(boot_info)
    357         test = self.params.get('test', False)
    358         if test:
    359             # We only download the first fragment
    360             fragments_list = fragments_list[:1]
    361         total_frags = len(fragments_list)
    362         # For some akamai manifests we'll need to add a query to the fragment url
    363         akamai_pv = xpath_text(doc, _add_ns('pv-2.0'))
    364 
    365         ctx = {
    366             'filename': filename,
    367             'total_frags': total_frags,
    368             'live': live,
    369         }
    370 
    371         self._prepare_frag_download(ctx)
    372 
    373         dest_stream = ctx['dest_stream']
    374 
    375         if ctx['complete_frags_downloaded_bytes'] == 0:
    376             write_flv_header(dest_stream)
    377             if not live:
    378                 write_metadata_tag(dest_stream, metadata)
    379 
    380         base_url_parsed = compat_urllib_parse_urlparse(base_url)
    381 
    382         self._start_frag_download(ctx)
    383 
    384         frag_index = 0
    385         while fragments_list:
    386             seg_i, frag_i = fragments_list.pop(0)
    387             frag_index += 1
    388             if frag_index <= ctx['fragment_index']:
    389                 continue
    390             name = 'Seg%d-Frag%d' % (seg_i, frag_i)
    391             query = []
    392             if base_url_parsed.query:
    393                 query.append(base_url_parsed.query)
    394             if akamai_pv:
    395                 query.append(akamai_pv.strip(';'))
    396             if info_dict.get('extra_param_to_segment_url'):
    397                 query.append(info_dict['extra_param_to_segment_url'])
    398             url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
    399             try:
    400                 success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict)
    401                 if not success:
    402                     return False
    403                 reader = FlvReader(down_data)
    404                 while True:
    405                     try:
    406                         _, box_type, box_data = reader.read_box_info()
    407                     except DataTruncatedError:
    408                         if test:
    409                             # In tests, segments may be truncated, and thus
    410                             # FlvReader may not be able to parse the whole
    411                             # chunk. If so, write the segment as is
    412                             # See https://github.com/ytdl-org/youtube-dl/issues/9214
    413                             dest_stream.write(down_data)
    414                             break
    415                         raise
    416                     if box_type == b'mdat':
    417                         self._append_fragment(ctx, box_data)
    418                         break
    419             except (compat_urllib_error.HTTPError, ) as err:
    420                 if live and (err.code == 404 or err.code == 410):
    421                     # We didn't keep up with the live window. Continue
    422                     # with the next available fragment.
    423                     msg = 'Fragment %d unavailable' % frag_i
    424                     self.report_warning(msg)
    425                     fragments_list = []
    426                 else:
    427                     raise
    428 
    429             if not fragments_list and not test and live and bootstrap_url:
    430                 fragments_list = self._update_live_fragments(bootstrap_url, frag_i)
    431                 total_frags += len(fragments_list)
    432                 if fragments_list and (fragments_list[0][1] > frag_i + 1):
    433                     msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1))
    434                     self.report_warning(msg)
    435 
    436         self._finish_frag_download(ctx)
    437 
    438         return True