f4m.py (15432B)
1 from __future__ import division, unicode_literals 2 3 import io 4 import itertools 5 import time 6 7 from .fragment import FragmentFD 8 from ..compat import ( 9 compat_b64decode, 10 compat_etree_fromstring, 11 compat_urlparse, 12 compat_urllib_error, 13 compat_urllib_parse_urlparse, 14 compat_struct_pack, 15 compat_struct_unpack, 16 ) 17 from ..utils import ( 18 fix_xml_ampersands, 19 xpath_text, 20 ) 21 22 23 class DataTruncatedError(Exception): 24 pass 25 26 27 class FlvReader(io.BytesIO): 28 """ 29 Reader for Flv files 30 The file format is documented in https://www.adobe.com/devnet/f4v.html 31 """ 32 33 def read_bytes(self, n): 34 data = self.read(n) 35 if len(data) < n: 36 raise DataTruncatedError( 37 'FlvReader error: need %d bytes while only %d bytes got' % ( 38 n, len(data))) 39 return data 40 41 # Utility functions for reading numbers and strings 42 def read_unsigned_long_long(self): 43 return compat_struct_unpack('!Q', self.read_bytes(8))[0] 44 45 def read_unsigned_int(self): 46 return compat_struct_unpack('!I', self.read_bytes(4))[0] 47 48 def read_unsigned_char(self): 49 return compat_struct_unpack('!B', self.read_bytes(1))[0] 50 51 def read_string(self): 52 res = b'' 53 while True: 54 char = self.read_bytes(1) 55 if char == b'\x00': 56 break 57 res += char 58 return res 59 60 def read_box_info(self): 61 """ 62 Read a box and return the info as a tuple: (box_size, box_type, box_data) 63 """ 64 real_size = size = self.read_unsigned_int() 65 box_type = self.read_bytes(4) 66 header_end = 8 67 if size == 1: 68 real_size = self.read_unsigned_long_long() 69 header_end = 16 70 return real_size, box_type, self.read_bytes(real_size - header_end) 71 72 def read_asrt(self): 73 # version 74 self.read_unsigned_char() 75 # flags 76 self.read_bytes(3) 77 quality_entry_count = self.read_unsigned_char() 78 # QualityEntryCount 79 for i in range(quality_entry_count): 80 self.read_string() 81 82 segment_run_count = self.read_unsigned_int() 83 segments = [] 84 for i in range(segment_run_count): 85 first_segment = self.read_unsigned_int() 86 fragments_per_segment = self.read_unsigned_int() 87 segments.append((first_segment, fragments_per_segment)) 88 89 return { 90 'segment_run': segments, 91 } 92 93 def read_afrt(self): 94 # version 95 self.read_unsigned_char() 96 # flags 97 self.read_bytes(3) 98 # time scale 99 self.read_unsigned_int() 100 101 quality_entry_count = self.read_unsigned_char() 102 # QualitySegmentUrlModifiers 103 for i in range(quality_entry_count): 104 self.read_string() 105 106 fragments_count = self.read_unsigned_int() 107 fragments = [] 108 for i in range(fragments_count): 109 first = self.read_unsigned_int() 110 first_ts = self.read_unsigned_long_long() 111 duration = self.read_unsigned_int() 112 if duration == 0: 113 discontinuity_indicator = self.read_unsigned_char() 114 else: 115 discontinuity_indicator = None 116 fragments.append({ 117 'first': first, 118 'ts': first_ts, 119 'duration': duration, 120 'discontinuity_indicator': discontinuity_indicator, 121 }) 122 123 return { 124 'fragments': fragments, 125 } 126 127 def read_abst(self): 128 # version 129 self.read_unsigned_char() 130 # flags 131 self.read_bytes(3) 132 133 self.read_unsigned_int() # BootstrapinfoVersion 134 # Profile,Live,Update,Reserved 135 flags = self.read_unsigned_char() 136 live = flags & 0x20 != 0 137 # time scale 138 self.read_unsigned_int() 139 # CurrentMediaTime 140 self.read_unsigned_long_long() 141 # SmpteTimeCodeOffset 142 self.read_unsigned_long_long() 143 144 self.read_string() # MovieIdentifier 145 server_count = self.read_unsigned_char() 146 # ServerEntryTable 147 for i in range(server_count): 148 self.read_string() 149 quality_count = self.read_unsigned_char() 150 # QualityEntryTable 151 for i in range(quality_count): 152 self.read_string() 153 # DrmData 154 self.read_string() 155 # MetaData 156 self.read_string() 157 158 segments_count = self.read_unsigned_char() 159 segments = [] 160 for i in range(segments_count): 161 box_size, box_type, box_data = self.read_box_info() 162 assert box_type == b'asrt' 163 segment = FlvReader(box_data).read_asrt() 164 segments.append(segment) 165 fragments_run_count = self.read_unsigned_char() 166 fragments = [] 167 for i in range(fragments_run_count): 168 box_size, box_type, box_data = self.read_box_info() 169 assert box_type == b'afrt' 170 fragments.append(FlvReader(box_data).read_afrt()) 171 172 return { 173 'segments': segments, 174 'fragments': fragments, 175 'live': live, 176 } 177 178 def read_bootstrap_info(self): 179 total_size, box_type, box_data = self.read_box_info() 180 assert box_type == b'abst' 181 return FlvReader(box_data).read_abst() 182 183 184 def read_bootstrap_info(bootstrap_bytes): 185 return FlvReader(bootstrap_bytes).read_bootstrap_info() 186 187 188 def build_fragments_list(boot_info): 189 """ Return a list of (segment, fragment) for each fragment in the video """ 190 res = [] 191 segment_run_table = boot_info['segments'][0] 192 fragment_run_entry_table = boot_info['fragments'][0]['fragments'] 193 first_frag_number = fragment_run_entry_table[0]['first'] 194 fragments_counter = itertools.count(first_frag_number) 195 for segment, fragments_count in segment_run_table['segment_run']: 196 # In some live HDS streams (for example Rai), `fragments_count` is 197 # abnormal and causing out-of-memory errors. It's OK to change the 198 # number of fragments for live streams as they are updated periodically 199 if fragments_count == 4294967295 and boot_info['live']: 200 fragments_count = 2 201 for _ in range(fragments_count): 202 res.append((segment, next(fragments_counter))) 203 204 if boot_info['live']: 205 res = res[-2:] 206 207 return res 208 209 210 def write_unsigned_int(stream, val): 211 stream.write(compat_struct_pack('!I', val)) 212 213 214 def write_unsigned_int_24(stream, val): 215 stream.write(compat_struct_pack('!I', val)[1:]) 216 217 218 def write_flv_header(stream): 219 """Writes the FLV header to stream""" 220 # FLV header 221 stream.write(b'FLV\x01') 222 stream.write(b'\x05') 223 stream.write(b'\x00\x00\x00\x09') 224 stream.write(b'\x00\x00\x00\x00') 225 226 227 def write_metadata_tag(stream, metadata): 228 """Writes optional metadata tag to stream""" 229 SCRIPT_TAG = b'\x12' 230 FLV_TAG_HEADER_LEN = 11 231 232 if metadata: 233 stream.write(SCRIPT_TAG) 234 write_unsigned_int_24(stream, len(metadata)) 235 stream.write(b'\x00\x00\x00\x00\x00\x00\x00') 236 stream.write(metadata) 237 write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) 238 239 240 def remove_encrypted_media(media): 241 return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib 242 and 'drmAdditionalHeaderSetId' not in e.attrib, 243 media)) 244 245 246 def _add_ns(prop, ver=1): 247 return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) 248 249 250 def get_base_url(manifest): 251 base_url = xpath_text( 252 manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], 253 'base URL', default=None) 254 if base_url: 255 base_url = base_url.strip() 256 return base_url 257 258 259 class F4mFD(FragmentFD): 260 """ 261 A downloader for f4m manifests or AdobeHDS. 262 """ 263 264 FD_NAME = 'f4m' 265 266 def _get_unencrypted_media(self, doc): 267 media = doc.findall(_add_ns('media')) 268 if not media: 269 self.report_error('No media found') 270 for e in (doc.findall(_add_ns('drmAdditionalHeader')) 271 + doc.findall(_add_ns('drmAdditionalHeaderSet'))): 272 # If id attribute is missing it's valid for all media nodes 273 # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute 274 if 'id' not in e.attrib: 275 self.report_error('Missing ID in f4m DRM') 276 media = remove_encrypted_media(media) 277 if not media: 278 self.report_error('Unsupported DRM') 279 return media 280 281 def _get_bootstrap_from_url(self, bootstrap_url): 282 bootstrap = self.ydl.urlopen(bootstrap_url).read() 283 return read_bootstrap_info(bootstrap) 284 285 def _update_live_fragments(self, bootstrap_url, latest_fragment): 286 fragments_list = [] 287 retries = 30 288 while (not fragments_list) and (retries > 0): 289 boot_info = self._get_bootstrap_from_url(bootstrap_url) 290 fragments_list = build_fragments_list(boot_info) 291 fragments_list = [f for f in fragments_list if f[1] > latest_fragment] 292 if not fragments_list: 293 # Retry after a while 294 time.sleep(5.0) 295 retries -= 1 296 297 if not fragments_list: 298 self.report_error('Failed to update fragments') 299 300 return fragments_list 301 302 def _parse_bootstrap_node(self, node, base_url): 303 # Sometimes non empty inline bootstrap info can be specified along 304 # with bootstrap url attribute (e.g. dummy inline bootstrap info 305 # contains whitespace characters in [1]). We will prefer bootstrap 306 # url over inline bootstrap info when present. 307 # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m 308 bootstrap_url = node.get('url') 309 if bootstrap_url: 310 bootstrap_url = compat_urlparse.urljoin( 311 base_url, bootstrap_url) 312 boot_info = self._get_bootstrap_from_url(bootstrap_url) 313 else: 314 bootstrap_url = None 315 bootstrap = compat_b64decode(node.text) 316 boot_info = read_bootstrap_info(bootstrap) 317 return boot_info, bootstrap_url 318 319 def real_download(self, filename, info_dict): 320 man_url = info_dict['url'] 321 requested_bitrate = info_dict.get('tbr') 322 self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) 323 324 urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) 325 man_url = urlh.geturl() 326 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests 327 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244 328 # and https://github.com/ytdl-org/youtube-dl/issues/7823) 329 manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() 330 331 doc = compat_etree_fromstring(manifest) 332 formats = [(int(f.attrib.get('bitrate', -1)), f) 333 for f in self._get_unencrypted_media(doc)] 334 if requested_bitrate is None or len(formats) == 1: 335 # get the best format 336 formats = sorted(formats, key=lambda f: f[0]) 337 rate, media = formats[-1] 338 else: 339 rate, media = list(filter( 340 lambda f: int(f[0]) == requested_bitrate, formats))[0] 341 342 # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. 343 man_base_url = get_base_url(doc) or man_url 344 345 base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) 346 bootstrap_node = doc.find(_add_ns('bootstrapInfo')) 347 boot_info, bootstrap_url = self._parse_bootstrap_node( 348 bootstrap_node, man_base_url) 349 live = boot_info['live'] 350 metadata_node = media.find(_add_ns('metadata')) 351 if metadata_node is not None: 352 metadata = compat_b64decode(metadata_node.text) 353 else: 354 metadata = None 355 356 fragments_list = build_fragments_list(boot_info) 357 test = self.params.get('test', False) 358 if test: 359 # We only download the first fragment 360 fragments_list = fragments_list[:1] 361 total_frags = len(fragments_list) 362 # For some akamai manifests we'll need to add a query to the fragment url 363 akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) 364 365 ctx = { 366 'filename': filename, 367 'total_frags': total_frags, 368 'live': live, 369 } 370 371 self._prepare_frag_download(ctx) 372 373 dest_stream = ctx['dest_stream'] 374 375 if ctx['complete_frags_downloaded_bytes'] == 0: 376 write_flv_header(dest_stream) 377 if not live: 378 write_metadata_tag(dest_stream, metadata) 379 380 base_url_parsed = compat_urllib_parse_urlparse(base_url) 381 382 self._start_frag_download(ctx) 383 384 frag_index = 0 385 while fragments_list: 386 seg_i, frag_i = fragments_list.pop(0) 387 frag_index += 1 388 if frag_index <= ctx['fragment_index']: 389 continue 390 name = 'Seg%d-Frag%d' % (seg_i, frag_i) 391 query = [] 392 if base_url_parsed.query: 393 query.append(base_url_parsed.query) 394 if akamai_pv: 395 query.append(akamai_pv.strip(';')) 396 if info_dict.get('extra_param_to_segment_url'): 397 query.append(info_dict['extra_param_to_segment_url']) 398 url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) 399 try: 400 success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) 401 if not success: 402 return False 403 reader = FlvReader(down_data) 404 while True: 405 try: 406 _, box_type, box_data = reader.read_box_info() 407 except DataTruncatedError: 408 if test: 409 # In tests, segments may be truncated, and thus 410 # FlvReader may not be able to parse the whole 411 # chunk. If so, write the segment as is 412 # See https://github.com/ytdl-org/youtube-dl/issues/9214 413 dest_stream.write(down_data) 414 break 415 raise 416 if box_type == b'mdat': 417 self._append_fragment(ctx, box_data) 418 break 419 except (compat_urllib_error.HTTPError, ) as err: 420 if live and (err.code == 404 or err.code == 410): 421 # We didn't keep up with the live window. Continue 422 # with the next available fragment. 423 msg = 'Fragment %d unavailable' % frag_i 424 self.report_warning(msg) 425 fragments_list = [] 426 else: 427 raise 428 429 if not fragments_list and not test and live and bootstrap_url: 430 fragments_list = self._update_live_fragments(bootstrap_url, frag_i) 431 total_frags += len(fragments_list) 432 if fragments_list and (fragments_list[0][1] > frag_i + 1): 433 msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) 434 self.report_warning(msg) 435 436 self._finish_frag_download(ctx) 437 438 return True