From: dirkf Date: Sat, 27 Jan 2024 15:29:25 +0000 (+0000) Subject: [InfoExtractor] Support DASH subtitle extraction (yt-dlp back-port) X-Git-Url: http://git.oshgnacknak.de/?a=commitdiff_plain;h=c58b655a9ef255eb9d02b4d57706c46cfdf35975;p=youtube-dl [InfoExtractor] Support DASH subtitle extraction (yt-dlp back-port) --- diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 3f96645de..043b62243 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -993,7 +993,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 5997.485, 'width': 1920, 'height': 1080, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/pull/14844 'urls_only', @@ -1076,7 +1077,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 4400, 'width': 1920, 'height': 1080, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/issues/20346 # Media considered unfragmented even though it contains @@ -1122,18 +1124,119 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 360, 'height': 360, 'fps': 30, - }] + }], + {}, + ), ( + 'subtitles', + 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/', + [{ + 'format_id': 'audio=128001', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'm4a', + 'tbr': 128.001, + 'asr': 48000, + 'format_note': 'DASH audio', + 'container': 'm4a_dash', + 'vcodec': 'none', + 'acodec': 'mp4a.40.2', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + }, { + 'format_id': 'video=100000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 336, + 'height': 144, + 'tbr': 100, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + }, { + 'format_id': 'video=326000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 562, + 'height': 240, + 'tbr': 326, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + }, { + 'format_id': 'video=698000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 844, + 'height': 360, + 'tbr': 698, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + }, { + 'format_id': 'video=1493000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 1126, + 'height': 480, + 'tbr': 1493, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + }, { + 'format_id': 'video=4482000', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'ext': 'mp4', + 'width': 1688, + 'height': 720, + 'tbr': 4482, + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'vcodec': 'avc1.4D401F', + 'acodec': 'none', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + }], + { + 'en': [ + { + 'ext': 'mp4', + 'manifest_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/manifest.mpd', + 'fragment_base_url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/dash/', + 'protocol': 'http_dash_segments', + } + ] + }, ) ] - for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES: + for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES: with open('./test/testdata/mpd/%s.mpd' % mpd_file, mode='r', encoding='utf-8') as f: - formats = self.ie._parse_mpd_formats( + formats, subtitles = self.ie._parse_mpd_formats_and_subtitles( compat_etree_fromstring(f.read().encode('utf-8')), mpd_base_url=mpd_base_url, mpd_url=mpd_url) self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + expect_value(self, subtitles, expected_subtitles, None) def test_parse_f4m_formats(self): _TEST_CASES = [ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d33557135..ed55d3e07 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import base64 +import collections import datetime import functools import hashlib @@ -58,6 +59,7 @@ from ..utils import ( GeoRestrictedError, GeoUtils, int_or_none, + join_nonempty, js_to_json, JSON_LD_RE, mimetype2ext, @@ -74,6 +76,7 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, + T, traverse_obj, try_get, unescapeHTML, @@ -1751,6 +1754,12 @@ class InfoExtractor(object): 'format_note': 'Quality selection URL', } + def _report_ignoring_subs(self, name): + self.report_warning(bug_reports_message( + 'Ignoring subtitle tracks found in the {0} manifest; ' + 'if any subtitle tracks are missing,'.format(name) + ), only_once=True) + def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, @@ -2191,23 +2200,46 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + def _extract_mpd_formats(self, *args, **kwargs): + fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('DASH') + return fmts + + def _extract_mpd_formats_and_subtitles( + self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, + fatal=True, data=None, headers=None, query=None): + + # TODO: or not? param not yet implemented + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( mpd_url, video_id, - note=note or 'Downloading MPD manifest', - errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal, data=data, headers=headers, query=query) + note='Downloading MPD manifest' if note is None else note, + errnote='Failed to download MPD manifest' if errnote is None else errnote, + fatal=fatal, data=data, headers=headers or {}, query=query or {}) if res is False: - return [] + return [], {} mpd_doc, urlh = res if mpd_doc is None: - return [] - mpd_base_url = base_url(urlh.geturl()) + return [], {} + + # We could have been redirected to a new url when we retrieved our mpd file. + mpd_url = urlh.geturl() + mpd_base_url = base_url(mpd_url) - return self._parse_mpd_formats( + return self._parse_mpd_formats_and_subtitles( mpd_doc, mpd_id, mpd_base_url, mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): + def _parse_mpd_formats(self, *args, **kwargs): + fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('DASH') + return fmts + + def _parse_mpd_formats_and_subtitles( + self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2215,8 +2247,10 @@ class InfoExtractor(object): http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP """ - if mpd_doc.get('type') == 'dynamic': - return [] + # TODO: param not yet implemented: default like previous yt-dl logic + if not self.get_param('dynamic_mpd', False): + if mpd_doc.get('type') == 'dynamic': + return [], {} namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) @@ -2228,6 +2262,7 @@ class InfoExtractor(object): def extract_multisegment_info(element, ms_parent_info): ms_info = ms_parent_info.copy() + base_url = ms_info.get('base_url') # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some # common attributes and elements. We will only extract relevant @@ -2285,7 +2320,8 @@ class InfoExtractor(object): return ms_info mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) - formats = [] + formats, subtitles = [], {} + stream_numbers = collections.defaultdict(int) for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2295,7 +2331,7 @@ class InfoExtractor(object): for adaptation_set in period.findall(_add_ns('AdaptationSet')): if is_drm_protected(adaptation_set): continue - adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) + adaptation_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) for representation in adaptation_set.findall(_add_ns('Representation')): if is_drm_protected(representation): continue @@ -2303,27 +2339,35 @@ class InfoExtractor(object): representation_attrib.update(representation.attrib) # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] - content_type = mime_type.split('/')[0] - if content_type == 'text': - # TODO implement WebVTT downloading - pass - elif content_type in ('video', 'audio'): - base_url = '' - for element in (representation, adaptation_set, period, mpd_doc): - base_url_e = element.find(_add_ns('BaseURL')) - if base_url_e is not None: - base_url = base_url_e.text + base_url - if re.match(r'^https?://', base_url): - break - if mpd_base_url and not re.match(r'^https?://', base_url): - if not mpd_base_url.endswith('/') and not base_url.startswith('/'): - mpd_base_url += '/' - base_url = mpd_base_url + base_url - representation_id = representation_attrib.get('id') - lang = representation_attrib.get('lang') - url_el = representation.find(_add_ns('BaseURL')) - filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) - bandwidth = int_or_none(representation_attrib.get('bandwidth')) + content_type = representation_attrib.get('contentType') or mime_type.split('/')[0] + codec_str = representation_attrib.get('codecs', '') + # Some kind of binary subtitle found in some youtube livestreams + if mime_type == 'application/x-rawcc': + codecs = {'scodec': codec_str} + else: + codecs = parse_codecs(codec_str) + if content_type not in ('video', 'audio', 'text'): + if mime_type == 'image/jpeg': + content_type = mime_type + elif codecs.get('vcodec', 'none') != 'none': + content_type = 'video' + elif codecs.get('acodec', 'none') != 'none': + content_type = 'audio' + elif codecs.get('scodec', 'none') != 'none': + content_type = 'text' + elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): + content_type = 'text' + else: + self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) + continue + + representation_id = representation_attrib.get('id') + lang = representation_attrib.get('lang') + url_el = representation.find(_add_ns('BaseURL')) + filesize = int_or_none(url_el.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) + bandwidth = int_or_none(representation_attrib.get('bandwidth')) + format_id = join_nonempty(representation_id or content_type, mpd_id) + if content_type in ('video', 'audio'): f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'manifest_url': mpd_url, @@ -2338,104 +2382,125 @@ class InfoExtractor(object): 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', } - f.update(parse_codecs(representation_attrib.get('codecs'))) - representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - - def prepare_template(template_name, identifiers): - tmpl = representation_ms_info[template_name] - # First of, % characters outside $...$ templates - # must be escaped by doubling for proper processing - # by % operator string formatting used further (see - # https://github.com/ytdl-org/youtube-dl/issues/16867). - t = '' - in_template = False - for c in tmpl: + f.update(codecs) + elif content_type == 'text': + f = { + 'ext': mimetype2ext(mime_type), + 'manifest_url': mpd_url, + 'filesize': filesize, + } + elif content_type == 'image/jpeg': + # See test case in VikiIE + # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1 + f = { + 'format_id': format_id, + 'ext': 'mhtml', + 'manifest_url': mpd_url, + 'format_note': 'DASH storyboards (jpeg)', + 'acodec': 'none', + 'vcodec': 'none', + } + if is_drm_protected(adaptation_set) or is_drm_protected(representation): + f['has_drm'] = True + representation_ms_info = extract_multisegment_info(representation, adaptation_set_ms_info) + + def prepare_template(template_name, identifiers): + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/ytdl-org/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: t += c - if c == '$': - in_template = not in_template - elif c == '%' and not in_template: - t += c - # Next, $...$ templates are translated to their - # %(...) counterparts to be used with % operator - t = t.replace('$RepresentationID$', representation_id) - t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) - t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) - t.replace('$$', '$') - return t - - # @initialization is a regular template like @media one - # so it should be handled just the same way (see - # https://github.com/ytdl-org/youtube-dl/issues/11605) - if 'initialization' in representation_ms_info: - initialization_template = prepare_template( - 'initialization', - # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and - # $Time$ shall not be included for @initialization thus - # only $Bandwidth$ remains - ('Bandwidth', )) - representation_ms_info['initialization_url'] = initialization_template % { - 'Bandwidth': bandwidth, - } - - def location_key(location): - return 'url' if re.match(r'^https?://', location) else 'path' - - if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: - - media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) - media_location_key = location_key(media_template) - - # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ - # can't be used at the same time - if '%(Number' in media_template and 's' not in representation_ms_info: - segment_duration = None - if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: - segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) - representation_ms_info['fragments'] = [{ - media_location_key: media_template % { - 'Number': segment_number, - 'Bandwidth': bandwidth, - }, - 'duration': segment_duration, - } for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] - else: - # $Number*$ or $Time$ in media template with S list available - # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg - # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 - representation_ms_info['fragments'] = [] - segment_time = 0 - segment_d = None - segment_number = representation_ms_info['start_number'] - - def add_segment_url(): - segment_url = media_template % { - 'Time': segment_time, - 'Bandwidth': bandwidth, - 'Number': segment_number, - } - representation_ms_info['fragments'].append({ - media_location_key: segment_url, - 'duration': float_or_none(segment_d, representation_ms_info['timescale']), - }) + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator + t = t.replace('$RepresentationID$', representation_id) + t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) + t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) + t.replace('$$', '$') + return t + + # @initialization is a regular template like @media one + # so it should be handled just the same way (see + # https://github.com/ytdl-org/youtube-dl/issues/11605) + if 'initialization' in representation_ms_info: + initialization_template = prepare_template( + 'initialization', + # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and + # $Time$ shall not be included for @initialization thus + # only $Bandwidth$ remains + ('Bandwidth', )) + representation_ms_info['initialization_url'] = initialization_template % { + 'Bandwidth': bandwidth, + } - for num, s in enumerate(representation_ms_info['s']): - segment_time = s.get('t') or segment_time - segment_d = s['d'] + def location_key(location): + return 'url' if re.match(r'^https?://', location) else 'path' + + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: + + media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) + media_location_key = location_key(media_template) + + # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ + # can't be used at the same time + if '%(Number' in media_template and 's' not in representation_ms_info: + segment_duration = None + if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: + segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) + representation_ms_info['total_number'] = int(math.ceil( + float_or_none(period_duration, segment_duration, default=0))) + representation_ms_info['fragments'] = [{ + media_location_key: media_template % { + 'Number': segment_number, + 'Bandwidth': bandwidth, + }, + 'duration': segment_duration, + } for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] + else: + # $Number*$ or $Time$ in media template with S list available + # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg + # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 + representation_ms_info['fragments'] = [] + segment_time = 0 + segment_d = None + segment_number = representation_ms_info['start_number'] + + def add_segment_url(): + segment_url = media_template % { + 'Time': segment_time, + 'Bandwidth': bandwidth, + 'Number': segment_number, + } + representation_ms_info['fragments'].append({ + media_location_key: segment_url, + 'duration': float_or_none(segment_d, representation_ms_info['timescale']), + }) + + for num, s in enumerate(representation_ms_info['s']): + segment_time = s.get('t') or segment_time + segment_d = s['d'] + add_segment_url() + segment_number += 1 + for r in range(s.get('r', 0)): + segment_time += segment_d add_segment_url() segment_number += 1 - for r in range(s.get('r', 0)): - segment_time += segment_d - add_segment_url() - segment_number += 1 - segment_time += segment_d - elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: + segment_time += segment_d + elif 'segment_urls' in representation_ms_info: + fragments = [] + if 's' in representation_ms_info: # No media template # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI # or any YouTube dashsegments video - fragments = [] segment_index = 0 timescale = representation_ms_info['timescale'] for s in representation_ms_info['s']: @@ -2487,8 +2552,15 @@ class InfoExtractor(object): f['url'] = base_url formats.append(f) else: - self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - return formats + # Assuming direct URL to unfragmented media. + f['url'] = representation_ms_info['base_url'] + if content_type in ('video', 'audio', 'image/jpeg'): + f['manifest_stream_number'] = stream_numbers[f['url']] + stream_numbers[f['url']] += 1 + formats.append(f) + elif content_type == 'text': + subtitles.setdefault(lang or 'und', []).append(f) + return formats, subtitles def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle(