youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

common.py (143736B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import base64
      5 import datetime
      6 import hashlib
      7 import json
      8 import netrc
      9 import os
     10 import random
     11 import re
     12 import socket
     13 import ssl
     14 import sys
     15 import time
     16 import math
     17 
     18 from ..compat import (
     19     compat_cookiejar_Cookie,
     20     compat_cookies_SimpleCookie,
     21     compat_etree_Element,
     22     compat_etree_fromstring,
     23     compat_getpass,
     24     compat_integer_types,
     25     compat_http_client,
     26     compat_os_name,
     27     compat_str,
     28     compat_urllib_error,
     29     compat_urllib_parse_unquote,
     30     compat_urllib_parse_urlencode,
     31     compat_urllib_request,
     32     compat_urlparse,
     33     compat_xml_parse_error,
     34 )
     35 from ..downloader.f4m import (
     36     get_base_url,
     37     remove_encrypted_media,
     38 )
     39 from ..utils import (
     40     NO_DEFAULT,
     41     age_restricted,
     42     base_url,
     43     bug_reports_message,
     44     clean_html,
     45     compiled_regex_type,
     46     determine_ext,
     47     determine_protocol,
     48     dict_get,
     49     error_to_compat_str,
     50     ExtractorError,
     51     extract_attributes,
     52     fix_xml_ampersands,
     53     float_or_none,
     54     GeoRestrictedError,
     55     GeoUtils,
     56     int_or_none,
     57     js_to_json,
     58     JSON_LD_RE,
     59     mimetype2ext,
     60     orderedSet,
     61     parse_bitrate,
     62     parse_codecs,
     63     parse_duration,
     64     parse_iso8601,
     65     parse_m3u8_attributes,
     66     parse_resolution,
     67     RegexNotFoundError,
     68     sanitized_Request,
     69     sanitize_filename,
     70     str_or_none,
     71     str_to_int,
     72     strip_or_none,
     73     unescapeHTML,
     74     unified_strdate,
     75     unified_timestamp,
     76     update_Request,
     77     update_url_query,
     78     urljoin,
     79     url_basename,
     80     url_or_none,
     81     xpath_element,
     82     xpath_text,
     83     xpath_with_ns,
     84 )
     85 
     86 
     87 class InfoExtractor(object):
     88     """Information Extractor class.
     89 
     90     Information extractors are the classes that, given a URL, extract
     91     information about the video (or videos) the URL refers to. This
     92     information includes the real video URL, the video title, author and
     93     others. The information is stored in a dictionary which is then
     94     passed to the YoutubeDL. The YoutubeDL processes this
     95     information possibly downloading the video to the file system, among
     96     other possible outcomes.
     97 
     98     The type field determines the type of the result.
     99     By far the most common value (and the default if _type is missing) is
    100     "video", which indicates a single video.
    101 
    102     For a video, the dictionaries must include the following fields:
    103 
    104     id:             Video identifier.
    105     title:          Video title, unescaped.
    106 
    107     Additionally, it must contain either a formats entry or a url one:
    108 
    109     formats:        A list of dictionaries for each format available, ordered
    110                     from worst to best quality.
    111 
    112                     Potential fields:
    113                     * url        The mandatory URL representing the media:
    114                                    for plain file media - HTTP URL of this file,
    115                                    for RTMP - RTMP URL,
    116                                    for HLS - URL of the M3U8 media playlist,
    117                                    for HDS - URL of the F4M manifest,
    118                                    for DASH
    119                                      - HTTP URL to plain file media (in case of
    120                                        unfragmented media)
    121                                      - URL of the MPD manifest or base URL
    122                                        representing the media if MPD manifest
    123                                        is parsed from a string (in case of
    124                                        fragmented media)
    125                                    for MSS - URL of the ISM manifest.
    126                     * manifest_url
    127                                  The URL of the manifest file in case of
    128                                  fragmented media:
    129                                    for HLS - URL of the M3U8 master playlist,
    130                                    for HDS - URL of the F4M manifest,
    131                                    for DASH - URL of the MPD manifest,
    132                                    for MSS - URL of the ISM manifest.
    133                     * ext        Will be calculated from URL if missing
    134                     * format     A human-readable description of the format
    135                                  ("mp4 container with h264/opus").
    136                                  Calculated from the format_id, width, height.
    137                                  and format_note fields if missing.
    138                     * format_id  A short description of the format
    139                                  ("mp4_h264_opus" or "19").
    140                                 Technically optional, but strongly recommended.
    141                     * format_note Additional info about the format
    142                                  ("3D" or "DASH video")
    143                     * width      Width of the video, if known
    144                     * height     Height of the video, if known
    145                     * resolution Textual description of width and height
    146                     * tbr        Average bitrate of audio and video in KBit/s
    147                     * abr        Average audio bitrate in KBit/s
    148                     * acodec     Name of the audio codec in use
    149                     * asr        Audio sampling rate in Hertz
    150                     * vbr        Average video bitrate in KBit/s
    151                     * fps        Frame rate
    152                     * vcodec     Name of the video codec in use
    153                     * container  Name of the container format
    154                     * filesize   The number of bytes, if known in advance
    155                     * filesize_approx  An estimate for the number of bytes
    156                     * player_url SWF Player URL (used for rtmpdump).
    157                     * protocol   The protocol that will be used for the actual
    158                                  download, lower-case.
    159                                  "http", "https", "rtsp", "rtmp", "rtmpe",
    160                                  "m3u8", "m3u8_native" or "http_dash_segments".
    161                     * fragment_base_url
    162                                  Base URL for fragments. Each fragment's path
    163                                  value (if present) will be relative to
    164                                  this URL.
    165                     * fragments  A list of fragments of a fragmented media.
    166                                  Each fragment entry must contain either an url
    167                                  or a path. If an url is present it should be
    168                                  considered by a client. Otherwise both path and
    169                                  fragment_base_url must be present. Here is
    170                                  the list of all potential fields:
    171                                  * "url" - fragment's URL
    172                                  * "path" - fragment's path relative to
    173                                             fragment_base_url
    174                                  * "duration" (optional, int or float)
    175                                  * "filesize" (optional, int)
    176                     * preference Order number of this format. If this field is
    177                                  present and not None, the formats get sorted
    178                                  by this field, regardless of all other values.
    179                                  -1 for default (order by other properties),
    180                                  -2 or smaller for less than default.
    181                                  < -1000 to hide the format (if there is
    182                                     another one which is strictly better)
    183                     * language   Language code, e.g. "de" or "en-US".
    184                     * language_preference  Is this in the language mentioned in
    185                                  the URL?
    186                                  10 if it's what the URL is about,
    187                                  -1 for default (don't know),
    188                                  -10 otherwise, other values reserved for now.
    189                     * quality    Order number of the video quality of this
    190                                  format, irrespective of the file format.
    191                                  -1 for default (order by other properties),
    192                                  -2 or smaller for less than default.
    193                     * source_preference  Order number for this video source
    194                                   (quality takes higher priority)
    195                                  -1 for default (order by other properties),
    196                                  -2 or smaller for less than default.
    197                     * http_headers  A dictionary of additional HTTP headers
    198                                  to add to the request.
    199                     * stretched_ratio  If given and not 1, indicates that the
    200                                  video's pixels are not square.
    201                                  width : height ratio as float.
    202                     * no_resume  The server does not support resuming the
    203                                  (HTTP or RTMP) download. Boolean.
    204                     * downloader_options  A dictionary of downloader options as
    205                                  described in FileDownloader
    206 
    207     url:            Final video URL.
    208     ext:            Video filename extension.
    209     format:         The video format, defaults to ext (used for --get-format)
    210     player_url:     SWF Player URL (used for rtmpdump).
    211 
    212     The following fields are optional:
    213 
    214     alt_title:      A secondary title of the video.
    215     display_id      An alternative identifier for the video, not necessarily
    216                     unique, but available before title. Typically, id is
    217                     something like "4234987", title "Dancing naked mole rats",
    218                     and display_id "dancing-naked-mole-rats"
    219     thumbnails:     A list of dictionaries, with the following entries:
    220                         * "id" (optional, string) - Thumbnail format ID
    221                         * "url"
    222                         * "preference" (optional, int) - quality of the image
    223                         * "width" (optional, int)
    224                         * "height" (optional, int)
    225                         * "resolution" (optional, string "{width}x{height}",
    226                                         deprecated)
    227                         * "filesize" (optional, int)
    228     thumbnail:      Full URL to a video thumbnail image.
    229     description:    Full video description.
    230     uploader:       Full name of the video uploader.
    231     license:        License name the video is licensed under.
    232     creator:        The creator of the video.
    233     release_timestamp: UNIX timestamp of the moment the video was released.
    234     release_date:   The date (YYYYMMDD) when the video was released.
    235     timestamp:      UNIX timestamp of the moment the video became available
    236                     (uploaded).
    237     upload_date:    Video upload date (YYYYMMDD).
    238                     If not explicitly set, calculated from timestamp.
    239     uploader_id:    Nickname or id of the video uploader.
    240     uploader_url:   Full URL to a personal webpage of the video uploader.
    241     channel:        Full name of the channel the video is uploaded on.
    242                     Note that channel fields may or may not repeat uploader
    243                     fields. This depends on a particular extractor.
    244     channel_id:     Id of the channel.
    245     channel_url:    Full URL to a channel webpage.
    246     location:       Physical location where the video was filmed.
    247     subtitles:      The available subtitles as a dictionary in the format
    248                     {tag: subformats}. "tag" is usually a language code, and
    249                     "subformats" is a list sorted from lower to higher
    250                     preference, each element is a dictionary with the "ext"
    251                     entry and one of:
    252                         * "data": The subtitles file contents
    253                         * "url": A URL pointing to the subtitles file
    254                     "ext" will be calculated from URL if missing
    255     automatic_captions: Like 'subtitles', used by the YoutubeIE for
    256                     automatically generated captions
    257     duration:       Length of the video in seconds, as an integer or float.
    258     view_count:     How many users have watched the video on the platform.
    259     like_count:     Number of positive ratings of the video
    260     dislike_count:  Number of negative ratings of the video
    261     repost_count:   Number of reposts of the video
    262     average_rating: Average rating give by users, the scale used depends on the webpage
    263     comment_count:  Number of comments on the video
    264     comments:       A list of comments, each with one or more of the following
    265                     properties (all but one of text or html optional):
    266                         * "author" - human-readable name of the comment author
    267                         * "author_id" - user ID of the comment author
    268                         * "id" - Comment ID
    269                         * "html" - Comment as HTML
    270                         * "text" - Plain text of the comment
    271                         * "timestamp" - UNIX timestamp of comment
    272                         * "parent" - ID of the comment this one is replying to.
    273                                      Set to "root" to indicate that this is a
    274                                      comment to the original video.
    275     age_limit:      Age restriction for the video, as an integer (years)
    276     webpage_url:    The URL to the video webpage, if given to youtube-dl it
    277                     should allow to get the same result again. (It will be set
    278                     by YoutubeDL if it's missing)
    279     categories:     A list of categories that the video falls in, for example
    280                     ["Sports", "Berlin"]
    281     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
    282     is_live:        True, False, or None (=unknown). Whether this video is a
    283                     live stream that goes on instead of a fixed-length video.
    284     start_time:     Time in seconds where the reproduction should start, as
    285                     specified in the URL.
    286     end_time:       Time in seconds where the reproduction should end, as
    287                     specified in the URL.
    288     chapters:       A list of dictionaries, with the following entries:
    289                         * "start_time" - The start time of the chapter in seconds
    290                         * "end_time" - The end time of the chapter in seconds
    291                         * "title" (optional, string)
    292 
    293     The following fields should only be used when the video belongs to some logical
    294     chapter or section:
    295 
    296     chapter:        Name or title of the chapter the video belongs to.
    297     chapter_number: Number of the chapter the video belongs to, as an integer.
    298     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
    299 
    300     The following fields should only be used when the video is an episode of some
    301     series, programme or podcast:
    302 
    303     series:         Title of the series or programme the video episode belongs to.
    304     season:         Title of the season the video episode belongs to.
    305     season_number:  Number of the season the video episode belongs to, as an integer.
    306     season_id:      Id of the season the video episode belongs to, as a unicode string.
    307     episode:        Title of the video episode. Unlike mandatory video title field,
    308                     this field should denote the exact title of the video episode
    309                     without any kind of decoration.
    310     episode_number: Number of the video episode within a season, as an integer.
    311     episode_id:     Id of the video episode, as a unicode string.
    312 
    313     The following fields should only be used when the media is a track or a part of
    314     a music album:
    315 
    316     track:          Title of the track.
    317     track_number:   Number of the track within an album or a disc, as an integer.
    318     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
    319                     as a unicode string.
    320     artist:         Artist(s) of the track.
    321     genre:          Genre(s) of the track.
    322     album:          Title of the album the track belongs to.
    323     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
    324     album_artist:   List of all artists appeared on the album (e.g.
    325                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
    326                     and compilations).
    327     disc_number:    Number of the disc or other physical medium the track belongs to,
    328                     as an integer.
    329     release_year:   Year (YYYY) when the album was released.
    330 
    331     Unless mentioned otherwise, the fields should be Unicode strings.
    332 
    333     Unless mentioned otherwise, None is equivalent to absence of information.
    334 
    335 
    336     _type "playlist" indicates multiple videos.
    337     There must be a key "entries", which is a list, an iterable, or a PagedList
    338     object, each element of which is a valid dictionary by this specification.
    339 
    340     Additionally, playlists can have "id", "title", "description", "uploader",
    341     "uploader_id", "uploader_url", "duration" attributes with the same semantics
    342     as videos (see above).
    343 
    344 
    345     _type "multi_video" indicates that there are multiple videos that
    346     form a single show, for examples multiple acts of an opera or TV episode.
    347     It must have an entries key like a playlist and contain all the keys
    348     required for a video at the same time.
    349 
    350 
    351     _type "url" indicates that the video must be extracted from another
    352     location, possibly by a different extractor. Its only required key is:
    353     "url" - the next URL to extract.
    354     The key "ie_key" can be set to the class name (minus the trailing "IE",
    355     e.g. "Youtube") if the extractor class is known in advance.
    356     Additionally, the dictionary may have any properties of the resolved entity
    357     known in advance, for example "title" if the title of the referred video is
    358     known ahead of time.
    359 
    360 
    361     _type "url_transparent" entities have the same specification as "url", but
    362     indicate that the given additional information is more precise than the one
    363     associated with the resolved URL.
    364     This is useful when a site employs a video service that hosts the video and
    365     its technical metadata, but that video service does not embed a useful
    366     title, description etc.
    367 
    368 
    369     Subclasses of this one should re-define the _real_initialize() and
    370     _real_extract() methods and define a _VALID_URL regexp.
    371     Probably, they should also be added to the list of extractors.
    372 
    373     _GEO_BYPASS attribute may be set to False in order to disable
    374     geo restriction bypass mechanisms for a particular extractor.
    375     Though it won't disable explicit geo restriction bypass based on
    376     country code provided with geo_bypass_country.
    377 
    378     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
    379     countries for this extractor. One of these countries will be used by
    380     geo restriction bypass mechanism right away in order to bypass
    381     geo restriction, of course, if the mechanism is not disabled.
    382 
    383     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
    384     IP blocks in CIDR notation for this extractor. One of these IP blocks
    385     will be used by geo restriction bypass mechanism similarly
    386     to _GEO_COUNTRIES.
    387 
    388     Finally, the _WORKING attribute should be set to False for broken IEs
    389     in order to warn the users and skip the tests.
    390     """
    391 
    392     _ready = False
    393     _downloader = None
    394     _x_forwarded_for_ip = None
    395     _GEO_BYPASS = True
    396     _GEO_COUNTRIES = None
    397     _GEO_IP_BLOCKS = None
    398     _WORKING = True
    399 
    400     def __init__(self, downloader=None):
    401         """Constructor. Receives an optional downloader."""
    402         self._ready = False
    403         self._x_forwarded_for_ip = None
    404         self.set_downloader(downloader)
    405 
    406     @classmethod
    407     def suitable(cls, url):
    408         """Receives a URL and returns True if suitable for this IE."""
    409 
    410         # This does not use has/getattr intentionally - we want to know whether
    411         # we have cached the regexp for *this* class, whereas getattr would also
    412         # match the superclass
    413         if '_VALID_URL_RE' not in cls.__dict__:
    414             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
    415         return cls._VALID_URL_RE.match(url) is not None
    416 
    417     @classmethod
    418     def _match_id(cls, url):
    419         if '_VALID_URL_RE' not in cls.__dict__:
    420             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
    421         m = cls._VALID_URL_RE.match(url)
    422         assert m
    423         return compat_str(m.group('id'))
    424 
    425     @classmethod
    426     def working(cls):
    427         """Getter method for _WORKING."""
    428         return cls._WORKING
    429 
    430     def initialize(self):
    431         """Initializes an instance (authentication, etc)."""
    432         self._initialize_geo_bypass({
    433             'countries': self._GEO_COUNTRIES,
    434             'ip_blocks': self._GEO_IP_BLOCKS,
    435         })
    436         if not self._ready:
    437             self._real_initialize()
    438             self._ready = True
    439 
    440     def _initialize_geo_bypass(self, geo_bypass_context):
    441         """
    442         Initialize geo restriction bypass mechanism.
    443 
    444         This method is used to initialize geo bypass mechanism based on faking
    445         X-Forwarded-For HTTP header. A random country from provided country list
    446         is selected and a random IP belonging to this country is generated. This
    447         IP will be passed as X-Forwarded-For HTTP header in all subsequent
    448         HTTP requests.
    449 
    450         This method will be used for initial geo bypass mechanism initialization
    451         during the instance initialization with _GEO_COUNTRIES and
    452         _GEO_IP_BLOCKS.
    453 
    454         You may also manually call it from extractor's code if geo bypass
    455         information is not available beforehand (e.g. obtained during
    456         extraction) or due to some other reason. In this case you should pass
    457         this information in geo bypass context passed as first argument. It may
    458         contain following fields:
    459 
    460         countries:  List of geo unrestricted countries (similar
    461                     to _GEO_COUNTRIES)
    462         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
    463                     (similar to _GEO_IP_BLOCKS)
    464 
    465         """
    466         if not self._x_forwarded_for_ip:
    467 
    468             # Geo bypass mechanism is explicitly disabled by user
    469             if not self._downloader.params.get('geo_bypass', True):
    470                 return
    471 
    472             if not geo_bypass_context:
    473                 geo_bypass_context = {}
    474 
    475             # Backward compatibility: previously _initialize_geo_bypass
    476             # expected a list of countries, some 3rd party code may still use
    477             # it this way
    478             if isinstance(geo_bypass_context, (list, tuple)):
    479                 geo_bypass_context = {
    480                     'countries': geo_bypass_context,
    481                 }
    482 
    483             # The whole point of geo bypass mechanism is to fake IP
    484             # as X-Forwarded-For HTTP header based on some IP block or
    485             # country code.
    486 
    487             # Path 1: bypassing based on IP block in CIDR notation
    488 
    489             # Explicit IP block specified by user, use it right away
    490             # regardless of whether extractor is geo bypassable or not
    491             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
    492 
    493             # Otherwise use random IP block from geo bypass context but only
    494             # if extractor is known as geo bypassable
    495             if not ip_block:
    496                 ip_blocks = geo_bypass_context.get('ip_blocks')
    497                 if self._GEO_BYPASS and ip_blocks:
    498                     ip_block = random.choice(ip_blocks)
    499 
    500             if ip_block:
    501                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
    502                 if self._downloader.params.get('verbose', False):
    503                     self._downloader.to_screen(
    504                         '[debug] Using fake IP %s as X-Forwarded-For.'
    505                         % self._x_forwarded_for_ip)
    506                 return
    507 
    508             # Path 2: bypassing based on country code
    509 
    510             # Explicit country code specified by user, use it right away
    511             # regardless of whether extractor is geo bypassable or not
    512             country = self._downloader.params.get('geo_bypass_country', None)
    513 
    514             # Otherwise use random country code from geo bypass context but
    515             # only if extractor is known as geo bypassable
    516             if not country:
    517                 countries = geo_bypass_context.get('countries')
    518                 if self._GEO_BYPASS and countries:
    519                     country = random.choice(countries)
    520 
    521             if country:
    522                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
    523                 if self._downloader.params.get('verbose', False):
    524                     self._downloader.to_screen(
    525                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
    526                         % (self._x_forwarded_for_ip, country.upper()))
    527 
    528     def extract(self, url):
    529         """Extracts URL information and returns it in list of dicts."""
    530         try:
    531             for _ in range(2):
    532                 try:
    533                     self.initialize()
    534                     ie_result = self._real_extract(url)
    535                     if self._x_forwarded_for_ip:
    536                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
    537                     return ie_result
    538                 except GeoRestrictedError as e:
    539                     if self.__maybe_fake_ip_and_retry(e.countries):
    540                         continue
    541                     raise
    542         except ExtractorError:
    543             raise
    544         except compat_http_client.IncompleteRead as e:
    545             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
    546         except (KeyError, StopIteration) as e:
    547             raise ExtractorError('An extractor error has occurred.', cause=e)
    548 
    549     def __maybe_fake_ip_and_retry(self, countries):
    550         if (not self._downloader.params.get('geo_bypass_country', None)
    551                 and self._GEO_BYPASS
    552                 and self._downloader.params.get('geo_bypass', True)
    553                 and not self._x_forwarded_for_ip
    554                 and countries):
    555             country_code = random.choice(countries)
    556             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
    557             if self._x_forwarded_for_ip:
    558                 self.report_warning(
    559                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
    560                     % (self._x_forwarded_for_ip, country_code.upper()))
    561                 return True
    562         return False
    563 
    564     def set_downloader(self, downloader):
    565         """Sets the downloader for this IE."""
    566         self._downloader = downloader
    567 
    568     def _real_initialize(self):
    569         """Real initialization process. Redefine in subclasses."""
    570         pass
    571 
    572     def _real_extract(self, url):
    573         """Real extraction process. Redefine in subclasses."""
    574         pass
    575 
    576     @classmethod
    577     def ie_key(cls):
    578         """A string for getting the InfoExtractor with get_info_extractor"""
    579         return compat_str(cls.__name__[:-2])
    580 
    581     @property
    582     def IE_NAME(self):
    583         return compat_str(type(self).__name__[:-2])
    584 
    585     @staticmethod
    586     def __can_accept_status_code(err, expected_status):
    587         assert isinstance(err, compat_urllib_error.HTTPError)
    588         if expected_status is None:
    589             return False
    590         if isinstance(expected_status, compat_integer_types):
    591             return err.code == expected_status
    592         elif isinstance(expected_status, (list, tuple)):
    593             return err.code in expected_status
    594         elif callable(expected_status):
    595             return expected_status(err.code) is True
    596         else:
    597             assert False
    598 
    599     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
    600         """
    601         Return the response handle.
    602 
    603         See _download_webpage docstring for arguments specification.
    604         """
    605         if note is None:
    606             self.report_download_webpage(video_id)
    607         elif note is not False:
    608             if video_id is None:
    609                 self.to_screen('%s' % (note,))
    610             else:
    611                 self.to_screen('%s: %s' % (video_id, note))
    612 
    613         # Some sites check X-Forwarded-For HTTP header in order to figure out
    614         # the origin of the client behind proxy. This allows bypassing geo
    615         # restriction by faking this header's value to IP that belongs to some
    616         # geo unrestricted country. We will do so once we encounter any
    617         # geo restriction error.
    618         if self._x_forwarded_for_ip:
    619             if 'X-Forwarded-For' not in headers:
    620                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
    621 
    622         if isinstance(url_or_request, compat_urllib_request.Request):
    623             url_or_request = update_Request(
    624                 url_or_request, data=data, headers=headers, query=query)
    625         else:
    626             if query:
    627                 url_or_request = update_url_query(url_or_request, query)
    628             if data is not None or headers:
    629                 url_or_request = sanitized_Request(url_or_request, data, headers)
    630         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
    631         if hasattr(ssl, 'CertificateError'):
    632             exceptions.append(ssl.CertificateError)
    633         try:
    634             return self._downloader.urlopen(url_or_request)
    635         except tuple(exceptions) as err:
    636             if isinstance(err, compat_urllib_error.HTTPError):
    637                 if self.__can_accept_status_code(err, expected_status):
    638                     # Retain reference to error to prevent file object from
    639                     # being closed before it can be read. Works around the
    640                     # effects of <https://bugs.python.org/issue15002>
    641                     # introduced in Python 3.4.1.
    642                     err.fp._error = err
    643                     return err.fp
    644 
    645             if errnote is False:
    646                 return False
    647             if errnote is None:
    648                 errnote = 'Unable to download webpage'
    649 
    650             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
    651             if fatal:
    652                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
    653             else:
    654                 self._downloader.report_warning(errmsg)
    655                 return False
    656 
    657     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
    658         """
    659         Return a tuple (page content as string, URL handle).
    660 
    661         See _download_webpage docstring for arguments specification.
    662         """
    663         # Strip hashes from the URL (#1038)
    664         if isinstance(url_or_request, (compat_str, str)):
    665             url_or_request = url_or_request.partition('#')[0]
    666 
    667         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
    668         if urlh is False:
    669             assert not fatal
    670             return False
    671         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
    672         return (content, urlh)
    673 
    674     @staticmethod
    675     def _guess_encoding_from_content(content_type, webpage_bytes):
    676         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
    677         if m:
    678             encoding = m.group(1)
    679         else:
    680             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
    681                           webpage_bytes[:1024])
    682             if m:
    683                 encoding = m.group(1).decode('ascii')
    684             elif webpage_bytes.startswith(b'\xff\xfe'):
    685                 encoding = 'utf-16'
    686             else:
    687                 encoding = 'utf-8'
    688 
    689         return encoding
    690 
    691     def __check_blocked(self, content):
    692         first_block = content[:512]
    693         if ('<title>Access to this site is blocked</title>' in content
    694                 and 'Websense' in first_block):
    695             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
    696             blocked_iframe = self._html_search_regex(
    697                 r'<iframe src="([^"]+)"', content,
    698                 'Websense information URL', default=None)
    699             if blocked_iframe:
    700                 msg += ' Visit %s for more details' % blocked_iframe
    701             raise ExtractorError(msg, expected=True)
    702         if '<title>The URL you requested has been blocked</title>' in first_block:
    703             msg = (
    704                 'Access to this webpage has been blocked by Indian censorship. '
    705                 'Use a VPN or proxy server (with --proxy) to route around it.')
    706             block_msg = self._html_search_regex(
    707                 r'</h1><p>(.*?)</p>',
    708                 content, 'block message', default=None)
    709             if block_msg:
    710                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
    711             raise ExtractorError(msg, expected=True)
    712         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
    713                 and 'blocklist.rkn.gov.ru' in content):
    714             raise ExtractorError(
    715                 'Access to this webpage has been blocked by decision of the Russian government. '
    716                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
    717                 expected=True)
    718 
    719     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
    720         content_type = urlh.headers.get('Content-Type', '')
    721         webpage_bytes = urlh.read()
    722         if prefix is not None:
    723             webpage_bytes = prefix + webpage_bytes
    724         if not encoding:
    725             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
    726         if self._downloader.params.get('dump_intermediate_pages', False):
    727             self.to_screen('Dumping request to ' + urlh.geturl())
    728             dump = base64.b64encode(webpage_bytes).decode('ascii')
    729             self._downloader.to_screen(dump)
    730         if self._downloader.params.get('write_pages', False):
    731             basen = '%s_%s' % (video_id, urlh.geturl())
    732             if len(basen) > 240:
    733                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
    734                 basen = basen[:240 - len(h)] + h
    735             raw_filename = basen + '.dump'
    736             filename = sanitize_filename(raw_filename, restricted=True)
    737             self.to_screen('Saving request to ' + filename)
    738             # Working around MAX_PATH limitation on Windows (see
    739             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
    740             if compat_os_name == 'nt':
    741                 absfilepath = os.path.abspath(filename)
    742                 if len(absfilepath) > 259:
    743                     filename = '\\\\?\\' + absfilepath
    744             with open(filename, 'wb') as outf:
    745                 outf.write(webpage_bytes)
    746 
    747         try:
    748             content = webpage_bytes.decode(encoding, 'replace')
    749         except LookupError:
    750             content = webpage_bytes.decode('utf-8', 'replace')
    751 
    752         self.__check_blocked(content)
    753 
    754         return content
    755 
    756     def _download_webpage(
    757             self, url_or_request, video_id, note=None, errnote=None,
    758             fatal=True, tries=1, timeout=5, encoding=None, data=None,
    759             headers={}, query={}, expected_status=None):
    760         """
    761         Return the data of the page as a string.
    762 
    763         Arguments:
    764         url_or_request -- plain text URL as a string or
    765             a compat_urllib_request.Requestobject
    766         video_id -- Video/playlist/item identifier (string)
    767 
    768         Keyword arguments:
    769         note -- note printed before downloading (string)
    770         errnote -- note printed in case of an error (string)
    771         fatal -- flag denoting whether error should be considered fatal,
    772             i.e. whether it should cause ExtractionError to be raised,
    773             otherwise a warning will be reported and extraction continued
    774         tries -- number of tries
    775         timeout -- sleep interval between tries
    776         encoding -- encoding for a page content decoding, guessed automatically
    777             when not explicitly specified
    778         data -- POST data (bytes)
    779         headers -- HTTP headers (dict)
    780         query -- URL query (dict)
    781         expected_status -- allows to accept failed HTTP requests (non 2xx
    782             status code) by explicitly specifying a set of accepted status
    783             codes. Can be any of the following entities:
    784                 - an integer type specifying an exact failed status code to
    785                   accept
    786                 - a list or a tuple of integer types specifying a list of
    787                   failed status codes to accept
    788                 - a callable accepting an actual failed status code and
    789                   returning True if it should be accepted
    790             Note that this argument does not affect success status codes (2xx)
    791             which are always accepted.
    792         """
    793 
    794         success = False
    795         try_count = 0
    796         while success is False:
    797             try:
    798                 res = self._download_webpage_handle(
    799                     url_or_request, video_id, note, errnote, fatal,
    800                     encoding=encoding, data=data, headers=headers, query=query,
    801                     expected_status=expected_status)
    802                 success = True
    803             except compat_http_client.IncompleteRead as e:
    804                 try_count += 1
    805                 if try_count >= tries:
    806                     raise e
    807                 self._sleep(timeout, video_id)
    808         if res is False:
    809             return res
    810         else:
    811             content, _ = res
    812             return content
    813 
    814     def _download_xml_handle(
    815             self, url_or_request, video_id, note='Downloading XML',
    816             errnote='Unable to download XML', transform_source=None,
    817             fatal=True, encoding=None, data=None, headers={}, query={},
    818             expected_status=None):
    819         """
    820         Return a tuple (xml as an compat_etree_Element, URL handle).
    821 
    822         See _download_webpage docstring for arguments specification.
    823         """
    824         res = self._download_webpage_handle(
    825             url_or_request, video_id, note, errnote, fatal=fatal,
    826             encoding=encoding, data=data, headers=headers, query=query,
    827             expected_status=expected_status)
    828         if res is False:
    829             return res
    830         xml_string, urlh = res
    831         return self._parse_xml(
    832             xml_string, video_id, transform_source=transform_source,
    833             fatal=fatal), urlh
    834 
    835     def _download_xml(
    836             self, url_or_request, video_id,
    837             note='Downloading XML', errnote='Unable to download XML',
    838             transform_source=None, fatal=True, encoding=None,
    839             data=None, headers={}, query={}, expected_status=None):
    840         """
    841         Return the xml as an compat_etree_Element.
    842 
    843         See _download_webpage docstring for arguments specification.
    844         """
    845         res = self._download_xml_handle(
    846             url_or_request, video_id, note=note, errnote=errnote,
    847             transform_source=transform_source, fatal=fatal, encoding=encoding,
    848             data=data, headers=headers, query=query,
    849             expected_status=expected_status)
    850         return res if res is False else res[0]
    851 
    852     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
    853         if transform_source:
    854             xml_string = transform_source(xml_string)
    855         try:
    856             return compat_etree_fromstring(xml_string.encode('utf-8'))
    857         except compat_xml_parse_error as ve:
    858             errmsg = '%s: Failed to parse XML ' % video_id
    859             if fatal:
    860                 raise ExtractorError(errmsg, cause=ve)
    861             else:
    862                 self.report_warning(errmsg + str(ve))
    863 
    864     def _download_json_handle(
    865             self, url_or_request, video_id, note='Downloading JSON metadata',
    866             errnote='Unable to download JSON metadata', transform_source=None,
    867             fatal=True, encoding=None, data=None, headers={}, query={},
    868             expected_status=None):
    869         """
    870         Return a tuple (JSON object, URL handle).
    871 
    872         See _download_webpage docstring for arguments specification.
    873         """
    874         res = self._download_webpage_handle(
    875             url_or_request, video_id, note, errnote, fatal=fatal,
    876             encoding=encoding, data=data, headers=headers, query=query,
    877             expected_status=expected_status)
    878         if res is False:
    879             return res
    880         json_string, urlh = res
    881         return self._parse_json(
    882             json_string, video_id, transform_source=transform_source,
    883             fatal=fatal), urlh
    884 
    885     def _download_json(
    886             self, url_or_request, video_id, note='Downloading JSON metadata',
    887             errnote='Unable to download JSON metadata', transform_source=None,
    888             fatal=True, encoding=None, data=None, headers={}, query={},
    889             expected_status=None):
    890         """
    891         Return the JSON object as a dict.
    892 
    893         See _download_webpage docstring for arguments specification.
    894         """
    895         res = self._download_json_handle(
    896             url_or_request, video_id, note=note, errnote=errnote,
    897             transform_source=transform_source, fatal=fatal, encoding=encoding,
    898             data=data, headers=headers, query=query,
    899             expected_status=expected_status)
    900         return res if res is False else res[0]
    901 
    902     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
    903         if transform_source:
    904             json_string = transform_source(json_string)
    905         try:
    906             return json.loads(json_string)
    907         except ValueError as ve:
    908             errmsg = '%s: Failed to parse JSON ' % video_id
    909             if fatal:
    910                 raise ExtractorError(errmsg, cause=ve)
    911             else:
    912                 self.report_warning(errmsg + str(ve))
    913 
    914     def report_warning(self, msg, video_id=None):
    915         idstr = '' if video_id is None else '%s: ' % video_id
    916         self._downloader.report_warning(
    917             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
    918 
    919     def to_screen(self, msg):
    920         """Print msg to screen, prefixing it with '[ie_name]'"""
    921         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
    922 
    923     def report_extraction(self, id_or_name):
    924         """Report information extraction."""
    925         self.to_screen('%s: Extracting information' % id_or_name)
    926 
    927     def report_download_webpage(self, video_id):
    928         """Report webpage download."""
    929         self.to_screen('%s: Downloading webpage' % video_id)
    930 
    931     def report_age_confirmation(self):
    932         """Report attempt to confirm age."""
    933         self.to_screen('Confirming age')
    934 
    935     def report_login(self):
    936         """Report attempt to log in."""
    937         self.to_screen('Logging in')
    938 
    939     @staticmethod
    940     def raise_login_required(msg='This video is only available for registered users'):
    941         raise ExtractorError(
    942             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
    943             expected=True)
    944 
    945     @staticmethod
    946     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
    947         raise GeoRestrictedError(msg, countries=countries)
    948 
    949     # Methods for following #608
    950     @staticmethod
    951     def url_result(url, ie=None, video_id=None, video_title=None):
    952         """Returns a URL that points to a page that should be processed"""
    953         # TODO: ie should be the class used for getting the info
    954         video_info = {'_type': 'url',
    955                       'url': url,
    956                       'ie_key': ie}
    957         if video_id is not None:
    958             video_info['id'] = video_id
    959         if video_title is not None:
    960             video_info['title'] = video_title
    961         return video_info
    962 
    963     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
    964         urls = orderedSet(
    965             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
    966             for m in matches)
    967         return self.playlist_result(
    968             urls, playlist_id=playlist_id, playlist_title=playlist_title)
    969 
    970     @staticmethod
    971     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
    972         """Returns a playlist"""
    973         video_info = {'_type': 'playlist',
    974                       'entries': entries}
    975         if playlist_id:
    976             video_info['id'] = playlist_id
    977         if playlist_title:
    978             video_info['title'] = playlist_title
    979         if playlist_description:
    980             video_info['description'] = playlist_description
    981         return video_info
    982 
    983     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
    984         """
    985         Perform a regex search on the given string, using a single or a list of
    986         patterns returning the first matching group.
    987         In case of failure return a default value or raise a WARNING or a
    988         RegexNotFoundError, depending on fatal, specifying the field name.
    989         """
    990         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
    991             mobj = re.search(pattern, string, flags)
    992         else:
    993             for p in pattern:
    994                 mobj = re.search(p, string, flags)
    995                 if mobj:
    996                     break
    997 
    998         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
    999             _name = '\033[0;34m%s\033[0m' % name
   1000         else:
   1001             _name = name
   1002 
   1003         if mobj:
   1004             if group is None:
   1005                 # return the first matching group
   1006                 return next(g for g in mobj.groups() if g is not None)
   1007             else:
   1008                 return mobj.group(group)
   1009         elif default is not NO_DEFAULT:
   1010             return default
   1011         elif fatal:
   1012             raise RegexNotFoundError('Unable to extract %s' % _name)
   1013         else:
   1014             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
   1015             return None
   1016 
   1017     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
   1018         """
   1019         Like _search_regex, but strips HTML tags and unescapes entities.
   1020         """
   1021         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
   1022         if res:
   1023             return clean_html(res).strip()
   1024         else:
   1025             return res
   1026 
   1027     def _get_netrc_login_info(self, netrc_machine=None):
   1028         username = None
   1029         password = None
   1030         netrc_machine = netrc_machine or self._NETRC_MACHINE
   1031 
   1032         if self._downloader.params.get('usenetrc', False):
   1033             try:
   1034                 info = netrc.netrc().authenticators(netrc_machine)
   1035                 if info is not None:
   1036                     username = info[0]
   1037                     password = info[2]
   1038                 else:
   1039                     raise netrc.NetrcParseError(
   1040                         'No authenticators for %s' % netrc_machine)
   1041             except (IOError, netrc.NetrcParseError) as err:
   1042                 self._downloader.report_warning(
   1043                     'parsing .netrc: %s' % error_to_compat_str(err))
   1044 
   1045         return username, password
   1046 
   1047     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
   1048         """
   1049         Get the login info as (username, password)
   1050         First look for the manually specified credentials using username_option
   1051         and password_option as keys in params dictionary. If no such credentials
   1052         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
   1053         value.
   1054         If there's no info available, return (None, None)
   1055         """
   1056         if self._downloader is None:
   1057             return (None, None)
   1058 
   1059         downloader_params = self._downloader.params
   1060 
   1061         # Attempt to use provided username and password or .netrc data
   1062         if downloader_params.get(username_option) is not None:
   1063             username = downloader_params[username_option]
   1064             password = downloader_params[password_option]
   1065         else:
   1066             username, password = self._get_netrc_login_info(netrc_machine)
   1067 
   1068         return username, password
   1069 
   1070     def _get_tfa_info(self, note='two-factor verification code'):
   1071         """
   1072         Get the two-factor authentication info
   1073         TODO - asking the user will be required for sms/phone verify
   1074         currently just uses the command line option
   1075         If there's no info available, return None
   1076         """
   1077         if self._downloader is None:
   1078             return None
   1079         downloader_params = self._downloader.params
   1080 
   1081         if downloader_params.get('twofactor') is not None:
   1082             return downloader_params['twofactor']
   1083 
   1084         return compat_getpass('Type %s and press [Return]: ' % note)
   1085 
   1086     # Helper functions for extracting OpenGraph info
   1087     @staticmethod
   1088     def _og_regexes(prop):
   1089         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
   1090         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
   1091                        % {'prop': re.escape(prop)})
   1092         template = r'<meta[^>]+?%s[^>]+?%s'
   1093         return [
   1094             template % (property_re, content_re),
   1095             template % (content_re, property_re),
   1096         ]
   1097 
   1098     @staticmethod
   1099     def _meta_regex(prop):
   1100         return r'''(?isx)<meta
   1101                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
   1102                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
   1103 
   1104     def _og_search_property(self, prop, html, name=None, **kargs):
   1105         if not isinstance(prop, (list, tuple)):
   1106             prop = [prop]
   1107         if name is None:
   1108             name = 'OpenGraph %s' % prop[0]
   1109         og_regexes = []
   1110         for p in prop:
   1111             og_regexes.extend(self._og_regexes(p))
   1112         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
   1113         if escaped is None:
   1114             return None
   1115         return unescapeHTML(escaped)
   1116 
   1117     def _og_search_thumbnail(self, html, **kargs):
   1118         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
   1119 
   1120     def _og_search_description(self, html, **kargs):
   1121         return self._og_search_property('description', html, fatal=False, **kargs)
   1122 
   1123     def _og_search_title(self, html, **kargs):
   1124         return self._og_search_property('title', html, **kargs)
   1125 
   1126     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
   1127         regexes = self._og_regexes('video') + self._og_regexes('video:url')
   1128         if secure:
   1129             regexes = self._og_regexes('video:secure_url') + regexes
   1130         return self._html_search_regex(regexes, html, name, **kargs)
   1131 
   1132     def _og_search_url(self, html, **kargs):
   1133         return self._og_search_property('url', html, **kargs)
   1134 
   1135     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
   1136         if not isinstance(name, (list, tuple)):
   1137             name = [name]
   1138         if display_name is None:
   1139             display_name = name[0]
   1140         return self._html_search_regex(
   1141             [self._meta_regex(n) for n in name],
   1142             html, display_name, fatal=fatal, group='content', **kwargs)
   1143 
   1144     def _dc_search_uploader(self, html):
   1145         return self._html_search_meta('dc.creator', html, 'uploader')
   1146 
   1147     def _rta_search(self, html):
   1148         # See http://www.rtalabel.org/index.php?content=howtofaq#single
   1149         if re.search(r'(?ix)<meta\s+name="rating"\s+'
   1150                      r'     content="RTA-5042-1996-1400-1577-RTA"',
   1151                      html):
   1152             return 18
   1153         return 0
   1154 
   1155     def _media_rating_search(self, html):
   1156         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
   1157         rating = self._html_search_meta('rating', html)
   1158 
   1159         if not rating:
   1160             return None
   1161 
   1162         RATING_TABLE = {
   1163             'safe for kids': 0,
   1164             'general': 8,
   1165             '14 years': 14,
   1166             'mature': 17,
   1167             'restricted': 19,
   1168         }
   1169         return RATING_TABLE.get(rating.lower())
   1170 
   1171     def _family_friendly_search(self, html):
   1172         # See http://schema.org/VideoObject
   1173         family_friendly = self._html_search_meta(
   1174             'isFamilyFriendly', html, default=None)
   1175 
   1176         if not family_friendly:
   1177             return None
   1178 
   1179         RATING_TABLE = {
   1180             '1': 0,
   1181             'true': 0,
   1182             '0': 18,
   1183             'false': 18,
   1184         }
   1185         return RATING_TABLE.get(family_friendly.lower())
   1186 
   1187     def _twitter_search_player(self, html):
   1188         return self._html_search_meta('twitter:player', html,
   1189                                       'twitter card player')
   1190 
   1191     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
   1192         json_ld_list = list(re.finditer(JSON_LD_RE, html))
   1193         default = kwargs.get('default', NO_DEFAULT)
   1194         # JSON-LD may be malformed and thus `fatal` should be respected.
   1195         # At the same time `default` may be passed that assumes `fatal=False`
   1196         # for _search_regex. Let's simulate the same behavior here as well.
   1197         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
   1198         json_ld = []
   1199         for mobj in json_ld_list:
   1200             json_ld_item = self._parse_json(
   1201                 mobj.group('json_ld'), video_id, fatal=fatal)
   1202             if not json_ld_item:
   1203                 continue
   1204             if isinstance(json_ld_item, dict):
   1205                 json_ld.append(json_ld_item)
   1206             elif isinstance(json_ld_item, (list, tuple)):
   1207                 json_ld.extend(json_ld_item)
   1208         if json_ld:
   1209             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
   1210         if json_ld:
   1211             return json_ld
   1212         if default is not NO_DEFAULT:
   1213             return default
   1214         elif fatal:
   1215             raise RegexNotFoundError('Unable to extract JSON-LD')
   1216         else:
   1217             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
   1218             return {}
   1219 
   1220     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
   1221         if isinstance(json_ld, compat_str):
   1222             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
   1223         if not json_ld:
   1224             return {}
   1225         info = {}
   1226         if not isinstance(json_ld, (list, tuple, dict)):
   1227             return info
   1228         if isinstance(json_ld, dict):
   1229             json_ld = [json_ld]
   1230 
   1231         INTERACTION_TYPE_MAP = {
   1232             'CommentAction': 'comment',
   1233             'AgreeAction': 'like',
   1234             'DisagreeAction': 'dislike',
   1235             'LikeAction': 'like',
   1236             'DislikeAction': 'dislike',
   1237             'ListenAction': 'view',
   1238             'WatchAction': 'view',
   1239             'ViewAction': 'view',
   1240         }
   1241 
   1242         def extract_interaction_type(e):
   1243             interaction_type = e.get('interactionType')
   1244             if isinstance(interaction_type, dict):
   1245                 interaction_type = interaction_type.get('@type')
   1246             return str_or_none(interaction_type)
   1247 
   1248         def extract_interaction_statistic(e):
   1249             interaction_statistic = e.get('interactionStatistic')
   1250             if isinstance(interaction_statistic, dict):
   1251                 interaction_statistic = [interaction_statistic]
   1252             if not isinstance(interaction_statistic, list):
   1253                 return
   1254             for is_e in interaction_statistic:
   1255                 if not isinstance(is_e, dict):
   1256                     continue
   1257                 if is_e.get('@type') != 'InteractionCounter':
   1258                     continue
   1259                 interaction_type = extract_interaction_type(is_e)
   1260                 if not interaction_type:
   1261                     continue
   1262                 # For interaction count some sites provide string instead of
   1263                 # an integer (as per spec) with non digit characters (e.g. ",")
   1264                 # so extracting count with more relaxed str_to_int
   1265                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
   1266                 if interaction_count is None:
   1267                     continue
   1268                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
   1269                 if not count_kind:
   1270                     continue
   1271                 count_key = '%s_count' % count_kind
   1272                 if info.get(count_key) is not None:
   1273                     continue
   1274                 info[count_key] = interaction_count
   1275 
   1276         def extract_video_object(e):
   1277             assert e['@type'] == 'VideoObject'
   1278             author = e.get('author')
   1279             info.update({
   1280                 'url': url_or_none(e.get('contentUrl')),
   1281                 'title': unescapeHTML(e.get('name')),
   1282                 'description': unescapeHTML(e.get('description')),
   1283                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
   1284                 'duration': parse_duration(e.get('duration')),
   1285                 'timestamp': unified_timestamp(e.get('uploadDate')),
   1286                 # author can be an instance of 'Organization' or 'Person' types.
   1287                 # both types can have 'name' property(inherited from 'Thing' type). [1]
   1288                 # however some websites are using 'Text' type instead.
   1289                 # 1. https://schema.org/VideoObject
   1290                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
   1291                 'filesize': float_or_none(e.get('contentSize')),
   1292                 'tbr': int_or_none(e.get('bitrate')),
   1293                 'width': int_or_none(e.get('width')),
   1294                 'height': int_or_none(e.get('height')),
   1295                 'view_count': int_or_none(e.get('interactionCount')),
   1296             })
   1297             extract_interaction_statistic(e)
   1298 
   1299         for e in json_ld:
   1300             if '@context' in e:
   1301                 item_type = e.get('@type')
   1302                 if expected_type is not None and expected_type != item_type:
   1303                     continue
   1304                 if item_type in ('TVEpisode', 'Episode'):
   1305                     episode_name = unescapeHTML(e.get('name'))
   1306                     info.update({
   1307                         'episode': episode_name,
   1308                         'episode_number': int_or_none(e.get('episodeNumber')),
   1309                         'description': unescapeHTML(e.get('description')),
   1310                     })
   1311                     if not info.get('title') and episode_name:
   1312                         info['title'] = episode_name
   1313                     part_of_season = e.get('partOfSeason')
   1314                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
   1315                         info.update({
   1316                             'season': unescapeHTML(part_of_season.get('name')),
   1317                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
   1318                         })
   1319                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
   1320                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
   1321                         info['series'] = unescapeHTML(part_of_series.get('name'))
   1322                 elif item_type == 'Movie':
   1323                     info.update({
   1324                         'title': unescapeHTML(e.get('name')),
   1325                         'description': unescapeHTML(e.get('description')),
   1326                         'duration': parse_duration(e.get('duration')),
   1327                         'timestamp': unified_timestamp(e.get('dateCreated')),
   1328                     })
   1329                 elif item_type in ('Article', 'NewsArticle'):
   1330                     info.update({
   1331                         'timestamp': parse_iso8601(e.get('datePublished')),
   1332                         'title': unescapeHTML(e.get('headline')),
   1333                         'description': unescapeHTML(e.get('articleBody')),
   1334                     })
   1335                 elif item_type == 'VideoObject':
   1336                     extract_video_object(e)
   1337                     if expected_type is None:
   1338                         continue
   1339                     else:
   1340                         break
   1341                 video = e.get('video')
   1342                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
   1343                     extract_video_object(video)
   1344                 if expected_type is None:
   1345                     continue
   1346                 else:
   1347                     break
   1348         return dict((k, v) for k, v in info.items() if v is not None)
   1349 
   1350     @staticmethod
   1351     def _hidden_inputs(html):
   1352         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
   1353         hidden_inputs = {}
   1354         for input in re.findall(r'(?i)(<input[^>]+>)', html):
   1355             attrs = extract_attributes(input)
   1356             if not input:
   1357                 continue
   1358             if attrs.get('type') not in ('hidden', 'submit'):
   1359                 continue
   1360             name = attrs.get('name') or attrs.get('id')
   1361             value = attrs.get('value')
   1362             if name and value is not None:
   1363                 hidden_inputs[name] = value
   1364         return hidden_inputs
   1365 
   1366     def _form_hidden_inputs(self, form_id, html):
   1367         form = self._search_regex(
   1368             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
   1369             html, '%s form' % form_id, group='form')
   1370         return self._hidden_inputs(form)
   1371 
   1372     def _sort_formats(self, formats, field_preference=None):
   1373         if not formats:
   1374             raise ExtractorError('No video formats found')
   1375 
   1376         for f in formats:
   1377             # Automatically determine tbr when missing based on abr and vbr (improves
   1378             # formats sorting in some cases)
   1379             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
   1380                 f['tbr'] = f['abr'] + f['vbr']
   1381 
   1382         def _formats_key(f):
   1383             # TODO remove the following workaround
   1384             from ..utils import determine_ext
   1385             if not f.get('ext') and 'url' in f:
   1386                 f['ext'] = determine_ext(f['url'])
   1387 
   1388             if isinstance(field_preference, (list, tuple)):
   1389                 return tuple(
   1390                     f.get(field)
   1391                     if f.get(field) is not None
   1392                     else ('' if field == 'format_id' else -1)
   1393                     for field in field_preference)
   1394 
   1395             preference = f.get('preference')
   1396             if preference is None:
   1397                 preference = 0
   1398                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
   1399                     preference -= 0.5
   1400 
   1401             protocol = f.get('protocol') or determine_protocol(f)
   1402             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
   1403 
   1404             if f.get('vcodec') == 'none':  # audio only
   1405                 preference -= 50
   1406                 if self._downloader.params.get('prefer_free_formats'):
   1407                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
   1408                 else:
   1409                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
   1410                 ext_preference = 0
   1411                 try:
   1412                     audio_ext_preference = ORDER.index(f['ext'])
   1413                 except ValueError:
   1414                     audio_ext_preference = -1
   1415             else:
   1416                 if f.get('acodec') == 'none':  # video only
   1417                     preference -= 40
   1418                 if self._downloader.params.get('prefer_free_formats'):
   1419                     ORDER = ['flv', 'mp4', 'webm']
   1420                 else:
   1421                     ORDER = ['webm', 'flv', 'mp4']
   1422                 try:
   1423                     ext_preference = ORDER.index(f['ext'])
   1424                 except ValueError:
   1425                     ext_preference = -1
   1426                 audio_ext_preference = 0
   1427 
   1428             return (
   1429                 preference,
   1430                 f.get('language_preference') if f.get('language_preference') is not None else -1,
   1431                 f.get('quality') if f.get('quality') is not None else -1,
   1432                 f.get('tbr') if f.get('tbr') is not None else -1,
   1433                 f.get('filesize') if f.get('filesize') is not None else -1,
   1434                 f.get('vbr') if f.get('vbr') is not None else -1,
   1435                 f.get('height') if f.get('height') is not None else -1,
   1436                 f.get('width') if f.get('width') is not None else -1,
   1437                 proto_preference,
   1438                 ext_preference,
   1439                 f.get('abr') if f.get('abr') is not None else -1,
   1440                 audio_ext_preference,
   1441                 f.get('fps') if f.get('fps') is not None else -1,
   1442                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
   1443                 f.get('source_preference') if f.get('source_preference') is not None else -1,
   1444                 f.get('format_id') if f.get('format_id') is not None else '',
   1445             )
   1446         formats.sort(key=_formats_key)
   1447 
   1448     def _check_formats(self, formats, video_id):
   1449         if formats:
   1450             formats[:] = filter(
   1451                 lambda f: self._is_valid_url(
   1452                     f['url'], video_id,
   1453                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
   1454                 formats)
   1455 
   1456     @staticmethod
   1457     def _remove_duplicate_formats(formats):
   1458         format_urls = set()
   1459         unique_formats = []
   1460         for f in formats:
   1461             if f['url'] not in format_urls:
   1462                 format_urls.add(f['url'])
   1463                 unique_formats.append(f)
   1464         formats[:] = unique_formats
   1465 
   1466     def _is_valid_url(self, url, video_id, item='video', headers={}):
   1467         url = self._proto_relative_url(url, scheme='http:')
   1468         # For now assume non HTTP(S) URLs always valid
   1469         if not (url.startswith('http://') or url.startswith('https://')):
   1470             return True
   1471         try:
   1472             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
   1473             return True
   1474         except ExtractorError as e:
   1475             self.to_screen(
   1476                 '%s: %s URL is invalid, skipping: %s'
   1477                 % (video_id, item, error_to_compat_str(e.cause)))
   1478             return False
   1479 
   1480     def http_scheme(self):
   1481         """ Either "http:" or "https:", depending on the user's preferences """
   1482         return (
   1483             'http:'
   1484             if self._downloader.params.get('prefer_insecure', False)
   1485             else 'https:')
   1486 
   1487     def _proto_relative_url(self, url, scheme=None):
   1488         if url is None:
   1489             return url
   1490         if url.startswith('//'):
   1491             if scheme is None:
   1492                 scheme = self.http_scheme()
   1493             return scheme + url
   1494         else:
   1495             return url
   1496 
   1497     def _sleep(self, timeout, video_id, msg_template=None):
   1498         if msg_template is None:
   1499             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
   1500         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
   1501         self.to_screen(msg)
   1502         time.sleep(timeout)
   1503 
   1504     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
   1505                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
   1506                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
   1507         manifest = self._download_xml(
   1508             manifest_url, video_id, 'Downloading f4m manifest',
   1509             'Unable to download f4m manifest',
   1510             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
   1511             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
   1512             transform_source=transform_source,
   1513             fatal=fatal, data=data, headers=headers, query=query)
   1514 
   1515         if manifest is False:
   1516             return []
   1517 
   1518         return self._parse_f4m_formats(
   1519             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
   1520             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
   1521 
   1522     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
   1523                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
   1524                            fatal=True, m3u8_id=None):
   1525         if not isinstance(manifest, compat_etree_Element) and not fatal:
   1526             return []
   1527 
   1528         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
   1529         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
   1530         if akamai_pv is not None and ';' in akamai_pv.text:
   1531             playerVerificationChallenge = akamai_pv.text.split(';')[0]
   1532             if playerVerificationChallenge.strip() != '':
   1533                 return []
   1534 
   1535         formats = []
   1536         manifest_version = '1.0'
   1537         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
   1538         if not media_nodes:
   1539             manifest_version = '2.0'
   1540             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
   1541         # Remove unsupported DRM protected media from final formats
   1542         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
   1543         media_nodes = remove_encrypted_media(media_nodes)
   1544         if not media_nodes:
   1545             return formats
   1546 
   1547         manifest_base_url = get_base_url(manifest)
   1548 
   1549         bootstrap_info = xpath_element(
   1550             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
   1551             'bootstrap info', default=None)
   1552 
   1553         vcodec = None
   1554         mime_type = xpath_text(
   1555             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
   1556             'base URL', default=None)
   1557         if mime_type and mime_type.startswith('audio/'):
   1558             vcodec = 'none'
   1559 
   1560         for i, media_el in enumerate(media_nodes):
   1561             tbr = int_or_none(media_el.attrib.get('bitrate'))
   1562             width = int_or_none(media_el.attrib.get('width'))
   1563             height = int_or_none(media_el.attrib.get('height'))
   1564             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
   1565             # If <bootstrapInfo> is present, the specified f4m is a
   1566             # stream-level manifest, and only set-level manifests may refer to
   1567             # external resources.  See section 11.4 and section 4 of F4M spec
   1568             if bootstrap_info is None:
   1569                 media_url = None
   1570                 # @href is introduced in 2.0, see section 11.6 of F4M spec
   1571                 if manifest_version == '2.0':
   1572                     media_url = media_el.attrib.get('href')
   1573                 if media_url is None:
   1574                     media_url = media_el.attrib.get('url')
   1575                 if not media_url:
   1576                     continue
   1577                 manifest_url = (
   1578                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
   1579                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
   1580                 # If media_url is itself a f4m manifest do the recursive extraction
   1581                 # since bitrates in parent manifest (this one) and media_url manifest
   1582                 # may differ leading to inability to resolve the format by requested
   1583                 # bitrate in f4m downloader
   1584                 ext = determine_ext(manifest_url)
   1585                 if ext == 'f4m':
   1586                     f4m_formats = self._extract_f4m_formats(
   1587                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
   1588                         transform_source=transform_source, fatal=fatal)
   1589                     # Sometimes stream-level manifest contains single media entry that
   1590                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
   1591                     # At the same time parent's media entry in set-level manifest may
   1592                     # contain it. We will copy it from parent in such cases.
   1593                     if len(f4m_formats) == 1:
   1594                         f = f4m_formats[0]
   1595                         f.update({
   1596                             'tbr': f.get('tbr') or tbr,
   1597                             'width': f.get('width') or width,
   1598                             'height': f.get('height') or height,
   1599                             'format_id': f.get('format_id') if not tbr else format_id,
   1600                             'vcodec': vcodec,
   1601                         })
   1602                     formats.extend(f4m_formats)
   1603                     continue
   1604                 elif ext == 'm3u8':
   1605                     formats.extend(self._extract_m3u8_formats(
   1606                         manifest_url, video_id, 'mp4', preference=preference,
   1607                         m3u8_id=m3u8_id, fatal=fatal))
   1608                     continue
   1609             formats.append({
   1610                 'format_id': format_id,
   1611                 'url': manifest_url,
   1612                 'manifest_url': manifest_url,
   1613                 'ext': 'flv' if bootstrap_info is not None else None,
   1614                 'protocol': 'f4m',
   1615                 'tbr': tbr,
   1616                 'width': width,
   1617                 'height': height,
   1618                 'vcodec': vcodec,
   1619                 'preference': preference,
   1620             })
   1621         return formats
   1622 
   1623     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
   1624         return {
   1625             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
   1626             'url': m3u8_url,
   1627             'ext': ext,
   1628             'protocol': 'm3u8',
   1629             'preference': preference - 100 if preference else -100,
   1630             'resolution': 'multiple',
   1631             'format_note': 'Quality selection URL',
   1632         }
   1633 
   1634     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
   1635                               entry_protocol='m3u8', preference=None,
   1636                               m3u8_id=None, note=None, errnote=None,
   1637                               fatal=True, live=False, data=None, headers={},
   1638                               query={}):
   1639         res = self._download_webpage_handle(
   1640             m3u8_url, video_id,
   1641             note=note or 'Downloading m3u8 information',
   1642             errnote=errnote or 'Failed to download m3u8 information',
   1643             fatal=fatal, data=data, headers=headers, query=query)
   1644 
   1645         if res is False:
   1646             return []
   1647 
   1648         m3u8_doc, urlh = res
   1649         m3u8_url = urlh.geturl()
   1650 
   1651         return self._parse_m3u8_formats(
   1652             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
   1653             preference=preference, m3u8_id=m3u8_id, live=live)
   1654 
   1655     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
   1656                             entry_protocol='m3u8', preference=None,
   1657                             m3u8_id=None, live=False):
   1658         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
   1659             return []
   1660 
   1661         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
   1662             return []
   1663 
   1664         formats = []
   1665 
   1666         format_url = lambda u: (
   1667             u
   1668             if re.match(r'^https?://', u)
   1669             else compat_urlparse.urljoin(m3u8_url, u))
   1670 
   1671         # References:
   1672         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
   1673         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
   1674         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
   1675 
   1676         # We should try extracting formats only from master playlists [1, 4.3.4],
   1677         # i.e. playlists that describe available qualities. On the other hand
   1678         # media playlists [1, 4.3.3] should be returned as is since they contain
   1679         # just the media without qualities renditions.
   1680         # Fortunately, master playlist can be easily distinguished from media
   1681         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
   1682         # master playlist tags MUST NOT appear in a media playlist and vice versa.
   1683         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
   1684         # media playlist and MUST NOT appear in master playlist thus we can
   1685         # clearly detect media playlist with this criterion.
   1686 
   1687         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
   1688             return [{
   1689                 'url': m3u8_url,
   1690                 'format_id': m3u8_id,
   1691                 'ext': ext,
   1692                 'protocol': entry_protocol,
   1693                 'preference': preference,
   1694             }]
   1695 
   1696         groups = {}
   1697         last_stream_inf = {}
   1698 
   1699         def extract_media(x_media_line):
   1700             media = parse_m3u8_attributes(x_media_line)
   1701             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
   1702             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
   1703             if not (media_type and group_id and name):
   1704                 return
   1705             groups.setdefault(group_id, []).append(media)
   1706             if media_type not in ('VIDEO', 'AUDIO'):
   1707                 return
   1708             media_url = media.get('URI')
   1709             if media_url:
   1710                 format_id = []
   1711                 for v in (m3u8_id, group_id, name):
   1712                     if v:
   1713                         format_id.append(v)
   1714                 f = {
   1715                     'format_id': '-'.join(format_id),
   1716                     'url': format_url(media_url),
   1717                     'manifest_url': m3u8_url,
   1718                     'language': media.get('LANGUAGE'),
   1719                     'ext': ext,
   1720                     'protocol': entry_protocol,
   1721                     'preference': preference,
   1722                 }
   1723                 if media_type == 'AUDIO':
   1724                     f['vcodec'] = 'none'
   1725                 formats.append(f)
   1726 
   1727         def build_stream_name():
   1728             # Despite specification does not mention NAME attribute for
   1729             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
   1730             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
   1731             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
   1732             stream_name = last_stream_inf.get('NAME')
   1733             if stream_name:
   1734                 return stream_name
   1735             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
   1736             # from corresponding rendition group
   1737             stream_group_id = last_stream_inf.get('VIDEO')
   1738             if not stream_group_id:
   1739                 return
   1740             stream_group = groups.get(stream_group_id)
   1741             if not stream_group:
   1742                 return stream_group_id
   1743             rendition = stream_group[0]
   1744             return rendition.get('NAME') or stream_group_id
   1745 
   1746         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
   1747         # chance to detect video only formats when EXT-X-STREAM-INF tags
   1748         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
   1749         for line in m3u8_doc.splitlines():
   1750             if line.startswith('#EXT-X-MEDIA:'):
   1751                 extract_media(line)
   1752 
   1753         for line in m3u8_doc.splitlines():
   1754             if line.startswith('#EXT-X-STREAM-INF:'):
   1755                 last_stream_inf = parse_m3u8_attributes(line)
   1756             elif line.startswith('#') or not line.strip():
   1757                 continue
   1758             else:
   1759                 tbr = float_or_none(
   1760                     last_stream_inf.get('AVERAGE-BANDWIDTH')
   1761                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
   1762                 format_id = []
   1763                 if m3u8_id:
   1764                     format_id.append(m3u8_id)
   1765                 stream_name = build_stream_name()
   1766                 # Bandwidth of live streams may differ over time thus making
   1767                 # format_id unpredictable. So it's better to keep provided
   1768                 # format_id intact.
   1769                 if not live:
   1770                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
   1771                 manifest_url = format_url(line.strip())
   1772                 f = {
   1773                     'format_id': '-'.join(format_id),
   1774                     'url': manifest_url,
   1775                     'manifest_url': m3u8_url,
   1776                     'tbr': tbr,
   1777                     'ext': ext,
   1778                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
   1779                     'protocol': entry_protocol,
   1780                     'preference': preference,
   1781                 }
   1782                 resolution = last_stream_inf.get('RESOLUTION')
   1783                 if resolution:
   1784                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
   1785                     if mobj:
   1786                         f['width'] = int(mobj.group('width'))
   1787                         f['height'] = int(mobj.group('height'))
   1788                 # Unified Streaming Platform
   1789                 mobj = re.search(
   1790                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
   1791                 if mobj:
   1792                     abr, vbr = mobj.groups()
   1793                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
   1794                     f.update({
   1795                         'vbr': vbr,
   1796                         'abr': abr,
   1797                     })
   1798                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
   1799                 f.update(codecs)
   1800                 audio_group_id = last_stream_inf.get('AUDIO')
   1801                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
   1802                 # references a rendition group MUST have a CODECS attribute.
   1803                 # However, this is not always respected, for example, [2]
   1804                 # contains EXT-X-STREAM-INF tag which references AUDIO
   1805                 # rendition group but does not have CODECS and despite
   1806                 # referencing an audio group it represents a complete
   1807                 # (with audio and video) format. So, for such cases we will
   1808                 # ignore references to rendition groups and treat them
   1809                 # as complete formats.
   1810                 if audio_group_id and codecs and f.get('vcodec') != 'none':
   1811                     audio_group = groups.get(audio_group_id)
   1812                     if audio_group and audio_group[0].get('URI'):
   1813                         # TODO: update acodec for audio only formats with
   1814                         # the same GROUP-ID
   1815                         f['acodec'] = 'none'
   1816                 formats.append(f)
   1817 
   1818                 # for DailyMotion
   1819                 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
   1820                 if progressive_uri:
   1821                     http_f = f.copy()
   1822                     del http_f['manifest_url']
   1823                     http_f.update({
   1824                         'format_id': f['format_id'].replace('hls-', 'http-'),
   1825                         'protocol': 'http',
   1826                         'url': progressive_uri,
   1827                     })
   1828                     formats.append(http_f)
   1829 
   1830                 last_stream_inf = {}
   1831         return formats
   1832 
   1833     @staticmethod
   1834     def _xpath_ns(path, namespace=None):
   1835         if not namespace:
   1836             return path
   1837         out = []
   1838         for c in path.split('/'):
   1839             if not c or c == '.':
   1840                 out.append(c)
   1841             else:
   1842                 out.append('{%s}%s' % (namespace, c))
   1843         return '/'.join(out)
   1844 
   1845     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
   1846         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
   1847 
   1848         if smil is False:
   1849             assert not fatal
   1850             return []
   1851 
   1852         namespace = self._parse_smil_namespace(smil)
   1853 
   1854         return self._parse_smil_formats(
   1855             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
   1856 
   1857     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
   1858         smil = self._download_smil(smil_url, video_id, fatal=fatal)
   1859         if smil is False:
   1860             return {}
   1861         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
   1862 
   1863     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
   1864         return self._download_xml(
   1865             smil_url, video_id, 'Downloading SMIL file',
   1866             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
   1867 
   1868     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
   1869         namespace = self._parse_smil_namespace(smil)
   1870 
   1871         formats = self._parse_smil_formats(
   1872             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
   1873         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
   1874 
   1875         video_id = os.path.splitext(url_basename(smil_url))[0]
   1876         title = None
   1877         description = None
   1878         upload_date = None
   1879         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
   1880             name = meta.attrib.get('name')
   1881             content = meta.attrib.get('content')
   1882             if not name or not content:
   1883                 continue
   1884             if not title and name == 'title':
   1885                 title = content
   1886             elif not description and name in ('description', 'abstract'):
   1887                 description = content
   1888             elif not upload_date and name == 'date':
   1889                 upload_date = unified_strdate(content)
   1890 
   1891         thumbnails = [{
   1892             'id': image.get('type'),
   1893             'url': image.get('src'),
   1894             'width': int_or_none(image.get('width')),
   1895             'height': int_or_none(image.get('height')),
   1896         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
   1897 
   1898         return {
   1899             'id': video_id,
   1900             'title': title or video_id,
   1901             'description': description,
   1902             'upload_date': upload_date,
   1903             'thumbnails': thumbnails,
   1904             'formats': formats,
   1905             'subtitles': subtitles,
   1906         }
   1907 
   1908     def _parse_smil_namespace(self, smil):
   1909         return self._search_regex(
   1910             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
   1911 
   1912     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
   1913         base = smil_url
   1914         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
   1915             b = meta.get('base') or meta.get('httpBase')
   1916             if b:
   1917                 base = b
   1918                 break
   1919 
   1920         formats = []
   1921         rtmp_count = 0
   1922         http_count = 0
   1923         m3u8_count = 0
   1924 
   1925         srcs = []
   1926         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
   1927         for medium in media:
   1928             src = medium.get('src')
   1929             if not src or src in srcs:
   1930                 continue
   1931             srcs.append(src)
   1932 
   1933             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
   1934             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
   1935             width = int_or_none(medium.get('width'))
   1936             height = int_or_none(medium.get('height'))
   1937             proto = medium.get('proto')
   1938             ext = medium.get('ext')
   1939             src_ext = determine_ext(src)
   1940             streamer = medium.get('streamer') or base
   1941 
   1942             if proto == 'rtmp' or streamer.startswith('rtmp'):
   1943                 rtmp_count += 1
   1944                 formats.append({
   1945                     'url': streamer,
   1946                     'play_path': src,
   1947                     'ext': 'flv',
   1948                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
   1949                     'tbr': bitrate,
   1950                     'filesize': filesize,
   1951                     'width': width,
   1952                     'height': height,
   1953                 })
   1954                 if transform_rtmp_url:
   1955                     streamer, src = transform_rtmp_url(streamer, src)
   1956                     formats[-1].update({
   1957                         'url': streamer,
   1958                         'play_path': src,
   1959                     })
   1960                 continue
   1961 
   1962             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
   1963             src_url = src_url.strip()
   1964 
   1965             if proto == 'm3u8' or src_ext == 'm3u8':
   1966                 m3u8_formats = self._extract_m3u8_formats(
   1967                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
   1968                 if len(m3u8_formats) == 1:
   1969                     m3u8_count += 1
   1970                     m3u8_formats[0].update({
   1971                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
   1972                         'tbr': bitrate,
   1973                         'width': width,
   1974                         'height': height,
   1975                     })
   1976                 formats.extend(m3u8_formats)
   1977             elif src_ext == 'f4m':
   1978                 f4m_url = src_url
   1979                 if not f4m_params:
   1980                     f4m_params = {
   1981                         'hdcore': '3.2.0',
   1982                         'plugin': 'flowplayer-3.2.0.1',
   1983                     }
   1984                 f4m_url += '&' if '?' in f4m_url else '?'
   1985                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
   1986                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
   1987             elif src_ext == 'mpd':
   1988                 formats.extend(self._extract_mpd_formats(
   1989                     src_url, video_id, mpd_id='dash', fatal=False))
   1990             elif re.search(r'\.ism/[Mm]anifest', src_url):
   1991                 formats.extend(self._extract_ism_formats(
   1992                     src_url, video_id, ism_id='mss', fatal=False))
   1993             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
   1994                 http_count += 1
   1995                 formats.append({
   1996                     'url': src_url,
   1997                     'ext': ext or src_ext or 'flv',
   1998                     'format_id': 'http-%d' % (bitrate or http_count),
   1999                     'tbr': bitrate,
   2000                     'filesize': filesize,
   2001                     'width': width,
   2002                     'height': height,
   2003                 })
   2004 
   2005         return formats
   2006 
   2007     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
   2008         urls = []
   2009         subtitles = {}
   2010         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
   2011             src = textstream.get('src')
   2012             if not src or src in urls:
   2013                 continue
   2014             urls.append(src)
   2015             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
   2016             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
   2017             subtitles.setdefault(lang, []).append({
   2018                 'url': src,
   2019                 'ext': ext,
   2020             })
   2021         return subtitles
   2022 
   2023     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
   2024         xspf = self._download_xml(
   2025             xspf_url, playlist_id, 'Downloading xpsf playlist',
   2026             'Unable to download xspf manifest', fatal=fatal)
   2027         if xspf is False:
   2028             return []
   2029         return self._parse_xspf(
   2030             xspf, playlist_id, xspf_url=xspf_url,
   2031             xspf_base_url=base_url(xspf_url))
   2032 
   2033     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
   2034         NS_MAP = {
   2035             'xspf': 'http://xspf.org/ns/0/',
   2036             's1': 'http://static.streamone.nl/player/ns/0',
   2037         }
   2038 
   2039         entries = []
   2040         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
   2041             title = xpath_text(
   2042                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
   2043             description = xpath_text(
   2044                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
   2045             thumbnail = xpath_text(
   2046                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
   2047             duration = float_or_none(
   2048                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
   2049 
   2050             formats = []
   2051             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
   2052                 format_url = urljoin(xspf_base_url, location.text)
   2053                 if not format_url:
   2054                     continue
   2055                 formats.append({
   2056                     'url': format_url,
   2057                     'manifest_url': xspf_url,
   2058                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
   2059                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
   2060                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
   2061                 })
   2062             self._sort_formats(formats)
   2063 
   2064             entries.append({
   2065                 'id': playlist_id,
   2066                 'title': title,
   2067                 'description': description,
   2068                 'thumbnail': thumbnail,
   2069                 'duration': duration,
   2070                 'formats': formats,
   2071             })
   2072         return entries
   2073 
   2074     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
   2075         res = self._download_xml_handle(
   2076             mpd_url, video_id,
   2077             note=note or 'Downloading MPD manifest',
   2078             errnote=errnote or 'Failed to download MPD manifest',
   2079             fatal=fatal, data=data, headers=headers, query=query)
   2080         if res is False:
   2081             return []
   2082         mpd_doc, urlh = res
   2083         if mpd_doc is None:
   2084             return []
   2085         mpd_base_url = base_url(urlh.geturl())
   2086 
   2087         return self._parse_mpd_formats(
   2088             mpd_doc, mpd_id, mpd_base_url, mpd_url)
   2089 
   2090     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
   2091         """
   2092         Parse formats from MPD manifest.
   2093         References:
   2094          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
   2095             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
   2096          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
   2097         """
   2098         if mpd_doc.get('type') == 'dynamic':
   2099             return []
   2100 
   2101         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
   2102 
   2103         def _add_ns(path):
   2104             return self._xpath_ns(path, namespace)
   2105 
   2106         def is_drm_protected(element):
   2107             return element.find(_add_ns('ContentProtection')) is not None
   2108 
   2109         def extract_multisegment_info(element, ms_parent_info):
   2110             ms_info = ms_parent_info.copy()
   2111 
   2112             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
   2113             # common attributes and elements.  We will only extract relevant
   2114             # for us.
   2115             def extract_common(source):
   2116                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
   2117                 if segment_timeline is not None:
   2118                     s_e = segment_timeline.findall(_add_ns('S'))
   2119                     if s_e:
   2120                         ms_info['total_number'] = 0
   2121                         ms_info['s'] = []
   2122                         for s in s_e:
   2123                             r = int(s.get('r', 0))
   2124                             ms_info['total_number'] += 1 + r
   2125                             ms_info['s'].append({
   2126                                 't': int(s.get('t', 0)),
   2127                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
   2128                                 'd': int(s.attrib['d']),
   2129                                 'r': r,
   2130                             })
   2131                 start_number = source.get('startNumber')
   2132                 if start_number:
   2133                     ms_info['start_number'] = int(start_number)
   2134                 timescale = source.get('timescale')
   2135                 if timescale:
   2136                     ms_info['timescale'] = int(timescale)
   2137                 segment_duration = source.get('duration')
   2138                 if segment_duration:
   2139                     ms_info['segment_duration'] = float(segment_duration)
   2140 
   2141             def extract_Initialization(source):
   2142                 initialization = source.find(_add_ns('Initialization'))
   2143                 if initialization is not None:
   2144                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
   2145 
   2146             segment_list = element.find(_add_ns('SegmentList'))
   2147             if segment_list is not None:
   2148                 extract_common(segment_list)
   2149                 extract_Initialization(segment_list)
   2150                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
   2151                 if segment_urls_e:
   2152                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
   2153             else:
   2154                 segment_template = element.find(_add_ns('SegmentTemplate'))
   2155                 if segment_template is not None:
   2156                     extract_common(segment_template)
   2157                     media = segment_template.get('media')
   2158                     if media:
   2159                         ms_info['media'] = media
   2160                     initialization = segment_template.get('initialization')
   2161                     if initialization:
   2162                         ms_info['initialization'] = initialization
   2163                     else:
   2164                         extract_Initialization(segment_template)
   2165             return ms_info
   2166 
   2167         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
   2168         formats = []
   2169         for period in mpd_doc.findall(_add_ns('Period')):
   2170             period_duration = parse_duration(period.get('duration')) or mpd_duration
   2171             period_ms_info = extract_multisegment_info(period, {
   2172                 'start_number': 1,
   2173                 'timescale': 1,
   2174             })
   2175             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
   2176                 if is_drm_protected(adaptation_set):
   2177                     continue
   2178                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
   2179                 for representation in adaptation_set.findall(_add_ns('Representation')):
   2180                     if is_drm_protected(representation):
   2181                         continue
   2182                     representation_attrib = adaptation_set.attrib.copy()
   2183                     representation_attrib.update(representation.attrib)
   2184                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
   2185                     mime_type = representation_attrib['mimeType']
   2186                     content_type = mime_type.split('/')[0]
   2187                     if content_type == 'text':
   2188                         # TODO implement WebVTT downloading
   2189                         pass
   2190                     elif content_type in ('video', 'audio'):
   2191                         base_url = ''
   2192                         for element in (representation, adaptation_set, period, mpd_doc):
   2193                             base_url_e = element.find(_add_ns('BaseURL'))
   2194                             if base_url_e is not None:
   2195                                 base_url = base_url_e.text + base_url
   2196                                 if re.match(r'^https?://', base_url):
   2197                                     break
   2198                         if mpd_base_url and not re.match(r'^https?://', base_url):
   2199                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
   2200                                 mpd_base_url += '/'
   2201                             base_url = mpd_base_url + base_url
   2202                         representation_id = representation_attrib.get('id')
   2203                         lang = representation_attrib.get('lang')
   2204                         url_el = representation.find(_add_ns('BaseURL'))
   2205                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
   2206                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
   2207                         f = {
   2208                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
   2209                             'manifest_url': mpd_url,
   2210                             'ext': mimetype2ext(mime_type),
   2211                             'width': int_or_none(representation_attrib.get('width')),
   2212                             'height': int_or_none(representation_attrib.get('height')),
   2213                             'tbr': float_or_none(bandwidth, 1000),
   2214                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
   2215                             'fps': int_or_none(representation_attrib.get('frameRate')),
   2216                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
   2217                             'format_note': 'DASH %s' % content_type,
   2218                             'filesize': filesize,
   2219                             'container': mimetype2ext(mime_type) + '_dash',
   2220                         }
   2221                         f.update(parse_codecs(representation_attrib.get('codecs')))
   2222                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
   2223 
   2224                         def prepare_template(template_name, identifiers):
   2225                             tmpl = representation_ms_info[template_name]
   2226                             # First of, % characters outside $...$ templates
   2227                             # must be escaped by doubling for proper processing
   2228                             # by % operator string formatting used further (see
   2229                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
   2230                             t = ''
   2231                             in_template = False
   2232                             for c in tmpl:
   2233                                 t += c
   2234                                 if c == '$':
   2235                                     in_template = not in_template
   2236                                 elif c == '%' and not in_template:
   2237                                     t += c
   2238                             # Next, $...$ templates are translated to their
   2239                             # %(...) counterparts to be used with % operator
   2240                             t = t.replace('$RepresentationID$', representation_id)
   2241                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
   2242                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
   2243                             t.replace('$$', '$')
   2244                             return t
   2245 
   2246                         # @initialization is a regular template like @media one
   2247                         # so it should be handled just the same way (see
   2248                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
   2249                         if 'initialization' in representation_ms_info:
   2250                             initialization_template = prepare_template(
   2251                                 'initialization',
   2252                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
   2253                                 # $Time$ shall not be included for @initialization thus
   2254                                 # only $Bandwidth$ remains
   2255                                 ('Bandwidth', ))
   2256                             representation_ms_info['initialization_url'] = initialization_template % {
   2257                                 'Bandwidth': bandwidth,
   2258                             }
   2259 
   2260                         def location_key(location):
   2261                             return 'url' if re.match(r'^https?://', location) else 'path'
   2262 
   2263                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
   2264 
   2265                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
   2266                             media_location_key = location_key(media_template)
   2267 
   2268                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
   2269                             # can't be used at the same time
   2270                             if '%(Number' in media_template and 's' not in representation_ms_info:
   2271                                 segment_duration = None
   2272                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
   2273                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
   2274                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
   2275                                 representation_ms_info['fragments'] = [{
   2276                                     media_location_key: media_template % {
   2277                                         'Number': segment_number,
   2278                                         'Bandwidth': bandwidth,
   2279                                     },
   2280                                     'duration': segment_duration,
   2281                                 } for segment_number in range(
   2282                                     representation_ms_info['start_number'],
   2283                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
   2284                             else:
   2285                                 # $Number*$ or $Time$ in media template with S list available
   2286                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
   2287                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
   2288                                 representation_ms_info['fragments'] = []
   2289                                 segment_time = 0
   2290                                 segment_d = None
   2291                                 segment_number = representation_ms_info['start_number']
   2292 
   2293                                 def add_segment_url():
   2294                                     segment_url = media_template % {
   2295                                         'Time': segment_time,
   2296                                         'Bandwidth': bandwidth,
   2297                                         'Number': segment_number,
   2298                                     }
   2299                                     representation_ms_info['fragments'].append({
   2300                                         media_location_key: segment_url,
   2301                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
   2302                                     })
   2303 
   2304                                 for num, s in enumerate(representation_ms_info['s']):
   2305                                     segment_time = s.get('t') or segment_time
   2306                                     segment_d = s['d']
   2307                                     add_segment_url()
   2308                                     segment_number += 1
   2309                                     for r in range(s.get('r', 0)):
   2310                                         segment_time += segment_d
   2311                                         add_segment_url()
   2312                                         segment_number += 1
   2313                                     segment_time += segment_d
   2314                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
   2315                             # No media template
   2316                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
   2317                             # or any YouTube dashsegments video
   2318                             fragments = []
   2319                             segment_index = 0
   2320                             timescale = representation_ms_info['timescale']
   2321                             for s in representation_ms_info['s']:
   2322                                 duration = float_or_none(s['d'], timescale)
   2323                                 for r in range(s.get('r', 0) + 1):
   2324                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
   2325                                     fragments.append({
   2326                                         location_key(segment_uri): segment_uri,
   2327                                         'duration': duration,
   2328                                     })
   2329                                     segment_index += 1
   2330                             representation_ms_info['fragments'] = fragments
   2331                         elif 'segment_urls' in representation_ms_info:
   2332                             # Segment URLs with no SegmentTimeline
   2333                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
   2334                             # https://github.com/ytdl-org/youtube-dl/pull/14844
   2335                             fragments = []
   2336                             segment_duration = float_or_none(
   2337                                 representation_ms_info['segment_duration'],
   2338                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
   2339                             for segment_url in representation_ms_info['segment_urls']:
   2340                                 fragment = {
   2341                                     location_key(segment_url): segment_url,
   2342                                 }
   2343                                 if segment_duration:
   2344                                     fragment['duration'] = segment_duration
   2345                                 fragments.append(fragment)
   2346                             representation_ms_info['fragments'] = fragments
   2347                         # If there is a fragments key available then we correctly recognized fragmented media.
   2348                         # Otherwise we will assume unfragmented media with direct access. Technically, such
   2349                         # assumption is not necessarily correct since we may simply have no support for
   2350                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
   2351                         if 'fragments' in representation_ms_info:
   2352                             f.update({
   2353                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
   2354                                 'url': mpd_url or base_url,
   2355                                 'fragment_base_url': base_url,
   2356                                 'fragments': [],
   2357                                 'protocol': 'http_dash_segments',
   2358                             })
   2359                             if 'initialization_url' in representation_ms_info:
   2360                                 initialization_url = representation_ms_info['initialization_url']
   2361                                 if not f.get('url'):
   2362                                     f['url'] = initialization_url
   2363                                 f['fragments'].append({location_key(initialization_url): initialization_url})
   2364                             f['fragments'].extend(representation_ms_info['fragments'])
   2365                         else:
   2366                             # Assuming direct URL to unfragmented media.
   2367                             f['url'] = base_url
   2368                         formats.append(f)
   2369                     else:
   2370                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
   2371         return formats
   2372 
   2373     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
   2374         res = self._download_xml_handle(
   2375             ism_url, video_id,
   2376             note=note or 'Downloading ISM manifest',
   2377             errnote=errnote or 'Failed to download ISM manifest',
   2378             fatal=fatal, data=data, headers=headers, query=query)
   2379         if res is False:
   2380             return []
   2381         ism_doc, urlh = res
   2382         if ism_doc is None:
   2383             return []
   2384 
   2385         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
   2386 
   2387     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
   2388         """
   2389         Parse formats from ISM manifest.
   2390         References:
   2391          1. [MS-SSTR]: Smooth Streaming Protocol,
   2392             https://msdn.microsoft.com/en-us/library/ff469518.aspx
   2393         """
   2394         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
   2395             return []
   2396 
   2397         duration = int(ism_doc.attrib['Duration'])
   2398         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
   2399 
   2400         formats = []
   2401         for stream in ism_doc.findall('StreamIndex'):
   2402             stream_type = stream.get('Type')
   2403             if stream_type not in ('video', 'audio'):
   2404                 continue
   2405             url_pattern = stream.attrib['Url']
   2406             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
   2407             stream_name = stream.get('Name')
   2408             for track in stream.findall('QualityLevel'):
   2409                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
   2410                 # TODO: add support for WVC1 and WMAP
   2411                 if fourcc not in ('H264', 'AVC1', 'AACL'):
   2412                     self.report_warning('%s is not a supported codec' % fourcc)
   2413                     continue
   2414                 tbr = int(track.attrib['Bitrate']) // 1000
   2415                 # [1] does not mention Width and Height attributes. However,
   2416                 # they're often present while MaxWidth and MaxHeight are
   2417                 # missing, so should be used as fallbacks
   2418                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
   2419                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
   2420                 sampling_rate = int_or_none(track.get('SamplingRate'))
   2421 
   2422                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
   2423                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
   2424 
   2425                 fragments = []
   2426                 fragment_ctx = {
   2427                     'time': 0,
   2428                 }
   2429                 stream_fragments = stream.findall('c')
   2430                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
   2431                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
   2432                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
   2433                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
   2434                     if not fragment_ctx['duration']:
   2435                         try:
   2436                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
   2437                         except IndexError:
   2438                             next_fragment_time = duration
   2439                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
   2440                     for _ in range(fragment_repeat):
   2441                         fragments.append({
   2442                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
   2443                             'duration': fragment_ctx['duration'] / stream_timescale,
   2444                         })
   2445                         fragment_ctx['time'] += fragment_ctx['duration']
   2446 
   2447                 format_id = []
   2448                 if ism_id:
   2449                     format_id.append(ism_id)
   2450                 if stream_name:
   2451                     format_id.append(stream_name)
   2452                 format_id.append(compat_str(tbr))
   2453 
   2454                 formats.append({
   2455                     'format_id': '-'.join(format_id),
   2456                     'url': ism_url,
   2457                     'manifest_url': ism_url,
   2458                     'ext': 'ismv' if stream_type == 'video' else 'isma',
   2459                     'width': width,
   2460                     'height': height,
   2461                     'tbr': tbr,
   2462                     'asr': sampling_rate,
   2463                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
   2464                     'acodec': 'none' if stream_type == 'video' else fourcc,
   2465                     'protocol': 'ism',
   2466                     'fragments': fragments,
   2467                     '_download_params': {
   2468                         'duration': duration,
   2469                         'timescale': stream_timescale,
   2470                         'width': width or 0,
   2471                         'height': height or 0,
   2472                         'fourcc': fourcc,
   2473                         'codec_private_data': track.get('CodecPrivateData'),
   2474                         'sampling_rate': sampling_rate,
   2475                         'channels': int_or_none(track.get('Channels', 2)),
   2476                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
   2477                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
   2478                     },
   2479                 })
   2480         return formats
   2481 
   2482     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
   2483         def absolute_url(item_url):
   2484             return urljoin(base_url, item_url)
   2485 
   2486         def parse_content_type(content_type):
   2487             if not content_type:
   2488                 return {}
   2489             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
   2490             if ctr:
   2491                 mimetype, codecs = ctr.groups()
   2492                 f = parse_codecs(codecs)
   2493                 f['ext'] = mimetype2ext(mimetype)
   2494                 return f
   2495             return {}
   2496 
   2497         def _media_formats(src, cur_media_type, type_info={}):
   2498             full_url = absolute_url(src)
   2499             ext = type_info.get('ext') or determine_ext(full_url)
   2500             if ext == 'm3u8':
   2501                 is_plain_url = False
   2502                 formats = self._extract_m3u8_formats(
   2503                     full_url, video_id, ext='mp4',
   2504                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
   2505                     preference=preference, fatal=False)
   2506             elif ext == 'mpd':
   2507                 is_plain_url = False
   2508                 formats = self._extract_mpd_formats(
   2509                     full_url, video_id, mpd_id=mpd_id, fatal=False)
   2510             else:
   2511                 is_plain_url = True
   2512                 formats = [{
   2513                     'url': full_url,
   2514                     'vcodec': 'none' if cur_media_type == 'audio' else None,
   2515                 }]
   2516             return is_plain_url, formats
   2517 
   2518         entries = []
   2519         # amp-video and amp-audio are very similar to their HTML5 counterparts
   2520         # so we wll include them right here (see
   2521         # https://www.ampproject.org/docs/reference/components/amp-video)
   2522         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
   2523         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
   2524         media_tags = [(media_tag, media_tag_name, media_type, '')
   2525                       for media_tag, media_tag_name, media_type
   2526                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
   2527         media_tags.extend(re.findall(
   2528             # We only allow video|audio followed by a whitespace or '>'.
   2529             # Allowing more characters may end up in significant slow down (see
   2530             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
   2531             # http://www.porntrex.com/maps/videositemap.xml).
   2532             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
   2533         for media_tag, _, media_type, media_content in media_tags:
   2534             media_info = {
   2535                 'formats': [],
   2536                 'subtitles': {},
   2537             }
   2538             media_attributes = extract_attributes(media_tag)
   2539             src = strip_or_none(media_attributes.get('src'))
   2540             if src:
   2541                 _, formats = _media_formats(src, media_type)
   2542                 media_info['formats'].extend(formats)
   2543             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
   2544             if media_content:
   2545                 for source_tag in re.findall(r'<source[^>]+>', media_content):
   2546                     s_attr = extract_attributes(source_tag)
   2547                     # data-video-src and data-src are non standard but seen
   2548                     # several times in the wild
   2549                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
   2550                     if not src:
   2551                         continue
   2552                     f = parse_content_type(s_attr.get('type'))
   2553                     is_plain_url, formats = _media_formats(src, media_type, f)
   2554                     if is_plain_url:
   2555                         # width, height, res, label and title attributes are
   2556                         # all not standard but seen several times in the wild
   2557                         labels = [
   2558                             s_attr.get(lbl)
   2559                             for lbl in ('label', 'title')
   2560                             if str_or_none(s_attr.get(lbl))
   2561                         ]
   2562                         width = int_or_none(s_attr.get('width'))
   2563                         height = (int_or_none(s_attr.get('height'))
   2564                                   or int_or_none(s_attr.get('res')))
   2565                         if not width or not height:
   2566                             for lbl in labels:
   2567                                 resolution = parse_resolution(lbl)
   2568                                 if not resolution:
   2569                                     continue
   2570                                 width = width or resolution.get('width')
   2571                                 height = height or resolution.get('height')
   2572                         for lbl in labels:
   2573                             tbr = parse_bitrate(lbl)
   2574                             if tbr:
   2575                                 break
   2576                         else:
   2577                             tbr = None
   2578                         f.update({
   2579                             'width': width,
   2580                             'height': height,
   2581                             'tbr': tbr,
   2582                             'format_id': s_attr.get('label') or s_attr.get('title'),
   2583                         })
   2584                         f.update(formats[0])
   2585                         media_info['formats'].append(f)
   2586                     else:
   2587                         media_info['formats'].extend(formats)
   2588                 for track_tag in re.findall(r'<track[^>]+>', media_content):
   2589                     track_attributes = extract_attributes(track_tag)
   2590                     kind = track_attributes.get('kind')
   2591                     if not kind or kind in ('subtitles', 'captions'):
   2592                         src = strip_or_none(track_attributes.get('src'))
   2593                         if not src:
   2594                             continue
   2595                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
   2596                         media_info['subtitles'].setdefault(lang, []).append({
   2597                             'url': absolute_url(src),
   2598                         })
   2599             for f in media_info['formats']:
   2600                 f.setdefault('http_headers', {})['Referer'] = base_url
   2601             if media_info['formats'] or media_info['subtitles']:
   2602                 entries.append(media_info)
   2603         return entries
   2604 
   2605     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
   2606         signed = 'hdnea=' in manifest_url
   2607         if not signed:
   2608             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
   2609             manifest_url = re.sub(
   2610                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
   2611                 '', manifest_url).strip('?')
   2612 
   2613         formats = []
   2614 
   2615         hdcore_sign = 'hdcore=3.7.0'
   2616         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
   2617         hds_host = hosts.get('hds')
   2618         if hds_host:
   2619             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
   2620         if 'hdcore=' not in f4m_url:
   2621             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
   2622         f4m_formats = self._extract_f4m_formats(
   2623             f4m_url, video_id, f4m_id='hds', fatal=False)
   2624         for entry in f4m_formats:
   2625             entry.update({'extra_param_to_segment_url': hdcore_sign})
   2626         formats.extend(f4m_formats)
   2627 
   2628         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
   2629         hls_host = hosts.get('hls')
   2630         if hls_host:
   2631             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
   2632         m3u8_formats = self._extract_m3u8_formats(
   2633             m3u8_url, video_id, 'mp4', 'm3u8_native',
   2634             m3u8_id='hls', fatal=False)
   2635         formats.extend(m3u8_formats)
   2636 
   2637         http_host = hosts.get('http')
   2638         if http_host and m3u8_formats and not signed:
   2639             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
   2640             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
   2641             qualities_length = len(qualities)
   2642             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
   2643                 i = 0
   2644                 for f in m3u8_formats:
   2645                     if f['vcodec'] != 'none':
   2646                         for protocol in ('http', 'https'):
   2647                             http_f = f.copy()
   2648                             del http_f['manifest_url']
   2649                             http_url = re.sub(
   2650                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
   2651                             http_f.update({
   2652                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
   2653                                 'url': http_url,
   2654                                 'protocol': protocol,
   2655                             })
   2656                             formats.append(http_f)
   2657                         i += 1
   2658 
   2659         return formats
   2660 
   2661     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
   2662         query = compat_urlparse.urlparse(url).query
   2663         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
   2664         mobj = re.search(
   2665             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
   2666         url_base = mobj.group('url')
   2667         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
   2668         formats = []
   2669 
   2670         def manifest_url(manifest):
   2671             m_url = '%s/%s' % (http_base_url, manifest)
   2672             if query:
   2673                 m_url += '?%s' % query
   2674             return m_url
   2675 
   2676         if 'm3u8' not in skip_protocols:
   2677             formats.extend(self._extract_m3u8_formats(
   2678                 manifest_url('playlist.m3u8'), video_id, 'mp4',
   2679                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
   2680         if 'f4m' not in skip_protocols:
   2681             formats.extend(self._extract_f4m_formats(
   2682                 manifest_url('manifest.f4m'),
   2683                 video_id, f4m_id='hds', fatal=False))
   2684         if 'dash' not in skip_protocols:
   2685             formats.extend(self._extract_mpd_formats(
   2686                 manifest_url('manifest.mpd'),
   2687                 video_id, mpd_id='dash', fatal=False))
   2688         if re.search(r'(?:/smil:|\.smil)', url_base):
   2689             if 'smil' not in skip_protocols:
   2690                 rtmp_formats = self._extract_smil_formats(
   2691                     manifest_url('jwplayer.smil'),
   2692                     video_id, fatal=False)
   2693                 for rtmp_format in rtmp_formats:
   2694                     rtsp_format = rtmp_format.copy()
   2695                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
   2696                     del rtsp_format['play_path']
   2697                     del rtsp_format['ext']
   2698                     rtsp_format.update({
   2699                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
   2700                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
   2701                         'protocol': 'rtsp',
   2702                     })
   2703                     formats.extend([rtmp_format, rtsp_format])
   2704         else:
   2705             for protocol in ('rtmp', 'rtsp'):
   2706                 if protocol not in skip_protocols:
   2707                     formats.append({
   2708                         'url': '%s:%s' % (protocol, url_base),
   2709                         'format_id': protocol,
   2710                         'protocol': protocol,
   2711                     })
   2712         return formats
   2713 
   2714     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
   2715         mobj = re.search(
   2716             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
   2717             webpage)
   2718         if mobj:
   2719             try:
   2720                 jwplayer_data = self._parse_json(mobj.group('options'),
   2721                                                  video_id=video_id,
   2722                                                  transform_source=transform_source)
   2723             except ExtractorError:
   2724                 pass
   2725             else:
   2726                 if isinstance(jwplayer_data, dict):
   2727                     return jwplayer_data
   2728 
   2729     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
   2730         jwplayer_data = self._find_jwplayer_data(
   2731             webpage, video_id, transform_source=js_to_json)
   2732         return self._parse_jwplayer_data(
   2733             jwplayer_data, video_id, *args, **kwargs)
   2734 
   2735     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
   2736                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
   2737         # JWPlayer backward compatibility: flattened playlists
   2738         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
   2739         if 'playlist' not in jwplayer_data:
   2740             jwplayer_data = {'playlist': [jwplayer_data]}
   2741 
   2742         entries = []
   2743 
   2744         # JWPlayer backward compatibility: single playlist item
   2745         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
   2746         if not isinstance(jwplayer_data['playlist'], list):
   2747             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
   2748 
   2749         for video_data in jwplayer_data['playlist']:
   2750             # JWPlayer backward compatibility: flattened sources
   2751             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
   2752             if 'sources' not in video_data:
   2753                 video_data['sources'] = [video_data]
   2754 
   2755             this_video_id = video_id or video_data['mediaid']
   2756 
   2757             formats = self._parse_jwplayer_formats(
   2758                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
   2759                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
   2760 
   2761             subtitles = {}
   2762             tracks = video_data.get('tracks')
   2763             if tracks and isinstance(tracks, list):
   2764                 for track in tracks:
   2765                     if not isinstance(track, dict):
   2766                         continue
   2767                     track_kind = track.get('kind')
   2768                     if not track_kind or not isinstance(track_kind, compat_str):
   2769                         continue
   2770                     if track_kind.lower() not in ('captions', 'subtitles'):
   2771                         continue
   2772                     track_url = urljoin(base_url, track.get('file'))
   2773                     if not track_url:
   2774                         continue
   2775                     subtitles.setdefault(track.get('label') or 'en', []).append({
   2776                         'url': self._proto_relative_url(track_url)
   2777                     })
   2778 
   2779             entry = {
   2780                 'id': this_video_id,
   2781                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
   2782                 'description': clean_html(video_data.get('description')),
   2783                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
   2784                 'timestamp': int_or_none(video_data.get('pubdate')),
   2785                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
   2786                 'subtitles': subtitles,
   2787             }
   2788             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
   2789             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
   2790                 entry.update({
   2791                     '_type': 'url_transparent',
   2792                     'url': formats[0]['url'],
   2793                 })
   2794             else:
   2795                 self._sort_formats(formats)
   2796                 entry['formats'] = formats
   2797             entries.append(entry)
   2798         if len(entries) == 1:
   2799             return entries[0]
   2800         else:
   2801             return self.playlist_result(entries)
   2802 
   2803     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
   2804                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
   2805         urls = []
   2806         formats = []
   2807         for source in jwplayer_sources_data:
   2808             if not isinstance(source, dict):
   2809                 continue
   2810             source_url = urljoin(
   2811                 base_url, self._proto_relative_url(source.get('file')))
   2812             if not source_url or source_url in urls:
   2813                 continue
   2814             urls.append(source_url)
   2815             source_type = source.get('type') or ''
   2816             ext = mimetype2ext(source_type) or determine_ext(source_url)
   2817             if source_type == 'hls' or ext == 'm3u8':
   2818                 formats.extend(self._extract_m3u8_formats(
   2819                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
   2820                     m3u8_id=m3u8_id, fatal=False))
   2821             elif source_type == 'dash' or ext == 'mpd':
   2822                 formats.extend(self._extract_mpd_formats(
   2823                     source_url, video_id, mpd_id=mpd_id, fatal=False))
   2824             elif ext == 'smil':
   2825                 formats.extend(self._extract_smil_formats(
   2826                     source_url, video_id, fatal=False))
   2827             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
   2828             elif source_type.startswith('audio') or ext in (
   2829                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
   2830                 formats.append({
   2831                     'url': source_url,
   2832                     'vcodec': 'none',
   2833                     'ext': ext,
   2834                 })
   2835             else:
   2836                 height = int_or_none(source.get('height'))
   2837                 if height is None:
   2838                     # Often no height is provided but there is a label in
   2839                     # format like "1080p", "720p SD", or 1080.
   2840                     height = int_or_none(self._search_regex(
   2841                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
   2842                         'height', default=None))
   2843                 a_format = {
   2844                     'url': source_url,
   2845                     'width': int_or_none(source.get('width')),
   2846                     'height': height,
   2847                     'tbr': int_or_none(source.get('bitrate')),
   2848                     'ext': ext,
   2849                 }
   2850                 if source_url.startswith('rtmp'):
   2851                     a_format['ext'] = 'flv'
   2852                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
   2853                     # of jwplayer.flash.swf
   2854                     rtmp_url_parts = re.split(
   2855                         r'((?:mp4|mp3|flv):)', source_url, 1)
   2856                     if len(rtmp_url_parts) == 3:
   2857                         rtmp_url, prefix, play_path = rtmp_url_parts
   2858                         a_format.update({
   2859                             'url': rtmp_url,
   2860                             'play_path': prefix + play_path,
   2861                         })
   2862                     if rtmp_params:
   2863                         a_format.update(rtmp_params)
   2864                 formats.append(a_format)
   2865         return formats
   2866 
   2867     def _live_title(self, name):
   2868         """ Generate the title for a live video """
   2869         now = datetime.datetime.now()
   2870         now_str = now.strftime('%Y-%m-%d %H:%M')
   2871         return name + ' ' + now_str
   2872 
   2873     def _int(self, v, name, fatal=False, **kwargs):
   2874         res = int_or_none(v, **kwargs)
   2875         if 'get_attr' in kwargs:
   2876             print(getattr(v, kwargs['get_attr']))
   2877         if res is None:
   2878             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
   2879             if fatal:
   2880                 raise ExtractorError(msg)
   2881             else:
   2882                 self._downloader.report_warning(msg)
   2883         return res
   2884 
   2885     def _float(self, v, name, fatal=False, **kwargs):
   2886         res = float_or_none(v, **kwargs)
   2887         if res is None:
   2888             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
   2889             if fatal:
   2890                 raise ExtractorError(msg)
   2891             else:
   2892                 self._downloader.report_warning(msg)
   2893         return res
   2894 
   2895     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
   2896                     path='/', secure=False, discard=False, rest={}, **kwargs):
   2897         cookie = compat_cookiejar_Cookie(
   2898             0, name, value, port, port is not None, domain, True,
   2899             domain.startswith('.'), path, True, secure, expire_time,
   2900             discard, None, None, rest)
   2901         self._downloader.cookiejar.set_cookie(cookie)
   2902 
   2903     def _get_cookies(self, url):
   2904         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
   2905         req = sanitized_Request(url)
   2906         self._downloader.cookiejar.add_cookie_header(req)
   2907         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
   2908 
   2909     def _apply_first_set_cookie_header(self, url_handle, cookie):
   2910         """
   2911         Apply first Set-Cookie header instead of the last. Experimental.
   2912 
   2913         Some sites (e.g. [1-3]) may serve two cookies under the same name
   2914         in Set-Cookie header and expect the first (old) one to be set rather
   2915         than second (new). However, as of RFC6265 the newer one cookie
   2916         should be set into cookie store what actually happens.
   2917         We will workaround this issue by resetting the cookie to
   2918         the first one manually.
   2919         1. https://new.vk.com/
   2920         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
   2921         3. https://learning.oreilly.com/
   2922         """
   2923         for header, cookies in url_handle.headers.items():
   2924             if header.lower() != 'set-cookie':
   2925                 continue
   2926             if sys.version_info[0] >= 3:
   2927                 cookies = cookies.encode('iso-8859-1')
   2928             cookies = cookies.decode('utf-8')
   2929             cookie_value = re.search(
   2930                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
   2931             if cookie_value:
   2932                 value, domain = cookie_value.groups()
   2933                 self._set_cookie(domain, cookie, value)
   2934                 break
   2935 
   2936     def get_testcases(self, include_onlymatching=False):
   2937         t = getattr(self, '_TEST', None)
   2938         if t:
   2939             assert not hasattr(self, '_TESTS'), \
   2940                 '%s has _TEST and _TESTS' % type(self).__name__
   2941             tests = [t]
   2942         else:
   2943             tests = getattr(self, '_TESTS', [])
   2944         for t in tests:
   2945             if not include_onlymatching and t.get('only_matching', False):
   2946                 continue
   2947             t['name'] = type(self).__name__[:-len('IE')]
   2948             yield t
   2949 
   2950     def is_suitable(self, age_limit):
   2951         """ Test whether the extractor is generally suitable for the given
   2952         age limit (i.e. pornographic sites are not, all others usually are) """
   2953 
   2954         any_restricted = False
   2955         for tc in self.get_testcases(include_onlymatching=False):
   2956             if tc.get('playlist', []):
   2957                 tc = tc['playlist'][0]
   2958             is_restricted = age_restricted(
   2959                 tc.get('info_dict', {}).get('age_limit'), age_limit)
   2960             if not is_restricted:
   2961                 return True
   2962             any_restricted = any_restricted or is_restricted
   2963         return not any_restricted
   2964 
   2965     def extract_subtitles(self, *args, **kwargs):
   2966         if (self._downloader.params.get('writesubtitles', False)
   2967                 or self._downloader.params.get('listsubtitles')):
   2968             return self._get_subtitles(*args, **kwargs)
   2969         return {}
   2970 
   2971     def _get_subtitles(self, *args, **kwargs):
   2972         raise NotImplementedError('This method must be implemented by subclasses')
   2973 
   2974     @staticmethod
   2975     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
   2976         """ Merge subtitle items for one language. Items with duplicated URLs
   2977         will be dropped. """
   2978         list1_urls = set([item['url'] for item in subtitle_list1])
   2979         ret = list(subtitle_list1)
   2980         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
   2981         return ret
   2982 
   2983     @classmethod
   2984     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
   2985         """ Merge two subtitle dictionaries, language by language. """
   2986         ret = dict(subtitle_dict1)
   2987         for lang in subtitle_dict2:
   2988             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
   2989         return ret
   2990 
   2991     def extract_automatic_captions(self, *args, **kwargs):
   2992         if (self._downloader.params.get('writeautomaticsub', False)
   2993                 or self._downloader.params.get('listsubtitles')):
   2994             return self._get_automatic_captions(*args, **kwargs)
   2995         return {}
   2996 
   2997     def _get_automatic_captions(self, *args, **kwargs):
   2998         raise NotImplementedError('This method must be implemented by subclasses')
   2999 
   3000     def mark_watched(self, *args, **kwargs):
   3001         if (self._downloader.params.get('mark_watched', False)
   3002                 and (self._get_login_info()[0] is not None
   3003                      or self._downloader.params.get('cookiefile') is not None)):
   3004             self._mark_watched(*args, **kwargs)
   3005 
   3006     def _mark_watched(self, *args, **kwargs):
   3007         raise NotImplementedError('This method must be implemented by subclasses')
   3008 
   3009     def geo_verification_headers(self):
   3010         headers = {}
   3011         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
   3012         if geo_verification_proxy:
   3013             headers['Ytdl-request-proxy'] = geo_verification_proxy
   3014         return headers
   3015 
   3016     def _generic_id(self, url):
   3017         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
   3018 
   3019     def _generic_title(self, url):
   3020         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
   3021 
   3022 
   3023 class SearchInfoExtractor(InfoExtractor):
   3024     """
   3025     Base class for paged search queries extractors.
   3026     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
   3027     Instances should define _SEARCH_KEY and _MAX_RESULTS.
   3028     """
   3029 
   3030     @classmethod
   3031     def _make_valid_url(cls):
   3032         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
   3033 
   3034     @classmethod
   3035     def suitable(cls, url):
   3036         return re.match(cls._make_valid_url(), url) is not None
   3037 
   3038     def _real_extract(self, query):
   3039         mobj = re.match(self._make_valid_url(), query)
   3040         if mobj is None:
   3041             raise ExtractorError('Invalid search query "%s"' % query)
   3042 
   3043         prefix = mobj.group('prefix')
   3044         query = mobj.group('query')
   3045         if prefix == '':
   3046             return self._get_n_results(query, 1)
   3047         elif prefix == 'all':
   3048             return self._get_n_results(query, self._MAX_RESULTS)
   3049         else:
   3050             n = int(prefix)
   3051             if n <= 0:
   3052                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
   3053             elif n > self._MAX_RESULTS:
   3054                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
   3055                 n = self._MAX_RESULTS
   3056             return self._get_n_results(query, n)
   3057 
   3058     def _get_n_results(self, query, n):
   3059         """Get a specified number of results for a query"""
   3060         raise NotImplementedError('This method must be implemented by subclasses')
   3061 
   3062     @property
   3063     def SEARCH_KEY(self):
   3064         return self._SEARCH_KEY