common.py (143736B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import base64 5 import datetime 6 import hashlib 7 import json 8 import netrc 9 import os 10 import random 11 import re 12 import socket 13 import ssl 14 import sys 15 import time 16 import math 17 18 from ..compat import ( 19 compat_cookiejar_Cookie, 20 compat_cookies_SimpleCookie, 21 compat_etree_Element, 22 compat_etree_fromstring, 23 compat_getpass, 24 compat_integer_types, 25 compat_http_client, 26 compat_os_name, 27 compat_str, 28 compat_urllib_error, 29 compat_urllib_parse_unquote, 30 compat_urllib_parse_urlencode, 31 compat_urllib_request, 32 compat_urlparse, 33 compat_xml_parse_error, 34 ) 35 from ..downloader.f4m import ( 36 get_base_url, 37 remove_encrypted_media, 38 ) 39 from ..utils import ( 40 NO_DEFAULT, 41 age_restricted, 42 base_url, 43 bug_reports_message, 44 clean_html, 45 compiled_regex_type, 46 determine_ext, 47 determine_protocol, 48 dict_get, 49 error_to_compat_str, 50 ExtractorError, 51 extract_attributes, 52 fix_xml_ampersands, 53 float_or_none, 54 GeoRestrictedError, 55 GeoUtils, 56 int_or_none, 57 js_to_json, 58 JSON_LD_RE, 59 mimetype2ext, 60 orderedSet, 61 parse_bitrate, 62 parse_codecs, 63 parse_duration, 64 parse_iso8601, 65 parse_m3u8_attributes, 66 parse_resolution, 67 RegexNotFoundError, 68 sanitized_Request, 69 sanitize_filename, 70 str_or_none, 71 str_to_int, 72 strip_or_none, 73 unescapeHTML, 74 unified_strdate, 75 unified_timestamp, 76 update_Request, 77 update_url_query, 78 urljoin, 79 url_basename, 80 url_or_none, 81 xpath_element, 82 xpath_text, 83 xpath_with_ns, 84 ) 85 86 87 class InfoExtractor(object): 88 """Information Extractor class. 89 90 Information extractors are the classes that, given a URL, extract 91 information about the video (or videos) the URL refers to. This 92 information includes the real video URL, the video title, author and 93 others. The information is stored in a dictionary which is then 94 passed to the YoutubeDL. The YoutubeDL processes this 95 information possibly downloading the video to the file system, among 96 other possible outcomes. 97 98 The type field determines the type of the result. 99 By far the most common value (and the default if _type is missing) is 100 "video", which indicates a single video. 101 102 For a video, the dictionaries must include the following fields: 103 104 id: Video identifier. 105 title: Video title, unescaped. 106 107 Additionally, it must contain either a formats entry or a url one: 108 109 formats: A list of dictionaries for each format available, ordered 110 from worst to best quality. 111 112 Potential fields: 113 * url The mandatory URL representing the media: 114 for plain file media - HTTP URL of this file, 115 for RTMP - RTMP URL, 116 for HLS - URL of the M3U8 media playlist, 117 for HDS - URL of the F4M manifest, 118 for DASH 119 - HTTP URL to plain file media (in case of 120 unfragmented media) 121 - URL of the MPD manifest or base URL 122 representing the media if MPD manifest 123 is parsed from a string (in case of 124 fragmented media) 125 for MSS - URL of the ISM manifest. 126 * manifest_url 127 The URL of the manifest file in case of 128 fragmented media: 129 for HLS - URL of the M3U8 master playlist, 130 for HDS - URL of the F4M manifest, 131 for DASH - URL of the MPD manifest, 132 for MSS - URL of the ISM manifest. 133 * ext Will be calculated from URL if missing 134 * format A human-readable description of the format 135 ("mp4 container with h264/opus"). 136 Calculated from the format_id, width, height. 137 and format_note fields if missing. 138 * format_id A short description of the format 139 ("mp4_h264_opus" or "19"). 140 Technically optional, but strongly recommended. 141 * format_note Additional info about the format 142 ("3D" or "DASH video") 143 * width Width of the video, if known 144 * height Height of the video, if known 145 * resolution Textual description of width and height 146 * tbr Average bitrate of audio and video in KBit/s 147 * abr Average audio bitrate in KBit/s 148 * acodec Name of the audio codec in use 149 * asr Audio sampling rate in Hertz 150 * vbr Average video bitrate in KBit/s 151 * fps Frame rate 152 * vcodec Name of the video codec in use 153 * container Name of the container format 154 * filesize The number of bytes, if known in advance 155 * filesize_approx An estimate for the number of bytes 156 * player_url SWF Player URL (used for rtmpdump). 157 * protocol The protocol that will be used for the actual 158 download, lower-case. 159 "http", "https", "rtsp", "rtmp", "rtmpe", 160 "m3u8", "m3u8_native" or "http_dash_segments". 161 * fragment_base_url 162 Base URL for fragments. Each fragment's path 163 value (if present) will be relative to 164 this URL. 165 * fragments A list of fragments of a fragmented media. 166 Each fragment entry must contain either an url 167 or a path. If an url is present it should be 168 considered by a client. Otherwise both path and 169 fragment_base_url must be present. Here is 170 the list of all potential fields: 171 * "url" - fragment's URL 172 * "path" - fragment's path relative to 173 fragment_base_url 174 * "duration" (optional, int or float) 175 * "filesize" (optional, int) 176 * preference Order number of this format. If this field is 177 present and not None, the formats get sorted 178 by this field, regardless of all other values. 179 -1 for default (order by other properties), 180 -2 or smaller for less than default. 181 < -1000 to hide the format (if there is 182 another one which is strictly better) 183 * language Language code, e.g. "de" or "en-US". 184 * language_preference Is this in the language mentioned in 185 the URL? 186 10 if it's what the URL is about, 187 -1 for default (don't know), 188 -10 otherwise, other values reserved for now. 189 * quality Order number of the video quality of this 190 format, irrespective of the file format. 191 -1 for default (order by other properties), 192 -2 or smaller for less than default. 193 * source_preference Order number for this video source 194 (quality takes higher priority) 195 -1 for default (order by other properties), 196 -2 or smaller for less than default. 197 * http_headers A dictionary of additional HTTP headers 198 to add to the request. 199 * stretched_ratio If given and not 1, indicates that the 200 video's pixels are not square. 201 width : height ratio as float. 202 * no_resume The server does not support resuming the 203 (HTTP or RTMP) download. Boolean. 204 * downloader_options A dictionary of downloader options as 205 described in FileDownloader 206 207 url: Final video URL. 208 ext: Video filename extension. 209 format: The video format, defaults to ext (used for --get-format) 210 player_url: SWF Player URL (used for rtmpdump). 211 212 The following fields are optional: 213 214 alt_title: A secondary title of the video. 215 display_id An alternative identifier for the video, not necessarily 216 unique, but available before title. Typically, id is 217 something like "4234987", title "Dancing naked mole rats", 218 and display_id "dancing-naked-mole-rats" 219 thumbnails: A list of dictionaries, with the following entries: 220 * "id" (optional, string) - Thumbnail format ID 221 * "url" 222 * "preference" (optional, int) - quality of the image 223 * "width" (optional, int) 224 * "height" (optional, int) 225 * "resolution" (optional, string "{width}x{height}", 226 deprecated) 227 * "filesize" (optional, int) 228 thumbnail: Full URL to a video thumbnail image. 229 description: Full video description. 230 uploader: Full name of the video uploader. 231 license: License name the video is licensed under. 232 creator: The creator of the video. 233 release_timestamp: UNIX timestamp of the moment the video was released. 234 release_date: The date (YYYYMMDD) when the video was released. 235 timestamp: UNIX timestamp of the moment the video became available 236 (uploaded). 237 upload_date: Video upload date (YYYYMMDD). 238 If not explicitly set, calculated from timestamp. 239 uploader_id: Nickname or id of the video uploader. 240 uploader_url: Full URL to a personal webpage of the video uploader. 241 channel: Full name of the channel the video is uploaded on. 242 Note that channel fields may or may not repeat uploader 243 fields. This depends on a particular extractor. 244 channel_id: Id of the channel. 245 channel_url: Full URL to a channel webpage. 246 location: Physical location where the video was filmed. 247 subtitles: The available subtitles as a dictionary in the format 248 {tag: subformats}. "tag" is usually a language code, and 249 "subformats" is a list sorted from lower to higher 250 preference, each element is a dictionary with the "ext" 251 entry and one of: 252 * "data": The subtitles file contents 253 * "url": A URL pointing to the subtitles file 254 "ext" will be calculated from URL if missing 255 automatic_captions: Like 'subtitles', used by the YoutubeIE for 256 automatically generated captions 257 duration: Length of the video in seconds, as an integer or float. 258 view_count: How many users have watched the video on the platform. 259 like_count: Number of positive ratings of the video 260 dislike_count: Number of negative ratings of the video 261 repost_count: Number of reposts of the video 262 average_rating: Average rating give by users, the scale used depends on the webpage 263 comment_count: Number of comments on the video 264 comments: A list of comments, each with one or more of the following 265 properties (all but one of text or html optional): 266 * "author" - human-readable name of the comment author 267 * "author_id" - user ID of the comment author 268 * "id" - Comment ID 269 * "html" - Comment as HTML 270 * "text" - Plain text of the comment 271 * "timestamp" - UNIX timestamp of comment 272 * "parent" - ID of the comment this one is replying to. 273 Set to "root" to indicate that this is a 274 comment to the original video. 275 age_limit: Age restriction for the video, as an integer (years) 276 webpage_url: The URL to the video webpage, if given to youtube-dl it 277 should allow to get the same result again. (It will be set 278 by YoutubeDL if it's missing) 279 categories: A list of categories that the video falls in, for example 280 ["Sports", "Berlin"] 281 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] 282 is_live: True, False, or None (=unknown). Whether this video is a 283 live stream that goes on instead of a fixed-length video. 284 start_time: Time in seconds where the reproduction should start, as 285 specified in the URL. 286 end_time: Time in seconds where the reproduction should end, as 287 specified in the URL. 288 chapters: A list of dictionaries, with the following entries: 289 * "start_time" - The start time of the chapter in seconds 290 * "end_time" - The end time of the chapter in seconds 291 * "title" (optional, string) 292 293 The following fields should only be used when the video belongs to some logical 294 chapter or section: 295 296 chapter: Name or title of the chapter the video belongs to. 297 chapter_number: Number of the chapter the video belongs to, as an integer. 298 chapter_id: Id of the chapter the video belongs to, as a unicode string. 299 300 The following fields should only be used when the video is an episode of some 301 series, programme or podcast: 302 303 series: Title of the series or programme the video episode belongs to. 304 season: Title of the season the video episode belongs to. 305 season_number: Number of the season the video episode belongs to, as an integer. 306 season_id: Id of the season the video episode belongs to, as a unicode string. 307 episode: Title of the video episode. Unlike mandatory video title field, 308 this field should denote the exact title of the video episode 309 without any kind of decoration. 310 episode_number: Number of the video episode within a season, as an integer. 311 episode_id: Id of the video episode, as a unicode string. 312 313 The following fields should only be used when the media is a track or a part of 314 a music album: 315 316 track: Title of the track. 317 track_number: Number of the track within an album or a disc, as an integer. 318 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), 319 as a unicode string. 320 artist: Artist(s) of the track. 321 genre: Genre(s) of the track. 322 album: Title of the album the track belongs to. 323 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). 324 album_artist: List of all artists appeared on the album (e.g. 325 "Ash Borer / Fell Voices" or "Various Artists", useful for splits 326 and compilations). 327 disc_number: Number of the disc or other physical medium the track belongs to, 328 as an integer. 329 release_year: Year (YYYY) when the album was released. 330 331 Unless mentioned otherwise, the fields should be Unicode strings. 332 333 Unless mentioned otherwise, None is equivalent to absence of information. 334 335 336 _type "playlist" indicates multiple videos. 337 There must be a key "entries", which is a list, an iterable, or a PagedList 338 object, each element of which is a valid dictionary by this specification. 339 340 Additionally, playlists can have "id", "title", "description", "uploader", 341 "uploader_id", "uploader_url", "duration" attributes with the same semantics 342 as videos (see above). 343 344 345 _type "multi_video" indicates that there are multiple videos that 346 form a single show, for examples multiple acts of an opera or TV episode. 347 It must have an entries key like a playlist and contain all the keys 348 required for a video at the same time. 349 350 351 _type "url" indicates that the video must be extracted from another 352 location, possibly by a different extractor. Its only required key is: 353 "url" - the next URL to extract. 354 The key "ie_key" can be set to the class name (minus the trailing "IE", 355 e.g. "Youtube") if the extractor class is known in advance. 356 Additionally, the dictionary may have any properties of the resolved entity 357 known in advance, for example "title" if the title of the referred video is 358 known ahead of time. 359 360 361 _type "url_transparent" entities have the same specification as "url", but 362 indicate that the given additional information is more precise than the one 363 associated with the resolved URL. 364 This is useful when a site employs a video service that hosts the video and 365 its technical metadata, but that video service does not embed a useful 366 title, description etc. 367 368 369 Subclasses of this one should re-define the _real_initialize() and 370 _real_extract() methods and define a _VALID_URL regexp. 371 Probably, they should also be added to the list of extractors. 372 373 _GEO_BYPASS attribute may be set to False in order to disable 374 geo restriction bypass mechanisms for a particular extractor. 375 Though it won't disable explicit geo restriction bypass based on 376 country code provided with geo_bypass_country. 377 378 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted 379 countries for this extractor. One of these countries will be used by 380 geo restriction bypass mechanism right away in order to bypass 381 geo restriction, of course, if the mechanism is not disabled. 382 383 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted 384 IP blocks in CIDR notation for this extractor. One of these IP blocks 385 will be used by geo restriction bypass mechanism similarly 386 to _GEO_COUNTRIES. 387 388 Finally, the _WORKING attribute should be set to False for broken IEs 389 in order to warn the users and skip the tests. 390 """ 391 392 _ready = False 393 _downloader = None 394 _x_forwarded_for_ip = None 395 _GEO_BYPASS = True 396 _GEO_COUNTRIES = None 397 _GEO_IP_BLOCKS = None 398 _WORKING = True 399 400 def __init__(self, downloader=None): 401 """Constructor. Receives an optional downloader.""" 402 self._ready = False 403 self._x_forwarded_for_ip = None 404 self.set_downloader(downloader) 405 406 @classmethod 407 def suitable(cls, url): 408 """Receives a URL and returns True if suitable for this IE.""" 409 410 # This does not use has/getattr intentionally - we want to know whether 411 # we have cached the regexp for *this* class, whereas getattr would also 412 # match the superclass 413 if '_VALID_URL_RE' not in cls.__dict__: 414 cls._VALID_URL_RE = re.compile(cls._VALID_URL) 415 return cls._VALID_URL_RE.match(url) is not None 416 417 @classmethod 418 def _match_id(cls, url): 419 if '_VALID_URL_RE' not in cls.__dict__: 420 cls._VALID_URL_RE = re.compile(cls._VALID_URL) 421 m = cls._VALID_URL_RE.match(url) 422 assert m 423 return compat_str(m.group('id')) 424 425 @classmethod 426 def working(cls): 427 """Getter method for _WORKING.""" 428 return cls._WORKING 429 430 def initialize(self): 431 """Initializes an instance (authentication, etc).""" 432 self._initialize_geo_bypass({ 433 'countries': self._GEO_COUNTRIES, 434 'ip_blocks': self._GEO_IP_BLOCKS, 435 }) 436 if not self._ready: 437 self._real_initialize() 438 self._ready = True 439 440 def _initialize_geo_bypass(self, geo_bypass_context): 441 """ 442 Initialize geo restriction bypass mechanism. 443 444 This method is used to initialize geo bypass mechanism based on faking 445 X-Forwarded-For HTTP header. A random country from provided country list 446 is selected and a random IP belonging to this country is generated. This 447 IP will be passed as X-Forwarded-For HTTP header in all subsequent 448 HTTP requests. 449 450 This method will be used for initial geo bypass mechanism initialization 451 during the instance initialization with _GEO_COUNTRIES and 452 _GEO_IP_BLOCKS. 453 454 You may also manually call it from extractor's code if geo bypass 455 information is not available beforehand (e.g. obtained during 456 extraction) or due to some other reason. In this case you should pass 457 this information in geo bypass context passed as first argument. It may 458 contain following fields: 459 460 countries: List of geo unrestricted countries (similar 461 to _GEO_COUNTRIES) 462 ip_blocks: List of geo unrestricted IP blocks in CIDR notation 463 (similar to _GEO_IP_BLOCKS) 464 465 """ 466 if not self._x_forwarded_for_ip: 467 468 # Geo bypass mechanism is explicitly disabled by user 469 if not self._downloader.params.get('geo_bypass', True): 470 return 471 472 if not geo_bypass_context: 473 geo_bypass_context = {} 474 475 # Backward compatibility: previously _initialize_geo_bypass 476 # expected a list of countries, some 3rd party code may still use 477 # it this way 478 if isinstance(geo_bypass_context, (list, tuple)): 479 geo_bypass_context = { 480 'countries': geo_bypass_context, 481 } 482 483 # The whole point of geo bypass mechanism is to fake IP 484 # as X-Forwarded-For HTTP header based on some IP block or 485 # country code. 486 487 # Path 1: bypassing based on IP block in CIDR notation 488 489 # Explicit IP block specified by user, use it right away 490 # regardless of whether extractor is geo bypassable or not 491 ip_block = self._downloader.params.get('geo_bypass_ip_block', None) 492 493 # Otherwise use random IP block from geo bypass context but only 494 # if extractor is known as geo bypassable 495 if not ip_block: 496 ip_blocks = geo_bypass_context.get('ip_blocks') 497 if self._GEO_BYPASS and ip_blocks: 498 ip_block = random.choice(ip_blocks) 499 500 if ip_block: 501 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) 502 if self._downloader.params.get('verbose', False): 503 self._downloader.to_screen( 504 '[debug] Using fake IP %s as X-Forwarded-For.' 505 % self._x_forwarded_for_ip) 506 return 507 508 # Path 2: bypassing based on country code 509 510 # Explicit country code specified by user, use it right away 511 # regardless of whether extractor is geo bypassable or not 512 country = self._downloader.params.get('geo_bypass_country', None) 513 514 # Otherwise use random country code from geo bypass context but 515 # only if extractor is known as geo bypassable 516 if not country: 517 countries = geo_bypass_context.get('countries') 518 if self._GEO_BYPASS and countries: 519 country = random.choice(countries) 520 521 if country: 522 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) 523 if self._downloader.params.get('verbose', False): 524 self._downloader.to_screen( 525 '[debug] Using fake IP %s (%s) as X-Forwarded-For.' 526 % (self._x_forwarded_for_ip, country.upper())) 527 528 def extract(self, url): 529 """Extracts URL information and returns it in list of dicts.""" 530 try: 531 for _ in range(2): 532 try: 533 self.initialize() 534 ie_result = self._real_extract(url) 535 if self._x_forwarded_for_ip: 536 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip 537 return ie_result 538 except GeoRestrictedError as e: 539 if self.__maybe_fake_ip_and_retry(e.countries): 540 continue 541 raise 542 except ExtractorError: 543 raise 544 except compat_http_client.IncompleteRead as e: 545 raise ExtractorError('A network error has occurred.', cause=e, expected=True) 546 except (KeyError, StopIteration) as e: 547 raise ExtractorError('An extractor error has occurred.', cause=e) 548 549 def __maybe_fake_ip_and_retry(self, countries): 550 if (not self._downloader.params.get('geo_bypass_country', None) 551 and self._GEO_BYPASS 552 and self._downloader.params.get('geo_bypass', True) 553 and not self._x_forwarded_for_ip 554 and countries): 555 country_code = random.choice(countries) 556 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) 557 if self._x_forwarded_for_ip: 558 self.report_warning( 559 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' 560 % (self._x_forwarded_for_ip, country_code.upper())) 561 return True 562 return False 563 564 def set_downloader(self, downloader): 565 """Sets the downloader for this IE.""" 566 self._downloader = downloader 567 568 def _real_initialize(self): 569 """Real initialization process. Redefine in subclasses.""" 570 pass 571 572 def _real_extract(self, url): 573 """Real extraction process. Redefine in subclasses.""" 574 pass 575 576 @classmethod 577 def ie_key(cls): 578 """A string for getting the InfoExtractor with get_info_extractor""" 579 return compat_str(cls.__name__[:-2]) 580 581 @property 582 def IE_NAME(self): 583 return compat_str(type(self).__name__[:-2]) 584 585 @staticmethod 586 def __can_accept_status_code(err, expected_status): 587 assert isinstance(err, compat_urllib_error.HTTPError) 588 if expected_status is None: 589 return False 590 if isinstance(expected_status, compat_integer_types): 591 return err.code == expected_status 592 elif isinstance(expected_status, (list, tuple)): 593 return err.code in expected_status 594 elif callable(expected_status): 595 return expected_status(err.code) is True 596 else: 597 assert False 598 599 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): 600 """ 601 Return the response handle. 602 603 See _download_webpage docstring for arguments specification. 604 """ 605 if note is None: 606 self.report_download_webpage(video_id) 607 elif note is not False: 608 if video_id is None: 609 self.to_screen('%s' % (note,)) 610 else: 611 self.to_screen('%s: %s' % (video_id, note)) 612 613 # Some sites check X-Forwarded-For HTTP header in order to figure out 614 # the origin of the client behind proxy. This allows bypassing geo 615 # restriction by faking this header's value to IP that belongs to some 616 # geo unrestricted country. We will do so once we encounter any 617 # geo restriction error. 618 if self._x_forwarded_for_ip: 619 if 'X-Forwarded-For' not in headers: 620 headers['X-Forwarded-For'] = self._x_forwarded_for_ip 621 622 if isinstance(url_or_request, compat_urllib_request.Request): 623 url_or_request = update_Request( 624 url_or_request, data=data, headers=headers, query=query) 625 else: 626 if query: 627 url_or_request = update_url_query(url_or_request, query) 628 if data is not None or headers: 629 url_or_request = sanitized_Request(url_or_request, data, headers) 630 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] 631 if hasattr(ssl, 'CertificateError'): 632 exceptions.append(ssl.CertificateError) 633 try: 634 return self._downloader.urlopen(url_or_request) 635 except tuple(exceptions) as err: 636 if isinstance(err, compat_urllib_error.HTTPError): 637 if self.__can_accept_status_code(err, expected_status): 638 # Retain reference to error to prevent file object from 639 # being closed before it can be read. Works around the 640 # effects of <https://bugs.python.org/issue15002> 641 # introduced in Python 3.4.1. 642 err.fp._error = err 643 return err.fp 644 645 if errnote is False: 646 return False 647 if errnote is None: 648 errnote = 'Unable to download webpage' 649 650 errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) 651 if fatal: 652 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) 653 else: 654 self._downloader.report_warning(errmsg) 655 return False 656 657 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): 658 """ 659 Return a tuple (page content as string, URL handle). 660 661 See _download_webpage docstring for arguments specification. 662 """ 663 # Strip hashes from the URL (#1038) 664 if isinstance(url_or_request, (compat_str, str)): 665 url_or_request = url_or_request.partition('#')[0] 666 667 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) 668 if urlh is False: 669 assert not fatal 670 return False 671 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) 672 return (content, urlh) 673 674 @staticmethod 675 def _guess_encoding_from_content(content_type, webpage_bytes): 676 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) 677 if m: 678 encoding = m.group(1) 679 else: 680 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 681 webpage_bytes[:1024]) 682 if m: 683 encoding = m.group(1).decode('ascii') 684 elif webpage_bytes.startswith(b'\xff\xfe'): 685 encoding = 'utf-16' 686 else: 687 encoding = 'utf-8' 688 689 return encoding 690 691 def __check_blocked(self, content): 692 first_block = content[:512] 693 if ('<title>Access to this site is blocked</title>' in content 694 and 'Websense' in first_block): 695 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' 696 blocked_iframe = self._html_search_regex( 697 r'<iframe src="([^"]+)"', content, 698 'Websense information URL', default=None) 699 if blocked_iframe: 700 msg += ' Visit %s for more details' % blocked_iframe 701 raise ExtractorError(msg, expected=True) 702 if '<title>The URL you requested has been blocked</title>' in first_block: 703 msg = ( 704 'Access to this webpage has been blocked by Indian censorship. ' 705 'Use a VPN or proxy server (with --proxy) to route around it.') 706 block_msg = self._html_search_regex( 707 r'</h1><p>(.*?)</p>', 708 content, 'block message', default=None) 709 if block_msg: 710 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') 711 raise ExtractorError(msg, expected=True) 712 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content 713 and 'blocklist.rkn.gov.ru' in content): 714 raise ExtractorError( 715 'Access to this webpage has been blocked by decision of the Russian government. ' 716 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', 717 expected=True) 718 719 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): 720 content_type = urlh.headers.get('Content-Type', '') 721 webpage_bytes = urlh.read() 722 if prefix is not None: 723 webpage_bytes = prefix + webpage_bytes 724 if not encoding: 725 encoding = self._guess_encoding_from_content(content_type, webpage_bytes) 726 if self._downloader.params.get('dump_intermediate_pages', False): 727 self.to_screen('Dumping request to ' + urlh.geturl()) 728 dump = base64.b64encode(webpage_bytes).decode('ascii') 729 self._downloader.to_screen(dump) 730 if self._downloader.params.get('write_pages', False): 731 basen = '%s_%s' % (video_id, urlh.geturl()) 732 if len(basen) > 240: 733 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() 734 basen = basen[:240 - len(h)] + h 735 raw_filename = basen + '.dump' 736 filename = sanitize_filename(raw_filename, restricted=True) 737 self.to_screen('Saving request to ' + filename) 738 # Working around MAX_PATH limitation on Windows (see 739 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) 740 if compat_os_name == 'nt': 741 absfilepath = os.path.abspath(filename) 742 if len(absfilepath) > 259: 743 filename = '\\\\?\\' + absfilepath 744 with open(filename, 'wb') as outf: 745 outf.write(webpage_bytes) 746 747 try: 748 content = webpage_bytes.decode(encoding, 'replace') 749 except LookupError: 750 content = webpage_bytes.decode('utf-8', 'replace') 751 752 self.__check_blocked(content) 753 754 return content 755 756 def _download_webpage( 757 self, url_or_request, video_id, note=None, errnote=None, 758 fatal=True, tries=1, timeout=5, encoding=None, data=None, 759 headers={}, query={}, expected_status=None): 760 """ 761 Return the data of the page as a string. 762 763 Arguments: 764 url_or_request -- plain text URL as a string or 765 a compat_urllib_request.Requestobject 766 video_id -- Video/playlist/item identifier (string) 767 768 Keyword arguments: 769 note -- note printed before downloading (string) 770 errnote -- note printed in case of an error (string) 771 fatal -- flag denoting whether error should be considered fatal, 772 i.e. whether it should cause ExtractionError to be raised, 773 otherwise a warning will be reported and extraction continued 774 tries -- number of tries 775 timeout -- sleep interval between tries 776 encoding -- encoding for a page content decoding, guessed automatically 777 when not explicitly specified 778 data -- POST data (bytes) 779 headers -- HTTP headers (dict) 780 query -- URL query (dict) 781 expected_status -- allows to accept failed HTTP requests (non 2xx 782 status code) by explicitly specifying a set of accepted status 783 codes. Can be any of the following entities: 784 - an integer type specifying an exact failed status code to 785 accept 786 - a list or a tuple of integer types specifying a list of 787 failed status codes to accept 788 - a callable accepting an actual failed status code and 789 returning True if it should be accepted 790 Note that this argument does not affect success status codes (2xx) 791 which are always accepted. 792 """ 793 794 success = False 795 try_count = 0 796 while success is False: 797 try: 798 res = self._download_webpage_handle( 799 url_or_request, video_id, note, errnote, fatal, 800 encoding=encoding, data=data, headers=headers, query=query, 801 expected_status=expected_status) 802 success = True 803 except compat_http_client.IncompleteRead as e: 804 try_count += 1 805 if try_count >= tries: 806 raise e 807 self._sleep(timeout, video_id) 808 if res is False: 809 return res 810 else: 811 content, _ = res 812 return content 813 814 def _download_xml_handle( 815 self, url_or_request, video_id, note='Downloading XML', 816 errnote='Unable to download XML', transform_source=None, 817 fatal=True, encoding=None, data=None, headers={}, query={}, 818 expected_status=None): 819 """ 820 Return a tuple (xml as an compat_etree_Element, URL handle). 821 822 See _download_webpage docstring for arguments specification. 823 """ 824 res = self._download_webpage_handle( 825 url_or_request, video_id, note, errnote, fatal=fatal, 826 encoding=encoding, data=data, headers=headers, query=query, 827 expected_status=expected_status) 828 if res is False: 829 return res 830 xml_string, urlh = res 831 return self._parse_xml( 832 xml_string, video_id, transform_source=transform_source, 833 fatal=fatal), urlh 834 835 def _download_xml( 836 self, url_or_request, video_id, 837 note='Downloading XML', errnote='Unable to download XML', 838 transform_source=None, fatal=True, encoding=None, 839 data=None, headers={}, query={}, expected_status=None): 840 """ 841 Return the xml as an compat_etree_Element. 842 843 See _download_webpage docstring for arguments specification. 844 """ 845 res = self._download_xml_handle( 846 url_or_request, video_id, note=note, errnote=errnote, 847 transform_source=transform_source, fatal=fatal, encoding=encoding, 848 data=data, headers=headers, query=query, 849 expected_status=expected_status) 850 return res if res is False else res[0] 851 852 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): 853 if transform_source: 854 xml_string = transform_source(xml_string) 855 try: 856 return compat_etree_fromstring(xml_string.encode('utf-8')) 857 except compat_xml_parse_error as ve: 858 errmsg = '%s: Failed to parse XML ' % video_id 859 if fatal: 860 raise ExtractorError(errmsg, cause=ve) 861 else: 862 self.report_warning(errmsg + str(ve)) 863 864 def _download_json_handle( 865 self, url_or_request, video_id, note='Downloading JSON metadata', 866 errnote='Unable to download JSON metadata', transform_source=None, 867 fatal=True, encoding=None, data=None, headers={}, query={}, 868 expected_status=None): 869 """ 870 Return a tuple (JSON object, URL handle). 871 872 See _download_webpage docstring for arguments specification. 873 """ 874 res = self._download_webpage_handle( 875 url_or_request, video_id, note, errnote, fatal=fatal, 876 encoding=encoding, data=data, headers=headers, query=query, 877 expected_status=expected_status) 878 if res is False: 879 return res 880 json_string, urlh = res 881 return self._parse_json( 882 json_string, video_id, transform_source=transform_source, 883 fatal=fatal), urlh 884 885 def _download_json( 886 self, url_or_request, video_id, note='Downloading JSON metadata', 887 errnote='Unable to download JSON metadata', transform_source=None, 888 fatal=True, encoding=None, data=None, headers={}, query={}, 889 expected_status=None): 890 """ 891 Return the JSON object as a dict. 892 893 See _download_webpage docstring for arguments specification. 894 """ 895 res = self._download_json_handle( 896 url_or_request, video_id, note=note, errnote=errnote, 897 transform_source=transform_source, fatal=fatal, encoding=encoding, 898 data=data, headers=headers, query=query, 899 expected_status=expected_status) 900 return res if res is False else res[0] 901 902 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): 903 if transform_source: 904 json_string = transform_source(json_string) 905 try: 906 return json.loads(json_string) 907 except ValueError as ve: 908 errmsg = '%s: Failed to parse JSON ' % video_id 909 if fatal: 910 raise ExtractorError(errmsg, cause=ve) 911 else: 912 self.report_warning(errmsg + str(ve)) 913 914 def report_warning(self, msg, video_id=None): 915 idstr = '' if video_id is None else '%s: ' % video_id 916 self._downloader.report_warning( 917 '[%s] %s%s' % (self.IE_NAME, idstr, msg)) 918 919 def to_screen(self, msg): 920 """Print msg to screen, prefixing it with '[ie_name]'""" 921 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) 922 923 def report_extraction(self, id_or_name): 924 """Report information extraction.""" 925 self.to_screen('%s: Extracting information' % id_or_name) 926 927 def report_download_webpage(self, video_id): 928 """Report webpage download.""" 929 self.to_screen('%s: Downloading webpage' % video_id) 930 931 def report_age_confirmation(self): 932 """Report attempt to confirm age.""" 933 self.to_screen('Confirming age') 934 935 def report_login(self): 936 """Report attempt to log in.""" 937 self.to_screen('Logging in') 938 939 @staticmethod 940 def raise_login_required(msg='This video is only available for registered users'): 941 raise ExtractorError( 942 '%s. Use --username and --password or --netrc to provide account credentials.' % msg, 943 expected=True) 944 945 @staticmethod 946 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None): 947 raise GeoRestrictedError(msg, countries=countries) 948 949 # Methods for following #608 950 @staticmethod 951 def url_result(url, ie=None, video_id=None, video_title=None): 952 """Returns a URL that points to a page that should be processed""" 953 # TODO: ie should be the class used for getting the info 954 video_info = {'_type': 'url', 955 'url': url, 956 'ie_key': ie} 957 if video_id is not None: 958 video_info['id'] = video_id 959 if video_title is not None: 960 video_info['title'] = video_title 961 return video_info 962 963 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): 964 urls = orderedSet( 965 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) 966 for m in matches) 967 return self.playlist_result( 968 urls, playlist_id=playlist_id, playlist_title=playlist_title) 969 970 @staticmethod 971 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): 972 """Returns a playlist""" 973 video_info = {'_type': 'playlist', 974 'entries': entries} 975 if playlist_id: 976 video_info['id'] = playlist_id 977 if playlist_title: 978 video_info['title'] = playlist_title 979 if playlist_description: 980 video_info['description'] = playlist_description 981 return video_info 982 983 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 984 """ 985 Perform a regex search on the given string, using a single or a list of 986 patterns returning the first matching group. 987 In case of failure return a default value or raise a WARNING or a 988 RegexNotFoundError, depending on fatal, specifying the field name. 989 """ 990 if isinstance(pattern, (str, compat_str, compiled_regex_type)): 991 mobj = re.search(pattern, string, flags) 992 else: 993 for p in pattern: 994 mobj = re.search(p, string, flags) 995 if mobj: 996 break 997 998 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): 999 _name = '\033[0;34m%s\033[0m' % name 1000 else: 1001 _name = name 1002 1003 if mobj: 1004 if group is None: 1005 # return the first matching group 1006 return next(g for g in mobj.groups() if g is not None) 1007 else: 1008 return mobj.group(group) 1009 elif default is not NO_DEFAULT: 1010 return default 1011 elif fatal: 1012 raise RegexNotFoundError('Unable to extract %s' % _name) 1013 else: 1014 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) 1015 return None 1016 1017 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 1018 """ 1019 Like _search_regex, but strips HTML tags and unescapes entities. 1020 """ 1021 res = self._search_regex(pattern, string, name, default, fatal, flags, group) 1022 if res: 1023 return clean_html(res).strip() 1024 else: 1025 return res 1026 1027 def _get_netrc_login_info(self, netrc_machine=None): 1028 username = None 1029 password = None 1030 netrc_machine = netrc_machine or self._NETRC_MACHINE 1031 1032 if self._downloader.params.get('usenetrc', False): 1033 try: 1034 info = netrc.netrc().authenticators(netrc_machine) 1035 if info is not None: 1036 username = info[0] 1037 password = info[2] 1038 else: 1039 raise netrc.NetrcParseError( 1040 'No authenticators for %s' % netrc_machine) 1041 except (IOError, netrc.NetrcParseError) as err: 1042 self._downloader.report_warning( 1043 'parsing .netrc: %s' % error_to_compat_str(err)) 1044 1045 return username, password 1046 1047 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): 1048 """ 1049 Get the login info as (username, password) 1050 First look for the manually specified credentials using username_option 1051 and password_option as keys in params dictionary. If no such credentials 1052 available look in the netrc file using the netrc_machine or _NETRC_MACHINE 1053 value. 1054 If there's no info available, return (None, None) 1055 """ 1056 if self._downloader is None: 1057 return (None, None) 1058 1059 downloader_params = self._downloader.params 1060 1061 # Attempt to use provided username and password or .netrc data 1062 if downloader_params.get(username_option) is not None: 1063 username = downloader_params[username_option] 1064 password = downloader_params[password_option] 1065 else: 1066 username, password = self._get_netrc_login_info(netrc_machine) 1067 1068 return username, password 1069 1070 def _get_tfa_info(self, note='two-factor verification code'): 1071 """ 1072 Get the two-factor authentication info 1073 TODO - asking the user will be required for sms/phone verify 1074 currently just uses the command line option 1075 If there's no info available, return None 1076 """ 1077 if self._downloader is None: 1078 return None 1079 downloader_params = self._downloader.params 1080 1081 if downloader_params.get('twofactor') is not None: 1082 return downloader_params['twofactor'] 1083 1084 return compat_getpass('Type %s and press [Return]: ' % note) 1085 1086 # Helper functions for extracting OpenGraph info 1087 @staticmethod 1088 def _og_regexes(prop): 1089 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' 1090 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' 1091 % {'prop': re.escape(prop)}) 1092 template = r'<meta[^>]+?%s[^>]+?%s' 1093 return [ 1094 template % (property_re, content_re), 1095 template % (content_re, property_re), 1096 ] 1097 1098 @staticmethod 1099 def _meta_regex(prop): 1100 return r'''(?isx)<meta 1101 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) 1102 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) 1103 1104 def _og_search_property(self, prop, html, name=None, **kargs): 1105 if not isinstance(prop, (list, tuple)): 1106 prop = [prop] 1107 if name is None: 1108 name = 'OpenGraph %s' % prop[0] 1109 og_regexes = [] 1110 for p in prop: 1111 og_regexes.extend(self._og_regexes(p)) 1112 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) 1113 if escaped is None: 1114 return None 1115 return unescapeHTML(escaped) 1116 1117 def _og_search_thumbnail(self, html, **kargs): 1118 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) 1119 1120 def _og_search_description(self, html, **kargs): 1121 return self._og_search_property('description', html, fatal=False, **kargs) 1122 1123 def _og_search_title(self, html, **kargs): 1124 return self._og_search_property('title', html, **kargs) 1125 1126 def _og_search_video_url(self, html, name='video url', secure=True, **kargs): 1127 regexes = self._og_regexes('video') + self._og_regexes('video:url') 1128 if secure: 1129 regexes = self._og_regexes('video:secure_url') + regexes 1130 return self._html_search_regex(regexes, html, name, **kargs) 1131 1132 def _og_search_url(self, html, **kargs): 1133 return self._og_search_property('url', html, **kargs) 1134 1135 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): 1136 if not isinstance(name, (list, tuple)): 1137 name = [name] 1138 if display_name is None: 1139 display_name = name[0] 1140 return self._html_search_regex( 1141 [self._meta_regex(n) for n in name], 1142 html, display_name, fatal=fatal, group='content', **kwargs) 1143 1144 def _dc_search_uploader(self, html): 1145 return self._html_search_meta('dc.creator', html, 'uploader') 1146 1147 def _rta_search(self, html): 1148 # See http://www.rtalabel.org/index.php?content=howtofaq#single 1149 if re.search(r'(?ix)<meta\s+name="rating"\s+' 1150 r' content="RTA-5042-1996-1400-1577-RTA"', 1151 html): 1152 return 18 1153 return 0 1154 1155 def _media_rating_search(self, html): 1156 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ 1157 rating = self._html_search_meta('rating', html) 1158 1159 if not rating: 1160 return None 1161 1162 RATING_TABLE = { 1163 'safe for kids': 0, 1164 'general': 8, 1165 '14 years': 14, 1166 'mature': 17, 1167 'restricted': 19, 1168 } 1169 return RATING_TABLE.get(rating.lower()) 1170 1171 def _family_friendly_search(self, html): 1172 # See http://schema.org/VideoObject 1173 family_friendly = self._html_search_meta( 1174 'isFamilyFriendly', html, default=None) 1175 1176 if not family_friendly: 1177 return None 1178 1179 RATING_TABLE = { 1180 '1': 0, 1181 'true': 0, 1182 '0': 18, 1183 'false': 18, 1184 } 1185 return RATING_TABLE.get(family_friendly.lower()) 1186 1187 def _twitter_search_player(self, html): 1188 return self._html_search_meta('twitter:player', html, 1189 'twitter card player') 1190 1191 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): 1192 json_ld_list = list(re.finditer(JSON_LD_RE, html)) 1193 default = kwargs.get('default', NO_DEFAULT) 1194 # JSON-LD may be malformed and thus `fatal` should be respected. 1195 # At the same time `default` may be passed that assumes `fatal=False` 1196 # for _search_regex. Let's simulate the same behavior here as well. 1197 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False 1198 json_ld = [] 1199 for mobj in json_ld_list: 1200 json_ld_item = self._parse_json( 1201 mobj.group('json_ld'), video_id, fatal=fatal) 1202 if not json_ld_item: 1203 continue 1204 if isinstance(json_ld_item, dict): 1205 json_ld.append(json_ld_item) 1206 elif isinstance(json_ld_item, (list, tuple)): 1207 json_ld.extend(json_ld_item) 1208 if json_ld: 1209 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) 1210 if json_ld: 1211 return json_ld 1212 if default is not NO_DEFAULT: 1213 return default 1214 elif fatal: 1215 raise RegexNotFoundError('Unable to extract JSON-LD') 1216 else: 1217 self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message()) 1218 return {} 1219 1220 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): 1221 if isinstance(json_ld, compat_str): 1222 json_ld = self._parse_json(json_ld, video_id, fatal=fatal) 1223 if not json_ld: 1224 return {} 1225 info = {} 1226 if not isinstance(json_ld, (list, tuple, dict)): 1227 return info 1228 if isinstance(json_ld, dict): 1229 json_ld = [json_ld] 1230 1231 INTERACTION_TYPE_MAP = { 1232 'CommentAction': 'comment', 1233 'AgreeAction': 'like', 1234 'DisagreeAction': 'dislike', 1235 'LikeAction': 'like', 1236 'DislikeAction': 'dislike', 1237 'ListenAction': 'view', 1238 'WatchAction': 'view', 1239 'ViewAction': 'view', 1240 } 1241 1242 def extract_interaction_type(e): 1243 interaction_type = e.get('interactionType') 1244 if isinstance(interaction_type, dict): 1245 interaction_type = interaction_type.get('@type') 1246 return str_or_none(interaction_type) 1247 1248 def extract_interaction_statistic(e): 1249 interaction_statistic = e.get('interactionStatistic') 1250 if isinstance(interaction_statistic, dict): 1251 interaction_statistic = [interaction_statistic] 1252 if not isinstance(interaction_statistic, list): 1253 return 1254 for is_e in interaction_statistic: 1255 if not isinstance(is_e, dict): 1256 continue 1257 if is_e.get('@type') != 'InteractionCounter': 1258 continue 1259 interaction_type = extract_interaction_type(is_e) 1260 if not interaction_type: 1261 continue 1262 # For interaction count some sites provide string instead of 1263 # an integer (as per spec) with non digit characters (e.g. ",") 1264 # so extracting count with more relaxed str_to_int 1265 interaction_count = str_to_int(is_e.get('userInteractionCount')) 1266 if interaction_count is None: 1267 continue 1268 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) 1269 if not count_kind: 1270 continue 1271 count_key = '%s_count' % count_kind 1272 if info.get(count_key) is not None: 1273 continue 1274 info[count_key] = interaction_count 1275 1276 def extract_video_object(e): 1277 assert e['@type'] == 'VideoObject' 1278 author = e.get('author') 1279 info.update({ 1280 'url': url_or_none(e.get('contentUrl')), 1281 'title': unescapeHTML(e.get('name')), 1282 'description': unescapeHTML(e.get('description')), 1283 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 1284 'duration': parse_duration(e.get('duration')), 1285 'timestamp': unified_timestamp(e.get('uploadDate')), 1286 # author can be an instance of 'Organization' or 'Person' types. 1287 # both types can have 'name' property(inherited from 'Thing' type). [1] 1288 # however some websites are using 'Text' type instead. 1289 # 1. https://schema.org/VideoObject 1290 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, 1291 'filesize': float_or_none(e.get('contentSize')), 1292 'tbr': int_or_none(e.get('bitrate')), 1293 'width': int_or_none(e.get('width')), 1294 'height': int_or_none(e.get('height')), 1295 'view_count': int_or_none(e.get('interactionCount')), 1296 }) 1297 extract_interaction_statistic(e) 1298 1299 for e in json_ld: 1300 if '@context' in e: 1301 item_type = e.get('@type') 1302 if expected_type is not None and expected_type != item_type: 1303 continue 1304 if item_type in ('TVEpisode', 'Episode'): 1305 episode_name = unescapeHTML(e.get('name')) 1306 info.update({ 1307 'episode': episode_name, 1308 'episode_number': int_or_none(e.get('episodeNumber')), 1309 'description': unescapeHTML(e.get('description')), 1310 }) 1311 if not info.get('title') and episode_name: 1312 info['title'] = episode_name 1313 part_of_season = e.get('partOfSeason') 1314 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): 1315 info.update({ 1316 'season': unescapeHTML(part_of_season.get('name')), 1317 'season_number': int_or_none(part_of_season.get('seasonNumber')), 1318 }) 1319 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') 1320 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): 1321 info['series'] = unescapeHTML(part_of_series.get('name')) 1322 elif item_type == 'Movie': 1323 info.update({ 1324 'title': unescapeHTML(e.get('name')), 1325 'description': unescapeHTML(e.get('description')), 1326 'duration': parse_duration(e.get('duration')), 1327 'timestamp': unified_timestamp(e.get('dateCreated')), 1328 }) 1329 elif item_type in ('Article', 'NewsArticle'): 1330 info.update({ 1331 'timestamp': parse_iso8601(e.get('datePublished')), 1332 'title': unescapeHTML(e.get('headline')), 1333 'description': unescapeHTML(e.get('articleBody')), 1334 }) 1335 elif item_type == 'VideoObject': 1336 extract_video_object(e) 1337 if expected_type is None: 1338 continue 1339 else: 1340 break 1341 video = e.get('video') 1342 if isinstance(video, dict) and video.get('@type') == 'VideoObject': 1343 extract_video_object(video) 1344 if expected_type is None: 1345 continue 1346 else: 1347 break 1348 return dict((k, v) for k, v in info.items() if v is not None) 1349 1350 @staticmethod 1351 def _hidden_inputs(html): 1352 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) 1353 hidden_inputs = {} 1354 for input in re.findall(r'(?i)(<input[^>]+>)', html): 1355 attrs = extract_attributes(input) 1356 if not input: 1357 continue 1358 if attrs.get('type') not in ('hidden', 'submit'): 1359 continue 1360 name = attrs.get('name') or attrs.get('id') 1361 value = attrs.get('value') 1362 if name and value is not None: 1363 hidden_inputs[name] = value 1364 return hidden_inputs 1365 1366 def _form_hidden_inputs(self, form_id, html): 1367 form = self._search_regex( 1368 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, 1369 html, '%s form' % form_id, group='form') 1370 return self._hidden_inputs(form) 1371 1372 def _sort_formats(self, formats, field_preference=None): 1373 if not formats: 1374 raise ExtractorError('No video formats found') 1375 1376 for f in formats: 1377 # Automatically determine tbr when missing based on abr and vbr (improves 1378 # formats sorting in some cases) 1379 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: 1380 f['tbr'] = f['abr'] + f['vbr'] 1381 1382 def _formats_key(f): 1383 # TODO remove the following workaround 1384 from ..utils import determine_ext 1385 if not f.get('ext') and 'url' in f: 1386 f['ext'] = determine_ext(f['url']) 1387 1388 if isinstance(field_preference, (list, tuple)): 1389 return tuple( 1390 f.get(field) 1391 if f.get(field) is not None 1392 else ('' if field == 'format_id' else -1) 1393 for field in field_preference) 1394 1395 preference = f.get('preference') 1396 if preference is None: 1397 preference = 0 1398 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported 1399 preference -= 0.5 1400 1401 protocol = f.get('protocol') or determine_protocol(f) 1402 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) 1403 1404 if f.get('vcodec') == 'none': # audio only 1405 preference -= 50 1406 if self._downloader.params.get('prefer_free_formats'): 1407 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] 1408 else: 1409 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] 1410 ext_preference = 0 1411 try: 1412 audio_ext_preference = ORDER.index(f['ext']) 1413 except ValueError: 1414 audio_ext_preference = -1 1415 else: 1416 if f.get('acodec') == 'none': # video only 1417 preference -= 40 1418 if self._downloader.params.get('prefer_free_formats'): 1419 ORDER = ['flv', 'mp4', 'webm'] 1420 else: 1421 ORDER = ['webm', 'flv', 'mp4'] 1422 try: 1423 ext_preference = ORDER.index(f['ext']) 1424 except ValueError: 1425 ext_preference = -1 1426 audio_ext_preference = 0 1427 1428 return ( 1429 preference, 1430 f.get('language_preference') if f.get('language_preference') is not None else -1, 1431 f.get('quality') if f.get('quality') is not None else -1, 1432 f.get('tbr') if f.get('tbr') is not None else -1, 1433 f.get('filesize') if f.get('filesize') is not None else -1, 1434 f.get('vbr') if f.get('vbr') is not None else -1, 1435 f.get('height') if f.get('height') is not None else -1, 1436 f.get('width') if f.get('width') is not None else -1, 1437 proto_preference, 1438 ext_preference, 1439 f.get('abr') if f.get('abr') is not None else -1, 1440 audio_ext_preference, 1441 f.get('fps') if f.get('fps') is not None else -1, 1442 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, 1443 f.get('source_preference') if f.get('source_preference') is not None else -1, 1444 f.get('format_id') if f.get('format_id') is not None else '', 1445 ) 1446 formats.sort(key=_formats_key) 1447 1448 def _check_formats(self, formats, video_id): 1449 if formats: 1450 formats[:] = filter( 1451 lambda f: self._is_valid_url( 1452 f['url'], video_id, 1453 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), 1454 formats) 1455 1456 @staticmethod 1457 def _remove_duplicate_formats(formats): 1458 format_urls = set() 1459 unique_formats = [] 1460 for f in formats: 1461 if f['url'] not in format_urls: 1462 format_urls.add(f['url']) 1463 unique_formats.append(f) 1464 formats[:] = unique_formats 1465 1466 def _is_valid_url(self, url, video_id, item='video', headers={}): 1467 url = self._proto_relative_url(url, scheme='http:') 1468 # For now assume non HTTP(S) URLs always valid 1469 if not (url.startswith('http://') or url.startswith('https://')): 1470 return True 1471 try: 1472 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) 1473 return True 1474 except ExtractorError as e: 1475 self.to_screen( 1476 '%s: %s URL is invalid, skipping: %s' 1477 % (video_id, item, error_to_compat_str(e.cause))) 1478 return False 1479 1480 def http_scheme(self): 1481 """ Either "http:" or "https:", depending on the user's preferences """ 1482 return ( 1483 'http:' 1484 if self._downloader.params.get('prefer_insecure', False) 1485 else 'https:') 1486 1487 def _proto_relative_url(self, url, scheme=None): 1488 if url is None: 1489 return url 1490 if url.startswith('//'): 1491 if scheme is None: 1492 scheme = self.http_scheme() 1493 return scheme + url 1494 else: 1495 return url 1496 1497 def _sleep(self, timeout, video_id, msg_template=None): 1498 if msg_template is None: 1499 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' 1500 msg = msg_template % {'video_id': video_id, 'timeout': timeout} 1501 self.to_screen(msg) 1502 time.sleep(timeout) 1503 1504 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, 1505 transform_source=lambda s: fix_xml_ampersands(s).strip(), 1506 fatal=True, m3u8_id=None, data=None, headers={}, query={}): 1507 manifest = self._download_xml( 1508 manifest_url, video_id, 'Downloading f4m manifest', 1509 'Unable to download f4m manifest', 1510 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests 1511 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) 1512 transform_source=transform_source, 1513 fatal=fatal, data=data, headers=headers, query=query) 1514 1515 if manifest is False: 1516 return [] 1517 1518 return self._parse_f4m_formats( 1519 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, 1520 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) 1521 1522 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, 1523 transform_source=lambda s: fix_xml_ampersands(s).strip(), 1524 fatal=True, m3u8_id=None): 1525 if not isinstance(manifest, compat_etree_Element) and not fatal: 1526 return [] 1527 1528 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy 1529 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') 1530 if akamai_pv is not None and ';' in akamai_pv.text: 1531 playerVerificationChallenge = akamai_pv.text.split(';')[0] 1532 if playerVerificationChallenge.strip() != '': 1533 return [] 1534 1535 formats = [] 1536 manifest_version = '1.0' 1537 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') 1538 if not media_nodes: 1539 manifest_version = '2.0' 1540 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') 1541 # Remove unsupported DRM protected media from final formats 1542 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573). 1543 media_nodes = remove_encrypted_media(media_nodes) 1544 if not media_nodes: 1545 return formats 1546 1547 manifest_base_url = get_base_url(manifest) 1548 1549 bootstrap_info = xpath_element( 1550 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 1551 'bootstrap info', default=None) 1552 1553 vcodec = None 1554 mime_type = xpath_text( 1555 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], 1556 'base URL', default=None) 1557 if mime_type and mime_type.startswith('audio/'): 1558 vcodec = 'none' 1559 1560 for i, media_el in enumerate(media_nodes): 1561 tbr = int_or_none(media_el.attrib.get('bitrate')) 1562 width = int_or_none(media_el.attrib.get('width')) 1563 height = int_or_none(media_el.attrib.get('height')) 1564 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) 1565 # If <bootstrapInfo> is present, the specified f4m is a 1566 # stream-level manifest, and only set-level manifests may refer to 1567 # external resources. See section 11.4 and section 4 of F4M spec 1568 if bootstrap_info is None: 1569 media_url = None 1570 # @href is introduced in 2.0, see section 11.6 of F4M spec 1571 if manifest_version == '2.0': 1572 media_url = media_el.attrib.get('href') 1573 if media_url is None: 1574 media_url = media_el.attrib.get('url') 1575 if not media_url: 1576 continue 1577 manifest_url = ( 1578 media_url if media_url.startswith('http://') or media_url.startswith('https://') 1579 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) 1580 # If media_url is itself a f4m manifest do the recursive extraction 1581 # since bitrates in parent manifest (this one) and media_url manifest 1582 # may differ leading to inability to resolve the format by requested 1583 # bitrate in f4m downloader 1584 ext = determine_ext(manifest_url) 1585 if ext == 'f4m': 1586 f4m_formats = self._extract_f4m_formats( 1587 manifest_url, video_id, preference=preference, f4m_id=f4m_id, 1588 transform_source=transform_source, fatal=fatal) 1589 # Sometimes stream-level manifest contains single media entry that 1590 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). 1591 # At the same time parent's media entry in set-level manifest may 1592 # contain it. We will copy it from parent in such cases. 1593 if len(f4m_formats) == 1: 1594 f = f4m_formats[0] 1595 f.update({ 1596 'tbr': f.get('tbr') or tbr, 1597 'width': f.get('width') or width, 1598 'height': f.get('height') or height, 1599 'format_id': f.get('format_id') if not tbr else format_id, 1600 'vcodec': vcodec, 1601 }) 1602 formats.extend(f4m_formats) 1603 continue 1604 elif ext == 'm3u8': 1605 formats.extend(self._extract_m3u8_formats( 1606 manifest_url, video_id, 'mp4', preference=preference, 1607 m3u8_id=m3u8_id, fatal=fatal)) 1608 continue 1609 formats.append({ 1610 'format_id': format_id, 1611 'url': manifest_url, 1612 'manifest_url': manifest_url, 1613 'ext': 'flv' if bootstrap_info is not None else None, 1614 'protocol': 'f4m', 1615 'tbr': tbr, 1616 'width': width, 1617 'height': height, 1618 'vcodec': vcodec, 1619 'preference': preference, 1620 }) 1621 return formats 1622 1623 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): 1624 return { 1625 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 1626 'url': m3u8_url, 1627 'ext': ext, 1628 'protocol': 'm3u8', 1629 'preference': preference - 100 if preference else -100, 1630 'resolution': 'multiple', 1631 'format_note': 'Quality selection URL', 1632 } 1633 1634 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, 1635 entry_protocol='m3u8', preference=None, 1636 m3u8_id=None, note=None, errnote=None, 1637 fatal=True, live=False, data=None, headers={}, 1638 query={}): 1639 res = self._download_webpage_handle( 1640 m3u8_url, video_id, 1641 note=note or 'Downloading m3u8 information', 1642 errnote=errnote or 'Failed to download m3u8 information', 1643 fatal=fatal, data=data, headers=headers, query=query) 1644 1645 if res is False: 1646 return [] 1647 1648 m3u8_doc, urlh = res 1649 m3u8_url = urlh.geturl() 1650 1651 return self._parse_m3u8_formats( 1652 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, 1653 preference=preference, m3u8_id=m3u8_id, live=live) 1654 1655 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, 1656 entry_protocol='m3u8', preference=None, 1657 m3u8_id=None, live=False): 1658 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access 1659 return [] 1660 1661 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay 1662 return [] 1663 1664 formats = [] 1665 1666 format_url = lambda u: ( 1667 u 1668 if re.match(r'^https?://', u) 1669 else compat_urlparse.urljoin(m3u8_url, u)) 1670 1671 # References: 1672 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 1673 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211 1674 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923 1675 1676 # We should try extracting formats only from master playlists [1, 4.3.4], 1677 # i.e. playlists that describe available qualities. On the other hand 1678 # media playlists [1, 4.3.3] should be returned as is since they contain 1679 # just the media without qualities renditions. 1680 # Fortunately, master playlist can be easily distinguished from media 1681 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] 1682 # master playlist tags MUST NOT appear in a media playlist and vice versa. 1683 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every 1684 # media playlist and MUST NOT appear in master playlist thus we can 1685 # clearly detect media playlist with this criterion. 1686 1687 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is 1688 return [{ 1689 'url': m3u8_url, 1690 'format_id': m3u8_id, 1691 'ext': ext, 1692 'protocol': entry_protocol, 1693 'preference': preference, 1694 }] 1695 1696 groups = {} 1697 last_stream_inf = {} 1698 1699 def extract_media(x_media_line): 1700 media = parse_m3u8_attributes(x_media_line) 1701 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED 1702 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') 1703 if not (media_type and group_id and name): 1704 return 1705 groups.setdefault(group_id, []).append(media) 1706 if media_type not in ('VIDEO', 'AUDIO'): 1707 return 1708 media_url = media.get('URI') 1709 if media_url: 1710 format_id = [] 1711 for v in (m3u8_id, group_id, name): 1712 if v: 1713 format_id.append(v) 1714 f = { 1715 'format_id': '-'.join(format_id), 1716 'url': format_url(media_url), 1717 'manifest_url': m3u8_url, 1718 'language': media.get('LANGUAGE'), 1719 'ext': ext, 1720 'protocol': entry_protocol, 1721 'preference': preference, 1722 } 1723 if media_type == 'AUDIO': 1724 f['vcodec'] = 'none' 1725 formats.append(f) 1726 1727 def build_stream_name(): 1728 # Despite specification does not mention NAME attribute for 1729 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] 1730 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) 1731 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 1732 stream_name = last_stream_inf.get('NAME') 1733 if stream_name: 1734 return stream_name 1735 # If there is no NAME in EXT-X-STREAM-INF it will be obtained 1736 # from corresponding rendition group 1737 stream_group_id = last_stream_inf.get('VIDEO') 1738 if not stream_group_id: 1739 return 1740 stream_group = groups.get(stream_group_id) 1741 if not stream_group: 1742 return stream_group_id 1743 rendition = stream_group[0] 1744 return rendition.get('NAME') or stream_group_id 1745 1746 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the 1747 # chance to detect video only formats when EXT-X-STREAM-INF tags 1748 # precede EXT-X-MEDIA tags in HLS manifest such as [3]. 1749 for line in m3u8_doc.splitlines(): 1750 if line.startswith('#EXT-X-MEDIA:'): 1751 extract_media(line) 1752 1753 for line in m3u8_doc.splitlines(): 1754 if line.startswith('#EXT-X-STREAM-INF:'): 1755 last_stream_inf = parse_m3u8_attributes(line) 1756 elif line.startswith('#') or not line.strip(): 1757 continue 1758 else: 1759 tbr = float_or_none( 1760 last_stream_inf.get('AVERAGE-BANDWIDTH') 1761 or last_stream_inf.get('BANDWIDTH'), scale=1000) 1762 format_id = [] 1763 if m3u8_id: 1764 format_id.append(m3u8_id) 1765 stream_name = build_stream_name() 1766 # Bandwidth of live streams may differ over time thus making 1767 # format_id unpredictable. So it's better to keep provided 1768 # format_id intact. 1769 if not live: 1770 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) 1771 manifest_url = format_url(line.strip()) 1772 f = { 1773 'format_id': '-'.join(format_id), 1774 'url': manifest_url, 1775 'manifest_url': m3u8_url, 1776 'tbr': tbr, 1777 'ext': ext, 1778 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), 1779 'protocol': entry_protocol, 1780 'preference': preference, 1781 } 1782 resolution = last_stream_inf.get('RESOLUTION') 1783 if resolution: 1784 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) 1785 if mobj: 1786 f['width'] = int(mobj.group('width')) 1787 f['height'] = int(mobj.group('height')) 1788 # Unified Streaming Platform 1789 mobj = re.search( 1790 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) 1791 if mobj: 1792 abr, vbr = mobj.groups() 1793 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) 1794 f.update({ 1795 'vbr': vbr, 1796 'abr': abr, 1797 }) 1798 codecs = parse_codecs(last_stream_inf.get('CODECS')) 1799 f.update(codecs) 1800 audio_group_id = last_stream_inf.get('AUDIO') 1801 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which 1802 # references a rendition group MUST have a CODECS attribute. 1803 # However, this is not always respected, for example, [2] 1804 # contains EXT-X-STREAM-INF tag which references AUDIO 1805 # rendition group but does not have CODECS and despite 1806 # referencing an audio group it represents a complete 1807 # (with audio and video) format. So, for such cases we will 1808 # ignore references to rendition groups and treat them 1809 # as complete formats. 1810 if audio_group_id and codecs and f.get('vcodec') != 'none': 1811 audio_group = groups.get(audio_group_id) 1812 if audio_group and audio_group[0].get('URI'): 1813 # TODO: update acodec for audio only formats with 1814 # the same GROUP-ID 1815 f['acodec'] = 'none' 1816 formats.append(f) 1817 1818 # for DailyMotion 1819 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI') 1820 if progressive_uri: 1821 http_f = f.copy() 1822 del http_f['manifest_url'] 1823 http_f.update({ 1824 'format_id': f['format_id'].replace('hls-', 'http-'), 1825 'protocol': 'http', 1826 'url': progressive_uri, 1827 }) 1828 formats.append(http_f) 1829 1830 last_stream_inf = {} 1831 return formats 1832 1833 @staticmethod 1834 def _xpath_ns(path, namespace=None): 1835 if not namespace: 1836 return path 1837 out = [] 1838 for c in path.split('/'): 1839 if not c or c == '.': 1840 out.append(c) 1841 else: 1842 out.append('{%s}%s' % (namespace, c)) 1843 return '/'.join(out) 1844 1845 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): 1846 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) 1847 1848 if smil is False: 1849 assert not fatal 1850 return [] 1851 1852 namespace = self._parse_smil_namespace(smil) 1853 1854 return self._parse_smil_formats( 1855 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 1856 1857 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): 1858 smil = self._download_smil(smil_url, video_id, fatal=fatal) 1859 if smil is False: 1860 return {} 1861 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) 1862 1863 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): 1864 return self._download_xml( 1865 smil_url, video_id, 'Downloading SMIL file', 1866 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) 1867 1868 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): 1869 namespace = self._parse_smil_namespace(smil) 1870 1871 formats = self._parse_smil_formats( 1872 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 1873 subtitles = self._parse_smil_subtitles(smil, namespace=namespace) 1874 1875 video_id = os.path.splitext(url_basename(smil_url))[0] 1876 title = None 1877 description = None 1878 upload_date = None 1879 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 1880 name = meta.attrib.get('name') 1881 content = meta.attrib.get('content') 1882 if not name or not content: 1883 continue 1884 if not title and name == 'title': 1885 title = content 1886 elif not description and name in ('description', 'abstract'): 1887 description = content 1888 elif not upload_date and name == 'date': 1889 upload_date = unified_strdate(content) 1890 1891 thumbnails = [{ 1892 'id': image.get('type'), 1893 'url': image.get('src'), 1894 'width': int_or_none(image.get('width')), 1895 'height': int_or_none(image.get('height')), 1896 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] 1897 1898 return { 1899 'id': video_id, 1900 'title': title or video_id, 1901 'description': description, 1902 'upload_date': upload_date, 1903 'thumbnails': thumbnails, 1904 'formats': formats, 1905 'subtitles': subtitles, 1906 } 1907 1908 def _parse_smil_namespace(self, smil): 1909 return self._search_regex( 1910 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) 1911 1912 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): 1913 base = smil_url 1914 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 1915 b = meta.get('base') or meta.get('httpBase') 1916 if b: 1917 base = b 1918 break 1919 1920 formats = [] 1921 rtmp_count = 0 1922 http_count = 0 1923 m3u8_count = 0 1924 1925 srcs = [] 1926 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) 1927 for medium in media: 1928 src = medium.get('src') 1929 if not src or src in srcs: 1930 continue 1931 srcs.append(src) 1932 1933 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) 1934 filesize = int_or_none(medium.get('size') or medium.get('fileSize')) 1935 width = int_or_none(medium.get('width')) 1936 height = int_or_none(medium.get('height')) 1937 proto = medium.get('proto') 1938 ext = medium.get('ext') 1939 src_ext = determine_ext(src) 1940 streamer = medium.get('streamer') or base 1941 1942 if proto == 'rtmp' or streamer.startswith('rtmp'): 1943 rtmp_count += 1 1944 formats.append({ 1945 'url': streamer, 1946 'play_path': src, 1947 'ext': 'flv', 1948 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), 1949 'tbr': bitrate, 1950 'filesize': filesize, 1951 'width': width, 1952 'height': height, 1953 }) 1954 if transform_rtmp_url: 1955 streamer, src = transform_rtmp_url(streamer, src) 1956 formats[-1].update({ 1957 'url': streamer, 1958 'play_path': src, 1959 }) 1960 continue 1961 1962 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) 1963 src_url = src_url.strip() 1964 1965 if proto == 'm3u8' or src_ext == 'm3u8': 1966 m3u8_formats = self._extract_m3u8_formats( 1967 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) 1968 if len(m3u8_formats) == 1: 1969 m3u8_count += 1 1970 m3u8_formats[0].update({ 1971 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), 1972 'tbr': bitrate, 1973 'width': width, 1974 'height': height, 1975 }) 1976 formats.extend(m3u8_formats) 1977 elif src_ext == 'f4m': 1978 f4m_url = src_url 1979 if not f4m_params: 1980 f4m_params = { 1981 'hdcore': '3.2.0', 1982 'plugin': 'flowplayer-3.2.0.1', 1983 } 1984 f4m_url += '&' if '?' in f4m_url else '?' 1985 f4m_url += compat_urllib_parse_urlencode(f4m_params) 1986 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) 1987 elif src_ext == 'mpd': 1988 formats.extend(self._extract_mpd_formats( 1989 src_url, video_id, mpd_id='dash', fatal=False)) 1990 elif re.search(r'\.ism/[Mm]anifest', src_url): 1991 formats.extend(self._extract_ism_formats( 1992 src_url, video_id, ism_id='mss', fatal=False)) 1993 elif src_url.startswith('http') and self._is_valid_url(src, video_id): 1994 http_count += 1 1995 formats.append({ 1996 'url': src_url, 1997 'ext': ext or src_ext or 'flv', 1998 'format_id': 'http-%d' % (bitrate or http_count), 1999 'tbr': bitrate, 2000 'filesize': filesize, 2001 'width': width, 2002 'height': height, 2003 }) 2004 2005 return formats 2006 2007 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): 2008 urls = [] 2009 subtitles = {} 2010 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): 2011 src = textstream.get('src') 2012 if not src or src in urls: 2013 continue 2014 urls.append(src) 2015 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) 2016 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang 2017 subtitles.setdefault(lang, []).append({ 2018 'url': src, 2019 'ext': ext, 2020 }) 2021 return subtitles 2022 2023 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): 2024 xspf = self._download_xml( 2025 xspf_url, playlist_id, 'Downloading xpsf playlist', 2026 'Unable to download xspf manifest', fatal=fatal) 2027 if xspf is False: 2028 return [] 2029 return self._parse_xspf( 2030 xspf, playlist_id, xspf_url=xspf_url, 2031 xspf_base_url=base_url(xspf_url)) 2032 2033 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): 2034 NS_MAP = { 2035 'xspf': 'http://xspf.org/ns/0/', 2036 's1': 'http://static.streamone.nl/player/ns/0', 2037 } 2038 2039 entries = [] 2040 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): 2041 title = xpath_text( 2042 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) 2043 description = xpath_text( 2044 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') 2045 thumbnail = xpath_text( 2046 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') 2047 duration = float_or_none( 2048 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) 2049 2050 formats = [] 2051 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): 2052 format_url = urljoin(xspf_base_url, location.text) 2053 if not format_url: 2054 continue 2055 formats.append({ 2056 'url': format_url, 2057 'manifest_url': xspf_url, 2058 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), 2059 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 2060 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), 2061 }) 2062 self._sort_formats(formats) 2063 2064 entries.append({ 2065 'id': playlist_id, 2066 'title': title, 2067 'description': description, 2068 'thumbnail': thumbnail, 2069 'duration': duration, 2070 'formats': formats, 2071 }) 2072 return entries 2073 2074 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): 2075 res = self._download_xml_handle( 2076 mpd_url, video_id, 2077 note=note or 'Downloading MPD manifest', 2078 errnote=errnote or 'Failed to download MPD manifest', 2079 fatal=fatal, data=data, headers=headers, query=query) 2080 if res is False: 2081 return [] 2082 mpd_doc, urlh = res 2083 if mpd_doc is None: 2084 return [] 2085 mpd_base_url = base_url(urlh.geturl()) 2086 2087 return self._parse_mpd_formats( 2088 mpd_doc, mpd_id, mpd_base_url, mpd_url) 2089 2090 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): 2091 """ 2092 Parse formats from MPD manifest. 2093 References: 2094 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), 2095 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip 2096 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP 2097 """ 2098 if mpd_doc.get('type') == 'dynamic': 2099 return [] 2100 2101 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) 2102 2103 def _add_ns(path): 2104 return self._xpath_ns(path, namespace) 2105 2106 def is_drm_protected(element): 2107 return element.find(_add_ns('ContentProtection')) is not None 2108 2109 def extract_multisegment_info(element, ms_parent_info): 2110 ms_info = ms_parent_info.copy() 2111 2112 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some 2113 # common attributes and elements. We will only extract relevant 2114 # for us. 2115 def extract_common(source): 2116 segment_timeline = source.find(_add_ns('SegmentTimeline')) 2117 if segment_timeline is not None: 2118 s_e = segment_timeline.findall(_add_ns('S')) 2119 if s_e: 2120 ms_info['total_number'] = 0 2121 ms_info['s'] = [] 2122 for s in s_e: 2123 r = int(s.get('r', 0)) 2124 ms_info['total_number'] += 1 + r 2125 ms_info['s'].append({ 2126 't': int(s.get('t', 0)), 2127 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) 2128 'd': int(s.attrib['d']), 2129 'r': r, 2130 }) 2131 start_number = source.get('startNumber') 2132 if start_number: 2133 ms_info['start_number'] = int(start_number) 2134 timescale = source.get('timescale') 2135 if timescale: 2136 ms_info['timescale'] = int(timescale) 2137 segment_duration = source.get('duration') 2138 if segment_duration: 2139 ms_info['segment_duration'] = float(segment_duration) 2140 2141 def extract_Initialization(source): 2142 initialization = source.find(_add_ns('Initialization')) 2143 if initialization is not None: 2144 ms_info['initialization_url'] = initialization.attrib['sourceURL'] 2145 2146 segment_list = element.find(_add_ns('SegmentList')) 2147 if segment_list is not None: 2148 extract_common(segment_list) 2149 extract_Initialization(segment_list) 2150 segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) 2151 if segment_urls_e: 2152 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] 2153 else: 2154 segment_template = element.find(_add_ns('SegmentTemplate')) 2155 if segment_template is not None: 2156 extract_common(segment_template) 2157 media = segment_template.get('media') 2158 if media: 2159 ms_info['media'] = media 2160 initialization = segment_template.get('initialization') 2161 if initialization: 2162 ms_info['initialization'] = initialization 2163 else: 2164 extract_Initialization(segment_template) 2165 return ms_info 2166 2167 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) 2168 formats = [] 2169 for period in mpd_doc.findall(_add_ns('Period')): 2170 period_duration = parse_duration(period.get('duration')) or mpd_duration 2171 period_ms_info = extract_multisegment_info(period, { 2172 'start_number': 1, 2173 'timescale': 1, 2174 }) 2175 for adaptation_set in period.findall(_add_ns('AdaptationSet')): 2176 if is_drm_protected(adaptation_set): 2177 continue 2178 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) 2179 for representation in adaptation_set.findall(_add_ns('Representation')): 2180 if is_drm_protected(representation): 2181 continue 2182 representation_attrib = adaptation_set.attrib.copy() 2183 representation_attrib.update(representation.attrib) 2184 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory 2185 mime_type = representation_attrib['mimeType'] 2186 content_type = mime_type.split('/')[0] 2187 if content_type == 'text': 2188 # TODO implement WebVTT downloading 2189 pass 2190 elif content_type in ('video', 'audio'): 2191 base_url = '' 2192 for element in (representation, adaptation_set, period, mpd_doc): 2193 base_url_e = element.find(_add_ns('BaseURL')) 2194 if base_url_e is not None: 2195 base_url = base_url_e.text + base_url 2196 if re.match(r'^https?://', base_url): 2197 break 2198 if mpd_base_url and not re.match(r'^https?://', base_url): 2199 if not mpd_base_url.endswith('/') and not base_url.startswith('/'): 2200 mpd_base_url += '/' 2201 base_url = mpd_base_url + base_url 2202 representation_id = representation_attrib.get('id') 2203 lang = representation_attrib.get('lang') 2204 url_el = representation.find(_add_ns('BaseURL')) 2205 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) 2206 bandwidth = int_or_none(representation_attrib.get('bandwidth')) 2207 f = { 2208 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 2209 'manifest_url': mpd_url, 2210 'ext': mimetype2ext(mime_type), 2211 'width': int_or_none(representation_attrib.get('width')), 2212 'height': int_or_none(representation_attrib.get('height')), 2213 'tbr': float_or_none(bandwidth, 1000), 2214 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 2215 'fps': int_or_none(representation_attrib.get('frameRate')), 2216 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 2217 'format_note': 'DASH %s' % content_type, 2218 'filesize': filesize, 2219 'container': mimetype2ext(mime_type) + '_dash', 2220 } 2221 f.update(parse_codecs(representation_attrib.get('codecs'))) 2222 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) 2223 2224 def prepare_template(template_name, identifiers): 2225 tmpl = representation_ms_info[template_name] 2226 # First of, % characters outside $...$ templates 2227 # must be escaped by doubling for proper processing 2228 # by % operator string formatting used further (see 2229 # https://github.com/ytdl-org/youtube-dl/issues/16867). 2230 t = '' 2231 in_template = False 2232 for c in tmpl: 2233 t += c 2234 if c == '$': 2235 in_template = not in_template 2236 elif c == '%' and not in_template: 2237 t += c 2238 # Next, $...$ templates are translated to their 2239 # %(...) counterparts to be used with % operator 2240 t = t.replace('$RepresentationID$', representation_id) 2241 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) 2242 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) 2243 t.replace('$$', '$') 2244 return t 2245 2246 # @initialization is a regular template like @media one 2247 # so it should be handled just the same way (see 2248 # https://github.com/ytdl-org/youtube-dl/issues/11605) 2249 if 'initialization' in representation_ms_info: 2250 initialization_template = prepare_template( 2251 'initialization', 2252 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and 2253 # $Time$ shall not be included for @initialization thus 2254 # only $Bandwidth$ remains 2255 ('Bandwidth', )) 2256 representation_ms_info['initialization_url'] = initialization_template % { 2257 'Bandwidth': bandwidth, 2258 } 2259 2260 def location_key(location): 2261 return 'url' if re.match(r'^https?://', location) else 'path' 2262 2263 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: 2264 2265 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) 2266 media_location_key = location_key(media_template) 2267 2268 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ 2269 # can't be used at the same time 2270 if '%(Number' in media_template and 's' not in representation_ms_info: 2271 segment_duration = None 2272 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: 2273 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) 2274 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) 2275 representation_ms_info['fragments'] = [{ 2276 media_location_key: media_template % { 2277 'Number': segment_number, 2278 'Bandwidth': bandwidth, 2279 }, 2280 'duration': segment_duration, 2281 } for segment_number in range( 2282 representation_ms_info['start_number'], 2283 representation_ms_info['total_number'] + representation_ms_info['start_number'])] 2284 else: 2285 # $Number*$ or $Time$ in media template with S list available 2286 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg 2287 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 2288 representation_ms_info['fragments'] = [] 2289 segment_time = 0 2290 segment_d = None 2291 segment_number = representation_ms_info['start_number'] 2292 2293 def add_segment_url(): 2294 segment_url = media_template % { 2295 'Time': segment_time, 2296 'Bandwidth': bandwidth, 2297 'Number': segment_number, 2298 } 2299 representation_ms_info['fragments'].append({ 2300 media_location_key: segment_url, 2301 'duration': float_or_none(segment_d, representation_ms_info['timescale']), 2302 }) 2303 2304 for num, s in enumerate(representation_ms_info['s']): 2305 segment_time = s.get('t') or segment_time 2306 segment_d = s['d'] 2307 add_segment_url() 2308 segment_number += 1 2309 for r in range(s.get('r', 0)): 2310 segment_time += segment_d 2311 add_segment_url() 2312 segment_number += 1 2313 segment_time += segment_d 2314 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: 2315 # No media template 2316 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI 2317 # or any YouTube dashsegments video 2318 fragments = [] 2319 segment_index = 0 2320 timescale = representation_ms_info['timescale'] 2321 for s in representation_ms_info['s']: 2322 duration = float_or_none(s['d'], timescale) 2323 for r in range(s.get('r', 0) + 1): 2324 segment_uri = representation_ms_info['segment_urls'][segment_index] 2325 fragments.append({ 2326 location_key(segment_uri): segment_uri, 2327 'duration': duration, 2328 }) 2329 segment_index += 1 2330 representation_ms_info['fragments'] = fragments 2331 elif 'segment_urls' in representation_ms_info: 2332 # Segment URLs with no SegmentTimeline 2333 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 2334 # https://github.com/ytdl-org/youtube-dl/pull/14844 2335 fragments = [] 2336 segment_duration = float_or_none( 2337 representation_ms_info['segment_duration'], 2338 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None 2339 for segment_url in representation_ms_info['segment_urls']: 2340 fragment = { 2341 location_key(segment_url): segment_url, 2342 } 2343 if segment_duration: 2344 fragment['duration'] = segment_duration 2345 fragments.append(fragment) 2346 representation_ms_info['fragments'] = fragments 2347 # If there is a fragments key available then we correctly recognized fragmented media. 2348 # Otherwise we will assume unfragmented media with direct access. Technically, such 2349 # assumption is not necessarily correct since we may simply have no support for 2350 # some forms of fragmented media renditions yet, but for now we'll use this fallback. 2351 if 'fragments' in representation_ms_info: 2352 f.update({ 2353 # NB: mpd_url may be empty when MPD manifest is parsed from a string 2354 'url': mpd_url or base_url, 2355 'fragment_base_url': base_url, 2356 'fragments': [], 2357 'protocol': 'http_dash_segments', 2358 }) 2359 if 'initialization_url' in representation_ms_info: 2360 initialization_url = representation_ms_info['initialization_url'] 2361 if not f.get('url'): 2362 f['url'] = initialization_url 2363 f['fragments'].append({location_key(initialization_url): initialization_url}) 2364 f['fragments'].extend(representation_ms_info['fragments']) 2365 else: 2366 # Assuming direct URL to unfragmented media. 2367 f['url'] = base_url 2368 formats.append(f) 2369 else: 2370 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) 2371 return formats 2372 2373 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): 2374 res = self._download_xml_handle( 2375 ism_url, video_id, 2376 note=note or 'Downloading ISM manifest', 2377 errnote=errnote or 'Failed to download ISM manifest', 2378 fatal=fatal, data=data, headers=headers, query=query) 2379 if res is False: 2380 return [] 2381 ism_doc, urlh = res 2382 if ism_doc is None: 2383 return [] 2384 2385 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) 2386 2387 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): 2388 """ 2389 Parse formats from ISM manifest. 2390 References: 2391 1. [MS-SSTR]: Smooth Streaming Protocol, 2392 https://msdn.microsoft.com/en-us/library/ff469518.aspx 2393 """ 2394 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: 2395 return [] 2396 2397 duration = int(ism_doc.attrib['Duration']) 2398 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 2399 2400 formats = [] 2401 for stream in ism_doc.findall('StreamIndex'): 2402 stream_type = stream.get('Type') 2403 if stream_type not in ('video', 'audio'): 2404 continue 2405 url_pattern = stream.attrib['Url'] 2406 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale 2407 stream_name = stream.get('Name') 2408 for track in stream.findall('QualityLevel'): 2409 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) 2410 # TODO: add support for WVC1 and WMAP 2411 if fourcc not in ('H264', 'AVC1', 'AACL'): 2412 self.report_warning('%s is not a supported codec' % fourcc) 2413 continue 2414 tbr = int(track.attrib['Bitrate']) // 1000 2415 # [1] does not mention Width and Height attributes. However, 2416 # they're often present while MaxWidth and MaxHeight are 2417 # missing, so should be used as fallbacks 2418 width = int_or_none(track.get('MaxWidth') or track.get('Width')) 2419 height = int_or_none(track.get('MaxHeight') or track.get('Height')) 2420 sampling_rate = int_or_none(track.get('SamplingRate')) 2421 2422 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) 2423 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) 2424 2425 fragments = [] 2426 fragment_ctx = { 2427 'time': 0, 2428 } 2429 stream_fragments = stream.findall('c') 2430 for stream_fragment_index, stream_fragment in enumerate(stream_fragments): 2431 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] 2432 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 2433 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) 2434 if not fragment_ctx['duration']: 2435 try: 2436 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) 2437 except IndexError: 2438 next_fragment_time = duration 2439 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat 2440 for _ in range(fragment_repeat): 2441 fragments.append({ 2442 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), 2443 'duration': fragment_ctx['duration'] / stream_timescale, 2444 }) 2445 fragment_ctx['time'] += fragment_ctx['duration'] 2446 2447 format_id = [] 2448 if ism_id: 2449 format_id.append(ism_id) 2450 if stream_name: 2451 format_id.append(stream_name) 2452 format_id.append(compat_str(tbr)) 2453 2454 formats.append({ 2455 'format_id': '-'.join(format_id), 2456 'url': ism_url, 2457 'manifest_url': ism_url, 2458 'ext': 'ismv' if stream_type == 'video' else 'isma', 2459 'width': width, 2460 'height': height, 2461 'tbr': tbr, 2462 'asr': sampling_rate, 2463 'vcodec': 'none' if stream_type == 'audio' else fourcc, 2464 'acodec': 'none' if stream_type == 'video' else fourcc, 2465 'protocol': 'ism', 2466 'fragments': fragments, 2467 '_download_params': { 2468 'duration': duration, 2469 'timescale': stream_timescale, 2470 'width': width or 0, 2471 'height': height or 0, 2472 'fourcc': fourcc, 2473 'codec_private_data': track.get('CodecPrivateData'), 2474 'sampling_rate': sampling_rate, 2475 'channels': int_or_none(track.get('Channels', 2)), 2476 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), 2477 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), 2478 }, 2479 }) 2480 return formats 2481 2482 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): 2483 def absolute_url(item_url): 2484 return urljoin(base_url, item_url) 2485 2486 def parse_content_type(content_type): 2487 if not content_type: 2488 return {} 2489 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) 2490 if ctr: 2491 mimetype, codecs = ctr.groups() 2492 f = parse_codecs(codecs) 2493 f['ext'] = mimetype2ext(mimetype) 2494 return f 2495 return {} 2496 2497 def _media_formats(src, cur_media_type, type_info={}): 2498 full_url = absolute_url(src) 2499 ext = type_info.get('ext') or determine_ext(full_url) 2500 if ext == 'm3u8': 2501 is_plain_url = False 2502 formats = self._extract_m3u8_formats( 2503 full_url, video_id, ext='mp4', 2504 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, 2505 preference=preference, fatal=False) 2506 elif ext == 'mpd': 2507 is_plain_url = False 2508 formats = self._extract_mpd_formats( 2509 full_url, video_id, mpd_id=mpd_id, fatal=False) 2510 else: 2511 is_plain_url = True 2512 formats = [{ 2513 'url': full_url, 2514 'vcodec': 'none' if cur_media_type == 'audio' else None, 2515 }] 2516 return is_plain_url, formats 2517 2518 entries = [] 2519 # amp-video and amp-audio are very similar to their HTML5 counterparts 2520 # so we wll include them right here (see 2521 # https://www.ampproject.org/docs/reference/components/amp-video) 2522 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ 2523 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' 2524 media_tags = [(media_tag, media_tag_name, media_type, '') 2525 for media_tag, media_tag_name, media_type 2526 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] 2527 media_tags.extend(re.findall( 2528 # We only allow video|audio followed by a whitespace or '>'. 2529 # Allowing more characters may end up in significant slow down (see 2530 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: 2531 # http://www.porntrex.com/maps/videositemap.xml). 2532 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage)) 2533 for media_tag, _, media_type, media_content in media_tags: 2534 media_info = { 2535 'formats': [], 2536 'subtitles': {}, 2537 } 2538 media_attributes = extract_attributes(media_tag) 2539 src = strip_or_none(media_attributes.get('src')) 2540 if src: 2541 _, formats = _media_formats(src, media_type) 2542 media_info['formats'].extend(formats) 2543 media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) 2544 if media_content: 2545 for source_tag in re.findall(r'<source[^>]+>', media_content): 2546 s_attr = extract_attributes(source_tag) 2547 # data-video-src and data-src are non standard but seen 2548 # several times in the wild 2549 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) 2550 if not src: 2551 continue 2552 f = parse_content_type(s_attr.get('type')) 2553 is_plain_url, formats = _media_formats(src, media_type, f) 2554 if is_plain_url: 2555 # width, height, res, label and title attributes are 2556 # all not standard but seen several times in the wild 2557 labels = [ 2558 s_attr.get(lbl) 2559 for lbl in ('label', 'title') 2560 if str_or_none(s_attr.get(lbl)) 2561 ] 2562 width = int_or_none(s_attr.get('width')) 2563 height = (int_or_none(s_attr.get('height')) 2564 or int_or_none(s_attr.get('res'))) 2565 if not width or not height: 2566 for lbl in labels: 2567 resolution = parse_resolution(lbl) 2568 if not resolution: 2569 continue 2570 width = width or resolution.get('width') 2571 height = height or resolution.get('height') 2572 for lbl in labels: 2573 tbr = parse_bitrate(lbl) 2574 if tbr: 2575 break 2576 else: 2577 tbr = None 2578 f.update({ 2579 'width': width, 2580 'height': height, 2581 'tbr': tbr, 2582 'format_id': s_attr.get('label') or s_attr.get('title'), 2583 }) 2584 f.update(formats[0]) 2585 media_info['formats'].append(f) 2586 else: 2587 media_info['formats'].extend(formats) 2588 for track_tag in re.findall(r'<track[^>]+>', media_content): 2589 track_attributes = extract_attributes(track_tag) 2590 kind = track_attributes.get('kind') 2591 if not kind or kind in ('subtitles', 'captions'): 2592 src = strip_or_none(track_attributes.get('src')) 2593 if not src: 2594 continue 2595 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') 2596 media_info['subtitles'].setdefault(lang, []).append({ 2597 'url': absolute_url(src), 2598 }) 2599 for f in media_info['formats']: 2600 f.setdefault('http_headers', {})['Referer'] = base_url 2601 if media_info['formats'] or media_info['subtitles']: 2602 entries.append(media_info) 2603 return entries 2604 2605 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): 2606 signed = 'hdnea=' in manifest_url 2607 if not signed: 2608 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html 2609 manifest_url = re.sub( 2610 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?', 2611 '', manifest_url).strip('?') 2612 2613 formats = [] 2614 2615 hdcore_sign = 'hdcore=3.7.0' 2616 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') 2617 hds_host = hosts.get('hds') 2618 if hds_host: 2619 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) 2620 if 'hdcore=' not in f4m_url: 2621 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign 2622 f4m_formats = self._extract_f4m_formats( 2623 f4m_url, video_id, f4m_id='hds', fatal=False) 2624 for entry in f4m_formats: 2625 entry.update({'extra_param_to_segment_url': hdcore_sign}) 2626 formats.extend(f4m_formats) 2627 2628 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') 2629 hls_host = hosts.get('hls') 2630 if hls_host: 2631 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) 2632 m3u8_formats = self._extract_m3u8_formats( 2633 m3u8_url, video_id, 'mp4', 'm3u8_native', 2634 m3u8_id='hls', fatal=False) 2635 formats.extend(m3u8_formats) 2636 2637 http_host = hosts.get('http') 2638 if http_host and m3u8_formats and not signed: 2639 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' 2640 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') 2641 qualities_length = len(qualities) 2642 if len(m3u8_formats) in (qualities_length, qualities_length + 1): 2643 i = 0 2644 for f in m3u8_formats: 2645 if f['vcodec'] != 'none': 2646 for protocol in ('http', 'https'): 2647 http_f = f.copy() 2648 del http_f['manifest_url'] 2649 http_url = re.sub( 2650 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) 2651 http_f.update({ 2652 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), 2653 'url': http_url, 2654 'protocol': protocol, 2655 }) 2656 formats.append(http_f) 2657 i += 1 2658 2659 return formats 2660 2661 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): 2662 query = compat_urlparse.urlparse(url).query 2663 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) 2664 mobj = re.search( 2665 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) 2666 url_base = mobj.group('url') 2667 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) 2668 formats = [] 2669 2670 def manifest_url(manifest): 2671 m_url = '%s/%s' % (http_base_url, manifest) 2672 if query: 2673 m_url += '?%s' % query 2674 return m_url 2675 2676 if 'm3u8' not in skip_protocols: 2677 formats.extend(self._extract_m3u8_formats( 2678 manifest_url('playlist.m3u8'), video_id, 'mp4', 2679 m3u8_entry_protocol, m3u8_id='hls', fatal=False)) 2680 if 'f4m' not in skip_protocols: 2681 formats.extend(self._extract_f4m_formats( 2682 manifest_url('manifest.f4m'), 2683 video_id, f4m_id='hds', fatal=False)) 2684 if 'dash' not in skip_protocols: 2685 formats.extend(self._extract_mpd_formats( 2686 manifest_url('manifest.mpd'), 2687 video_id, mpd_id='dash', fatal=False)) 2688 if re.search(r'(?:/smil:|\.smil)', url_base): 2689 if 'smil' not in skip_protocols: 2690 rtmp_formats = self._extract_smil_formats( 2691 manifest_url('jwplayer.smil'), 2692 video_id, fatal=False) 2693 for rtmp_format in rtmp_formats: 2694 rtsp_format = rtmp_format.copy() 2695 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) 2696 del rtsp_format['play_path'] 2697 del rtsp_format['ext'] 2698 rtsp_format.update({ 2699 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), 2700 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), 2701 'protocol': 'rtsp', 2702 }) 2703 formats.extend([rtmp_format, rtsp_format]) 2704 else: 2705 for protocol in ('rtmp', 'rtsp'): 2706 if protocol not in skip_protocols: 2707 formats.append({ 2708 'url': '%s:%s' % (protocol, url_base), 2709 'format_id': protocol, 2710 'protocol': protocol, 2711 }) 2712 return formats 2713 2714 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): 2715 mobj = re.search( 2716 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', 2717 webpage) 2718 if mobj: 2719 try: 2720 jwplayer_data = self._parse_json(mobj.group('options'), 2721 video_id=video_id, 2722 transform_source=transform_source) 2723 except ExtractorError: 2724 pass 2725 else: 2726 if isinstance(jwplayer_data, dict): 2727 return jwplayer_data 2728 2729 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): 2730 jwplayer_data = self._find_jwplayer_data( 2731 webpage, video_id, transform_source=js_to_json) 2732 return self._parse_jwplayer_data( 2733 jwplayer_data, video_id, *args, **kwargs) 2734 2735 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, 2736 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): 2737 # JWPlayer backward compatibility: flattened playlists 2738 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 2739 if 'playlist' not in jwplayer_data: 2740 jwplayer_data = {'playlist': [jwplayer_data]} 2741 2742 entries = [] 2743 2744 # JWPlayer backward compatibility: single playlist item 2745 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 2746 if not isinstance(jwplayer_data['playlist'], list): 2747 jwplayer_data['playlist'] = [jwplayer_data['playlist']] 2748 2749 for video_data in jwplayer_data['playlist']: 2750 # JWPlayer backward compatibility: flattened sources 2751 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 2752 if 'sources' not in video_data: 2753 video_data['sources'] = [video_data] 2754 2755 this_video_id = video_id or video_data['mediaid'] 2756 2757 formats = self._parse_jwplayer_formats( 2758 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, 2759 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) 2760 2761 subtitles = {} 2762 tracks = video_data.get('tracks') 2763 if tracks and isinstance(tracks, list): 2764 for track in tracks: 2765 if not isinstance(track, dict): 2766 continue 2767 track_kind = track.get('kind') 2768 if not track_kind or not isinstance(track_kind, compat_str): 2769 continue 2770 if track_kind.lower() not in ('captions', 'subtitles'): 2771 continue 2772 track_url = urljoin(base_url, track.get('file')) 2773 if not track_url: 2774 continue 2775 subtitles.setdefault(track.get('label') or 'en', []).append({ 2776 'url': self._proto_relative_url(track_url) 2777 }) 2778 2779 entry = { 2780 'id': this_video_id, 2781 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 2782 'description': clean_html(video_data.get('description')), 2783 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 2784 'timestamp': int_or_none(video_data.get('pubdate')), 2785 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 2786 'subtitles': subtitles, 2787 } 2788 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 2789 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): 2790 entry.update({ 2791 '_type': 'url_transparent', 2792 'url': formats[0]['url'], 2793 }) 2794 else: 2795 self._sort_formats(formats) 2796 entry['formats'] = formats 2797 entries.append(entry) 2798 if len(entries) == 1: 2799 return entries[0] 2800 else: 2801 return self.playlist_result(entries) 2802 2803 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 2804 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): 2805 urls = [] 2806 formats = [] 2807 for source in jwplayer_sources_data: 2808 if not isinstance(source, dict): 2809 continue 2810 source_url = urljoin( 2811 base_url, self._proto_relative_url(source.get('file'))) 2812 if not source_url or source_url in urls: 2813 continue 2814 urls.append(source_url) 2815 source_type = source.get('type') or '' 2816 ext = mimetype2ext(source_type) or determine_ext(source_url) 2817 if source_type == 'hls' or ext == 'm3u8': 2818 formats.extend(self._extract_m3u8_formats( 2819 source_url, video_id, 'mp4', entry_protocol='m3u8_native', 2820 m3u8_id=m3u8_id, fatal=False)) 2821 elif source_type == 'dash' or ext == 'mpd': 2822 formats.extend(self._extract_mpd_formats( 2823 source_url, video_id, mpd_id=mpd_id, fatal=False)) 2824 elif ext == 'smil': 2825 formats.extend(self._extract_smil_formats( 2826 source_url, video_id, fatal=False)) 2827 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 2828 elif source_type.startswith('audio') or ext in ( 2829 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): 2830 formats.append({ 2831 'url': source_url, 2832 'vcodec': 'none', 2833 'ext': ext, 2834 }) 2835 else: 2836 height = int_or_none(source.get('height')) 2837 if height is None: 2838 # Often no height is provided but there is a label in 2839 # format like "1080p", "720p SD", or 1080. 2840 height = int_or_none(self._search_regex( 2841 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), 2842 'height', default=None)) 2843 a_format = { 2844 'url': source_url, 2845 'width': int_or_none(source.get('width')), 2846 'height': height, 2847 'tbr': int_or_none(source.get('bitrate')), 2848 'ext': ext, 2849 } 2850 if source_url.startswith('rtmp'): 2851 a_format['ext'] = 'flv' 2852 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as 2853 # of jwplayer.flash.swf 2854 rtmp_url_parts = re.split( 2855 r'((?:mp4|mp3|flv):)', source_url, 1) 2856 if len(rtmp_url_parts) == 3: 2857 rtmp_url, prefix, play_path = rtmp_url_parts 2858 a_format.update({ 2859 'url': rtmp_url, 2860 'play_path': prefix + play_path, 2861 }) 2862 if rtmp_params: 2863 a_format.update(rtmp_params) 2864 formats.append(a_format) 2865 return formats 2866 2867 def _live_title(self, name): 2868 """ Generate the title for a live video """ 2869 now = datetime.datetime.now() 2870 now_str = now.strftime('%Y-%m-%d %H:%M') 2871 return name + ' ' + now_str 2872 2873 def _int(self, v, name, fatal=False, **kwargs): 2874 res = int_or_none(v, **kwargs) 2875 if 'get_attr' in kwargs: 2876 print(getattr(v, kwargs['get_attr'])) 2877 if res is None: 2878 msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 2879 if fatal: 2880 raise ExtractorError(msg) 2881 else: 2882 self._downloader.report_warning(msg) 2883 return res 2884 2885 def _float(self, v, name, fatal=False, **kwargs): 2886 res = float_or_none(v, **kwargs) 2887 if res is None: 2888 msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 2889 if fatal: 2890 raise ExtractorError(msg) 2891 else: 2892 self._downloader.report_warning(msg) 2893 return res 2894 2895 def _set_cookie(self, domain, name, value, expire_time=None, port=None, 2896 path='/', secure=False, discard=False, rest={}, **kwargs): 2897 cookie = compat_cookiejar_Cookie( 2898 0, name, value, port, port is not None, domain, True, 2899 domain.startswith('.'), path, True, secure, expire_time, 2900 discard, None, None, rest) 2901 self._downloader.cookiejar.set_cookie(cookie) 2902 2903 def _get_cookies(self, url): 2904 """ Return a compat_cookies_SimpleCookie with the cookies for the url """ 2905 req = sanitized_Request(url) 2906 self._downloader.cookiejar.add_cookie_header(req) 2907 return compat_cookies_SimpleCookie(req.get_header('Cookie')) 2908 2909 def _apply_first_set_cookie_header(self, url_handle, cookie): 2910 """ 2911 Apply first Set-Cookie header instead of the last. Experimental. 2912 2913 Some sites (e.g. [1-3]) may serve two cookies under the same name 2914 in Set-Cookie header and expect the first (old) one to be set rather 2915 than second (new). However, as of RFC6265 the newer one cookie 2916 should be set into cookie store what actually happens. 2917 We will workaround this issue by resetting the cookie to 2918 the first one manually. 2919 1. https://new.vk.com/ 2920 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201 2921 3. https://learning.oreilly.com/ 2922 """ 2923 for header, cookies in url_handle.headers.items(): 2924 if header.lower() != 'set-cookie': 2925 continue 2926 if sys.version_info[0] >= 3: 2927 cookies = cookies.encode('iso-8859-1') 2928 cookies = cookies.decode('utf-8') 2929 cookie_value = re.search( 2930 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies) 2931 if cookie_value: 2932 value, domain = cookie_value.groups() 2933 self._set_cookie(domain, cookie, value) 2934 break 2935 2936 def get_testcases(self, include_onlymatching=False): 2937 t = getattr(self, '_TEST', None) 2938 if t: 2939 assert not hasattr(self, '_TESTS'), \ 2940 '%s has _TEST and _TESTS' % type(self).__name__ 2941 tests = [t] 2942 else: 2943 tests = getattr(self, '_TESTS', []) 2944 for t in tests: 2945 if not include_onlymatching and t.get('only_matching', False): 2946 continue 2947 t['name'] = type(self).__name__[:-len('IE')] 2948 yield t 2949 2950 def is_suitable(self, age_limit): 2951 """ Test whether the extractor is generally suitable for the given 2952 age limit (i.e. pornographic sites are not, all others usually are) """ 2953 2954 any_restricted = False 2955 for tc in self.get_testcases(include_onlymatching=False): 2956 if tc.get('playlist', []): 2957 tc = tc['playlist'][0] 2958 is_restricted = age_restricted( 2959 tc.get('info_dict', {}).get('age_limit'), age_limit) 2960 if not is_restricted: 2961 return True 2962 any_restricted = any_restricted or is_restricted 2963 return not any_restricted 2964 2965 def extract_subtitles(self, *args, **kwargs): 2966 if (self._downloader.params.get('writesubtitles', False) 2967 or self._downloader.params.get('listsubtitles')): 2968 return self._get_subtitles(*args, **kwargs) 2969 return {} 2970 2971 def _get_subtitles(self, *args, **kwargs): 2972 raise NotImplementedError('This method must be implemented by subclasses') 2973 2974 @staticmethod 2975 def _merge_subtitle_items(subtitle_list1, subtitle_list2): 2976 """ Merge subtitle items for one language. Items with duplicated URLs 2977 will be dropped. """ 2978 list1_urls = set([item['url'] for item in subtitle_list1]) 2979 ret = list(subtitle_list1) 2980 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) 2981 return ret 2982 2983 @classmethod 2984 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): 2985 """ Merge two subtitle dictionaries, language by language. """ 2986 ret = dict(subtitle_dict1) 2987 for lang in subtitle_dict2: 2988 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) 2989 return ret 2990 2991 def extract_automatic_captions(self, *args, **kwargs): 2992 if (self._downloader.params.get('writeautomaticsub', False) 2993 or self._downloader.params.get('listsubtitles')): 2994 return self._get_automatic_captions(*args, **kwargs) 2995 return {} 2996 2997 def _get_automatic_captions(self, *args, **kwargs): 2998 raise NotImplementedError('This method must be implemented by subclasses') 2999 3000 def mark_watched(self, *args, **kwargs): 3001 if (self._downloader.params.get('mark_watched', False) 3002 and (self._get_login_info()[0] is not None 3003 or self._downloader.params.get('cookiefile') is not None)): 3004 self._mark_watched(*args, **kwargs) 3005 3006 def _mark_watched(self, *args, **kwargs): 3007 raise NotImplementedError('This method must be implemented by subclasses') 3008 3009 def geo_verification_headers(self): 3010 headers = {} 3011 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') 3012 if geo_verification_proxy: 3013 headers['Ytdl-request-proxy'] = geo_verification_proxy 3014 return headers 3015 3016 def _generic_id(self, url): 3017 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) 3018 3019 def _generic_title(self, url): 3020 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) 3021 3022 3023 class SearchInfoExtractor(InfoExtractor): 3024 """ 3025 Base class for paged search queries extractors. 3026 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} 3027 Instances should define _SEARCH_KEY and _MAX_RESULTS. 3028 """ 3029 3030 @classmethod 3031 def _make_valid_url(cls): 3032 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY 3033 3034 @classmethod 3035 def suitable(cls, url): 3036 return re.match(cls._make_valid_url(), url) is not None 3037 3038 def _real_extract(self, query): 3039 mobj = re.match(self._make_valid_url(), query) 3040 if mobj is None: 3041 raise ExtractorError('Invalid search query "%s"' % query) 3042 3043 prefix = mobj.group('prefix') 3044 query = mobj.group('query') 3045 if prefix == '': 3046 return self._get_n_results(query, 1) 3047 elif prefix == 'all': 3048 return self._get_n_results(query, self._MAX_RESULTS) 3049 else: 3050 n = int(prefix) 3051 if n <= 0: 3052 raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) 3053 elif n > self._MAX_RESULTS: 3054 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) 3055 n = self._MAX_RESULTS 3056 return self._get_n_results(query, n) 3057 3058 def _get_n_results(self, query, n): 3059 """Get a specified number of results for a query""" 3060 raise NotImplementedError('This method must be implemented by subclasses') 3061 3062 @property 3063 def SEARCH_KEY(self): 3064 return self._SEARCH_KEY