youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

cda.py (8325B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import codecs
      5 import re
      6 
      7 from .common import InfoExtractor
      8 from ..compat import (
      9     compat_chr,
     10     compat_ord,
     11     compat_urllib_parse_unquote,
     12 )
     13 from ..utils import (
     14     ExtractorError,
     15     float_or_none,
     16     int_or_none,
     17     merge_dicts,
     18     multipart_encode,
     19     parse_duration,
     20     random_birthday,
     21     urljoin,
     22 )
     23 
     24 
     25 class CDAIE(InfoExtractor):
     26     _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
     27     _BASE_URL = 'http://www.cda.pl/'
     28     _TESTS = [{
     29         'url': 'http://www.cda.pl/video/5749950c',
     30         'md5': '6f844bf51b15f31fae165365707ae970',
     31         'info_dict': {
     32             'id': '5749950c',
     33             'ext': 'mp4',
     34             'height': 720,
     35             'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
     36             'description': 'md5:269ccd135d550da90d1662651fcb9772',
     37             'thumbnail': r're:^https?://.*\.jpg$',
     38             'average_rating': float,
     39             'duration': 39,
     40             'age_limit': 0,
     41         }
     42     }, {
     43         'url': 'http://www.cda.pl/video/57413289',
     44         'md5': 'a88828770a8310fc00be6c95faf7f4d5',
     45         'info_dict': {
     46             'id': '57413289',
     47             'ext': 'mp4',
     48             'title': 'Lądowanie na lotnisku na Maderze',
     49             'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
     50             'thumbnail': r're:^https?://.*\.jpg$',
     51             'uploader': 'crash404',
     52             'view_count': int,
     53             'average_rating': float,
     54             'duration': 137,
     55             'age_limit': 0,
     56         }
     57     }, {
     58         # Age-restricted
     59         'url': 'http://www.cda.pl/video/1273454c4',
     60         'info_dict': {
     61             'id': '1273454c4',
     62             'ext': 'mp4',
     63             'title': 'Bronson (2008) napisy HD 1080p',
     64             'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
     65             'height': 1080,
     66             'uploader': 'boniek61',
     67             'thumbnail': r're:^https?://.*\.jpg$',
     68             'duration': 5554,
     69             'age_limit': 18,
     70             'view_count': int,
     71             'average_rating': float,
     72         },
     73     }, {
     74         'url': 'http://ebd.cda.pl/0x0/5749950c',
     75         'only_matching': True,
     76     }]
     77 
     78     def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
     79         form_data = random_birthday('rok', 'miesiac', 'dzien')
     80         form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
     81         data, content_type = multipart_encode(form_data)
     82         return self._download_webpage(
     83             urljoin(url, '/a/validatebirth'), video_id, *args,
     84             data=data, headers={
     85                 'Referer': url,
     86                 'Content-Type': content_type,
     87             }, **kwargs)
     88 
     89     def _real_extract(self, url):
     90         video_id = self._match_id(url)
     91         self._set_cookie('cda.pl', 'cda.player', 'html5')
     92         webpage = self._download_webpage(
     93             self._BASE_URL + '/video/' + video_id, video_id)
     94 
     95         if 'Ten film jest dostępny dla użytkowników premium' in webpage:
     96             raise ExtractorError('This video is only available for premium users.', expected=True)
     97 
     98         if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
     99             self.raise_geo_restricted()
    100 
    101         need_confirm_age = False
    102         if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")',
    103                                    webpage, 'birthday validate form', default=None):
    104             webpage = self._download_age_confirm_page(
    105                 url, video_id, note='Confirming age')
    106             need_confirm_age = True
    107 
    108         formats = []
    109 
    110         uploader = self._search_regex(r'''(?x)
    111             <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
    112             (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
    113             <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
    114         ''', webpage, 'uploader', default=None, group='uploader')
    115         view_count = self._search_regex(
    116             r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
    117             'view_count', default=None)
    118         average_rating = self._search_regex(
    119             (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
    120              r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
    121             group='rating_value')
    122 
    123         info_dict = {
    124             'id': video_id,
    125             'title': self._og_search_title(webpage),
    126             'description': self._og_search_description(webpage),
    127             'uploader': uploader,
    128             'view_count': int_or_none(view_count),
    129             'average_rating': float_or_none(average_rating),
    130             'thumbnail': self._og_search_thumbnail(webpage),
    131             'formats': formats,
    132             'duration': None,
    133             'age_limit': 18 if need_confirm_age else 0,
    134         }
    135 
    136         info = self._search_json_ld(webpage, video_id, default={})
    137 
    138         # Source: https://www.cda.pl/js/player.js?t=1606154898
    139         def decrypt_file(a):
    140             for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
    141                 a = a.replace(p, '')
    142             a = compat_urllib_parse_unquote(a)
    143             b = []
    144             for c in a:
    145                 f = compat_ord(c)
    146                 b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f))
    147             a = ''.join(b)
    148             a = a.replace('.cda.mp4', '')
    149             for p in ('.2cda.pl', '.3cda.pl'):
    150                 a = a.replace(p, '.cda.pl')
    151             if '/upstream' in a:
    152                 a = a.replace('/upstream', '.mp4/upstream')
    153                 return 'https://' + a
    154             return 'https://' + a + '.mp4'
    155 
    156         def extract_format(page, version):
    157             json_str = self._html_search_regex(
    158                 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
    159                 '%s player_json' % version, fatal=False, group='player_data')
    160             if not json_str:
    161                 return
    162             player_data = self._parse_json(
    163                 json_str, '%s player_data' % version, fatal=False)
    164             if not player_data:
    165                 return
    166             video = player_data.get('video')
    167             if not video or 'file' not in video:
    168                 self.report_warning('Unable to extract %s version information' % version)
    169                 return
    170             if video['file'].startswith('uggc'):
    171                 video['file'] = codecs.decode(video['file'], 'rot_13')
    172                 if video['file'].endswith('adc.mp4'):
    173                     video['file'] = video['file'].replace('adc.mp4', '.mp4')
    174             elif not video['file'].startswith('http'):
    175                 video['file'] = decrypt_file(video['file'])
    176             f = {
    177                 'url': video['file'],
    178             }
    179             m = re.search(
    180                 r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p',
    181                 page)
    182             if m:
    183                 f.update({
    184                     'format_id': m.group('format_id'),
    185                     'height': int(m.group('height')),
    186                 })
    187             info_dict['formats'].append(f)
    188             if not info_dict['duration']:
    189                 info_dict['duration'] = parse_duration(video.get('duration'))
    190 
    191         extract_format(webpage, 'default')
    192 
    193         for href, resolution in re.findall(
    194                 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
    195                 webpage):
    196             if need_confirm_age:
    197                 handler = self._download_age_confirm_page
    198             else:
    199                 handler = self._download_webpage
    200 
    201             webpage = handler(
    202                 urljoin(self._BASE_URL, href), video_id,
    203                 'Downloading %s version information' % resolution, fatal=False)
    204             if not webpage:
    205                 # Manually report warning because empty page is returned when
    206                 # invalid version is requested.
    207                 self.report_warning('Unable to download %s version information' % resolution)
    208                 continue
    209 
    210             extract_format(webpage, resolution)
    211 
    212         self._sort_formats(formats)
    213 
    214         return merge_dicts(info_dict, info)