cda.py (8325B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import codecs 5 import re 6 7 from .common import InfoExtractor 8 from ..compat import ( 9 compat_chr, 10 compat_ord, 11 compat_urllib_parse_unquote, 12 ) 13 from ..utils import ( 14 ExtractorError, 15 float_or_none, 16 int_or_none, 17 merge_dicts, 18 multipart_encode, 19 parse_duration, 20 random_birthday, 21 urljoin, 22 ) 23 24 25 class CDAIE(InfoExtractor): 26 _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' 27 _BASE_URL = 'http://www.cda.pl/' 28 _TESTS = [{ 29 'url': 'http://www.cda.pl/video/5749950c', 30 'md5': '6f844bf51b15f31fae165365707ae970', 31 'info_dict': { 32 'id': '5749950c', 33 'ext': 'mp4', 34 'height': 720, 35 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', 36 'description': 'md5:269ccd135d550da90d1662651fcb9772', 37 'thumbnail': r're:^https?://.*\.jpg$', 38 'average_rating': float, 39 'duration': 39, 40 'age_limit': 0, 41 } 42 }, { 43 'url': 'http://www.cda.pl/video/57413289', 44 'md5': 'a88828770a8310fc00be6c95faf7f4d5', 45 'info_dict': { 46 'id': '57413289', 47 'ext': 'mp4', 48 'title': 'Lądowanie na lotnisku na Maderze', 49 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', 50 'thumbnail': r're:^https?://.*\.jpg$', 51 'uploader': 'crash404', 52 'view_count': int, 53 'average_rating': float, 54 'duration': 137, 55 'age_limit': 0, 56 } 57 }, { 58 # Age-restricted 59 'url': 'http://www.cda.pl/video/1273454c4', 60 'info_dict': { 61 'id': '1273454c4', 62 'ext': 'mp4', 63 'title': 'Bronson (2008) napisy HD 1080p', 64 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', 65 'height': 1080, 66 'uploader': 'boniek61', 67 'thumbnail': r're:^https?://.*\.jpg$', 68 'duration': 5554, 69 'age_limit': 18, 70 'view_count': int, 71 'average_rating': float, 72 }, 73 }, { 74 'url': 'http://ebd.cda.pl/0x0/5749950c', 75 'only_matching': True, 76 }] 77 78 def _download_age_confirm_page(self, url, video_id, *args, **kwargs): 79 form_data = random_birthday('rok', 'miesiac', 'dzien') 80 form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) 81 data, content_type = multipart_encode(form_data) 82 return self._download_webpage( 83 urljoin(url, '/a/validatebirth'), video_id, *args, 84 data=data, headers={ 85 'Referer': url, 86 'Content-Type': content_type, 87 }, **kwargs) 88 89 def _real_extract(self, url): 90 video_id = self._match_id(url) 91 self._set_cookie('cda.pl', 'cda.player', 'html5') 92 webpage = self._download_webpage( 93 self._BASE_URL + '/video/' + video_id, video_id) 94 95 if 'Ten film jest dostępny dla użytkowników premium' in webpage: 96 raise ExtractorError('This video is only available for premium users.', expected=True) 97 98 if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): 99 self.raise_geo_restricted() 100 101 need_confirm_age = False 102 if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")', 103 webpage, 'birthday validate form', default=None): 104 webpage = self._download_age_confirm_page( 105 url, video_id, note='Confirming age') 106 need_confirm_age = True 107 108 formats = [] 109 110 uploader = self._search_regex(r'''(?x) 111 <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> 112 (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*? 113 <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3> 114 ''', webpage, 'uploader', default=None, group='uploader') 115 view_count = self._search_regex( 116 r'Odsłony:(?:\s| )*([0-9]+)', webpage, 117 'view_count', default=None) 118 average_rating = self._search_regex( 119 (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', 120 r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, 121 group='rating_value') 122 123 info_dict = { 124 'id': video_id, 125 'title': self._og_search_title(webpage), 126 'description': self._og_search_description(webpage), 127 'uploader': uploader, 128 'view_count': int_or_none(view_count), 129 'average_rating': float_or_none(average_rating), 130 'thumbnail': self._og_search_thumbnail(webpage), 131 'formats': formats, 132 'duration': None, 133 'age_limit': 18 if need_confirm_age else 0, 134 } 135 136 info = self._search_json_ld(webpage, video_id, default={}) 137 138 # Source: https://www.cda.pl/js/player.js?t=1606154898 139 def decrypt_file(a): 140 for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): 141 a = a.replace(p, '') 142 a = compat_urllib_parse_unquote(a) 143 b = [] 144 for c in a: 145 f = compat_ord(c) 146 b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) 147 a = ''.join(b) 148 a = a.replace('.cda.mp4', '') 149 for p in ('.2cda.pl', '.3cda.pl'): 150 a = a.replace(p, '.cda.pl') 151 if '/upstream' in a: 152 a = a.replace('/upstream', '.mp4/upstream') 153 return 'https://' + a 154 return 'https://' + a + '.mp4' 155 156 def extract_format(page, version): 157 json_str = self._html_search_regex( 158 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, 159 '%s player_json' % version, fatal=False, group='player_data') 160 if not json_str: 161 return 162 player_data = self._parse_json( 163 json_str, '%s player_data' % version, fatal=False) 164 if not player_data: 165 return 166 video = player_data.get('video') 167 if not video or 'file' not in video: 168 self.report_warning('Unable to extract %s version information' % version) 169 return 170 if video['file'].startswith('uggc'): 171 video['file'] = codecs.decode(video['file'], 'rot_13') 172 if video['file'].endswith('adc.mp4'): 173 video['file'] = video['file'].replace('adc.mp4', '.mp4') 174 elif not video['file'].startswith('http'): 175 video['file'] = decrypt_file(video['file']) 176 f = { 177 'url': video['file'], 178 } 179 m = re.search( 180 r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p', 181 page) 182 if m: 183 f.update({ 184 'format_id': m.group('format_id'), 185 'height': int(m.group('height')), 186 }) 187 info_dict['formats'].append(f) 188 if not info_dict['duration']: 189 info_dict['duration'] = parse_duration(video.get('duration')) 190 191 extract_format(webpage, 'default') 192 193 for href, resolution in re.findall( 194 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', 195 webpage): 196 if need_confirm_age: 197 handler = self._download_age_confirm_page 198 else: 199 handler = self._download_webpage 200 201 webpage = handler( 202 urljoin(self._BASE_URL, href), video_id, 203 'Downloading %s version information' % resolution, fatal=False) 204 if not webpage: 205 # Manually report warning because empty page is returned when 206 # invalid version is requested. 207 self.report_warning('Unable to download %s version information' % resolution) 208 continue 209 210 extract_format(webpage, resolution) 211 212 self._sort_formats(formats) 213 214 return merge_dicts(info_dict, info)