youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

leeco.py (13103B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import datetime
      5 import hashlib
      6 import re
      7 import time
      8 
      9 from .common import InfoExtractor
     10 from ..compat import (
     11     compat_b64decode,
     12     compat_ord,
     13     compat_str,
     14     compat_urllib_parse_urlencode,
     15 )
     16 from ..utils import (
     17     determine_ext,
     18     encode_data_uri,
     19     ExtractorError,
     20     int_or_none,
     21     orderedSet,
     22     parse_iso8601,
     23     str_or_none,
     24     url_basename,
     25     urshift,
     26 )
     27 
     28 
     29 class LeIE(InfoExtractor):
     30     IE_DESC = '乐视网'
     31     _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|(?:sports\.le|(?:www\.)?lesports)\.com/(?:match|video))/(?P<id>\d+)\.html'
     32     _GEO_COUNTRIES = ['CN']
     33     _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'
     34 
     35     _TESTS = [{
     36         'url': 'http://www.le.com/ptv/vplay/22005890.html',
     37         'md5': 'edadcfe5406976f42f9f266057ee5e40',
     38         'info_dict': {
     39             'id': '22005890',
     40             'ext': 'mp4',
     41             'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家',
     42             'description': 'md5:a9cb175fd753e2962176b7beca21a47c',
     43         },
     44         'params': {
     45             'hls_prefer_native': True,
     46         },
     47     }, {
     48         'url': 'http://www.le.com/ptv/vplay/1415246.html',
     49         'info_dict': {
     50             'id': '1415246',
     51             'ext': 'mp4',
     52             'title': '美人天下01',
     53             'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',
     54         },
     55         'params': {
     56             'hls_prefer_native': True,
     57         },
     58     }, {
     59         'note': 'This video is available only in Mainland China, thus a proxy is needed',
     60         'url': 'http://www.le.com/ptv/vplay/1118082.html',
     61         'md5': '2424c74948a62e5f31988438979c5ad1',
     62         'info_dict': {
     63             'id': '1118082',
     64             'ext': 'mp4',
     65             'title': '与龙共舞 完整版',
     66             'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
     67         },
     68         'params': {
     69             'hls_prefer_native': True,
     70         },
     71     }, {
     72         'url': 'http://sports.le.com/video/25737697.html',
     73         'only_matching': True,
     74     }, {
     75         'url': 'http://www.lesports.com/match/1023203003.html',
     76         'only_matching': True,
     77     }, {
     78         'url': 'http://sports.le.com/match/1023203003.html',
     79         'only_matching': True,
     80     }]
     81 
     82     # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf
     83     def ror(self, param1, param2):
     84         _loc3_ = 0
     85         while _loc3_ < param2:
     86             param1 = urshift(param1, 1) + ((param1 & 1) << 31)
     87             _loc3_ += 1
     88         return param1
     89 
     90     def calc_time_key(self, param1):
     91         _loc2_ = 185025305
     92         return self.ror(param1, _loc2_ % 17) ^ _loc2_
     93 
     94     # see M3U8Encryption class in KLetvPlayer.swf
     95     @staticmethod
     96     def decrypt_m3u8(encrypted_data):
     97         if encrypted_data[:5].decode('utf-8').lower() != 'vc_01':
     98             return encrypted_data
     99         encrypted_data = encrypted_data[5:]
    100 
    101         _loc4_ = bytearray(2 * len(encrypted_data))
    102         for idx, val in enumerate(encrypted_data):
    103             b = compat_ord(val)
    104             _loc4_[2 * idx] = b // 16
    105             _loc4_[2 * idx + 1] = b % 16
    106         idx = len(_loc4_) - 11
    107         _loc4_ = _loc4_[idx:] + _loc4_[:idx]
    108         _loc7_ = bytearray(len(encrypted_data))
    109         for i in range(len(encrypted_data)):
    110             _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1]
    111 
    112         return bytes(_loc7_)
    113 
    114     def _check_errors(self, play_json):
    115         # Check for errors
    116         playstatus = play_json['msgs']['playstatus']
    117         if playstatus['status'] == 0:
    118             flag = playstatus['flag']
    119             if flag == 1:
    120                 self.raise_geo_restricted()
    121             else:
    122                 raise ExtractorError('Generic error. flag = %d' % flag, expected=True)
    123 
    124     def _real_extract(self, url):
    125         media_id = self._match_id(url)
    126         page = self._download_webpage(url, media_id)
    127 
    128         play_json_flash = self._download_json(
    129             'http://player-pc.le.com/mms/out/video/playJson',
    130             media_id, 'Downloading flash playJson data', query={
    131                 'id': media_id,
    132                 'platid': 1,
    133                 'splatid': 105,
    134                 'format': 1,
    135                 'source': 1000,
    136                 'tkey': self.calc_time_key(int(time.time())),
    137                 'domain': 'www.le.com',
    138                 'region': 'cn',
    139             },
    140             headers=self.geo_verification_headers())
    141         self._check_errors(play_json_flash)
    142 
    143         def get_flash_urls(media_url, format_id):
    144             nodes_data = self._download_json(
    145                 media_url, media_id,
    146                 'Download JSON metadata for format %s' % format_id,
    147                 query={
    148                     'm3v': 1,
    149                     'format': 1,
    150                     'expect': 3,
    151                     'tss': 'ios',
    152                 })
    153 
    154             req = self._request_webpage(
    155                 nodes_data['nodelist'][0]['location'], media_id,
    156                 note='Downloading m3u8 information for format %s' % format_id)
    157 
    158             m3u8_data = self.decrypt_m3u8(req.read())
    159 
    160             return {
    161                 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
    162             }
    163 
    164         extracted_formats = []
    165         formats = []
    166         playurl = play_json_flash['msgs']['playurl']
    167         play_domain = playurl['domain'][0]
    168 
    169         for format_id, format_data in playurl.get('dispatch', []).items():
    170             if format_id in extracted_formats:
    171                 continue
    172             extracted_formats.append(format_id)
    173 
    174             media_url = play_domain + format_data[0]
    175             for protocol, format_url in get_flash_urls(media_url, format_id).items():
    176                 f = {
    177                     'url': format_url,
    178                     'ext': determine_ext(format_data[1]),
    179                     'format_id': '%s-%s' % (protocol, format_id),
    180                     'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
    181                     'quality': int_or_none(format_id),
    182                 }
    183 
    184                 if format_id[-1:] == 'p':
    185                     f['height'] = int_or_none(format_id[:-1])
    186 
    187                 formats.append(f)
    188         self._sort_formats(formats, ('height', 'quality', 'format_id'))
    189 
    190         publish_time = parse_iso8601(self._html_search_regex(
    191             r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
    192             delimiter=' ', timezone=datetime.timedelta(hours=8))
    193         description = self._html_search_meta('description', page, fatal=False)
    194 
    195         return {
    196             'id': media_id,
    197             'formats': formats,
    198             'title': playurl['title'],
    199             'thumbnail': playurl['pic'],
    200             'description': description,
    201             'timestamp': publish_time,
    202         }
    203 
    204 
    205 class LePlaylistIE(InfoExtractor):
    206     _VALID_URL = r'https?://[a-z]+\.le\.com/(?!video)[a-z]+/(?P<id>[a-z0-9_]+)'
    207 
    208     _TESTS = [{
    209         'url': 'http://www.le.com/tv/46177.html',
    210         'info_dict': {
    211             'id': '46177',
    212             'title': '美人天下',
    213             'description': 'md5:395666ff41b44080396e59570dbac01c'
    214         },
    215         'playlist_count': 35
    216     }, {
    217         'url': 'http://tv.le.com/izt/wuzetian/index.html',
    218         'info_dict': {
    219             'id': 'wuzetian',
    220             'title': '武媚娘传奇',
    221             'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
    222         },
    223         # This playlist contains some extra videos other than the drama itself
    224         'playlist_mincount': 96
    225     }, {
    226         'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml',
    227         # This series is moved to http://www.le.com/tv/10005297.html
    228         'only_matching': True,
    229     }, {
    230         'url': 'http://www.le.com/comic/92063.html',
    231         'only_matching': True,
    232     }, {
    233         'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html',
    234         'only_matching': True,
    235     }]
    236 
    237     @classmethod
    238     def suitable(cls, url):
    239         return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url)
    240 
    241     def _real_extract(self, url):
    242         playlist_id = self._match_id(url)
    243         page = self._download_webpage(url, playlist_id)
    244 
    245         # Currently old domain names are still used in playlists
    246         media_ids = orderedSet(re.findall(
    247             r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page))
    248         entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le')
    249                    for media_id in media_ids]
    250 
    251         title = self._html_search_meta('keywords', page,
    252                                        fatal=False).split(',')[0]
    253         description = self._html_search_meta('description', page, fatal=False)
    254 
    255         return self.playlist_result(entries, playlist_id, playlist_title=title,
    256                                     playlist_description=description)
    257 
    258 
    259 class LetvCloudIE(InfoExtractor):
    260     # Most of *.letv.com is changed to *.le.com on 2016/01/02
    261     # but yuntv.letv.com is kept, so also keep the extractor name
    262     IE_DESC = '乐视云'
    263     _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+'
    264 
    265     _TESTS = [{
    266         'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf',
    267         'md5': '26450599afd64c513bc77030ad15db44',
    268         'info_dict': {
    269             'id': 'p7jnfw5hw9_467623dedf',
    270             'ext': 'mp4',
    271             'title': 'Video p7jnfw5hw9_467623dedf',
    272         },
    273     }, {
    274         'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360',
    275         'md5': 'e03d9cc8d9c13191e1caf277e42dbd31',
    276         'info_dict': {
    277             'id': 'p7jnfw5hw9_ec93197892',
    278             'ext': 'mp4',
    279             'title': 'Video p7jnfw5hw9_ec93197892',
    280         },
    281     }, {
    282         'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd',
    283         'md5': 'cb988699a776b22d4a41b9d43acfb3ac',
    284         'info_dict': {
    285             'id': 'p7jnfw5hw9_187060b6fd',
    286             'ext': 'mp4',
    287             'title': 'Video p7jnfw5hw9_187060b6fd',
    288         },
    289     }]
    290 
    291     @staticmethod
    292     def sign_data(obj):
    293         if obj['cf'] == 'flash':
    294             salt = '2f9d6924b33a165a6d8b5d3d42f4f987'
    295             items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu']
    296         elif obj['cf'] == 'html5':
    297             salt = 'fbeh5player12c43eccf2bec3300344'
    298             items = ['cf', 'ran', 'uu', 'bver', 'vu']
    299         input_data = ''.join([item + obj[item] for item in items]) + salt
    300         obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest()
    301 
    302     def _get_formats(self, cf, uu, vu, media_id):
    303         def get_play_json(cf, timestamp):
    304             data = {
    305                 'cf': cf,
    306                 'ver': '2.2',
    307                 'bver': 'firefox44.0',
    308                 'format': 'json',
    309                 'uu': uu,
    310                 'vu': vu,
    311                 'ran': compat_str(timestamp),
    312             }
    313             self.sign_data(data)
    314             return self._download_json(
    315                 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data),
    316                 media_id, 'Downloading playJson data for type %s' % cf)
    317 
    318         play_json = get_play_json(cf, time.time())
    319         # The server time may be different from local time
    320         if play_json.get('code') == 10071:
    321             play_json = get_play_json(cf, play_json['timestamp'])
    322 
    323         if not play_json.get('data'):
    324             if play_json.get('message'):
    325                 raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True)
    326             elif play_json.get('code'):
    327                 raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True)
    328             else:
    329                 raise ExtractorError('Letv cloud returned an unknown error')
    330 
    331         def b64decode(s):
    332             return compat_b64decode(s).decode('utf-8')
    333 
    334         formats = []
    335         for media in play_json['data']['video_info']['media'].values():
    336             play_url = media['play_url']
    337             url = b64decode(play_url['main_url'])
    338             decoded_url = b64decode(url_basename(url))
    339             formats.append({
    340                 'url': url,
    341                 'ext': determine_ext(decoded_url),
    342                 'format_id': str_or_none(play_url.get('vtype')),
    343                 'format_note': str_or_none(play_url.get('definition')),
    344                 'width': int_or_none(play_url.get('vwidth')),
    345                 'height': int_or_none(play_url.get('vheight')),
    346             })
    347 
    348         return formats
    349 
    350     def _real_extract(self, url):
    351         uu_mobj = re.search(r'uu=([\w]+)', url)
    352         vu_mobj = re.search(r'vu=([\w]+)', url)
    353 
    354         if not uu_mobj or not vu_mobj:
    355             raise ExtractorError('Invalid URL: %s' % url, expected=True)
    356 
    357         uu = uu_mobj.group(1)
    358         vu = vu_mobj.group(1)
    359         media_id = uu + '_' + vu
    360 
    361         formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id)
    362         self._sort_formats(formats)
    363 
    364         return {
    365             'id': media_id,
    366             'title': 'Video %s' % media_id,
    367             'formats': formats,
    368         }