youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

bilibili.py (16619B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import hashlib
      5 import re
      6 
      7 from .common import InfoExtractor
      8 from ..compat import (
      9     compat_parse_qs,
     10     compat_urlparse,
     11 )
     12 from ..utils import (
     13     ExtractorError,
     14     int_or_none,
     15     float_or_none,
     16     parse_iso8601,
     17     smuggle_url,
     18     str_or_none,
     19     strip_jsonp,
     20     unified_timestamp,
     21     unsmuggle_url,
     22     urlencode_postdata,
     23 )
     24 
     25 
     26 class BiliBiliIE(InfoExtractor):
     27     _VALID_URL = r'''(?x)
     28                     https?://
     29                         (?:(?:www|bangumi)\.)?
     30                         bilibili\.(?:tv|com)/
     31                         (?:
     32                             (?:
     33                                 video/[aA][vV]|
     34                                 anime/(?P<anime_id>\d+)/play\#
     35                             )(?P<id_bv>\d+)|
     36                             video/[bB][vV](?P<id>[^/?#&]+)
     37                         )
     38                     '''
     39 
     40     _TESTS = [{
     41         'url': 'http://www.bilibili.tv/video/av1074402/',
     42         'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
     43         'info_dict': {
     44             'id': '1074402',
     45             'ext': 'flv',
     46             'title': '【金坷垃】金泡沫',
     47             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
     48             'duration': 308.067,
     49             'timestamp': 1398012678,
     50             'upload_date': '20140420',
     51             'thumbnail': r're:^https?://.+\.jpg',
     52             'uploader': '菊子桑',
     53             'uploader_id': '156160',
     54         },
     55     }, {
     56         # Tested in BiliBiliBangumiIE
     57         'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
     58         'only_matching': True,
     59     }, {
     60         'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
     61         'md5': '3f721ad1e75030cc06faf73587cfec57',
     62         'info_dict': {
     63             'id': '100643',
     64             'ext': 'mp4',
     65             'title': 'CHAOS;CHILD',
     66             'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
     67         },
     68         'skip': 'Geo-restricted to China',
     69     }, {
     70         # Title with double quotes
     71         'url': 'http://www.bilibili.com/video/av8903802/',
     72         'info_dict': {
     73             'id': '8903802',
     74             'title': '阿滴英文|英文歌分享#6 "Closer',
     75             'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
     76         },
     77         'playlist': [{
     78             'info_dict': {
     79                 'id': '8903802_part1',
     80                 'ext': 'flv',
     81                 'title': '阿滴英文|英文歌分享#6 "Closer',
     82                 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
     83                 'uploader': '阿滴英文',
     84                 'uploader_id': '65880958',
     85                 'timestamp': 1488382634,
     86                 'upload_date': '20170301',
     87             },
     88             'params': {
     89                 'skip_download': True,  # Test metadata only
     90             },
     91         }, {
     92             'info_dict': {
     93                 'id': '8903802_part2',
     94                 'ext': 'flv',
     95                 'title': '阿滴英文|英文歌分享#6 "Closer',
     96                 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
     97                 'uploader': '阿滴英文',
     98                 'uploader_id': '65880958',
     99                 'timestamp': 1488382634,
    100                 'upload_date': '20170301',
    101             },
    102             'params': {
    103                 'skip_download': True,  # Test metadata only
    104             },
    105         }]
    106     }, {
    107         # new BV video id format
    108         'url': 'https://www.bilibili.com/video/BV1JE411F741',
    109         'only_matching': True,
    110     }]
    111 
    112     _APP_KEY = 'iVGUTjsxvpLeuDCf'
    113     _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
    114 
    115     def _report_error(self, result):
    116         if 'message' in result:
    117             raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
    118         elif 'code' in result:
    119             raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
    120         else:
    121             raise ExtractorError('Can\'t extract Bangumi episode ID')
    122 
    123     def _real_extract(self, url):
    124         url, smuggled_data = unsmuggle_url(url, {})
    125 
    126         mobj = re.match(self._VALID_URL, url)
    127         video_id = mobj.group('id') or mobj.group('id_bv')
    128         anime_id = mobj.group('anime_id')
    129         webpage = self._download_webpage(url, video_id)
    130 
    131         if 'anime/' not in url:
    132             cid = self._search_regex(
    133                 r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
    134                 default=None
    135             ) or compat_parse_qs(self._search_regex(
    136                 [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
    137                  r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
    138                  r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
    139                 webpage, 'player parameters'))['cid'][0]
    140         else:
    141             if 'no_bangumi_tip' not in smuggled_data:
    142                 self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % (
    143                     video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
    144             headers = {
    145                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    146                 'Referer': url
    147             }
    148             headers.update(self.geo_verification_headers())
    149 
    150             js = self._download_json(
    151                 'http://bangumi.bilibili.com/web_api/get_source', video_id,
    152                 data=urlencode_postdata({'episode_id': video_id}),
    153                 headers=headers)
    154             if 'result' not in js:
    155                 self._report_error(js)
    156             cid = js['result']['cid']
    157 
    158         headers = {
    159             'Accept': 'application/json',
    160             'Referer': url
    161         }
    162         headers.update(self.geo_verification_headers())
    163 
    164         entries = []
    165 
    166         RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
    167         for num, rendition in enumerate(RENDITIONS, start=1):
    168             payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
    169             sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
    170 
    171             video_info = self._download_json(
    172                 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
    173                 video_id, note='Downloading video info page',
    174                 headers=headers, fatal=num == len(RENDITIONS))
    175 
    176             if not video_info:
    177                 continue
    178 
    179             if 'durl' not in video_info:
    180                 if num < len(RENDITIONS):
    181                     continue
    182                 self._report_error(video_info)
    183 
    184             for idx, durl in enumerate(video_info['durl']):
    185                 formats = [{
    186                     'url': durl['url'],
    187                     'filesize': int_or_none(durl['size']),
    188                 }]
    189                 for backup_url in durl.get('backup_url', []):
    190                     formats.append({
    191                         'url': backup_url,
    192                         # backup URLs have lower priorities
    193                         'preference': -2 if 'hd.mp4' in backup_url else -3,
    194                     })
    195 
    196                 for a_format in formats:
    197                     a_format.setdefault('http_headers', {}).update({
    198                         'Referer': url,
    199                     })
    200 
    201                 self._sort_formats(formats)
    202 
    203                 entries.append({
    204                     'id': '%s_part%s' % (video_id, idx),
    205                     'duration': float_or_none(durl.get('length'), 1000),
    206                     'formats': formats,
    207                 })
    208             break
    209 
    210         title = self._html_search_regex(
    211             ('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
    212              '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
    213             group='title')
    214         description = self._html_search_meta('description', webpage)
    215         timestamp = unified_timestamp(self._html_search_regex(
    216             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
    217             default=None) or self._html_search_meta(
    218             'uploadDate', webpage, 'timestamp', default=None))
    219         thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
    220 
    221         # TODO 'view_count' requires deobfuscating Javascript
    222         info = {
    223             'id': video_id,
    224             'title': title,
    225             'description': description,
    226             'timestamp': timestamp,
    227             'thumbnail': thumbnail,
    228             'duration': float_or_none(video_info.get('timelength'), scale=1000),
    229         }
    230 
    231         uploader_mobj = re.search(
    232             r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)',
    233             webpage)
    234         if uploader_mobj:
    235             info.update({
    236                 'uploader': uploader_mobj.group('name').strip(),
    237                 'uploader_id': uploader_mobj.group('id'),
    238             })
    239         if not info.get('uploader'):
    240             info['uploader'] = self._html_search_meta(
    241                 'author', webpage, 'uploader', default=None)
    242 
    243         for entry in entries:
    244             entry.update(info)
    245 
    246         if len(entries) == 1:
    247             return entries[0]
    248         else:
    249             for idx, entry in enumerate(entries):
    250                 entry['id'] = '%s_part%d' % (video_id, (idx + 1))
    251 
    252             return {
    253                 '_type': 'multi_video',
    254                 'id': video_id,
    255                 'title': title,
    256                 'description': description,
    257                 'entries': entries,
    258             }
    259 
    260 
    261 class BiliBiliBangumiIE(InfoExtractor):
    262     _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
    263 
    264     IE_NAME = 'bangumi.bilibili.com'
    265     IE_DESC = 'BiliBili番剧'
    266 
    267     _TESTS = [{
    268         'url': 'http://bangumi.bilibili.com/anime/1869',
    269         'info_dict': {
    270             'id': '1869',
    271             'title': '混沌武士',
    272             'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
    273         },
    274         'playlist_count': 26,
    275     }, {
    276         'url': 'http://bangumi.bilibili.com/anime/1869',
    277         'info_dict': {
    278             'id': '1869',
    279             'title': '混沌武士',
    280             'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
    281         },
    282         'playlist': [{
    283             'md5': '91da8621454dd58316851c27c68b0c13',
    284             'info_dict': {
    285                 'id': '40062',
    286                 'ext': 'mp4',
    287                 'title': '混沌武士',
    288                 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...',
    289                 'timestamp': 1414538739,
    290                 'upload_date': '20141028',
    291                 'episode': '疾风怒涛 Tempestuous Temperaments',
    292                 'episode_number': 1,
    293             },
    294         }],
    295         'params': {
    296             'playlist_items': '1',
    297         },
    298     }]
    299 
    300     @classmethod
    301     def suitable(cls, url):
    302         return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
    303 
    304     def _real_extract(self, url):
    305         bangumi_id = self._match_id(url)
    306 
    307         # Sometimes this API returns a JSONP response
    308         season_info = self._download_json(
    309             'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
    310             bangumi_id, transform_source=strip_jsonp)['result']
    311 
    312         entries = [{
    313             '_type': 'url_transparent',
    314             'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
    315             'ie_key': BiliBiliIE.ie_key(),
    316             'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
    317             'episode': episode.get('index_title'),
    318             'episode_number': int_or_none(episode.get('index')),
    319         } for episode in season_info['episodes']]
    320 
    321         entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
    322 
    323         return self.playlist_result(
    324             entries, bangumi_id,
    325             season_info.get('bangumi_title'), season_info.get('evaluate'))
    326 
    327 
    328 class BilibiliAudioBaseIE(InfoExtractor):
    329     def _call_api(self, path, sid, query=None):
    330         if not query:
    331             query = {'sid': sid}
    332         return self._download_json(
    333             'https://www.bilibili.com/audio/music-service-c/web/' + path,
    334             sid, query=query)['data']
    335 
    336 
    337 class BilibiliAudioIE(BilibiliAudioBaseIE):
    338     _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)'
    339     _TEST = {
    340         'url': 'https://www.bilibili.com/audio/au1003142',
    341         'md5': 'fec4987014ec94ef9e666d4d158ad03b',
    342         'info_dict': {
    343             'id': '1003142',
    344             'ext': 'm4a',
    345             'title': '【tsukimi】YELLOW / 神山羊',
    346             'artist': 'tsukimi',
    347             'comment_count': int,
    348             'description': 'YELLOW的mp3版!',
    349             'duration': 183,
    350             'subtitles': {
    351                 'origin': [{
    352                     'ext': 'lrc',
    353                 }],
    354             },
    355             'thumbnail': r're:^https?://.+\.jpg',
    356             'timestamp': 1564836614,
    357             'upload_date': '20190803',
    358             'uploader': 'tsukimi-つきみぐー',
    359             'view_count': int,
    360         },
    361     }
    362 
    363     def _real_extract(self, url):
    364         au_id = self._match_id(url)
    365 
    366         play_data = self._call_api('url', au_id)
    367         formats = [{
    368             'url': play_data['cdns'][0],
    369             'filesize': int_or_none(play_data.get('size')),
    370         }]
    371 
    372         song = self._call_api('song/info', au_id)
    373         title = song['title']
    374         statistic = song.get('statistic') or {}
    375 
    376         subtitles = None
    377         lyric = song.get('lyric')
    378         if lyric:
    379             subtitles = {
    380                 'origin': [{
    381                     'url': lyric,
    382                 }]
    383             }
    384 
    385         return {
    386             'id': au_id,
    387             'title': title,
    388             'formats': formats,
    389             'artist': song.get('author'),
    390             'comment_count': int_or_none(statistic.get('comment')),
    391             'description': song.get('intro'),
    392             'duration': int_or_none(song.get('duration')),
    393             'subtitles': subtitles,
    394             'thumbnail': song.get('cover'),
    395             'timestamp': int_or_none(song.get('passtime')),
    396             'uploader': song.get('uname'),
    397             'view_count': int_or_none(statistic.get('play')),
    398         }
    399 
    400 
    401 class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
    402     _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)'
    403     _TEST = {
    404         'url': 'https://www.bilibili.com/audio/am10624',
    405         'info_dict': {
    406             'id': '10624',
    407             'title': '每日新曲推荐(每日11:00更新)',
    408             'description': '每天11:00更新,为你推送最新音乐',
    409         },
    410         'playlist_count': 19,
    411     }
    412 
    413     def _real_extract(self, url):
    414         am_id = self._match_id(url)
    415 
    416         songs = self._call_api(
    417             'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data']
    418 
    419         entries = []
    420         for song in songs:
    421             sid = str_or_none(song.get('id'))
    422             if not sid:
    423                 continue
    424             entries.append(self.url_result(
    425                 'https://www.bilibili.com/audio/au' + sid,
    426                 BilibiliAudioIE.ie_key(), sid))
    427 
    428         if entries:
    429             album_data = self._call_api('menu/info', am_id) or {}
    430             album_title = album_data.get('title')
    431             if album_title:
    432                 for entry in entries:
    433                     entry['album'] = album_title
    434                 return self.playlist_result(
    435                     entries, am_id, album_title, album_data.get('intro'))
    436 
    437         return self.playlist_result(entries, am_id)
    438 
    439 
    440 class BiliBiliPlayerIE(InfoExtractor):
    441     _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)'
    442     _TEST = {
    443         'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1',
    444         'only_matching': True,
    445     }
    446 
    447     def _real_extract(self, url):
    448         video_id = self._match_id(url)
    449         return self.url_result(
    450             'http://www.bilibili.tv/video/av%s/' % video_id,
    451             ie=BiliBiliIE.ie_key(), video_id=video_id)