youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

piksel.py (7045B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import re
      5 
      6 from .common import InfoExtractor
      7 from ..compat import compat_str
      8 from ..utils import (
      9     dict_get,
     10     ExtractorError,
     11     int_or_none,
     12     parse_iso8601,
     13     try_get,
     14     unescapeHTML,
     15 )
     16 
     17 
     18 class PikselIE(InfoExtractor):
     19     _VALID_URL = r'''(?x)https?://
     20         (?:
     21             (?:
     22                 player\.
     23                     (?:
     24                         olympusattelecom|
     25                         vibebyvista
     26                     )|
     27                 (?:api|player)\.multicastmedia|
     28                 (?:api-ovp|player)\.piksel
     29             )\.com|
     30             (?:
     31                 mz-edge\.stream\.co|
     32                 movie-s\.nhk\.or
     33             )\.jp|
     34             vidego\.baltimorecity\.gov
     35         )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)'''
     36     _TESTS = [
     37         {
     38             'url': 'http://player.piksel.com/v/ums2867l',
     39             'md5': '34e34c8d89dc2559976a6079db531e85',
     40             'info_dict': {
     41                 'id': 'ums2867l',
     42                 'ext': 'mp4',
     43                 'title': 'GX-005 with Caption',
     44                 'timestamp': 1481335659,
     45                 'upload_date': '20161210'
     46             }
     47         },
     48         {
     49             # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al
     50             'url': 'https://player.piksel.com/v/v80kqp41',
     51             'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d',
     52             'info_dict': {
     53                 'id': 'v80kqp41',
     54                 'ext': 'mp4',
     55                 'title': 'WAW- State of Washington vs. Donald J. Trump, et al',
     56                 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.',
     57                 'timestamp': 1486171129,
     58                 'upload_date': '20170204'
     59             }
     60         },
     61         {
     62             # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/
     63             'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477',
     64             'only_matching': True,
     65         }
     66     ]
     67 
     68     @staticmethod
     69     def _extract_url(webpage):
     70         mobj = re.search(
     71             r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)',
     72             webpage)
     73         if mobj:
     74             return mobj.group('url')
     75 
     76     def _call_api(self, app_token, resource, display_id, query, fatal=True):
     77         response = (self._download_json(
     78             'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token),
     79             display_id, query=query, fatal=fatal) or {}).get('response')
     80         failure = try_get(response, lambda x: x['failure']['reason'])
     81         if failure:
     82             if fatal:
     83                 raise ExtractorError(failure, expected=True)
     84             self.report_warning(failure)
     85         return response
     86 
     87     def _real_extract(self, url):
     88         ref_id, display_id = re.match(self._VALID_URL, url).groups()
     89         webpage = self._download_webpage(url, display_id)
     90         app_token = self._search_regex([
     91             r'clientAPI\s*:\s*"([^"]+)"',
     92             r'data-de-api-key\s*=\s*"([^"]+)"'
     93         ], webpage, 'app token')
     94         query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id}
     95         program = self._call_api(
     96             app_token, 'program', display_id, query)['WsProgramResponse']['program']
     97         video_id = program['uuid']
     98         video_data = program['asset']
     99         title = video_data['title']
    100         asset_type = dict_get(video_data, ['assetType', 'asset_type'])
    101 
    102         formats = []
    103 
    104         def process_asset_file(asset_file):
    105             if not asset_file:
    106                 return
    107             # TODO: extract rtmp formats
    108             http_url = asset_file.get('http_url')
    109             if not http_url:
    110                 return
    111             tbr = None
    112             vbr = int_or_none(asset_file.get('videoBitrate'), 1024)
    113             abr = int_or_none(asset_file.get('audioBitrate'), 1024)
    114             if asset_type == 'video':
    115                 tbr = vbr + abr
    116             elif asset_type == 'audio':
    117                 tbr = abr
    118 
    119             format_id = ['http']
    120             if tbr:
    121                 format_id.append(compat_str(tbr))
    122 
    123             formats.append({
    124                 'format_id': '-'.join(format_id),
    125                 'url': unescapeHTML(http_url),
    126                 'vbr': vbr,
    127                 'abr': abr,
    128                 'width': int_or_none(asset_file.get('videoWidth')),
    129                 'height': int_or_none(asset_file.get('videoHeight')),
    130                 'filesize': int_or_none(asset_file.get('filesize')),
    131                 'tbr': tbr,
    132             })
    133 
    134         def process_asset_files(asset_files):
    135             for asset_file in (asset_files or []):
    136                 process_asset_file(asset_file)
    137 
    138         process_asset_files(video_data.get('assetFiles'))
    139         process_asset_file(video_data.get('referenceFile'))
    140         if not formats:
    141             asset_id = video_data.get('assetid') or program.get('assetid')
    142             if asset_id:
    143                 process_asset_files(try_get(self._call_api(
    144                     app_token, 'asset_file', display_id, {
    145                         'assetid': asset_id,
    146                     }, False), lambda x: x['WsAssetFileResponse']['AssetFiles']))
    147 
    148         m3u8_url = dict_get(video_data, [
    149             'm3u8iPadURL',
    150             'ipadM3u8Url',
    151             'm3u8AndroidURL',
    152             'm3u8iPhoneURL',
    153             'iphoneM3u8Url'])
    154         if m3u8_url:
    155             formats.extend(self._extract_m3u8_formats(
    156                 m3u8_url, video_id, 'mp4', 'm3u8_native',
    157                 m3u8_id='hls', fatal=False))
    158 
    159         smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil'])
    160         if smil_url:
    161             transform_source = None
    162             if ref_id == 'nhkworld':
    163                 # TODO: figure out if this is something to be fixed in urljoin,
    164                 # _parse_smil_formats or keep it here
    165                 transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"')
    166             formats.extend(self._extract_smil_formats(
    167                 re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id,
    168                 transform_source=transform_source, fatal=False))
    169 
    170         self._sort_formats(formats)
    171 
    172         subtitles = {}
    173         for caption in video_data.get('captions', []):
    174             caption_url = caption.get('url')
    175             if caption_url:
    176                 subtitles.setdefault(caption.get('locale', 'en'), []).append({
    177                     'url': caption_url})
    178 
    179         return {
    180             'id': video_id,
    181             'title': title,
    182             'description': video_data.get('description'),
    183             'thumbnail': video_data.get('thumbnailUrl'),
    184             'timestamp': parse_iso8601(video_data.get('dateadd')),
    185             'formats': formats,
    186             'subtitles': subtitles,
    187         }