youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

arcpublishing.py (7970B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import re
      5 
      6 from .common import InfoExtractor
      7 from ..utils import (
      8     extract_attributes,
      9     int_or_none,
     10     parse_iso8601,
     11     try_get,
     12 )
     13 
     14 
     15 class ArcPublishingIE(InfoExtractor):
     16     _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
     17     _VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX
     18     _TESTS = [{
     19         # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/
     20         'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
     21         'only_matching': True,
     22     }, {
     23         # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/
     24         'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1',
     25         'only_matching': True,
     26     }, {
     27         # https://www.actionnewsjax.com/video/live-stream/
     28         'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a',
     29         'only_matching': True,
     30     }, {
     31         # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/
     32         'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3',
     33         'only_matching': True,
     34     }, {
     35         # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/
     36         'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe',
     37         'only_matching': True,
     38     }, {
     39         # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/
     40         'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e',
     41         'only_matching': True,
     42     }, {
     43         # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/
     44         'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143',
     45         'only_matching': True,
     46     }, {
     47         # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/
     48         'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055',
     49         'only_matching': True,
     50     }, {
     51         # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/
     52         'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d',
     53         'only_matching': True,
     54     }, {
     55         # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/
     56         'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7',
     57         'only_matching': True,
     58     }, {
     59         # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/
     60         'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b',
     61         'only_matching': True,
     62     }, {
     63         # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html
     64         'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685',
     65         'only_matching': True,
     66     }]
     67     _POWA_DEFAULTS = [
     68         (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'),
     69         ([
     70             'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo',
     71             'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom',
     72             'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek',
     73         ], 'video-api-cdn.%s.arcpublishing.com/api'),
     74     ]
     75 
     76     @staticmethod
     77     def _extract_urls(webpage):
     78         entries = []
     79         # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
     80         for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):
     81             powa = extract_attributes(powa_el) or {}
     82             org = powa.get('data-org')
     83             uuid = powa.get('data-uuid')
     84             if org and uuid:
     85                 entries.append('arcpublishing:%s:%s' % (org, uuid))
     86         return entries
     87 
     88     def _real_extract(self, url):
     89         org, uuid = re.match(self._VALID_URL, url).groups()
     90         for orgs, tmpl in self._POWA_DEFAULTS:
     91             if org in orgs:
     92                 base_api_tmpl = tmpl
     93                 break
     94         else:
     95             base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api'
     96         if org == 'wapo':
     97             org = 'washpost'
     98         video = self._download_json(
     99             'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org),
    100             uuid, query={'uuid': uuid})[0]
    101         title = video['headlines']['basic']
    102         is_live = video.get('status') == 'live'
    103 
    104         urls = []
    105         formats = []
    106         for s in video.get('streams', []):
    107             s_url = s.get('url')
    108             if not s_url or s_url in urls:
    109                 continue
    110             urls.append(s_url)
    111             stream_type = s.get('stream_type')
    112             if stream_type == 'smil':
    113                 smil_formats = self._extract_smil_formats(
    114                     s_url, uuid, fatal=False)
    115                 for f in smil_formats:
    116                     if f['url'].endswith('/cfx/st'):
    117                         f['app'] = 'cfx/st'
    118                         if not f['play_path'].startswith('mp4:'):
    119                             f['play_path'] = 'mp4:' + f['play_path']
    120                         if isinstance(f['tbr'], float):
    121                             f['vbr'] = f['tbr'] * 1000
    122                             del f['tbr']
    123                             f['format_id'] = 'rtmp-%d' % f['vbr']
    124                 formats.extend(smil_formats)
    125             elif stream_type in ('ts', 'hls'):
    126                 m3u8_formats = self._extract_m3u8_formats(
    127                     s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native',
    128                     m3u8_id='hls', fatal=False)
    129                 if all([f.get('acodec') == 'none' for f in m3u8_formats]):
    130                     continue
    131                 for f in m3u8_formats:
    132                     if f.get('acodec') == 'none':
    133                         f['preference'] = -40
    134                     elif f.get('vcodec') == 'none':
    135                         f['preference'] = -50
    136                     height = f.get('height')
    137                     if not height:
    138                         continue
    139                     vbr = self._search_regex(
    140                         r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None)
    141                     if vbr:
    142                         f['vbr'] = int(vbr)
    143                 formats.extend(m3u8_formats)
    144             else:
    145                 vbr = int_or_none(s.get('bitrate'))
    146                 formats.append({
    147                     'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type,
    148                     'vbr': vbr,
    149                     'width': int_or_none(s.get('width')),
    150                     'height': int_or_none(s.get('height')),
    151                     'filesize': int_or_none(s.get('filesize')),
    152                     'url': s_url,
    153                     'preference': -1,
    154                 })
    155         self._sort_formats(
    156             formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id'))
    157 
    158         subtitles = {}
    159         for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):
    160             subtitle_url = subtitle.get('url')
    161             if subtitle_url:
    162                 subtitles.setdefault('en', []).append({'url': subtitle_url})
    163 
    164         return {
    165             'id': uuid,
    166             'title': self._live_title(title) if is_live else title,
    167             'thumbnail': try_get(video, lambda x: x['promo_image']['url']),
    168             'description': try_get(video, lambda x: x['subheadlines']['basic']),
    169             'formats': formats,
    170             'duration': int_or_none(video.get('duration'), 100),
    171             'timestamp': parse_iso8601(video.get('created_date')),
    172             'subtitles': subtitles,
    173             'is_live': is_live,
    174         }