youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

nytimes.py (10270B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import hmac
      5 import hashlib
      6 import base64
      7 
      8 from .common import InfoExtractor
      9 from ..utils import (
     10     determine_ext,
     11     float_or_none,
     12     int_or_none,
     13     js_to_json,
     14     mimetype2ext,
     15     parse_iso8601,
     16     remove_start,
     17 )
     18 
     19 
     20 class NYTimesBaseIE(InfoExtractor):
     21     _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v'
     22 
     23     def _extract_video_from_id(self, video_id):
     24         # Authorization generation algorithm is reverse engineered from `signer` in
     25         # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js
     26         path = '/svc/video/api/v3/video/' + video_id
     27         hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest()
     28         video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={
     29             'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(),
     30             'X-NYTV': 'vhs',
     31         }, fatal=False)
     32         if not video_data:
     33             video_data = self._download_json(
     34                 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id,
     35                 video_id, 'Downloading video JSON')
     36 
     37         title = video_data['headline']
     38 
     39         def get_file_size(file_size):
     40             if isinstance(file_size, int):
     41                 return file_size
     42             elif isinstance(file_size, dict):
     43                 return int(file_size.get('value', 0))
     44             else:
     45                 return None
     46 
     47         urls = []
     48         formats = []
     49         for video in video_data.get('renditions', []):
     50             video_url = video.get('url')
     51             format_id = video.get('type')
     52             if not video_url or format_id == 'thumbs' or video_url in urls:
     53                 continue
     54             urls.append(video_url)
     55             ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
     56             if ext == 'm3u8':
     57                 formats.extend(self._extract_m3u8_formats(
     58                     video_url, video_id, 'mp4', 'm3u8_native',
     59                     m3u8_id=format_id or 'hls', fatal=False))
     60             elif ext == 'mpd':
     61                 continue
     62             #     formats.extend(self._extract_mpd_formats(
     63             #         video_url, video_id, format_id or 'dash', fatal=False))
     64             else:
     65                 formats.append({
     66                     'url': video_url,
     67                     'format_id': format_id,
     68                     'vcodec': video.get('videoencoding') or video.get('video_codec'),
     69                     'width': int_or_none(video.get('width')),
     70                     'height': int_or_none(video.get('height')),
     71                     'filesize': get_file_size(video.get('file_size') or video.get('fileSize')),
     72                     'tbr': int_or_none(video.get('bitrate'), 1000) or None,
     73                     'ext': ext,
     74                 })
     75         self._sort_formats(formats, ('height', 'width', 'filesize', 'tbr', 'fps', 'format_id'))
     76 
     77         thumbnails = []
     78         for image in video_data.get('images', []):
     79             image_url = image.get('url')
     80             if not image_url:
     81                 continue
     82             thumbnails.append({
     83                 'url': 'http://www.nytimes.com/' + image_url,
     84                 'width': int_or_none(image.get('width')),
     85                 'height': int_or_none(image.get('height')),
     86             })
     87 
     88         publication_date = video_data.get('publication_date')
     89         timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None
     90 
     91         return {
     92             'id': video_id,
     93             'title': title,
     94             'description': video_data.get('summary'),
     95             'timestamp': timestamp,
     96             'uploader': video_data.get('byline'),
     97             'duration': float_or_none(video_data.get('duration'), 1000),
     98             'formats': formats,
     99             'thumbnails': thumbnails,
    100         }
    101 
    102 
    103 class NYTimesIE(NYTimesBaseIE):
    104     _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
    105 
    106     _TESTS = [{
    107         'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
    108         'md5': 'd665342765db043f7e225cff19df0f2d',
    109         'info_dict': {
    110             'id': '100000002847155',
    111             'ext': 'mov',
    112             'title': 'Verbatim: What Is a Photocopier?',
    113             'description': 'md5:93603dada88ddbda9395632fdc5da260',
    114             'timestamp': 1398631707,
    115             'upload_date': '20140427',
    116             'uploader': 'Brett Weiner',
    117             'duration': 419,
    118         }
    119     }, {
    120         'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
    121         'only_matching': True,
    122     }]
    123 
    124     def _real_extract(self, url):
    125         video_id = self._match_id(url)
    126 
    127         return self._extract_video_from_id(video_id)
    128 
    129 
    130 class NYTimesArticleIE(NYTimesBaseIE):
    131     _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
    132     _TESTS = [{
    133         'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
    134         'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
    135         'info_dict': {
    136             'id': '100000003628438',
    137             'ext': 'mov',
    138             'title': 'New Minimum Wage: $70,000 a Year',
    139             'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.',
    140             'timestamp': 1429033037,
    141             'upload_date': '20150414',
    142             'uploader': 'Matthew Williams',
    143         }
    144     }, {
    145         'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html',
    146         'md5': 'e0d52040cafb07662acf3c9132db3575',
    147         'info_dict': {
    148             'id': '100000004709062',
    149             'title': 'The Run-Up: ‘He Was Like an Octopus’',
    150             'ext': 'mp3',
    151             'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4',
    152             'series': 'The Run-Up',
    153             'episode': '‘He Was Like an Octopus’',
    154             'episode_number': 20,
    155             'duration': 2130,
    156         }
    157     }, {
    158         'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html',
    159         'info_dict': {
    160             'id': '100000004709479',
    161             'title': 'The Rise of Hitler',
    162             'ext': 'mp3',
    163             'description': 'md5:bce877fd9e3444990cb141875fab0028',
    164             'creator': 'Pamela Paul',
    165             'duration': 3475,
    166         },
    167         'params': {
    168             'skip_download': True,
    169         },
    170     }, {
    171         'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
    172         'only_matching': True,
    173     }]
    174 
    175     def _extract_podcast_from_json(self, json, page_id, webpage):
    176         podcast_audio = self._parse_json(
    177             json, page_id, transform_source=js_to_json)
    178 
    179         audio_data = podcast_audio['data']
    180         track = audio_data['track']
    181 
    182         episode_title = track['title']
    183         video_url = track['source']
    184 
    185         description = track.get('description') or self._html_search_meta(
    186             ['og:description', 'twitter:description'], webpage)
    187 
    188         podcast_title = audio_data.get('podcast', {}).get('title')
    189         title = ('%s: %s' % (podcast_title, episode_title)
    190                  if podcast_title else episode_title)
    191 
    192         episode = audio_data.get('podcast', {}).get('episode') or ''
    193         episode_number = int_or_none(self._search_regex(
    194             r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None))
    195 
    196         return {
    197             'id': remove_start(podcast_audio.get('target'), 'FT') or page_id,
    198             'url': video_url,
    199             'title': title,
    200             'description': description,
    201             'creator': track.get('credit'),
    202             'series': podcast_title,
    203             'episode': episode_title,
    204             'episode_number': episode_number,
    205             'duration': int_or_none(track.get('duration')),
    206         }
    207 
    208     def _real_extract(self, url):
    209         page_id = self._match_id(url)
    210 
    211         webpage = self._download_webpage(url, page_id)
    212 
    213         video_id = self._search_regex(
    214             r'data-videoid=["\'](\d+)', webpage, 'video id',
    215             default=None, fatal=False)
    216         if video_id is not None:
    217             return self._extract_video_from_id(video_id)
    218 
    219         podcast_data = self._search_regex(
    220             (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script',
    221              r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'),
    222             webpage, 'podcast data')
    223         return self._extract_podcast_from_json(podcast_data, page_id, webpage)
    224 
    225 
    226 class NYTimesCookingIE(NYTimesBaseIE):
    227     _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)'
    228     _TESTS = [{
    229         'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
    230         'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3',
    231         'info_dict': {
    232             'id': '100000004756089',
    233             'ext': 'mov',
    234             'timestamp': 1479383008,
    235             'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON',
    236             'title': 'Cranberry Tart',
    237             'upload_date': '20161117',
    238             'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.',
    239         },
    240     }, {
    241         'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
    242         'md5': '4b2e8c70530a89b8d905a2b572316eb8',
    243         'info_dict': {
    244             'id': '100000003951728',
    245             'ext': 'mov',
    246             'timestamp': 1445509539,
    247             'description': 'Turkey guide',
    248             'upload_date': '20151022',
    249             'title': 'Turkey',
    250         }
    251     }]
    252 
    253     def _real_extract(self, url):
    254         page_id = self._match_id(url)
    255 
    256         webpage = self._download_webpage(url, page_id)
    257 
    258         video_id = self._search_regex(
    259             r'data-video-id=["\'](\d+)', webpage, 'video id')
    260 
    261         return self._extract_video_from_id(video_id)