wsj.py (4694B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 from .common import InfoExtractor 5 from ..utils import ( 6 int_or_none, 7 float_or_none, 8 unified_strdate, 9 ) 10 11 12 class WSJIE(InfoExtractor): 13 _VALID_URL = r'''(?x) 14 (?: 15 https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| 16 https?://(?:www\.)?(?:wsj|barrons)\.com/video/(?:[^/]+/)+| 17 wsj: 18 ) 19 (?P<id>[a-fA-F0-9-]{36}) 20 ''' 21 IE_DESC = 'Wall Street Journal' 22 _TESTS = [{ 23 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 24 'md5': 'e230a5bb249075e40793b655a54a02e4', 25 'info_dict': { 26 'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 27 'ext': 'mp4', 28 'upload_date': '20150202', 29 'uploader_id': 'jdesai', 30 'creator': 'jdesai', 31 'categories': list, # a long list 32 'duration': 90, 33 'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', 34 }, 35 }, { 36 'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html', 37 'only_matching': True, 38 }, { 39 'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html', 40 'only_matching': True, 41 }, { 42 'url': 'https://www.wsj.com/video/series/a-brief-history-of/the-modern-cell-carrier-how-we-got-here/980E2187-401D-48A1-B82B-1486CEE06CB9', 43 'only_matching': True, 44 }] 45 46 def _real_extract(self, url): 47 video_id = self._match_id(url) 48 49 info = self._download_json( 50 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id, 51 query={ 52 'type': 'guid', 53 'count': 1, 54 'query': video_id, 55 'fields': ','.join(( 56 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author', 57 'description', 'name', 'duration', 'videoURL', 'titletag', 58 'formattedCreationDate', 'keywords', 'editor')), 59 })['items'][0] 60 title = info.get('name', info.get('titletag')) 61 62 formats = [] 63 64 f4m_url = info.get('videoURL') 65 if f4m_url: 66 formats.extend(self._extract_f4m_formats( 67 f4m_url, video_id, f4m_id='hds', fatal=False)) 68 69 m3u8_url = info.get('hls') 70 if m3u8_url: 71 formats.extend(self._extract_m3u8_formats( 72 info['hls'], video_id, ext='mp4', 73 entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) 74 75 for v in info.get('videoMP4List', []): 76 mp4_url = v.get('url') 77 if not mp4_url: 78 continue 79 tbr = int_or_none(v.get('bitrate')) 80 formats.append({ 81 'url': mp4_url, 82 'format_id': 'http' + ('-%d' % tbr if tbr else ''), 83 'tbr': tbr, 84 'width': int_or_none(v.get('width')), 85 'height': int_or_none(v.get('height')), 86 'fps': float_or_none(v.get('fps')), 87 }) 88 self._sort_formats(formats) 89 90 return { 91 'id': video_id, 92 'formats': formats, 93 # Thumbnails are conveniently in the correct format already 94 'thumbnails': info.get('thumbnailList'), 95 'creator': info.get('author'), 96 'uploader_id': info.get('editor'), 97 'duration': int_or_none(info.get('duration')), 98 'upload_date': unified_strdate(info.get( 99 'formattedCreationDate'), day_first=False), 100 'title': title, 101 'categories': info.get('keywords'), 102 } 103 104 105 class WSJArticleIE(InfoExtractor): 106 _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' 107 _TEST = { 108 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', 109 'info_dict': { 110 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', 111 'ext': 'mp4', 112 'upload_date': '20170221', 113 'uploader_id': 'ralcaraz', 114 'title': 'Bao Bao the Panda Leaves for China', 115 } 116 } 117 118 def _real_extract(self, url): 119 article_id = self._match_id(url) 120 webpage = self._download_webpage(url, article_id) 121 video_id = self._search_regex( 122 r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id') 123 return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)