imdb.py (5248B)
1 from __future__ import unicode_literals 2 3 import base64 4 import json 5 import re 6 7 from .common import InfoExtractor 8 from ..utils import ( 9 determine_ext, 10 mimetype2ext, 11 parse_duration, 12 qualities, 13 try_get, 14 url_or_none, 15 ) 16 17 18 class ImdbIE(InfoExtractor): 19 IE_NAME = 'imdb' 20 IE_DESC = 'Internet Movie Database trailers' 21 _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).*?[/-]vi(?P<id>\d+)' 22 23 _TESTS = [{ 24 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 25 'info_dict': { 26 'id': '2524815897', 27 'ext': 'mp4', 28 'title': 'No. 2', 29 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', 30 'duration': 152, 31 } 32 }, { 33 'url': 'http://www.imdb.com/video/_/vi2524815897', 34 'only_matching': True, 35 }, { 36 'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897', 37 'only_matching': True, 38 }, { 39 'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897', 40 'only_matching': True, 41 }, { 42 'url': 'http://www.imdb.com/videoplayer/vi1562949145', 43 'only_matching': True, 44 }, { 45 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', 46 'only_matching': True, 47 }, { 48 'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329', 49 'only_matching': True, 50 }] 51 52 def _real_extract(self, url): 53 video_id = self._match_id(url) 54 55 data = self._download_json( 56 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, 57 query={ 58 'key': base64.b64encode(json.dumps({ 59 'type': 'VIDEO_PLAYER', 60 'subType': 'FORCE_LEGACY', 61 'id': 'vi%s' % video_id, 62 }).encode()).decode(), 63 })[0] 64 65 quality = qualities(('SD', '480p', '720p', '1080p')) 66 formats = [] 67 for encoding in data['videoLegacyEncodings']: 68 if not encoding or not isinstance(encoding, dict): 69 continue 70 video_url = url_or_none(encoding.get('url')) 71 if not video_url: 72 continue 73 ext = mimetype2ext(encoding.get( 74 'mimeType')) or determine_ext(video_url) 75 if ext == 'm3u8': 76 formats.extend(self._extract_m3u8_formats( 77 video_url, video_id, 'mp4', entry_protocol='m3u8_native', 78 preference=1, m3u8_id='hls', fatal=False)) 79 continue 80 format_id = encoding.get('definition') 81 formats.append({ 82 'format_id': format_id, 83 'url': video_url, 84 'ext': ext, 85 'quality': quality(format_id), 86 }) 87 self._sort_formats(formats) 88 89 webpage = self._download_webpage( 90 'https://www.imdb.com/video/vi' + video_id, video_id) 91 video_metadata = self._parse_json(self._search_regex( 92 r'args\.push\(\s*({.+?})\s*\)\s*;', webpage, 93 'video metadata'), video_id) 94 95 video_info = video_metadata.get('VIDEO_INFO') 96 if video_info and isinstance(video_info, dict): 97 info = try_get( 98 video_info, lambda x: x[list(video_info.keys())[0]][0], dict) 99 else: 100 info = {} 101 102 title = self._html_search_meta( 103 ['og:title', 'twitter:title'], webpage) or self._html_search_regex( 104 r'<title>(.+?)</title>', webpage, 'title', 105 default=None) or info['videoTitle'] 106 107 return { 108 'id': video_id, 109 'title': title, 110 'alt_title': info.get('videoSubTitle'), 111 'formats': formats, 112 'description': info.get('videoDescription'), 113 'thumbnail': url_or_none(try_get( 114 video_metadata, lambda x: x['videoSlate']['source'])), 115 'duration': parse_duration(info.get('videoRuntime')), 116 } 117 118 119 class ImdbListIE(InfoExtractor): 120 IE_NAME = 'imdb:list' 121 IE_DESC = 'Internet Movie Database lists' 122 _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P<id>\d{9})(?!/videoplayer/vi\d+)' 123 _TEST = { 124 'url': 'https://www.imdb.com/list/ls009921623/', 125 'info_dict': { 126 'id': '009921623', 127 'title': 'The Bourne Legacy', 128 'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.', 129 }, 130 'playlist_count': 8, 131 } 132 133 def _real_extract(self, url): 134 list_id = self._match_id(url) 135 webpage = self._download_webpage(url, list_id) 136 entries = [ 137 self.url_result('http://www.imdb.com' + m, 'Imdb') 138 for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)] 139 140 list_title = self._html_search_regex( 141 r'<h1[^>]+class="[^"]*header[^"]*"[^>]*>(.*?)</h1>', 142 webpage, 'list title') 143 list_description = self._html_search_regex( 144 r'<div[^>]+class="[^"]*list-description[^"]*"[^>]*><p>(.*?)</p>', 145 webpage, 'list description') 146 147 return self.playlist_result(entries, list_id, list_title, list_description)