gdcvault.py (8596B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from .kaltura import KalturaIE 7 from ..utils import ( 8 HEADRequest, 9 remove_start, 10 sanitized_Request, 11 smuggle_url, 12 urlencode_postdata, 13 ) 14 15 16 class GDCVaultIE(InfoExtractor): 17 _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)(?:/(?P<name>[\w-]+))?' 18 _NETRC_MACHINE = 'gdcvault' 19 _TESTS = [ 20 { 21 'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple', 22 'md5': '7ce8388f544c88b7ac11c7ab1b593704', 23 'info_dict': { 24 'id': '201311826596_AWNY', 25 'display_id': 'Doki-Doki-Universe-Sweet-Simple', 26 'ext': 'mp4', 27 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)' 28 } 29 }, 30 { 31 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of', 32 'info_dict': { 33 'id': '201203272_1330951438328RSXR', 34 'display_id': 'Embracing-the-Dark-Art-of', 35 'ext': 'flv', 36 'title': 'Embracing the Dark Art of Mathematical Modeling in AI' 37 }, 38 'params': { 39 'skip_download': True, # Requires rtmpdump 40 } 41 }, 42 { 43 'url': 'http://www.gdcvault.com/play/1015301/Thexder-Meets-Windows-95-or', 44 'md5': 'a5eb77996ef82118afbbe8e48731b98e', 45 'info_dict': { 46 'id': '1015301', 47 'display_id': 'Thexder-Meets-Windows-95-or', 48 'ext': 'flv', 49 'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment', 50 }, 51 'skip': 'Requires login', 52 }, 53 { 54 'url': 'http://gdcvault.com/play/1020791/', 55 'only_matching': True, 56 }, 57 { 58 # Hard-coded hostname 59 'url': 'http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface', 60 'md5': 'a8efb6c31ed06ca8739294960b2dbabd', 61 'info_dict': { 62 'id': '840376_BQRC', 63 'ext': 'mp4', 64 'display_id': 'Tenacious-Design-and-The-Interface', 65 'title': 'Tenacious Design and The Interface of \'Destiny\'', 66 }, 67 }, 68 { 69 # Multiple audios 70 'url': 'http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC', 71 'info_dict': { 72 'id': '12396_1299111843500GMPX', 73 'ext': 'mp4', 74 'title': 'How to Create a Good Game - From My Experience of Designing Pac-Man', 75 }, 76 # 'params': { 77 # 'skip_download': True, # Requires rtmpdump 78 # 'format': 'jp', # The japanese audio 79 # } 80 }, 81 { 82 # gdc-player.html 83 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo', 84 'info_dict': { 85 'id': '9350_1238021887562UHXB', 86 'display_id': 'An-American-engine-in-Tokyo', 87 'ext': 'mp4', 88 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT', 89 }, 90 }, 91 { 92 # Kaltura Embed 93 'url': 'https://www.gdcvault.com/play/1026180/Mastering-the-Apex-of-Scaling', 94 'info_dict': { 95 'id': '0_h1fg8j3p', 96 'ext': 'mp4', 97 'title': 'Mastering the Apex of Scaling Game Servers (Presented by Multiplay)', 98 'timestamp': 1554401811, 99 'upload_date': '20190404', 100 'uploader_id': 'joe@blazestreaming.com', 101 }, 102 'params': { 103 'format': 'mp4-408', 104 }, 105 }, 106 { 107 # Kaltura embed, whitespace between quote and embedded URL in iframe's src 108 'url': 'https://www.gdcvault.com/play/1025699', 109 'info_dict': { 110 'id': '0_zagynv0a', 111 'ext': 'mp4', 112 'title': 'Tech Toolbox', 113 'upload_date': '20190408', 114 'uploader_id': 'joe@blazestreaming.com', 115 'timestamp': 1554764629, 116 }, 117 'params': { 118 'skip_download': True, 119 }, 120 }, 121 { 122 # HTML5 video 123 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', 124 'only_matching': True, 125 }, 126 ] 127 128 def _login(self, webpage_url, display_id): 129 username, password = self._get_login_info() 130 if username is None or password is None: 131 self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') 132 return None 133 134 mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url) 135 login_url = mobj.group('root_url') + 'api/login.php' 136 logout_url = mobj.group('root_url') + 'logout' 137 138 login_form = { 139 'email': username, 140 'password': password, 141 } 142 143 request = sanitized_Request(login_url, urlencode_postdata(login_form)) 144 request.add_header('Content-Type', 'application/x-www-form-urlencoded') 145 self._download_webpage(request, display_id, 'Logging in') 146 start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') 147 self._download_webpage(logout_url, display_id, 'Logging out') 148 149 return start_page 150 151 def _real_extract(self, url): 152 video_id, name = re.match(self._VALID_URL, url).groups() 153 display_id = name or video_id 154 155 webpage_url = 'http://www.gdcvault.com/play/' + video_id 156 start_page = self._download_webpage(webpage_url, display_id) 157 158 direct_url = self._search_regex( 159 r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', 160 start_page, 'url', default=None) 161 if direct_url: 162 title = self._html_search_regex( 163 r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>', 164 start_page, 'title') 165 video_url = 'http://www.gdcvault.com' + direct_url 166 # resolve the url so that we can detect the correct extension 167 video_url = self._request_webpage( 168 HEADRequest(video_url), video_id).geturl() 169 170 return { 171 'id': video_id, 172 'display_id': display_id, 173 'url': video_url, 174 'title': title, 175 } 176 177 embed_url = KalturaIE._extract_url(start_page) 178 if embed_url: 179 embed_url = smuggle_url(embed_url, {'source_url': url}) 180 ie_key = 'Kaltura' 181 else: 182 PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' 183 184 xml_root = self._html_search_regex( 185 PLAYER_REGEX, start_page, 'xml root', default=None) 186 if xml_root is None: 187 # Probably need to authenticate 188 login_res = self._login(webpage_url, display_id) 189 if login_res is None: 190 self.report_warning('Could not login.') 191 else: 192 start_page = login_res 193 # Grab the url from the authenticated page 194 xml_root = self._html_search_regex( 195 PLAYER_REGEX, start_page, 'xml root') 196 197 xml_name = self._html_search_regex( 198 r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>', 199 start_page, 'xml filename', default=None) 200 if not xml_name: 201 info = self._parse_html5_media_entries(url, start_page, video_id)[0] 202 info.update({ 203 'title': remove_start(self._search_regex( 204 r'>Session Name:\s*<.*?>\s*<td>(.+?)</td>', start_page, 205 'title', default=None) or self._og_search_title( 206 start_page, default=None), 'GDC Vault - '), 207 'id': video_id, 208 'display_id': display_id, 209 }) 210 return info 211 embed_url = '%s/xml/%s' % (xml_root, xml_name) 212 ie_key = 'DigitallySpeaking' 213 214 return { 215 '_type': 'url_transparent', 216 'id': video_id, 217 'display_id': display_id, 218 'url': embed_url, 219 'ie_key': ie_key, 220 }