instagram.py (17625B)
1 from __future__ import unicode_literals 2 3 import itertools 4 import hashlib 5 import json 6 import re 7 8 from .common import InfoExtractor 9 from ..compat import ( 10 compat_str, 11 compat_HTTPError, 12 ) 13 from ..utils import ( 14 ExtractorError, 15 float_or_none, 16 get_element_by_attribute, 17 int_or_none, 18 lowercase_escape, 19 std_headers, 20 try_get, 21 url_or_none, 22 ) 23 24 25 class InstagramIE(InfoExtractor): 26 _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' 27 _TESTS = [{ 28 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 29 'md5': '0d2da106a9d2631273e192b372806516', 30 'info_dict': { 31 'id': 'aye83DjauH', 32 'ext': 'mp4', 33 'title': 'Video by naomipq', 34 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 35 'thumbnail': r're:^https?://.*\.jpg', 36 'duration': 0, 37 'timestamp': 1371748545, 38 'upload_date': '20130620', 39 'uploader_id': 'naomipq', 40 'uploader': 'B E A U T Y F O R A S H E S', 41 'like_count': int, 42 'comment_count': int, 43 'comments': list, 44 }, 45 }, { 46 # missing description 47 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', 48 'info_dict': { 49 'id': 'BA-pQFBG8HZ', 50 'ext': 'mp4', 51 'title': 'Video by britneyspears', 52 'thumbnail': r're:^https?://.*\.jpg', 53 'duration': 0, 54 'timestamp': 1453760977, 55 'upload_date': '20160125', 56 'uploader_id': 'britneyspears', 57 'uploader': 'Britney Spears', 58 'like_count': int, 59 'comment_count': int, 60 'comments': list, 61 }, 62 'params': { 63 'skip_download': True, 64 }, 65 }, { 66 # multi video post 67 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', 68 'playlist': [{ 69 'info_dict': { 70 'id': 'BQ0dSaohpPW', 71 'ext': 'mp4', 72 'title': 'Video 1', 73 }, 74 }, { 75 'info_dict': { 76 'id': 'BQ0dTpOhuHT', 77 'ext': 'mp4', 78 'title': 'Video 2', 79 }, 80 }, { 81 'info_dict': { 82 'id': 'BQ0dT7RBFeF', 83 'ext': 'mp4', 84 'title': 'Video 3', 85 }, 86 }], 87 'info_dict': { 88 'id': 'BQ0eAlwhDrw', 89 'title': 'Post by instagram', 90 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', 91 }, 92 }, { 93 # IGTV 94 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', 95 'info_dict': { 96 'id': 'BkfuX9UB-eK', 97 'ext': 'mp4', 98 'title': 'Fingerboarding Tricks with @cass.fb', 99 'thumbnail': r're:^https?://.*\.jpg', 100 'duration': 53.83, 101 'timestamp': 1530032919, 102 'upload_date': '20180626', 103 'uploader_id': 'instagram', 104 'uploader': 'Instagram', 105 'like_count': int, 106 'comment_count': int, 107 'comments': list, 108 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', 109 } 110 }, { 111 'url': 'https://instagram.com/p/-Cmh1cukG2/', 112 'only_matching': True, 113 }, { 114 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', 115 'only_matching': True, 116 }, { 117 'url': 'https://www.instagram.com/tv/aye83DjauH/', 118 'only_matching': True, 119 }, { 120 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', 121 'only_matching': True, 122 }] 123 124 @staticmethod 125 def _extract_embed_url(webpage): 126 mobj = re.search( 127 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', 128 webpage) 129 if mobj: 130 return mobj.group('url') 131 132 blockquote_el = get_element_by_attribute( 133 'class', 'instagram-media', webpage) 134 if blockquote_el is None: 135 return 136 137 mobj = re.search( 138 r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) 139 if mobj: 140 return mobj.group('link') 141 142 def _real_extract(self, url): 143 mobj = re.match(self._VALID_URL, url) 144 video_id = mobj.group('id') 145 url = mobj.group('url') 146 147 webpage = self._download_webpage(url, video_id) 148 149 (media, video_url, description, thumbnail, timestamp, uploader, 150 uploader_id, like_count, comment_count, comments, height, 151 width) = [None] * 12 152 153 shared_data = self._parse_json( 154 self._search_regex( 155 r'window\._sharedData\s*=\s*({.+?});', 156 webpage, 'shared data', default='{}'), 157 video_id, fatal=False) 158 if shared_data: 159 media = try_get( 160 shared_data, 161 (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], 162 lambda x: x['entry_data']['PostPage'][0]['media']), 163 dict) 164 # _sharedData.entry_data.PostPage is empty when authenticated (see 165 # https://github.com/ytdl-org/youtube-dl/pull/22880) 166 if not media: 167 additional_data = self._parse_json( 168 self._search_regex( 169 r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', 170 webpage, 'additional data', default='{}'), 171 video_id, fatal=False) 172 if additional_data: 173 media = try_get( 174 additional_data, lambda x: x['graphql']['shortcode_media'], 175 dict) 176 if media: 177 video_url = media.get('video_url') 178 height = int_or_none(media.get('dimensions', {}).get('height')) 179 width = int_or_none(media.get('dimensions', {}).get('width')) 180 description = try_get( 181 media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], 182 compat_str) or media.get('caption') 183 title = media.get('title') 184 thumbnail = media.get('display_src') or media.get('display_url') 185 duration = float_or_none(media.get('video_duration')) 186 timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) 187 uploader = media.get('owner', {}).get('full_name') 188 uploader_id = media.get('owner', {}).get('username') 189 190 def get_count(keys, kind): 191 if not isinstance(keys, (list, tuple)): 192 keys = [keys] 193 for key in keys: 194 count = int_or_none(try_get( 195 media, (lambda x: x['edge_media_%s' % key]['count'], 196 lambda x: x['%ss' % kind]['count']))) 197 if count is not None: 198 return count 199 like_count = get_count('preview_like', 'like') 200 comment_count = get_count( 201 ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') 202 203 comments = [{ 204 'author': comment.get('user', {}).get('username'), 205 'author_id': comment.get('user', {}).get('id'), 206 'id': comment.get('id'), 207 'text': comment.get('text'), 208 'timestamp': int_or_none(comment.get('created_at')), 209 } for comment in media.get( 210 'comments', {}).get('nodes', []) if comment.get('text')] 211 if not video_url: 212 edges = try_get( 213 media, lambda x: x['edge_sidecar_to_children']['edges'], 214 list) or [] 215 if edges: 216 entries = [] 217 for edge_num, edge in enumerate(edges, start=1): 218 node = try_get(edge, lambda x: x['node'], dict) 219 if not node: 220 continue 221 node_video_url = url_or_none(node.get('video_url')) 222 if not node_video_url: 223 continue 224 entries.append({ 225 'id': node.get('shortcode') or node['id'], 226 'title': node.get('title') or 'Video %d' % edge_num, 227 'url': node_video_url, 228 'thumbnail': node.get('display_url'), 229 'duration': float_or_none(node.get('video_duration')), 230 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), 231 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), 232 'view_count': int_or_none(node.get('video_view_count')), 233 }) 234 return self.playlist_result( 235 entries, video_id, 236 'Post by %s' % uploader_id if uploader_id else None, 237 description) 238 239 if not video_url: 240 video_url = self._og_search_video_url(webpage, secure=False) 241 242 formats = [{ 243 'url': video_url, 244 'width': width, 245 'height': height, 246 }] 247 248 if not uploader_id: 249 uploader_id = self._search_regex( 250 r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', 251 webpage, 'uploader id', fatal=False) 252 253 if not description: 254 description = self._search_regex( 255 r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) 256 if description is not None: 257 description = lowercase_escape(description) 258 259 if not thumbnail: 260 thumbnail = self._og_search_thumbnail(webpage) 261 262 return { 263 'id': video_id, 264 'formats': formats, 265 'ext': 'mp4', 266 'title': title or 'Video by %s' % uploader_id, 267 'description': description, 268 'duration': duration, 269 'thumbnail': thumbnail, 270 'timestamp': timestamp, 271 'uploader_id': uploader_id, 272 'uploader': uploader, 273 'like_count': like_count, 274 'comment_count': comment_count, 275 'comments': comments, 276 } 277 278 279 class InstagramPlaylistIE(InfoExtractor): 280 # A superclass for handling any kind of query based on GraphQL which 281 # results in a playlist. 282 283 _gis_tmpl = None # used to cache GIS request type 284 285 def _parse_graphql(self, webpage, item_id): 286 # Reads a webpage and returns its GraphQL data. 287 return self._parse_json( 288 self._search_regex( 289 r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), 290 item_id) 291 292 def _extract_graphql(self, data, url): 293 # Parses GraphQL queries containing videos and generates a playlist. 294 def get_count(suffix): 295 return int_or_none(try_get( 296 node, lambda x: x['edge_media_' + suffix]['count'])) 297 298 uploader_id = self._match_id(url) 299 csrf_token = data['config']['csrf_token'] 300 rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' 301 302 cursor = '' 303 for page_num in itertools.count(1): 304 variables = { 305 'first': 12, 306 'after': cursor, 307 } 308 variables.update(self._query_vars_for(data)) 309 variables = json.dumps(variables) 310 311 if self._gis_tmpl: 312 gis_tmpls = [self._gis_tmpl] 313 else: 314 gis_tmpls = [ 315 '%s' % rhx_gis, 316 '', 317 '%s:%s' % (rhx_gis, csrf_token), 318 '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), 319 ] 320 321 # try all of the ways to generate a GIS query, and not only use the 322 # first one that works, but cache it for future requests 323 for gis_tmpl in gis_tmpls: 324 try: 325 json_data = self._download_json( 326 'https://www.instagram.com/graphql/query/', uploader_id, 327 'Downloading JSON page %d' % page_num, headers={ 328 'X-Requested-With': 'XMLHttpRequest', 329 'X-Instagram-GIS': hashlib.md5( 330 ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), 331 }, query={ 332 'query_hash': self._QUERY_HASH, 333 'variables': variables, 334 }) 335 media = self._parse_timeline_from(json_data) 336 self._gis_tmpl = gis_tmpl 337 break 338 except ExtractorError as e: 339 # if it's an error caused by a bad query, and there are 340 # more GIS templates to try, ignore it and keep trying 341 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: 342 if gis_tmpl != gis_tmpls[-1]: 343 continue 344 raise 345 346 edges = media.get('edges') 347 if not edges or not isinstance(edges, list): 348 break 349 350 for edge in edges: 351 node = edge.get('node') 352 if not node or not isinstance(node, dict): 353 continue 354 if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: 355 continue 356 video_id = node.get('shortcode') 357 if not video_id: 358 continue 359 360 info = self.url_result( 361 'https://instagram.com/p/%s/' % video_id, 362 ie=InstagramIE.ie_key(), video_id=video_id) 363 364 description = try_get( 365 node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], 366 compat_str) 367 thumbnail = node.get('thumbnail_src') or node.get('display_src') 368 timestamp = int_or_none(node.get('taken_at_timestamp')) 369 370 comment_count = get_count('to_comment') 371 like_count = get_count('preview_like') 372 view_count = int_or_none(node.get('video_view_count')) 373 374 info.update({ 375 'description': description, 376 'thumbnail': thumbnail, 377 'timestamp': timestamp, 378 'comment_count': comment_count, 379 'like_count': like_count, 380 'view_count': view_count, 381 }) 382 383 yield info 384 385 page_info = media.get('page_info') 386 if not page_info or not isinstance(page_info, dict): 387 break 388 389 has_next_page = page_info.get('has_next_page') 390 if not has_next_page: 391 break 392 393 cursor = page_info.get('end_cursor') 394 if not cursor or not isinstance(cursor, compat_str): 395 break 396 397 def _real_extract(self, url): 398 user_or_tag = self._match_id(url) 399 webpage = self._download_webpage(url, user_or_tag) 400 data = self._parse_graphql(webpage, user_or_tag) 401 402 self._set_cookie('instagram.com', 'ig_pr', '1') 403 404 return self.playlist_result( 405 self._extract_graphql(data, url), user_or_tag, user_or_tag) 406 407 408 class InstagramUserIE(InstagramPlaylistIE): 409 _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' 410 IE_DESC = 'Instagram user profile' 411 IE_NAME = 'instagram:user' 412 _TEST = { 413 'url': 'https://instagram.com/porsche', 414 'info_dict': { 415 'id': 'porsche', 416 'title': 'porsche', 417 }, 418 'playlist_count': 5, 419 'params': { 420 'extract_flat': True, 421 'skip_download': True, 422 'playlistend': 5, 423 } 424 } 425 426 _QUERY_HASH = '42323d64886122307be10013ad2dcc44', 427 428 @staticmethod 429 def _parse_timeline_from(data): 430 # extracts the media timeline data from a GraphQL result 431 return data['data']['user']['edge_owner_to_timeline_media'] 432 433 @staticmethod 434 def _query_vars_for(data): 435 # returns a dictionary of variables to add to the timeline query based 436 # on the GraphQL of the original page 437 return { 438 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] 439 } 440 441 442 class InstagramTagIE(InstagramPlaylistIE): 443 _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' 444 IE_DESC = 'Instagram hashtag search' 445 IE_NAME = 'instagram:tag' 446 _TEST = { 447 'url': 'https://instagram.com/explore/tags/lolcats', 448 'info_dict': { 449 'id': 'lolcats', 450 'title': 'lolcats', 451 }, 452 'playlist_count': 50, 453 'params': { 454 'extract_flat': True, 455 'skip_download': True, 456 'playlistend': 50, 457 } 458 } 459 460 _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', 461 462 @staticmethod 463 def _parse_timeline_from(data): 464 # extracts the media timeline data from a GraphQL result 465 return data['data']['hashtag']['edge_hashtag_to_media'] 466 467 @staticmethod 468 def _query_vars_for(data): 469 # returns a dictionary of variables to add to the timeline query based 470 # on the GraphQL of the original page 471 return { 472 'tag_name': 473 data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] 474 }