pinterest.py (7598B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import json 5 import re 6 7 from .common import InfoExtractor 8 from ..compat import compat_str 9 from ..utils import ( 10 determine_ext, 11 float_or_none, 12 int_or_none, 13 try_get, 14 unified_timestamp, 15 url_or_none, 16 ) 17 18 19 class PinterestBaseIE(InfoExtractor): 20 _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' 21 22 def _call_api(self, resource, video_id, options): 23 return self._download_json( 24 'https://www.pinterest.com/resource/%sResource/get/' % resource, 25 video_id, 'Download %s JSON metadata' % resource, query={ 26 'data': json.dumps({'options': options}) 27 })['resource_response'] 28 29 def _extract_video(self, data, extract_formats=True): 30 video_id = data['id'] 31 32 title = (data.get('title') or data.get('grid_title') or video_id).strip() 33 34 urls = [] 35 formats = [] 36 duration = None 37 if extract_formats: 38 for format_id, format_dict in data['videos']['video_list'].items(): 39 if not isinstance(format_dict, dict): 40 continue 41 format_url = url_or_none(format_dict.get('url')) 42 if not format_url or format_url in urls: 43 continue 44 urls.append(format_url) 45 duration = float_or_none(format_dict.get('duration'), scale=1000) 46 ext = determine_ext(format_url) 47 if 'hls' in format_id.lower() or ext == 'm3u8': 48 formats.extend(self._extract_m3u8_formats( 49 format_url, video_id, 'mp4', entry_protocol='m3u8_native', 50 m3u8_id=format_id, fatal=False)) 51 else: 52 formats.append({ 53 'url': format_url, 54 'format_id': format_id, 55 'width': int_or_none(format_dict.get('width')), 56 'height': int_or_none(format_dict.get('height')), 57 'duration': duration, 58 }) 59 self._sort_formats( 60 formats, field_preference=('height', 'width', 'tbr', 'format_id')) 61 62 description = data.get('description') or data.get('description_html') or data.get('seo_description') 63 timestamp = unified_timestamp(data.get('created_at')) 64 65 def _u(field): 66 return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) 67 68 uploader = _u('full_name') 69 uploader_id = _u('id') 70 71 repost_count = int_or_none(data.get('repin_count')) 72 comment_count = int_or_none(data.get('comment_count')) 73 categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) 74 tags = data.get('hashtags') 75 76 thumbnails = [] 77 images = data.get('images') 78 if isinstance(images, dict): 79 for thumbnail_id, thumbnail in images.items(): 80 if not isinstance(thumbnail, dict): 81 continue 82 thumbnail_url = url_or_none(thumbnail.get('url')) 83 if not thumbnail_url: 84 continue 85 thumbnails.append({ 86 'url': thumbnail_url, 87 'width': int_or_none(thumbnail.get('width')), 88 'height': int_or_none(thumbnail.get('height')), 89 }) 90 91 return { 92 'id': video_id, 93 'title': title, 94 'description': description, 95 'duration': duration, 96 'timestamp': timestamp, 97 'thumbnails': thumbnails, 98 'uploader': uploader, 99 'uploader_id': uploader_id, 100 'repost_count': repost_count, 101 'comment_count': comment_count, 102 'categories': categories, 103 'tags': tags, 104 'formats': formats, 105 'extractor_key': PinterestIE.ie_key(), 106 } 107 108 109 class PinterestIE(PinterestBaseIE): 110 _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE 111 _TESTS = [{ 112 'url': 'https://www.pinterest.com/pin/664281013778109217/', 113 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', 114 'info_dict': { 115 'id': '664281013778109217', 116 'ext': 'mp4', 117 'title': 'Origami', 118 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', 119 'duration': 57.7, 120 'timestamp': 1593073622, 121 'upload_date': '20200625', 122 'uploader': 'Love origami -I am Dafei', 123 'uploader_id': '586523688879454212', 124 'repost_count': 50, 125 'comment_count': 0, 126 'categories': list, 127 'tags': list, 128 }, 129 }, { 130 'url': 'https://co.pinterest.com/pin/824721750502199491/', 131 'only_matching': True, 132 }] 133 134 def _real_extract(self, url): 135 video_id = self._match_id(url) 136 data = self._call_api( 137 'Pin', video_id, { 138 'field_set_key': 'unauth_react_main_pin', 139 'id': video_id, 140 })['data'] 141 return self._extract_video(data) 142 143 144 class PinterestCollectionIE(PinterestBaseIE): 145 _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE 146 _TESTS = [{ 147 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', 148 'info_dict': { 149 'id': '585890301462791043', 150 'title': 'cool diys', 151 }, 152 'playlist_count': 8, 153 }, { 154 'url': 'https://www.pinterest.ca/fudohub/videos/', 155 'info_dict': { 156 'id': '682858430939307450', 157 'title': 'VIDEOS', 158 }, 159 'playlist_mincount': 365, 160 'skip': 'Test with extract_formats=False', 161 }] 162 163 @classmethod 164 def suitable(cls, url): 165 return False if PinterestIE.suitable(url) else super( 166 PinterestCollectionIE, cls).suitable(url) 167 168 def _real_extract(self, url): 169 username, slug = re.match(self._VALID_URL, url).groups() 170 board = self._call_api( 171 'Board', slug, { 172 'slug': slug, 173 'username': username 174 })['data'] 175 board_id = board['id'] 176 options = { 177 'board_id': board_id, 178 'page_size': 250, 179 } 180 bookmark = None 181 entries = [] 182 while True: 183 if bookmark: 184 options['bookmarks'] = [bookmark] 185 board_feed = self._call_api('BoardFeed', board_id, options) 186 for item in (board_feed.get('data') or []): 187 if not isinstance(item, dict) or item.get('type') != 'pin': 188 continue 189 video_id = item.get('id') 190 if video_id: 191 # Some pins may not be available anonymously via pin URL 192 # video = self._extract_video(item, extract_formats=False) 193 # video.update({ 194 # '_type': 'url_transparent', 195 # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, 196 # }) 197 # entries.append(video) 198 entries.append(self._extract_video(item)) 199 bookmark = board_feed.get('bookmark') 200 if not bookmark: 201 break 202 return self.playlist_result( 203 entries, playlist_id=board_id, playlist_title=board.get('name'))