spotify.py (5739B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import json 5 import re 6 7 from .common import InfoExtractor 8 from ..utils import ( 9 clean_podcast_url, 10 float_or_none, 11 int_or_none, 12 strip_or_none, 13 try_get, 14 unified_strdate, 15 ) 16 17 18 class SpotifyBaseIE(InfoExtractor): 19 _ACCESS_TOKEN = None 20 _OPERATION_HASHES = { 21 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf', 22 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0', 23 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', 24 } 25 _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P<id>[^/?&#]+)' 26 27 def _real_initialize(self): 28 self._ACCESS_TOKEN = self._download_json( 29 'https://open.spotify.com/get_access_token', None)['accessToken'] 30 31 def _call_api(self, operation, video_id, variables): 32 return self._download_json( 33 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={ 34 'operationName': 'query' + operation, 35 'variables': json.dumps(variables), 36 'extensions': json.dumps({ 37 'persistedQuery': { 38 'sha256Hash': self._OPERATION_HASHES[operation], 39 }, 40 }) 41 }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data'] 42 43 def _extract_episode(self, episode, series): 44 episode_id = episode['id'] 45 title = episode['name'].strip() 46 47 formats = [] 48 audio_preview = episode.get('audioPreview') or {} 49 audio_preview_url = audio_preview.get('url') 50 if audio_preview_url: 51 f = { 52 'url': audio_preview_url.replace('://p.scdn.co/mp3-preview/', '://anon-podcast.scdn.co/'), 53 'vcodec': 'none', 54 } 55 audio_preview_format = audio_preview.get('format') 56 if audio_preview_format: 57 f['format_id'] = audio_preview_format 58 mobj = re.match(r'([0-9A-Z]{3})_(?:[A-Z]+_)?(\d+)', audio_preview_format) 59 if mobj: 60 f.update({ 61 'abr': int(mobj.group(2)), 62 'ext': mobj.group(1).lower(), 63 }) 64 formats.append(f) 65 66 for item in (try_get(episode, lambda x: x['audio']['items']) or []): 67 item_url = item.get('url') 68 if not (item_url and item.get('externallyHosted')): 69 continue 70 formats.append({ 71 'url': clean_podcast_url(item_url), 72 'vcodec': 'none', 73 }) 74 75 thumbnails = [] 76 for source in (try_get(episode, lambda x: x['coverArt']['sources']) or []): 77 source_url = source.get('url') 78 if not source_url: 79 continue 80 thumbnails.append({ 81 'url': source_url, 82 'width': int_or_none(source.get('width')), 83 'height': int_or_none(source.get('height')), 84 }) 85 86 return { 87 'id': episode_id, 88 'title': title, 89 'formats': formats, 90 'thumbnails': thumbnails, 91 'description': strip_or_none(episode.get('description')), 92 'duration': float_or_none(try_get( 93 episode, lambda x: x['duration']['totalMilliseconds']), 1000), 94 'release_date': unified_strdate(try_get( 95 episode, lambda x: x['releaseDate']['isoString'])), 96 'series': series, 97 } 98 99 100 class SpotifyIE(SpotifyBaseIE): 101 IE_NAME = 'spotify' 102 _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode' 103 _TEST = { 104 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo', 105 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b', 106 'info_dict': { 107 'id': '4Z7GAJ50bgctf6uclHlWKo', 108 'ext': 'mp3', 109 'title': 'From the archive: Why time management is ruining our lives', 110 'description': 'md5:b120d9c4ff4135b42aa9b6d9cde86935', 111 'duration': 2083.605, 112 'release_date': '20201217', 113 'series': "The Guardian's Audio Long Reads", 114 } 115 } 116 117 def _real_extract(self, url): 118 episode_id = self._match_id(url) 119 episode = self._call_api('Episode', episode_id, { 120 'uri': 'spotify:episode:' + episode_id 121 })['episode'] 122 return self._extract_episode( 123 episode, try_get(episode, lambda x: x['podcast']['name'])) 124 125 126 class SpotifyShowIE(SpotifyBaseIE): 127 IE_NAME = 'spotify:show' 128 _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'show' 129 _TEST = { 130 'url': 'https://open.spotify.com/show/4PM9Ke6l66IRNpottHKV9M', 131 'info_dict': { 132 'id': '4PM9Ke6l66IRNpottHKV9M', 133 'title': 'The Story from the Guardian', 134 'description': 'The Story podcast is dedicated to our finest audio documentaries, investigations and long form stories', 135 }, 136 'playlist_mincount': 36, 137 } 138 139 def _real_extract(self, url): 140 show_id = self._match_id(url) 141 podcast = self._call_api('ShowEpisodes', show_id, { 142 'limit': 1000000000, 143 'offset': 0, 144 'uri': 'spotify:show:' + show_id, 145 })['podcast'] 146 podcast_name = podcast.get('name') 147 148 entries = [] 149 for item in (try_get(podcast, lambda x: x['episodes']['items']) or []): 150 episode = item.get('episode') 151 if not episode: 152 continue 153 entries.append(self._extract_episode(episode, podcast_name)) 154 155 return self.playlist_result( 156 entries, show_id, podcast_name, podcast.get('description'))