kuwo.py (12536B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..compat import compat_urlparse 8 from ..utils import ( 9 get_element_by_id, 10 clean_html, 11 ExtractorError, 12 InAdvancePagedList, 13 remove_start, 14 ) 15 16 17 class KuwoBaseIE(InfoExtractor): 18 _FORMATS = [ 19 {'format': 'ape', 'ext': 'ape', 'preference': 100}, 20 {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, 21 {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70}, 22 {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60}, 23 {'format': 'wma', 'ext': 'wma', 'preference': 20}, 24 {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} 25 ] 26 27 def _get_formats(self, song_id, tolerate_ip_deny=False): 28 formats = [] 29 for file_format in self._FORMATS: 30 query = { 31 'format': file_format['ext'], 32 'br': file_format.get('br', ''), 33 'rid': 'MUSIC_%s' % song_id, 34 'type': 'convert_url', 35 'response': 'url' 36 } 37 38 song_url = self._download_webpage( 39 'http://antiserver.kuwo.cn/anti.s', 40 song_id, note='Download %s url info' % file_format['format'], 41 query=query, headers=self.geo_verification_headers(), 42 ) 43 44 if song_url == 'IPDeny' and not tolerate_ip_deny: 45 raise ExtractorError('This song is blocked in this region', expected=True) 46 47 if song_url.startswith('http://') or song_url.startswith('https://'): 48 formats.append({ 49 'url': song_url, 50 'format_id': file_format['format'], 51 'format': file_format['format'], 52 'preference': file_format['preference'], 53 'abr': file_format.get('abr'), 54 }) 55 56 return formats 57 58 59 class KuwoIE(KuwoBaseIE): 60 IE_NAME = 'kuwo:song' 61 IE_DESC = '酷我音乐' 62 _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P<id>\d+)' 63 _TESTS = [{ 64 'url': 'http://www.kuwo.cn/yinyue/635632/', 65 'info_dict': { 66 'id': '635632', 67 'ext': 'ape', 68 'title': '爱我别走', 69 'creator': '张震岳', 70 'upload_date': '20080122', 71 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' 72 }, 73 'skip': 'this song has been offline because of copyright issues', 74 }, { 75 'url': 'http://www.kuwo.cn/yinyue/6446136/', 76 'info_dict': { 77 'id': '6446136', 78 'ext': 'mp3', 79 'title': '心', 80 'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c', 81 'creator': 'IU', 82 'upload_date': '20150518', 83 }, 84 'params': { 85 'format': 'mp3-320', 86 }, 87 }, { 88 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', 89 'only_matching': True, 90 }] 91 92 def _real_extract(self, url): 93 song_id = self._match_id(url) 94 webpage, urlh = self._download_webpage_handle( 95 url, song_id, note='Download song detail info', 96 errnote='Unable to get song detail info') 97 if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: 98 raise ExtractorError('this song has been offline because of copyright issues', expected=True) 99 100 song_name = self._html_search_regex( 101 r'<p[^>]+id="lrcName">([^<]+)</p>', webpage, 'song name') 102 singer_name = remove_start(self._html_search_regex( 103 r'<a[^>]+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">', 104 webpage, 'singer name', fatal=False), '歌手') 105 lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) 106 if lrc_content == '暂无': # indicates no lyrics 107 lrc_content = None 108 109 formats = self._get_formats(song_id) 110 self._sort_formats(formats) 111 112 album_id = self._html_search_regex( 113 r'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"', 114 webpage, 'album id', fatal=False) 115 116 publish_time = None 117 if album_id is not None: 118 album_info_page = self._download_webpage( 119 'http://www.kuwo.cn/album/%s/' % album_id, song_id, 120 note='Download album detail info', 121 errnote='Unable to get album detail info') 122 123 publish_time = self._html_search_regex( 124 r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page, 125 'publish time', fatal=False) 126 if publish_time: 127 publish_time = publish_time.replace('-', '') 128 129 return { 130 'id': song_id, 131 'title': song_name, 132 'creator': singer_name, 133 'upload_date': publish_time, 134 'description': lrc_content, 135 'formats': formats, 136 } 137 138 139 class KuwoAlbumIE(InfoExtractor): 140 IE_NAME = 'kuwo:album' 141 IE_DESC = '酷我音乐 - 专辑' 142 _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P<id>\d+?)/' 143 _TEST = { 144 'url': 'http://www.kuwo.cn/album/502294/', 145 'info_dict': { 146 'id': '502294', 147 'title': 'Made\xa0Series\xa0《M》', 148 'description': 'md5:d463f0d8a0ff3c3ea3d6ed7452a9483f', 149 }, 150 'playlist_count': 2, 151 } 152 153 def _real_extract(self, url): 154 album_id = self._match_id(url) 155 156 webpage = self._download_webpage( 157 url, album_id, note='Download album info', 158 errnote='Unable to get album info') 159 160 album_name = self._html_search_regex( 161 r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage, 162 'album name') 163 album_intro = remove_start( 164 clean_html(get_element_by_id('intro', webpage)), 165 '%s简介:' % album_name) 166 167 entries = [ 168 self.url_result(song_url, 'Kuwo') for song_url in re.findall( 169 r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"', 170 webpage) 171 ] 172 return self.playlist_result(entries, album_id, album_name, album_intro) 173 174 175 class KuwoChartIE(InfoExtractor): 176 IE_NAME = 'kuwo:chart' 177 IE_DESC = '酷我音乐 - 排行榜' 178 _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' 179 _TEST = { 180 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 181 'info_dict': { 182 'id': '香港中文龙虎榜', 183 }, 184 'playlist_mincount': 7, 185 } 186 187 def _real_extract(self, url): 188 chart_id = self._match_id(url) 189 webpage = self._download_webpage( 190 url, chart_id, note='Download chart info', 191 errnote='Unable to get chart info') 192 193 entries = [ 194 self.url_result(song_url, 'Kuwo') for song_url in re.findall( 195 r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) 196 ] 197 return self.playlist_result(entries, chart_id) 198 199 200 class KuwoSingerIE(InfoExtractor): 201 IE_NAME = 'kuwo:singer' 202 IE_DESC = '酷我音乐 - 歌手' 203 _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P<id>[^/]+)' 204 _TESTS = [{ 205 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 206 'info_dict': { 207 'id': 'bruno+mars', 208 'title': 'Bruno\xa0Mars', 209 }, 210 'playlist_mincount': 329, 211 }, { 212 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', 213 'info_dict': { 214 'id': 'Ali', 215 'title': 'Ali', 216 }, 217 'playlist_mincount': 95, 218 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/ytdl-org/youtube-dl/jobs/78878540 219 }] 220 221 PAGE_SIZE = 15 222 223 def _real_extract(self, url): 224 singer_id = self._match_id(url) 225 webpage = self._download_webpage( 226 url, singer_id, note='Download singer info', 227 errnote='Unable to get singer info') 228 229 singer_name = self._html_search_regex( 230 r'<h1>([^<]+)</h1>', webpage, 'singer name') 231 232 artist_id = self._html_search_regex( 233 r'data-artistid="(\d+)"', webpage, 'artist id') 234 235 page_count = int(self._html_search_regex( 236 r'data-page="(\d+)"', webpage, 'page count')) 237 238 def page_func(page_num): 239 webpage = self._download_webpage( 240 'http://www.kuwo.cn/artist/contentMusicsAjax', 241 singer_id, note='Download song list page #%d' % (page_num + 1), 242 errnote='Unable to get song list page #%d' % (page_num + 1), 243 query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) 244 245 return [ 246 self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo') 247 for song_url in re.findall( 248 r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)', 249 webpage) 250 ] 251 252 entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE) 253 254 return self.playlist_result(entries, singer_id, singer_name) 255 256 257 class KuwoCategoryIE(InfoExtractor): 258 IE_NAME = 'kuwo:category' 259 IE_DESC = '酷我音乐 - 分类' 260 _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' 261 _TEST = { 262 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 263 'info_dict': { 264 'id': '86375', 265 'title': '八十年代精选', 266 'description': '这些都是属于八十年代的回忆!', 267 }, 268 'playlist_mincount': 24, 269 } 270 271 def _real_extract(self, url): 272 category_id = self._match_id(url) 273 webpage = self._download_webpage( 274 url, category_id, note='Download category info', 275 errnote='Unable to get category info') 276 277 category_name = self._html_search_regex( 278 r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name') 279 280 category_desc = remove_start( 281 get_element_by_id('intro', webpage).strip(), 282 '%s简介:' % category_name) 283 if category_desc == '暂无': 284 category_desc = None 285 286 jsonm = self._parse_json(self._html_search_regex( 287 r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) 288 289 entries = [ 290 self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo') 291 for song in jsonm['musiclist'] 292 ] 293 return self.playlist_result(entries, category_id, category_name, category_desc) 294 295 296 class KuwoMvIE(KuwoBaseIE): 297 IE_NAME = 'kuwo:mv' 298 IE_DESC = '酷我音乐 - MV' 299 _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P<id>\d+?)/' 300 _TEST = { 301 'url': 'http://www.kuwo.cn/mv/6480076/', 302 'info_dict': { 303 'id': '6480076', 304 'ext': 'mp4', 305 'title': 'My HouseMV', 306 'creator': '2PM', 307 }, 308 # In this video, music URLs (anti.s) are blocked outside China and 309 # USA, while the MV URL (mvurl) is available globally, so force the MV 310 # URL for consistent results in different countries 311 'params': { 312 'format': 'mv', 313 }, 314 } 315 _FORMATS = KuwoBaseIE._FORMATS + [ 316 {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, 317 {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, 318 ] 319 320 def _real_extract(self, url): 321 song_id = self._match_id(url) 322 webpage = self._download_webpage( 323 url, song_id, note='Download mv detail info: %s' % song_id, 324 errnote='Unable to get mv detail info: %s' % song_id) 325 326 mobj = re.search( 327 r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"', 328 webpage) 329 if mobj: 330 song_name = mobj.group('song') 331 singer_name = mobj.group('singer') 332 else: 333 raise ExtractorError('Unable to find song or singer names') 334 335 formats = self._get_formats(song_id, tolerate_ip_deny=True) 336 337 mv_url = self._download_webpage( 338 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, 339 song_id, note='Download %s MV URL' % song_id) 340 formats.append({ 341 'url': mv_url, 342 'format_id': 'mv', 343 }) 344 345 self._sort_formats(formats) 346 347 return { 348 'id': song_id, 349 'title': song_name, 350 'creator': singer_name, 351 'formats': formats, 352 }