libsyn.py (3637B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import ( 8 clean_html, 9 get_element_by_class, 10 parse_duration, 11 strip_or_none, 12 unified_strdate, 13 ) 14 15 16 class LibsynIE(InfoExtractor): 17 _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' 18 19 _TESTS = [{ 20 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', 21 'md5': '2a55e75496c790cdeb058e7e6c087746', 22 'info_dict': { 23 'id': '6385796', 24 'ext': 'mp3', 25 'title': "Champion Minded - Developing a Growth Mindset", 26 # description fetched using another request: 27 # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796 28 # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', 29 'upload_date': '20180320', 30 'thumbnail': 're:^https?://.*', 31 }, 32 }, { 33 'url': 'https://html5-player.libsyn.com/embed/episode/id/3727166/height/75/width/200/theme/standard/direction/no/autoplay/no/autonext/no/thumbnail/no/preload/no/no_addthis/no/', 34 'md5': '6c5cb21acd622d754d3b1a92b582ce42', 35 'info_dict': { 36 'id': '3727166', 37 'ext': 'mp3', 38 'title': 'Clients From Hell Podcast - How a Sex Toy Company Kickstarted my Freelance Career', 39 'upload_date': '20150818', 40 'thumbnail': 're:^https?://.*', 41 } 42 }] 43 44 def _real_extract(self, url): 45 url, video_id = re.match(self._VALID_URL, url).groups() 46 webpage = self._download_webpage(url, video_id) 47 48 data = self._parse_json(self._search_regex( 49 r'var\s+playlistItem\s*=\s*({.+?});', 50 webpage, 'JSON data block'), video_id) 51 52 episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage) 53 if not episode_title: 54 self._search_regex( 55 [r'data-title="([^"]+)"', r'<title>(.+?)</title>'], 56 webpage, 'episode title') 57 episode_title = episode_title.strip() 58 59 podcast_title = strip_or_none(clean_html(self._search_regex( 60 r'<h3>([^<]+)</h3>', webpage, 'podcast title', 61 default=None) or get_element_by_class('podcast-title', webpage))) 62 63 title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title 64 65 formats = [] 66 for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')): 67 f_url = data.get(k) 68 if not f_url: 69 continue 70 formats.append({ 71 'url': f_url, 72 'format_id': format_id, 73 }) 74 75 description = self._html_search_regex( 76 r'<p\s+id="info_text_body">(.+?)</p>', webpage, 77 'description', default=None) 78 if description: 79 # Strip non-breaking and normal spaces 80 description = description.replace('\u00A0', ' ').strip() 81 release_date = unified_strdate(self._search_regex( 82 r'<div class="release_date">Released: ([^<]+)<', 83 webpage, 'release date', default=None) or data.get('release_date')) 84 85 return { 86 'id': video_id, 87 'title': title, 88 'description': description, 89 'thumbnail': data.get('thumbnail_url'), 90 'upload_date': release_date, 91 'duration': parse_duration(data.get('duration')), 92 'formats': formats, 93 }