cjsw.py (2412B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import ( 8 determine_ext, 9 unescapeHTML, 10 ) 11 12 13 class CJSWIE(InfoExtractor): 14 _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)' 15 _TESTS = [{ 16 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', 17 'md5': 'cee14d40f1e9433632c56e3d14977120', 18 'info_dict': { 19 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', 20 'ext': 'mp3', 21 'title': 'Freshly Squeezed – Episode June 20, 2017', 22 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', 23 'series': 'Freshly Squeezed', 24 'episode_id': '20170620', 25 }, 26 }, { 27 # no description 28 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', 29 'only_matching': True, 30 }] 31 32 def _real_extract(self, url): 33 mobj = re.match(self._VALID_URL, url) 34 program, episode_id = mobj.group('program', 'id') 35 audio_id = '%s/%s' % (program, episode_id) 36 37 webpage = self._download_webpage(url, episode_id) 38 39 title = unescapeHTML(self._search_regex( 40 (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)', 41 r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), 42 webpage, 'title', group='title')) 43 44 audio_url = self._search_regex( 45 r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', 46 webpage, 'audio url', group='url') 47 48 audio_id = self._search_regex( 49 r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', 50 audio_url, 'audio id', default=audio_id) 51 52 formats = [{ 53 'url': audio_url, 54 'ext': determine_ext(audio_url, 'mp3'), 55 'vcodec': 'none', 56 }] 57 58 description = self._html_search_regex( 59 r'<p>(?P<description>.+?)</p>', webpage, 'description', 60 default=None) 61 series = self._search_regex( 62 r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, 63 'series', default=program, group='name') 64 65 return { 66 'id': audio_id, 67 'title': title, 68 'description': description, 69 'formats': formats, 70 'series': series, 71 'episode_id': episode_id, 72 }