youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

stitcher.py (5371B)


      1 from __future__ import unicode_literals
      2 
      3 from .common import InfoExtractor
      4 from ..compat import compat_str
      5 from ..utils import (
      6     clean_html,
      7     clean_podcast_url,
      8     ExtractorError,
      9     int_or_none,
     10     str_or_none,
     11     try_get,
     12     url_or_none,
     13 )
     14 
     15 
     16 class StitcherBaseIE(InfoExtractor):
     17     _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/'
     18 
     19     def _call_api(self, path, video_id, query):
     20         resp = self._download_json(
     21             'https://api.prod.stitcher.com/' + path,
     22             video_id, query=query)
     23         error_massage = try_get(resp, lambda x: x['errors'][0]['message'])
     24         if error_massage:
     25             raise ExtractorError(error_massage, expected=True)
     26         return resp['data']
     27 
     28     def _extract_description(self, data):
     29         return clean_html(data.get('html_description') or data.get('description'))
     30 
     31     def _extract_audio_url(self, episode):
     32         return url_or_none(episode.get('audio_url') or episode.get('guid'))
     33 
     34     def _extract_show_info(self, show):
     35         return {
     36             'thumbnail': show.get('image_base_url'),
     37             'series': show.get('title'),
     38         }
     39 
     40     def _extract_episode(self, episode, audio_url, show_info):
     41         info = {
     42             'id': compat_str(episode['id']),
     43             'display_id': episode.get('slug'),
     44             'title': episode['title'].strip(),
     45             'description': self._extract_description(episode),
     46             'duration': int_or_none(episode.get('duration')),
     47             'url': clean_podcast_url(audio_url),
     48             'vcodec': 'none',
     49             'timestamp': int_or_none(episode.get('date_published')),
     50             'season_number': int_or_none(episode.get('season')),
     51             'season_id': str_or_none(episode.get('season_id')),
     52         }
     53         info.update(show_info)
     54         return info
     55 
     56 
     57 class StitcherIE(StitcherBaseIE):
     58     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P<id>\d+)'
     59     _TESTS = [{
     60         'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
     61         'md5': 'e9635098e0da10b21a0e2b85585530f6',
     62         'info_dict': {
     63             'id': '40789481',
     64             'ext': 'mp3',
     65             'title': 'Machine Learning Mastery and Cancer Clusters',
     66             'description': 'md5:547adb4081864be114ae3831b4c2b42f',
     67             'duration': 1604,
     68             'thumbnail': r're:^https?://.*\.jpg',
     69             'upload_date': '20151008',
     70             'timestamp': 1444285800,
     71             'series': 'Talking Machines',
     72         },
     73     }, {
     74         'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
     75         'info_dict': {
     76             'id': '40846275',
     77             'display_id': 'the-rare-hourlong-comedy-plus',
     78             'ext': 'mp3',
     79             'title': "The CW's 'Crazy Ex-Girlfriend'",
     80             'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
     81             'duration': 2235,
     82             'thumbnail': r're:^https?://.*\.jpg',
     83         },
     84         'params': {
     85             'skip_download': True,
     86         },
     87         'skip': 'Page Not Found',
     88     }, {
     89         # escaped title
     90         'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
     91         'only_matching': True,
     92     }, {
     93         'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
     94         'only_matching': True,
     95     }, {
     96         'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584',
     97         'only_matching': True,
     98     }]
     99 
    100     def _real_extract(self, url):
    101         audio_id = self._match_id(url)
    102         data = self._call_api(
    103             'shows/episodes', audio_id, {'episode_ids': audio_id})
    104         episode = data['episodes'][0]
    105         audio_url = self._extract_audio_url(episode)
    106         if not audio_url:
    107             self.raise_login_required()
    108         show = try_get(data, lambda x: x['shows'][0], dict) or {}
    109         return self._extract_episode(
    110             episode, audio_url, self._extract_show_info(show))
    111 
    112 
    113 class StitcherShowIE(StitcherBaseIE):
    114     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P<id>[^/#?&]+)/?(?:[?#&]|$)'
    115     _TESTS = [{
    116         'url': 'http://www.stitcher.com/podcast/the-talking-machines',
    117         'info_dict': {
    118             'id': 'the-talking-machines',
    119             'title': 'Talking Machines',
    120             'description': 'md5:831f0995e40f26c10231af39cf1ebf0b',
    121         },
    122         'playlist_mincount': 106,
    123     }, {
    124         'url': 'https://www.stitcher.com/show/the-talking-machines',
    125         'only_matching': True,
    126     }]
    127 
    128     def _real_extract(self, url):
    129         show_slug = self._match_id(url)
    130         data = self._call_api(
    131             'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000})
    132         show = try_get(data, lambda x: x['shows'][0], dict) or {}
    133         show_info = self._extract_show_info(show)
    134 
    135         entries = []
    136         for episode in (data.get('episodes') or []):
    137             audio_url = self._extract_audio_url(episode)
    138             if not audio_url:
    139                 continue
    140             entries.append(self._extract_episode(episode, audio_url, show_info))
    141 
    142         return self.playlist_result(
    143             entries, show_slug, show.get('title'),
    144             self._extract_description(show))