democracynow.py (3101B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 import os.path 6 7 from .common import InfoExtractor 8 from ..compat import compat_urlparse 9 from ..utils import ( 10 url_basename, 11 remove_start, 12 ) 13 14 15 class DemocracynowIE(InfoExtractor): 16 _VALID_URL = r'https?://(?:www\.)?democracynow\.org/(?P<id>[^\?]*)' 17 IE_NAME = 'democracynow' 18 _TESTS = [{ 19 'url': 'http://www.democracynow.org/shows/2015/7/3', 20 'md5': '3757c182d3d84da68f5c8f506c18c196', 21 'info_dict': { 22 'id': '2015-0703-001', 23 'ext': 'mp4', 24 'title': 'Daily Show for July 03, 2015', 25 'description': 'md5:80eb927244d6749900de6072c7cc2c86', 26 }, 27 }, { 28 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', 29 'info_dict': { 30 'id': '2015-0703-001', 31 'ext': 'mp4', 32 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', 33 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', 34 }, 35 'params': { 36 'skip_download': True, 37 }, 38 }] 39 40 def _real_extract(self, url): 41 display_id = self._match_id(url) 42 43 webpage = self._download_webpage(url, display_id) 44 45 json_data = self._parse_json(self._search_regex( 46 r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), 47 display_id) 48 49 title = json_data['title'] 50 formats = [] 51 52 video_id = None 53 54 for key in ('file', 'audio', 'video', 'high_res_video'): 55 media_url = json_data.get(key, '') 56 if not media_url: 57 continue 58 media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) 59 video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') 60 formats.append({ 61 'url': media_url, 62 'vcodec': 'none' if key == 'audio' else None, 63 }) 64 65 self._sort_formats(formats) 66 67 default_lang = 'en' 68 subtitles = {} 69 70 def add_subtitle_item(lang, info_dict): 71 if lang not in subtitles: 72 subtitles[lang] = [] 73 subtitles[lang].append(info_dict) 74 75 # chapter_file are not subtitles 76 if 'caption_file' in json_data: 77 add_subtitle_item(default_lang, { 78 'url': compat_urlparse.urljoin(url, json_data['caption_file']), 79 }) 80 81 for subtitle_item in json_data.get('captions', []): 82 lang = subtitle_item.get('language', '').lower() or default_lang 83 add_subtitle_item(lang, { 84 'url': compat_urlparse.urljoin(url, subtitle_item['url']), 85 }) 86 87 description = self._og_search_description(webpage, default=None) 88 89 return { 90 'id': video_id or display_id, 91 'title': title, 92 'description': description, 93 'thumbnail': json_data.get('image'), 94 'subtitles': subtitles, 95 'formats': formats, 96 }