youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

channel9.py (10284B)


      1 from __future__ import unicode_literals
      2 
      3 import re
      4 
      5 from .common import InfoExtractor
      6 from ..utils import (
      7     clean_html,
      8     ExtractorError,
      9     int_or_none,
     10     parse_iso8601,
     11     qualities,
     12     unescapeHTML,
     13 )
     14 
     15 
     16 class Channel9IE(InfoExtractor):
     17     IE_DESC = 'Channel 9'
     18     IE_NAME = 'channel9'
     19     _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
     20 
     21     _TESTS = [{
     22         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
     23         'md5': '32083d4eaf1946db6d454313f44510ca',
     24         'info_dict': {
     25             'id': '6c413323-383a-49dc-88f9-a22800cab024',
     26             'ext': 'wmv',
     27             'title': 'Developer Kick-Off Session: Stuff We Love',
     28             'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
     29             'duration': 4576,
     30             'thumbnail': r're:https?://.*\.jpg',
     31             'timestamp': 1377717420,
     32             'upload_date': '20130828',
     33             'session_code': 'KOS002',
     34             'session_room': 'Arena 1A',
     35             'session_speakers': 'count:5',
     36         },
     37     }, {
     38         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
     39         'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
     40         'info_dict': {
     41             'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
     42             'ext': 'wmv',
     43             'title': 'Self-service BI with Power BI - nuclear testing',
     44             'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
     45             'duration': 1540,
     46             'thumbnail': r're:https?://.*\.jpg',
     47             'timestamp': 1386381991,
     48             'upload_date': '20131207',
     49             'authors': ['Mike Wilmot'],
     50         },
     51     }, {
     52         # low quality mp4 is best
     53         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
     54         'info_dict': {
     55             'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
     56             'ext': 'mp4',
     57             'title': 'Ranges for the Standard Library',
     58             'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
     59             'duration': 5646,
     60             'thumbnail': r're:https?://.*\.jpg',
     61             'upload_date': '20150930',
     62             'timestamp': 1443640735,
     63         },
     64         'params': {
     65             'skip_download': True,
     66         },
     67     }, {
     68         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
     69         'info_dict': {
     70             'id': 'Events/DEVintersection/DEVintersection-2016',
     71             'title': 'DEVintersection 2016 Orlando Sessions',
     72         },
     73         'playlist_mincount': 14,
     74     }, {
     75         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
     76         'only_matching': True,
     77     }, {
     78         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
     79         'only_matching': True,
     80     }]
     81 
     82     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
     83 
     84     @staticmethod
     85     def _extract_urls(webpage):
     86         return re.findall(
     87             r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b',
     88             webpage)
     89 
     90     def _extract_list(self, video_id, rss_url=None):
     91         if not rss_url:
     92             rss_url = self._RSS_URL % video_id
     93         rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
     94         entries = [self.url_result(session_url.text, 'Channel9')
     95                    for session_url in rss.findall('./channel/item/link')]
     96         title_text = rss.find('./channel/title').text
     97         return self.playlist_result(entries, video_id, title_text)
     98 
     99     def _real_extract(self, url):
    100         content_path, rss = re.match(self._VALID_URL, url).groups()
    101 
    102         if rss:
    103             return self._extract_list(content_path, url)
    104 
    105         webpage = self._download_webpage(
    106             url, content_path, 'Downloading web page')
    107 
    108         episode_data = self._search_regex(
    109             r"data-episode='([^']+)'", webpage, 'episode data', default=None)
    110         if episode_data:
    111             episode_data = self._parse_json(unescapeHTML(
    112                 episode_data), content_path)
    113             content_id = episode_data['contentId']
    114             is_session = '/Sessions(' in episode_data['api']
    115             content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,'
    116             if is_session:
    117                 content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers'
    118             else:
    119                 content_url += 'Authors,Body&$expand=Authors'
    120             content_data = self._download_json(content_url, content_id)
    121             title = content_data['Title']
    122 
    123             QUALITIES = (
    124                 'mp3',
    125                 'wmv', 'mp4',
    126                 'wmv-low', 'mp4-low',
    127                 'wmv-mid', 'mp4-mid',
    128                 'wmv-high', 'mp4-high',
    129             )
    130 
    131             quality_key = qualities(QUALITIES)
    132 
    133             def quality(quality_id, format_url):
    134                 return (len(QUALITIES) if '_Source.' in format_url
    135                         else quality_key(quality_id))
    136 
    137             formats = []
    138             urls = set()
    139 
    140             SITE_QUALITIES = {
    141                 'MP3': 'mp3',
    142                 'MP4': 'mp4',
    143                 'Low Quality WMV': 'wmv-low',
    144                 'Low Quality MP4': 'mp4-low',
    145                 'Mid Quality WMV': 'wmv-mid',
    146                 'Mid Quality MP4': 'mp4-mid',
    147                 'High Quality WMV': 'wmv-high',
    148                 'High Quality MP4': 'mp4-high',
    149             }
    150 
    151             formats_select = self._search_regex(
    152                 r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
    153                 'formats select', default=None)
    154             if formats_select:
    155                 for mobj in re.finditer(
    156                         r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
    157                         formats_select):
    158                     format_url = mobj.group('url')
    159                     if format_url in urls:
    160                         continue
    161                     urls.add(format_url)
    162                     format_id = mobj.group('format')
    163                     quality_id = SITE_QUALITIES.get(format_id, format_id)
    164                     formats.append({
    165                         'url': format_url,
    166                         'format_id': quality_id,
    167                         'quality': quality(quality_id, format_url),
    168                         'vcodec': 'none' if quality_id == 'mp3' else None,
    169                     })
    170 
    171             API_QUALITIES = {
    172                 'VideoMP4Low': 'mp4-low',
    173                 'VideoWMV': 'wmv-mid',
    174                 'VideoMP4Medium': 'mp4-mid',
    175                 'VideoMP4High': 'mp4-high',
    176                 'VideoWMVHQ': 'wmv-hq',
    177             }
    178 
    179             for format_id, q in API_QUALITIES.items():
    180                 q_url = content_data.get(format_id)
    181                 if not q_url or q_url in urls:
    182                     continue
    183                 urls.add(q_url)
    184                 formats.append({
    185                     'url': q_url,
    186                     'format_id': q,
    187                     'quality': quality(q, q_url),
    188                 })
    189 
    190             self._sort_formats(formats)
    191 
    192             slides = content_data.get('Slides')
    193             zip_file = content_data.get('ZipFile')
    194 
    195             if not formats and not slides and not zip_file:
    196                 raise ExtractorError(
    197                     'None of recording, slides or zip are available for %s' % content_path)
    198 
    199             subtitles = {}
    200             for caption in content_data.get('Captions', []):
    201                 caption_url = caption.get('Url')
    202                 if not caption_url:
    203                     continue
    204                 subtitles.setdefault(caption.get('Language', 'en'), []).append({
    205                     'url': caption_url,
    206                     'ext': 'vtt',
    207                 })
    208 
    209             common = {
    210                 'id': content_id,
    211                 'title': title,
    212                 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
    213                 'thumbnail': content_data.get('VideoPlayerPreviewImage'),
    214                 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
    215                 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
    216                 'avg_rating': int_or_none(content_data.get('Rating')),
    217                 'rating_count': int_or_none(content_data.get('RatingCount')),
    218                 'view_count': int_or_none(content_data.get('Views')),
    219                 'comment_count': int_or_none(content_data.get('CommentCount')),
    220                 'subtitles': subtitles,
    221             }
    222             if is_session:
    223                 speakers = []
    224                 for s in content_data.get('Speakers', []):
    225                     speaker_name = s.get('FullName')
    226                     if not speaker_name:
    227                         continue
    228                     speakers.append(speaker_name)
    229 
    230                 common.update({
    231                     'session_code': content_data.get('Code'),
    232                     'session_room': content_data.get('Room'),
    233                     'session_speakers': speakers,
    234                 })
    235             else:
    236                 authors = []
    237                 for a in content_data.get('Authors', []):
    238                     author_name = a.get('DisplayName')
    239                     if not author_name:
    240                         continue
    241                     authors.append(author_name)
    242                 common['authors'] = authors
    243 
    244             contents = []
    245 
    246             if slides:
    247                 d = common.copy()
    248                 d.update({'title': title + '-Slides', 'url': slides})
    249                 contents.append(d)
    250 
    251             if zip_file:
    252                 d = common.copy()
    253                 d.update({'title': title + '-Zip', 'url': zip_file})
    254                 contents.append(d)
    255 
    256             if formats:
    257                 d = common.copy()
    258                 d.update({'title': title, 'formats': formats})
    259                 contents.append(d)
    260             return self.playlist_result(contents)
    261         else:
    262             return self._extract_list(content_path)