microsoftvirtualacademy.py (7476B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..compat import ( 7 compat_xpath, 8 ) 9 from ..utils import ( 10 int_or_none, 11 parse_duration, 12 smuggle_url, 13 unsmuggle_url, 14 xpath_text, 15 ) 16 17 18 class MicrosoftVirtualAcademyBaseIE(InfoExtractor): 19 def _extract_base_url(self, course_id, display_id): 20 return self._download_json( 21 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, 22 display_id, 'Downloading course base URL') 23 24 def _extract_chapter_and_title(self, title): 25 if not title: 26 return None, None 27 m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title) 28 return (int(m.group('chapter')), m.group('title')) if m else (None, title) 29 30 31 class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): 32 IE_NAME = 'mva' 33 IE_DESC = 'Microsoft Virtual Academy videos' 34 _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME 35 36 _TESTS = [{ 37 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', 38 'md5': '7826c44fc31678b12ad8db11f6b5abb9', 39 'info_dict': { 40 'id': 'gfVXISmEB_6804984382', 41 'ext': 'mp4', 42 'title': 'Course Introduction', 43 'formats': 'mincount:3', 44 'subtitles': { 45 'en': [{ 46 'ext': 'ttml', 47 }], 48 }, 49 } 50 }, { 51 'url': 'mva:11788:gfVXISmEB_6804984382', 52 'only_matching': True, 53 }] 54 55 def _real_extract(self, url): 56 url, smuggled_data = unsmuggle_url(url, {}) 57 58 mobj = re.match(self._VALID_URL, url) 59 course_id = mobj.group('course_id') 60 video_id = mobj.group('id') 61 62 base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) 63 64 settings = self._download_xml( 65 '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), 66 video_id, 'Downloading video settings XML') 67 68 _, title = self._extract_chapter_and_title(xpath_text( 69 settings, './/Title', 'title', fatal=True)) 70 71 formats = [] 72 73 for sources in settings.findall(compat_xpath('.//MediaSources')): 74 sources_type = sources.get('videoType') 75 for source in sources.findall(compat_xpath('./MediaSource')): 76 video_url = source.text 77 if not video_url or not video_url.startswith('http'): 78 continue 79 if sources_type == 'smoothstreaming': 80 formats.extend(self._extract_ism_formats( 81 video_url, video_id, 'mss', fatal=False)) 82 continue 83 video_mode = source.get('videoMode') 84 height = int_or_none(self._search_regex( 85 r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) 86 codec = source.get('codec') 87 acodec, vcodec = [None] * 2 88 if codec: 89 codecs = codec.split(',') 90 if len(codecs) == 2: 91 acodec, vcodec = codecs 92 elif len(codecs) == 1: 93 vcodec = codecs[0] 94 formats.append({ 95 'url': video_url, 96 'format_id': video_mode, 97 'height': height, 98 'acodec': acodec, 99 'vcodec': vcodec, 100 }) 101 self._sort_formats(formats) 102 103 subtitles = {} 104 for source in settings.findall(compat_xpath('.//MarkerResourceSource')): 105 subtitle_url = source.text 106 if not subtitle_url: 107 continue 108 subtitles.setdefault('en', []).append({ 109 'url': '%s/%s' % (base_url, subtitle_url), 110 'ext': source.get('type'), 111 }) 112 113 return { 114 'id': video_id, 115 'title': title, 116 'subtitles': subtitles, 117 'formats': formats 118 } 119 120 121 class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): 122 IE_NAME = 'mva:course' 123 IE_DESC = 'Microsoft Virtual Academy courses' 124 _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME 125 126 _TESTS = [{ 127 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', 128 'info_dict': { 129 'id': '11788', 130 'title': 'Microsoft Azure Fundamentals: Virtual Machines', 131 }, 132 'playlist_count': 36, 133 }, { 134 # with emphasized chapters 135 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', 136 'info_dict': { 137 'id': '16335', 138 'title': 'Developing Windows 10 Games with Construct 2', 139 }, 140 'playlist_count': 10, 141 }, { 142 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', 143 'only_matching': True, 144 }, { 145 'url': 'mva:course:11788', 146 'only_matching': True, 147 }] 148 149 @classmethod 150 def suitable(cls, url): 151 return False if MicrosoftVirtualAcademyIE.suitable(url) else super( 152 MicrosoftVirtualAcademyCourseIE, cls).suitable(url) 153 154 def _real_extract(self, url): 155 mobj = re.match(self._VALID_URL, url) 156 course_id = mobj.group('id') 157 display_id = mobj.group('display_id') 158 159 base_url = self._extract_base_url(course_id, display_id) 160 161 manifest = self._download_json( 162 '%s/imsmanifestlite.json' % base_url, 163 display_id, 'Downloading course manifest JSON')['manifest'] 164 165 organization = manifest['organizations']['organization'][0] 166 167 entries = [] 168 for chapter in organization['item']: 169 chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) 170 chapter_id = chapter.get('@identifier') 171 for item in chapter.get('item', []): 172 item_id = item.get('@identifier') 173 if not item_id: 174 continue 175 metadata = item.get('resource', {}).get('metadata') or {} 176 if metadata.get('learningresourcetype') != 'Video': 177 continue 178 _, title = self._extract_chapter_and_title(item.get('title')) 179 duration = parse_duration(metadata.get('duration')) 180 description = metadata.get('description') 181 entries.append({ 182 '_type': 'url_transparent', 183 'url': smuggle_url( 184 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), 185 'title': title, 186 'description': description, 187 'duration': duration, 188 'chapter': chapter_title, 189 'chapter_number': chapter_number, 190 'chapter_id': chapter_id, 191 }) 192 193 title = organization.get('title') or manifest.get('metadata', {}).get('title') 194 195 return self.playlist_result(entries, course_id, title)