youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

lecturio.py (8437B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import re
      5 
      6 from .common import InfoExtractor
      7 from ..utils import (
      8     clean_html,
      9     determine_ext,
     10     ExtractorError,
     11     float_or_none,
     12     int_or_none,
     13     str_or_none,
     14     url_or_none,
     15     urlencode_postdata,
     16     urljoin,
     17 )
     18 
     19 
     20 class LecturioBaseIE(InfoExtractor):
     21     _API_BASE_URL = 'https://app.lecturio.com/api/en/latest/html5/'
     22     _LOGIN_URL = 'https://app.lecturio.com/en/login'
     23     _NETRC_MACHINE = 'lecturio'
     24 
     25     def _real_initialize(self):
     26         self._login()
     27 
     28     def _login(self):
     29         username, password = self._get_login_info()
     30         if username is None:
     31             return
     32 
     33         # Sets some cookies
     34         _, urlh = self._download_webpage_handle(
     35             self._LOGIN_URL, None, 'Downloading login popup')
     36 
     37         def is_logged(url_handle):
     38             return self._LOGIN_URL not in url_handle.geturl()
     39 
     40         # Already logged in
     41         if is_logged(urlh):
     42             return
     43 
     44         login_form = {
     45             'signin[email]': username,
     46             'signin[password]': password,
     47             'signin[remember]': 'on',
     48         }
     49 
     50         response, urlh = self._download_webpage_handle(
     51             self._LOGIN_URL, None, 'Logging in',
     52             data=urlencode_postdata(login_form))
     53 
     54         # Logged in successfully
     55         if is_logged(urlh):
     56             return
     57 
     58         errors = self._html_search_regex(
     59             r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response,
     60             'errors', default=None)
     61         if errors:
     62             raise ExtractorError('Unable to login: %s' % errors, expected=True)
     63         raise ExtractorError('Unable to log in')
     64 
     65 
     66 class LecturioIE(LecturioBaseIE):
     67     _VALID_URL = r'''(?x)
     68                     https://
     69                         (?:
     70                             app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))|
     71                             (?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag
     72                         )
     73                     '''
     74     _TESTS = [{
     75         'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos',
     76         'md5': '9a42cf1d8282a6311bf7211bbde26fde',
     77         'info_dict': {
     78             'id': '39634',
     79             'ext': 'mp4',
     80             'title': 'Important Concepts and Terms — Introduction to Microbiology',
     81         },
     82         'skip': 'Requires lecturio account credentials',
     83     }, {
     84         'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag',
     85         'only_matching': True,
     86     }, {
     87         'url': 'https://app.lecturio.com/#/lecture/c/6434/39634',
     88         'only_matching': True,
     89     }]
     90 
     91     _CC_LANGS = {
     92         'Arabic': 'ar',
     93         'Bulgarian': 'bg',
     94         'German': 'de',
     95         'English': 'en',
     96         'Spanish': 'es',
     97         'Persian': 'fa',
     98         'French': 'fr',
     99         'Japanese': 'ja',
    100         'Polish': 'pl',
    101         'Pashto': 'ps',
    102         'Russian': 'ru',
    103     }
    104 
    105     def _real_extract(self, url):
    106         mobj = re.match(self._VALID_URL, url)
    107         nt = mobj.group('nt') or mobj.group('nt_de')
    108         lecture_id = mobj.group('id')
    109         display_id = nt or lecture_id
    110         api_path = 'lectures/' + lecture_id if lecture_id else 'lecture/' + nt + '.json'
    111         video = self._download_json(
    112             self._API_BASE_URL + api_path, display_id)
    113         title = video['title'].strip()
    114         if not lecture_id:
    115             pid = video.get('productId') or video.get('uid')
    116             if pid:
    117                 spid = pid.split('_')
    118                 if spid and len(spid) == 2:
    119                     lecture_id = spid[1]
    120 
    121         formats = []
    122         for format_ in video['content']['media']:
    123             if not isinstance(format_, dict):
    124                 continue
    125             file_ = format_.get('file')
    126             if not file_:
    127                 continue
    128             ext = determine_ext(file_)
    129             if ext == 'smil':
    130                 # smil contains only broken RTMP formats anyway
    131                 continue
    132             file_url = url_or_none(file_)
    133             if not file_url:
    134                 continue
    135             label = str_or_none(format_.get('label'))
    136             filesize = int_or_none(format_.get('fileSize'))
    137             f = {
    138                 'url': file_url,
    139                 'format_id': label,
    140                 'filesize': float_or_none(filesize, invscale=1000)
    141             }
    142             if label:
    143                 mobj = re.match(r'(\d+)p\s*\(([^)]+)\)', label)
    144                 if mobj:
    145                     f.update({
    146                         'format_id': mobj.group(2),
    147                         'height': int(mobj.group(1)),
    148                     })
    149             formats.append(f)
    150         self._sort_formats(formats)
    151 
    152         subtitles = {}
    153         automatic_captions = {}
    154         captions = video.get('captions') or []
    155         for cc in captions:
    156             cc_url = cc.get('url')
    157             if not cc_url:
    158                 continue
    159             cc_label = cc.get('translatedCode')
    160             lang = cc.get('languageCode') or self._search_regex(
    161                 r'/([a-z]{2})_', cc_url, 'lang',
    162                 default=cc_label.split()[0] if cc_label else 'en')
    163             original_lang = self._search_regex(
    164                 r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang',
    165                 default=None)
    166             sub_dict = (automatic_captions
    167                         if 'auto-translated' in cc_label or original_lang
    168                         else subtitles)
    169             sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({
    170                 'url': cc_url,
    171             })
    172 
    173         return {
    174             'id': lecture_id or nt,
    175             'title': title,
    176             'formats': formats,
    177             'subtitles': subtitles,
    178             'automatic_captions': automatic_captions,
    179         }
    180 
    181 
    182 class LecturioCourseIE(LecturioBaseIE):
    183     _VALID_URL = r'https://app\.lecturio\.com/(?:[^/]+/(?P<nt>[^/?#&]+)\.course|(?:#/)?course/c/(?P<id>\d+))'
    184     _TESTS = [{
    185         'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/',
    186         'info_dict': {
    187             'id': 'microbiology-introduction',
    188             'title': 'Microbiology: Introduction',
    189             'description': 'md5:13da8500c25880c6016ae1e6d78c386a',
    190         },
    191         'playlist_count': 45,
    192         'skip': 'Requires lecturio account credentials',
    193     }, {
    194         'url': 'https://app.lecturio.com/#/course/c/6434',
    195         'only_matching': True,
    196     }]
    197 
    198     def _real_extract(self, url):
    199         nt, course_id = re.match(self._VALID_URL, url).groups()
    200         display_id = nt or course_id
    201         api_path = 'courses/' + course_id if course_id else 'course/content/' + nt + '.json'
    202         course = self._download_json(
    203             self._API_BASE_URL + api_path, display_id)
    204         entries = []
    205         for lecture in course.get('lectures', []):
    206             lecture_id = str_or_none(lecture.get('id'))
    207             lecture_url = lecture.get('url')
    208             if lecture_url:
    209                 lecture_url = urljoin(url, lecture_url)
    210             else:
    211                 lecture_url = 'https://app.lecturio.com/#/lecture/c/%s/%s' % (course_id, lecture_id)
    212             entries.append(self.url_result(
    213                 lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id))
    214         return self.playlist_result(
    215             entries, display_id, course.get('title'),
    216             clean_html(course.get('description')))
    217 
    218 
    219 class LecturioDeCourseIE(LecturioBaseIE):
    220     _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs'
    221     _TEST = {
    222         'url': 'https://www.lecturio.de/jura/grundrechte.kurs',
    223         'only_matching': True,
    224     }
    225 
    226     def _real_extract(self, url):
    227         display_id = self._match_id(url)
    228 
    229         webpage = self._download_webpage(url, display_id)
    230 
    231         entries = []
    232         for mobj in re.finditer(
    233                 r'(?s)<td[^>]+\bdata-lecture-id=["\'](?P<id>\d+).+?\bhref=(["\'])(?P<url>(?:(?!\2).)+\.vortrag)\b[^>]+>',
    234                 webpage):
    235             lecture_url = urljoin(url, mobj.group('url'))
    236             lecture_id = mobj.group('id')
    237             entries.append(self.url_result(
    238                 lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id))
    239 
    240         title = self._search_regex(
    241             r'<h1[^>]*>([^<]+)', webpage, 'title', default=None)
    242 
    243         return self.playlist_result(entries, display_id, title)