youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

udn.py (3575B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import re
      5 
      6 from .common import InfoExtractor
      7 from ..utils import (
      8     determine_ext,
      9     int_or_none,
     10     js_to_json,
     11 )
     12 from ..compat import compat_urlparse
     13 
     14 
     15 class UDNEmbedIE(InfoExtractor):
     16     IE_DESC = '聯合影音'
     17     _PROTOCOL_RELATIVE_VALID_URL = r'//video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
     18     _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL
     19     _TESTS = [{
     20         'url': 'http://video.udn.com/embed/news/300040',
     21         'info_dict': {
     22             'id': '300040',
     23             'ext': 'mp4',
     24             'title': '生物老師男變女 全校挺"做自己"',
     25             'thumbnail': r're:^https?://.*\.jpg$',
     26         },
     27         'params': {
     28             # m3u8 download
     29             'skip_download': True,
     30         },
     31         'expected_warnings': ['Failed to parse JSON Expecting value'],
     32     }, {
     33         'url': 'https://video.udn.com/embed/news/300040',
     34         'only_matching': True,
     35     }, {
     36         # From https://video.udn.com/news/303776
     37         'url': 'https://video.udn.com/play/news/303776',
     38         'only_matching': True,
     39     }]
     40 
     41     def _real_extract(self, url):
     42         video_id = self._match_id(url)
     43 
     44         page = self._download_webpage(url, video_id)
     45 
     46         options_str = self._html_search_regex(
     47             r'var\s+options\s*=\s*([^;]+);', page, 'options')
     48         trans_options_str = js_to_json(options_str)
     49         options = self._parse_json(trans_options_str, 'options', fatal=False) or {}
     50         if options:
     51             video_urls = options['video']
     52             title = options['title']
     53             poster = options.get('poster')
     54         else:
     55             video_urls = self._parse_json(self._html_search_regex(
     56                 r'"video"\s*:\s*({.+?})\s*,', trans_options_str, 'video urls'), 'video urls')
     57             title = self._html_search_regex(
     58                 r"title\s*:\s*'(.+?)'\s*,", options_str, 'title')
     59             poster = self._html_search_regex(
     60                 r"poster\s*:\s*'(.+?)'\s*,", options_str, 'poster', default=None)
     61 
     62         if video_urls.get('youtube'):
     63             return self.url_result(video_urls.get('youtube'), 'Youtube')
     64 
     65         formats = []
     66         for video_type, api_url in video_urls.items():
     67             if not api_url:
     68                 continue
     69 
     70             video_url = self._download_webpage(
     71                 compat_urlparse.urljoin(url, api_url), video_id,
     72                 note='retrieve url for %s video' % video_type)
     73 
     74             ext = determine_ext(video_url)
     75             if ext == 'm3u8':
     76                 formats.extend(self._extract_m3u8_formats(
     77                     video_url, video_id, ext='mp4', m3u8_id='hls'))
     78             elif ext == 'f4m':
     79                 formats.extend(self._extract_f4m_formats(
     80                     video_url, video_id, f4m_id='hds'))
     81             else:
     82                 mobj = re.search(r'_(?P<height>\d+)p_(?P<tbr>\d+)\.mp4', video_url)
     83                 a_format = {
     84                     'url': video_url,
     85                     # video_type may be 'mp4', which confuses YoutubeDL
     86                     'format_id': 'http-' + video_type,
     87                 }
     88                 if mobj:
     89                     a_format.update({
     90                         'height': int_or_none(mobj.group('height')),
     91                         'tbr': int_or_none(mobj.group('tbr')),
     92                     })
     93                 formats.append(a_format)
     94 
     95         self._sort_formats(formats)
     96 
     97         return {
     98             'id': video_id,
     99             'formats': formats,
    100             'title': title,
    101             'thumbnail': poster,
    102         }