youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

nextmedia.py (8975B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 from .common import InfoExtractor
      5 from ..compat import compat_urlparse
      6 from ..utils import (
      7     clean_html,
      8     get_element_by_class,
      9     int_or_none,
     10     parse_iso8601,
     11     remove_start,
     12     unified_timestamp,
     13 )
     14 
     15 
     16 class NextMediaIE(InfoExtractor):
     17     IE_DESC = '蘋果日報'
     18     _VALID_URL = r'https?://hk\.apple\.nextmedia\.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
     19     _TESTS = [{
     20         'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
     21         'md5': 'dff9fad7009311c421176d1ac90bfe4f',
     22         'info_dict': {
     23             'id': '53109199',
     24             'ext': 'mp4',
     25             'title': '【佔領金鐘】50外國領事議員撐場 讚學生勇敢香港有希望',
     26             'thumbnail': r're:^https?://.*\.jpg$',
     27             'description': 'md5:28222b9912b6665a21011b034c70fcc7',
     28             'timestamp': 1415456273,
     29             'upload_date': '20141108',
     30         }
     31     }]
     32 
     33     _URL_PATTERN = r'\{ url: \'(.+)\' \}'
     34 
     35     def _real_extract(self, url):
     36         news_id = self._match_id(url)
     37         page = self._download_webpage(url, news_id)
     38         return self._extract_from_nextmedia_page(news_id, url, page)
     39 
     40     def _extract_from_nextmedia_page(self, news_id, url, page):
     41         redirection_url = self._search_regex(
     42             r'window\.location\.href\s*=\s*([\'"])(?P<url>(?!\1).+)\1',
     43             page, 'redirection URL', default=None, group='url')
     44         if redirection_url:
     45             return self.url_result(compat_urlparse.urljoin(url, redirection_url))
     46 
     47         title = self._fetch_title(page)
     48         video_url = self._search_regex(self._URL_PATTERN, page, 'video url')
     49 
     50         attrs = {
     51             'id': news_id,
     52             'title': title,
     53             'url': video_url,  # ext can be inferred from url
     54             'thumbnail': self._fetch_thumbnail(page),
     55             'description': self._fetch_description(page),
     56         }
     57 
     58         timestamp = self._fetch_timestamp(page)
     59         if timestamp:
     60             attrs['timestamp'] = timestamp
     61         else:
     62             attrs['upload_date'] = self._fetch_upload_date(url)
     63 
     64         return attrs
     65 
     66     def _fetch_title(self, page):
     67         return self._og_search_title(page)
     68 
     69     def _fetch_thumbnail(self, page):
     70         return self._og_search_thumbnail(page)
     71 
     72     def _fetch_timestamp(self, page):
     73         dateCreated = self._search_regex('"dateCreated":"([^"]+)"', page, 'created time')
     74         return parse_iso8601(dateCreated)
     75 
     76     def _fetch_upload_date(self, url):
     77         return self._search_regex(self._VALID_URL, url, 'upload date', group='date')
     78 
     79     def _fetch_description(self, page):
     80         return self._og_search_property('description', page)
     81 
     82 
     83 class NextMediaActionNewsIE(NextMediaIE):
     84     IE_DESC = '蘋果日報 - 動新聞'
     85     _VALID_URL = r'https?://hk\.dv\.nextmedia\.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
     86     _TESTS = [{
     87         'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
     88         'md5': '05fce8ffeed7a5e00665d4b7cf0f9201',
     89         'info_dict': {
     90             'id': '19009428',
     91             'ext': 'mp4',
     92             'title': '【壹週刊】細10年男友偷食 50歲邵美琪再失戀',
     93             'thumbnail': r're:^https?://.*\.jpg$',
     94             'description': 'md5:cd802fad1f40fd9ea178c1e2af02d659',
     95             'timestamp': 1421791200,
     96             'upload_date': '20150120',
     97         }
     98     }]
     99 
    100     def _real_extract(self, url):
    101         news_id = self._match_id(url)
    102         actionnews_page = self._download_webpage(url, news_id)
    103         article_url = self._og_search_url(actionnews_page)
    104         article_page = self._download_webpage(article_url, news_id)
    105         return self._extract_from_nextmedia_page(news_id, url, article_page)
    106 
    107 
    108 class AppleDailyIE(NextMediaIE):
    109     IE_DESC = '臺灣蘋果日報'
    110     _VALID_URL = r'https?://(www|ent)\.appledaily\.com\.tw/[^/]+/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
    111     _TESTS = [{
    112         'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
    113         'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
    114         'info_dict': {
    115             'id': '36354694',
    116             'ext': 'mp4',
    117             'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
    118             'thumbnail': r're:^https?://.*\.jpg$',
    119             'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
    120             'upload_date': '20150128',
    121         }
    122     }, {
    123         'url': 'http://www.appledaily.com.tw/realtimenews/article/strange/20150128/550549/%E4%B8%8D%E6%BB%BF%E8%A2%AB%E8%B8%A9%E8%85%B3%E3%80%80%E5%B1%B1%E6%9D%B1%E5%85%A9%E5%A4%A7%E5%AA%BD%E4%B8%80%E8%B7%AF%E6%89%93%E4%B8%8B%E8%BB%8A',
    124         'md5': '86b4e9132d158279c7883822d94ccc49',
    125         'info_dict': {
    126             'id': '550549',
    127             'ext': 'mp4',
    128             'title': '不滿被踩腳 山東兩大媽一路打下車',
    129             'thumbnail': r're:^https?://.*\.jpg$',
    130             'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
    131             'upload_date': '20150128',
    132         }
    133     }, {
    134         'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
    135         'md5': '03df296d95dedc2d5886debbb80cb43f',
    136         'info_dict': {
    137             'id': '5003671',
    138             'ext': 'mp4',
    139             'title': '20正妹熱舞 《刀龍傳說Online》火辣上市',
    140             'thumbnail': r're:^https?://.*\.jpg$',
    141             'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd',
    142             'upload_date': '20150128',
    143         },
    144         'skip': 'redirect to http://www.appledaily.com.tw/animation/',
    145     }, {
    146         # No thumbnail
    147         'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/',
    148         'md5': 'b06182cd386ea7bc6115ec7ff0f72aeb',
    149         'info_dict': {
    150             'id': '5003673',
    151             'ext': 'mp4',
    152             'title': '半夜尿尿 好像會看到___',
    153             'description': 'md5:61d2da7fe117fede148706cdb85ac066',
    154             'upload_date': '20150128',
    155         },
    156         'expected_warnings': [
    157             'video thumbnail',
    158         ],
    159         'skip': 'redirect to http://www.appledaily.com.tw/animation/',
    160     }, {
    161         'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
    162         'md5': 'eaa20e6b9df418c912d7f5dec2ba734d',
    163         'info_dict': {
    164             'id': '35770334',
    165             'ext': 'mp4',
    166             'title': '咖啡占卜測 XU裝熟指數',
    167             'thumbnail': r're:^https?://.*\.jpg$',
    168             'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748',
    169             'upload_date': '20140417',
    170         },
    171     }, {
    172         'url': 'http://www.appledaily.com.tw/actionnews/appledaily/7/20161003/960588/',
    173         'only_matching': True,
    174     }, {
    175         # Redirected from http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694
    176         'url': 'http://ent.appledaily.com.tw/section/article/headline/20150128/36354694',
    177         'only_matching': True,
    178     }]
    179 
    180     _URL_PATTERN = r'\{url: \'(.+)\'\}'
    181 
    182     def _fetch_title(self, page):
    183         return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None)
    184                 or self._html_search_meta('description', page, 'news title'))
    185 
    186     def _fetch_thumbnail(self, page):
    187         return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
    188 
    189     def _fetch_timestamp(self, page):
    190         return None
    191 
    192     def _fetch_description(self, page):
    193         return self._html_search_meta('description', page, 'news description')
    194 
    195 
    196 class NextTVIE(InfoExtractor):
    197     IE_DESC = '壹電視'
    198     _VALID_URL = r'https?://(?:www\.)?nexttv\.com\.tw/(?:[^/]+/)+(?P<id>\d+)'
    199 
    200     _TEST = {
    201         'url': 'http://www.nexttv.com.tw/news/realtime/politics/11779671',
    202         'info_dict': {
    203             'id': '11779671',
    204             'ext': 'mp4',
    205             'title': '「超收稅」近4千億! 藍議員籲發消費券',
    206             'thumbnail': r're:^https?://.*\.jpg$',
    207             'timestamp': 1484825400,
    208             'upload_date': '20170119',
    209             'view_count': int,
    210         },
    211     }
    212 
    213     def _real_extract(self, url):
    214         video_id = self._match_id(url)
    215 
    216         webpage = self._download_webpage(url, video_id)
    217 
    218         title = self._html_search_regex(
    219             r'<h1[^>]*>([^<]+)</h1>', webpage, 'title')
    220 
    221         data = self._hidden_inputs(webpage)
    222 
    223         video_url = data['ntt-vod-src-detailview']
    224 
    225         date_str = get_element_by_class('date', webpage)
    226         timestamp = unified_timestamp(date_str + '+0800') if date_str else None
    227 
    228         view_count = int_or_none(remove_start(
    229             clean_html(get_element_by_class('click', webpage)), '點閱:'))
    230 
    231         return {
    232             'id': video_id,
    233             'title': title,
    234             'url': video_url,
    235             'thumbnail': data.get('ntt-vod-img-src'),
    236             'timestamp': timestamp,
    237             'view_count': view_count,
    238         }