youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

polskieradio.py (6668B)


      1 # coding: utf-8
      2 from __future__ import unicode_literals
      3 
      4 import itertools
      5 import re
      6 
      7 from .common import InfoExtractor
      8 from ..compat import (
      9     compat_str,
     10     compat_urllib_parse_unquote,
     11     compat_urlparse
     12 )
     13 from ..utils import (
     14     extract_attributes,
     15     int_or_none,
     16     strip_or_none,
     17     unified_timestamp,
     18 )
     19 
     20 
     21 class PolskieRadioIE(InfoExtractor):
     22     _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
     23     _TESTS = [{
     24         'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
     25         'info_dict': {
     26             'id': '1587943',
     27             'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
     28             'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
     29         },
     30         'playlist': [{
     31             'md5': '2984ee6ce9046d91fc233bc1a864a09a',
     32             'info_dict': {
     33                 'id': '1540576',
     34                 'ext': 'mp3',
     35                 'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
     36                 'timestamp': 1456594200,
     37                 'upload_date': '20160227',
     38                 'duration': 2364,
     39                 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
     40             },
     41         }],
     42     }, {
     43         'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal',
     44         'info_dict': {
     45             'id': '1635803',
     46             'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał',
     47             'description': 'md5:01cb7d0cad58664095d72b51a1ebada2',
     48         },
     49         'playlist_mincount': 12,
     50     }, {
     51         'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
     52         'only_matching': True,
     53     }, {
     54         'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943',
     55         'only_matching': True,
     56     }, {
     57         # with mp4 video
     58         'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
     59         'only_matching': True,
     60     }]
     61 
     62     def _real_extract(self, url):
     63         playlist_id = self._match_id(url)
     64 
     65         webpage = self._download_webpage(url, playlist_id)
     66 
     67         content = self._search_regex(
     68             r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
     69             webpage, 'content')
     70 
     71         timestamp = unified_timestamp(self._html_search_regex(
     72             r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
     73             webpage, 'timestamp', fatal=False))
     74 
     75         thumbnail_url = self._og_search_thumbnail(webpage)
     76 
     77         entries = []
     78 
     79         media_urls = set()
     80 
     81         for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content):
     82             media = self._parse_json(data_media, playlist_id, fatal=False)
     83             if not media.get('file') or not media.get('desc'):
     84                 continue
     85             media_url = self._proto_relative_url(media['file'], 'http:')
     86             if media_url in media_urls:
     87                 continue
     88             media_urls.add(media_url)
     89             entries.append({
     90                 'id': compat_str(media['id']),
     91                 'url': media_url,
     92                 'title': compat_urllib_parse_unquote(media['desc']),
     93                 'duration': int_or_none(media.get('length')),
     94                 'vcodec': 'none' if media.get('provider') == 'audio' else None,
     95                 'timestamp': timestamp,
     96                 'thumbnail': thumbnail_url
     97             })
     98 
     99         title = self._og_search_title(webpage).strip()
    100         description = strip_or_none(self._og_search_description(webpage))
    101 
    102         return self.playlist_result(entries, playlist_id, title, description)
    103 
    104 
    105 class PolskieRadioCategoryIE(InfoExtractor):
    106     _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)'
    107     _TESTS = [{
    108         'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA',
    109         'info_dict': {
    110             'id': '5102',
    111             'title': 'HISTORIA ŻYWA',
    112         },
    113         'playlist_mincount': 38,
    114     }, {
    115         'url': 'http://www.polskieradio.pl/7/4807',
    116         'info_dict': {
    117             'id': '4807',
    118             'title': 'Vademecum 1050. rocznicy Chrztu Polski'
    119         },
    120         'playlist_mincount': 5
    121     }, {
    122         'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source',
    123         'only_matching': True
    124     }, {
    125         'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
    126         'info_dict': {
    127             'id': '4143',
    128             'title': 'Kierunek Kraków',
    129         },
    130         'playlist_mincount': 61
    131     }, {
    132         'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka',
    133         'info_dict': {
    134             'id': '214',
    135             'title': 'Muzyka',
    136         },
    137         'playlist_mincount': 61
    138     }, {
    139         'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA',
    140         'only_matching': True,
    141     }, {
    142         'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka',
    143         'only_matching': True,
    144     }]
    145 
    146     @classmethod
    147     def suitable(cls, url):
    148         return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url)
    149 
    150     def _entries(self, url, page, category_id):
    151         content = page
    152         for page_num in itertools.count(2):
    153             for a_entry, entry_id in re.findall(
    154                     r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>',
    155                     content):
    156                 entry = extract_attributes(a_entry)
    157                 href = entry.get('href')
    158                 if not href:
    159                     continue
    160                 yield self.url_result(
    161                     compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(),
    162                     entry_id, entry.get('title'))
    163             mobj = re.search(
    164                 r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1',
    165                 content)
    166             if not mobj:
    167                 break
    168             next_url = compat_urlparse.urljoin(url, mobj.group('url'))
    169             content = self._download_webpage(
    170                 next_url, category_id, 'Downloading page %s' % page_num)
    171 
    172     def _real_extract(self, url):
    173         category_id = self._match_id(url)
    174         webpage = self._download_webpage(url, category_id)
    175         title = self._html_search_regex(
    176             r'<title>([^<]+) - [^<]+ - [^<]+</title>',
    177             webpage, 'title', fatal=False)
    178         return self.playlist_result(
    179             self._entries(url, webpage, category_id),
    180             category_id, title)