youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

newgrounds.py (5583B)


      1 from __future__ import unicode_literals
      2 
      3 import re
      4 
      5 from .common import InfoExtractor
      6 from ..utils import (
      7     extract_attributes,
      8     int_or_none,
      9     parse_duration,
     10     parse_filesize,
     11     unified_timestamp,
     12 )
     13 
     14 
     15 class NewgroundsIE(InfoExtractor):
     16     _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)'
     17     _TESTS = [{
     18         'url': 'https://www.newgrounds.com/audio/listen/549479',
     19         'md5': 'fe6033d297591288fa1c1f780386f07a',
     20         'info_dict': {
     21             'id': '549479',
     22             'ext': 'mp3',
     23             'title': 'B7 - BusMode',
     24             'uploader': 'Burn7',
     25             'timestamp': 1378878540,
     26             'upload_date': '20130911',
     27             'duration': 143,
     28         },
     29     }, {
     30         'url': 'https://www.newgrounds.com/portal/view/673111',
     31         'md5': '3394735822aab2478c31b1004fe5e5bc',
     32         'info_dict': {
     33             'id': '673111',
     34             'ext': 'mp4',
     35             'title': 'Dancin',
     36             'uploader': 'Squirrelman82',
     37             'timestamp': 1460256780,
     38             'upload_date': '20160410',
     39         },
     40     }, {
     41         # source format unavailable, additional mp4 formats
     42         'url': 'http://www.newgrounds.com/portal/view/689400',
     43         'info_dict': {
     44             'id': '689400',
     45             'ext': 'mp4',
     46             'title': 'ZTV News Episode 8',
     47             'uploader': 'BennettTheSage',
     48             'timestamp': 1487965140,
     49             'upload_date': '20170224',
     50         },
     51         'params': {
     52             'skip_download': True,
     53         },
     54     }]
     55 
     56     def _real_extract(self, url):
     57         media_id = self._match_id(url)
     58 
     59         webpage = self._download_webpage(url, media_id)
     60 
     61         title = self._html_search_regex(
     62             r'<title>([^>]+)</title>', webpage, 'title')
     63 
     64         media_url = self._parse_json(self._search_regex(
     65             r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id)
     66 
     67         formats = [{
     68             'url': media_url,
     69             'format_id': 'source',
     70             'quality': 1,
     71         }]
     72 
     73         max_resolution = int_or_none(self._search_regex(
     74             r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
     75             default=None))
     76         if max_resolution:
     77             url_base = media_url.rpartition('.')[0]
     78             for resolution in (360, 720, 1080):
     79                 if resolution > max_resolution:
     80                     break
     81                 formats.append({
     82                     'url': '%s.%dp.mp4' % (url_base, resolution),
     83                     'format_id': '%dp' % resolution,
     84                     'height': resolution,
     85                 })
     86 
     87         self._check_formats(formats, media_id)
     88         self._sort_formats(formats)
     89 
     90         uploader = self._html_search_regex(
     91             (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*Author\s*</em>',
     92              r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
     93             fatal=False)
     94 
     95         timestamp = unified_timestamp(self._html_search_regex(
     96             (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
     97              r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp',
     98             default=None))
     99         duration = parse_duration(self._search_regex(
    100             r'(?s)<dd>\s*Song\s*</dd>\s*<dd>.+?</dd>\s*<dd>([^<]+)', webpage,
    101             'duration', default=None))
    102 
    103         filesize_approx = parse_filesize(self._html_search_regex(
    104             r'(?s)<dd>\s*Song\s*</dd>\s*<dd>(.+?)</dd>', webpage, 'filesize',
    105             default=None))
    106         if len(formats) == 1:
    107             formats[0]['filesize_approx'] = filesize_approx
    108 
    109         if '<dd>Song' in webpage:
    110             formats[0]['vcodec'] = 'none'
    111 
    112         return {
    113             'id': media_id,
    114             'title': title,
    115             'uploader': uploader,
    116             'timestamp': timestamp,
    117             'duration': duration,
    118             'formats': formats,
    119         }
    120 
    121 
    122 class NewgroundsPlaylistIE(InfoExtractor):
    123     _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)'
    124     _TESTS = [{
    125         'url': 'https://www.newgrounds.com/collection/cats',
    126         'info_dict': {
    127             'id': 'cats',
    128             'title': 'Cats',
    129         },
    130         'playlist_mincount': 46,
    131     }, {
    132         'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA',
    133         'info_dict': {
    134             'id': 'ZONE-SAMA',
    135             'title': 'Portal Search: ZONE-SAMA',
    136         },
    137         'playlist_mincount': 47,
    138     }, {
    139         'url': 'http://www.newgrounds.com/audio/search/title/cats',
    140         'only_matching': True,
    141     }]
    142 
    143     def _real_extract(self, url):
    144         playlist_id = self._match_id(url)
    145 
    146         webpage = self._download_webpage(url, playlist_id)
    147 
    148         title = self._search_regex(
    149             r'<title>([^>]+)</title>', webpage, 'title', default=None)
    150 
    151         # cut left menu
    152         webpage = self._search_regex(
    153             r'(?s)<div[^>]+\bclass=["\']column wide(.+)',
    154             webpage, 'wide column', default=webpage)
    155 
    156         entries = []
    157         for a, path, media_id in re.findall(
    158                 r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)',
    159                 webpage):
    160             a_class = extract_attributes(a).get('class')
    161             if a_class not in ('item-portalsubmission', 'item-audiosubmission'):
    162                 continue
    163             entries.append(
    164                 self.url_result(
    165                     'https://www.newgrounds.com/%s' % path,
    166                     ie=NewgroundsIE.ie_key(), video_id=media_id))
    167 
    168         return self.playlist_result(entries, playlist_id, title)