youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

redtube.py (5233B)


      1 from __future__ import unicode_literals
      2 
      3 import re
      4 
      5 from .common import InfoExtractor
      6 from ..utils import (
      7     determine_ext,
      8     ExtractorError,
      9     int_or_none,
     10     merge_dicts,
     11     str_to_int,
     12     unified_strdate,
     13     url_or_none,
     14 )
     15 
     16 
     17 class RedTubeIE(InfoExtractor):
     18     _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
     19     _TESTS = [{
     20         'url': 'http://www.redtube.com/66418',
     21         'md5': 'fc08071233725f26b8f014dba9590005',
     22         'info_dict': {
     23             'id': '66418',
     24             'ext': 'mp4',
     25             'title': 'Sucked on a toilet',
     26             'upload_date': '20110811',
     27             'duration': 596,
     28             'view_count': int,
     29             'age_limit': 18,
     30         }
     31     }, {
     32         'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286',
     33         'only_matching': True,
     34     }, {
     35         'url': 'http://it.redtube.com/66418',
     36         'only_matching': True,
     37     }]
     38 
     39     @staticmethod
     40     def _extract_urls(webpage):
     41         return re.findall(
     42             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)',
     43             webpage)
     44 
     45     def _real_extract(self, url):
     46         video_id = self._match_id(url)
     47         webpage = self._download_webpage(
     48             'http://www.redtube.com/%s' % video_id, video_id)
     49 
     50         ERRORS = (
     51             (('video-deleted-info', '>This video has been removed'), 'has been removed'),
     52             (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'),
     53         )
     54 
     55         for patterns, message in ERRORS:
     56             if any(p in webpage for p in patterns):
     57                 raise ExtractorError(
     58                     'Video %s %s' % (video_id, message), expected=True)
     59 
     60         info = self._search_json_ld(webpage, video_id, default={})
     61 
     62         if not info.get('title'):
     63             info['title'] = self._html_search_regex(
     64                 (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>',
     65                  r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',),
     66                 webpage, 'title', group='title',
     67                 default=None) or self._og_search_title(webpage)
     68 
     69         formats = []
     70         sources = self._parse_json(
     71             self._search_regex(
     72                 r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
     73             video_id, fatal=False)
     74         if sources and isinstance(sources, dict):
     75             for format_id, format_url in sources.items():
     76                 if format_url:
     77                     formats.append({
     78                         'url': format_url,
     79                         'format_id': format_id,
     80                         'height': int_or_none(format_id),
     81                     })
     82         medias = self._parse_json(
     83             self._search_regex(
     84                 r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage,
     85                 'media definitions', default='{}'),
     86             video_id, fatal=False)
     87         if medias and isinstance(medias, list):
     88             for media in medias:
     89                 format_url = url_or_none(media.get('videoUrl'))
     90                 if not format_url:
     91                     continue
     92                 if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8':
     93                     formats.extend(self._extract_m3u8_formats(
     94                         format_url, video_id, 'mp4',
     95                         entry_protocol='m3u8_native', m3u8_id='hls',
     96                         fatal=False))
     97                     continue
     98                 format_id = media.get('quality')
     99                 formats.append({
    100                     'url': format_url,
    101                     'format_id': format_id,
    102                     'height': int_or_none(format_id),
    103                 })
    104         if not formats:
    105             video_url = self._html_search_regex(
    106                 r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
    107             formats.append({'url': video_url})
    108         self._sort_formats(formats)
    109 
    110         thumbnail = self._og_search_thumbnail(webpage)
    111         upload_date = unified_strdate(self._search_regex(
    112             r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<',
    113             webpage, 'upload date', default=None))
    114         duration = int_or_none(self._og_search_property(
    115             'video:duration', webpage, default=None) or self._search_regex(
    116                 r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None))
    117         view_count = str_to_int(self._search_regex(
    118             (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)',
    119              r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)',
    120              r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'),
    121             webpage, 'view count', default=None))
    122 
    123         # No self-labeling, but they describe themselves as
    124         # "Home of Videos Porno"
    125         age_limit = 18
    126 
    127         return merge_dicts(info, {
    128             'id': video_id,
    129             'ext': 'mp4',
    130             'thumbnail': thumbnail,
    131             'upload_date': upload_date,
    132             'duration': duration,
    133             'view_count': view_count,
    134             'age_limit': age_limit,
    135             'formats': formats,
    136         })