redtube.py (5233B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..utils import ( 7 determine_ext, 8 ExtractorError, 9 int_or_none, 10 merge_dicts, 11 str_to_int, 12 unified_strdate, 13 url_or_none, 14 ) 15 16 17 class RedTubeIE(InfoExtractor): 18 _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' 19 _TESTS = [{ 20 'url': 'http://www.redtube.com/66418', 21 'md5': 'fc08071233725f26b8f014dba9590005', 22 'info_dict': { 23 'id': '66418', 24 'ext': 'mp4', 25 'title': 'Sucked on a toilet', 26 'upload_date': '20110811', 27 'duration': 596, 28 'view_count': int, 29 'age_limit': 18, 30 } 31 }, { 32 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', 33 'only_matching': True, 34 }, { 35 'url': 'http://it.redtube.com/66418', 36 'only_matching': True, 37 }] 38 39 @staticmethod 40 def _extract_urls(webpage): 41 return re.findall( 42 r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', 43 webpage) 44 45 def _real_extract(self, url): 46 video_id = self._match_id(url) 47 webpage = self._download_webpage( 48 'http://www.redtube.com/%s' % video_id, video_id) 49 50 ERRORS = ( 51 (('video-deleted-info', '>This video has been removed'), 'has been removed'), 52 (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), 53 ) 54 55 for patterns, message in ERRORS: 56 if any(p in webpage for p in patterns): 57 raise ExtractorError( 58 'Video %s %s' % (video_id, message), expected=True) 59 60 info = self._search_json_ld(webpage, video_id, default={}) 61 62 if not info.get('title'): 63 info['title'] = self._html_search_regex( 64 (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', 65 r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), 66 webpage, 'title', group='title', 67 default=None) or self._og_search_title(webpage) 68 69 formats = [] 70 sources = self._parse_json( 71 self._search_regex( 72 r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), 73 video_id, fatal=False) 74 if sources and isinstance(sources, dict): 75 for format_id, format_url in sources.items(): 76 if format_url: 77 formats.append({ 78 'url': format_url, 79 'format_id': format_id, 80 'height': int_or_none(format_id), 81 }) 82 medias = self._parse_json( 83 self._search_regex( 84 r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, 85 'media definitions', default='{}'), 86 video_id, fatal=False) 87 if medias and isinstance(medias, list): 88 for media in medias: 89 format_url = url_or_none(media.get('videoUrl')) 90 if not format_url: 91 continue 92 if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': 93 formats.extend(self._extract_m3u8_formats( 94 format_url, video_id, 'mp4', 95 entry_protocol='m3u8_native', m3u8_id='hls', 96 fatal=False)) 97 continue 98 format_id = media.get('quality') 99 formats.append({ 100 'url': format_url, 101 'format_id': format_id, 102 'height': int_or_none(format_id), 103 }) 104 if not formats: 105 video_url = self._html_search_regex( 106 r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') 107 formats.append({'url': video_url}) 108 self._sort_formats(formats) 109 110 thumbnail = self._og_search_thumbnail(webpage) 111 upload_date = unified_strdate(self._search_regex( 112 r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<', 113 webpage, 'upload date', default=None)) 114 duration = int_or_none(self._og_search_property( 115 'video:duration', webpage, default=None) or self._search_regex( 116 r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) 117 view_count = str_to_int(self._search_regex( 118 (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', 119 r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)', 120 r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'), 121 webpage, 'view count', default=None)) 122 123 # No self-labeling, but they describe themselves as 124 # "Home of Videos Porno" 125 age_limit = 18 126 127 return merge_dicts(info, { 128 'id': video_id, 129 'ext': 'mp4', 130 'thumbnail': thumbnail, 131 'upload_date': upload_date, 132 'duration': duration, 133 'view_count': view_count, 134 'age_limit': age_limit, 135 'formats': formats, 136 })