drtuber.py (3968B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..utils import ( 7 int_or_none, 8 NO_DEFAULT, 9 parse_duration, 10 str_to_int, 11 ) 12 13 14 class DrTuberIE(InfoExtractor): 15 _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' 16 _TESTS = [{ 17 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 18 'md5': '93e680cf2536ad0dfb7e74d94a89facd', 19 'info_dict': { 20 'id': '1740434', 21 'display_id': 'hot-perky-blonde-naked-golf', 22 'ext': 'mp4', 23 'title': 'hot perky blonde naked golf', 24 'like_count': int, 25 'comment_count': int, 26 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], 27 'thumbnail': r're:https?://.*\.jpg$', 28 'age_limit': 18, 29 } 30 }, { 31 'url': 'http://www.drtuber.com/embed/489939', 32 'only_matching': True, 33 }, { 34 'url': 'http://m.drtuber.com/video/3893529/lingerie-blowjob-from-beautiful-teen', 35 'only_matching': True, 36 }] 37 38 @staticmethod 39 def _extract_urls(webpage): 40 return re.findall( 41 r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)', 42 webpage) 43 44 def _real_extract(self, url): 45 mobj = re.match(self._VALID_URL, url) 46 video_id = mobj.group('id') 47 display_id = mobj.group('display_id') or video_id 48 49 webpage = self._download_webpage( 50 'http://www.drtuber.com/video/%s' % video_id, display_id) 51 52 video_data = self._download_json( 53 'http://www.drtuber.com/player_config_json/', video_id, query={ 54 'vid': video_id, 55 'embed': 0, 56 'aid': 0, 57 'domain_id': 0, 58 }) 59 60 formats = [] 61 for format_id, video_url in video_data['files'].items(): 62 if video_url: 63 formats.append({ 64 'format_id': format_id, 65 'quality': 2 if format_id == 'hq' else 1, 66 'url': video_url 67 }) 68 self._sort_formats(formats) 69 70 duration = int_or_none(video_data.get('duration')) or parse_duration( 71 video_data.get('duration_format')) 72 73 title = self._html_search_regex( 74 (r'<h1[^>]+class=["\']title[^>]+>([^<]+)', 75 r'<title>([^<]+)\s*@\s+DrTuber', 76 r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', 77 r'<p[^>]+class="title_substrate">([^<]+)</p>', 78 r'<title>([^<]+) - \d+'), 79 webpage, 'title') 80 81 thumbnail = self._html_search_regex( 82 r'poster="([^"]+)"', 83 webpage, 'thumbnail', fatal=False) 84 85 def extract_count(id_, name, default=NO_DEFAULT): 86 return str_to_int(self._html_search_regex( 87 r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, 88 webpage, '%s count' % name, default=default, fatal=False)) 89 90 like_count = extract_count('rate_likes', 'like') 91 dislike_count = extract_count('rate_dislikes', 'dislike', default=None) 92 comment_count = extract_count('comments_count', 'comment') 93 94 cats_str = self._search_regex( 95 r'<div[^>]+class="categories_list">(.+?)</div>', 96 webpage, 'categories', fatal=False) 97 categories = [] if not cats_str else re.findall( 98 r'<a title="([^"]+)"', cats_str) 99 100 return { 101 'id': video_id, 102 'display_id': display_id, 103 'formats': formats, 104 'title': title, 105 'thumbnail': thumbnail, 106 'like_count': like_count, 107 'dislike_count': dislike_count, 108 'comment_count': comment_count, 109 'categories': categories, 110 'age_limit': self._rta_search(webpage), 111 'duration': duration, 112 }