thisav.py (2531B)
1 # coding: utf-8 2 from __future__ import unicode_literals 3 4 import re 5 6 from .common import InfoExtractor 7 from ..utils import remove_end 8 9 10 class ThisAVIE(InfoExtractor): 11 _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' 12 _TESTS = [{ 13 # jwplayer 14 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', 15 'md5': '0480f1ef3932d901f0e0e719f188f19b', 16 'info_dict': { 17 'id': '47734', 18 'ext': 'flv', 19 'title': '高樹マリア - Just fit', 20 'uploader': 'dj7970', 21 'uploader_id': 'dj7970' 22 } 23 }, { 24 # html5 media 25 'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html', 26 'md5': 'ba90c076bd0f80203679e5b60bf523ee', 27 'info_dict': { 28 'id': '242352', 29 'ext': 'mp4', 30 'title': 'Nerdy 18yo Big Ass Tattoos and Glasses', 31 'uploader': 'cybersluts', 32 'uploader_id': 'cybersluts', 33 }, 34 }] 35 36 def _real_extract(self, url): 37 mobj = re.match(self._VALID_URL, url) 38 39 video_id = mobj.group('id') 40 webpage = self._download_webpage(url, video_id) 41 title = remove_end(self._html_search_regex( 42 r'<title>([^<]+)</title>', webpage, 'title'), 43 ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') 44 video_url = self._html_search_regex( 45 r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) 46 if video_url: 47 info_dict = { 48 'formats': [{ 49 'url': video_url, 50 }], 51 } 52 else: 53 entries = self._parse_html5_media_entries(url, webpage, video_id) 54 if entries: 55 info_dict = entries[0] 56 else: 57 info_dict = self._extract_jwplayer_data( 58 webpage, video_id, require_title=False) 59 uploader = self._html_search_regex( 60 r': <a href="http://www\.thisav\.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', 61 webpage, 'uploader name', fatal=False) 62 uploader_id = self._html_search_regex( 63 r': <a href="http://www\.thisav\.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', 64 webpage, 'uploader id', fatal=False) 65 66 info_dict.update({ 67 'id': video_id, 68 'uploader': uploader, 69 'uploader_id': uploader_id, 70 'title': title, 71 }) 72 73 return info_dict