youjizz.py (3126B)
1 from __future__ import unicode_literals 2 3 import re 4 5 from .common import InfoExtractor 6 from ..utils import ( 7 determine_ext, 8 int_or_none, 9 parse_duration, 10 url_or_none, 11 ) 12 13 14 class YouJizzIE(InfoExtractor): 15 _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))' 16 _TESTS = [{ 17 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', 18 'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4', 19 'info_dict': { 20 'id': '2189178', 21 'ext': 'mp4', 22 'title': 'Zeichentrick 1', 23 'age_limit': 18, 24 'duration': 2874, 25 } 26 }, { 27 'url': 'http://www.youjizz.com/videos/-2189178.html', 28 'only_matching': True, 29 }, { 30 'url': 'https://www.youjizz.com/videos/embed/31991001', 31 'only_matching': True, 32 }] 33 34 def _real_extract(self, url): 35 mobj = re.match(self._VALID_URL, url) 36 video_id = mobj.group('id') or mobj.group('embed_id') 37 38 webpage = self._download_webpage(url, video_id) 39 40 title = self._html_search_regex( 41 r'<title>(.+?)</title>', webpage, 'title') 42 43 formats = [] 44 45 encodings = self._parse_json( 46 self._search_regex( 47 r'[Ee]ncodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', 48 default='[]'), 49 video_id, fatal=False) 50 for encoding in encodings: 51 if not isinstance(encoding, dict): 52 continue 53 format_url = url_or_none(encoding.get('filename')) 54 if not format_url: 55 continue 56 if determine_ext(format_url) == 'm3u8': 57 formats.extend(self._extract_m3u8_formats( 58 format_url, video_id, 'mp4', entry_protocol='m3u8_native', 59 m3u8_id='hls', fatal=False)) 60 else: 61 format_id = encoding.get('name') or encoding.get('quality') 62 height = int_or_none(self._search_regex( 63 r'^(\d+)[pP]', format_id, 'height', default=None)) 64 formats.append({ 65 'url': format_url, 66 'format_id': format_id, 67 'height': height, 68 }) 69 70 if formats: 71 info_dict = { 72 'formats': formats, 73 } 74 else: 75 # YouJizz's HTML5 player has invalid HTML 76 webpage = webpage.replace('"controls', '" controls') 77 info_dict = self._parse_html5_media_entries( 78 url, webpage, video_id)[0] 79 80 duration = parse_duration(self._search_regex( 81 r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration', 82 default=None)) 83 uploader = self._search_regex( 84 r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader', 85 default=None) 86 87 info_dict.update({ 88 'id': video_id, 89 'title': title, 90 'age_limit': self._rta_search(webpage), 91 'duration': duration, 92 'uploader': uploader, 93 }) 94 95 return info_dict