motherless.py (8892B)
1 from __future__ import unicode_literals 2 3 import datetime 4 import re 5 6 from .common import InfoExtractor 7 from ..compat import compat_urlparse 8 from ..utils import ( 9 ExtractorError, 10 InAdvancePagedList, 11 orderedSet, 12 str_to_int, 13 unified_strdate, 14 ) 15 16 17 class MotherlessIE(InfoExtractor): 18 _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' 19 _TESTS = [{ 20 'url': 'http://motherless.com/AC3FFE1', 21 'md5': '310f62e325a9fafe64f68c0bccb6e75f', 22 'info_dict': { 23 'id': 'AC3FFE1', 24 'ext': 'mp4', 25 'title': 'Fucked in the ass while playing PS3', 26 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 27 'upload_date': '20100913', 28 'uploader_id': 'famouslyfuckedup', 29 'thumbnail': r're:https?://.*\.jpg', 30 'age_limit': 18, 31 } 32 }, { 33 'url': 'http://motherless.com/532291B', 34 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', 35 'info_dict': { 36 'id': '532291B', 37 'ext': 'mp4', 38 'title': 'Amazing girl playing the omegle game, PERFECT!', 39 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 40 'game', 'hairy'], 41 'upload_date': '20140622', 42 'uploader_id': 'Sulivana7x', 43 'thumbnail': r're:https?://.*\.jpg', 44 'age_limit': 18, 45 }, 46 'skip': '404', 47 }, { 48 'url': 'http://motherless.com/g/cosplay/633979F', 49 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', 50 'info_dict': { 51 'id': '633979F', 52 'ext': 'mp4', 53 'title': 'Turtlette', 54 'categories': ['superheroine heroine superher'], 55 'upload_date': '20140827', 56 'uploader_id': 'shade0230', 57 'thumbnail': r're:https?://.*\.jpg', 58 'age_limit': 18, 59 } 60 }, { 61 # no keywords 62 'url': 'http://motherless.com/8B4BBC1', 63 'only_matching': True, 64 }, { 65 # see https://motherless.com/videos/recent for recent videos with 66 # uploaded date in "ago" format 67 'url': 'https://motherless.com/3C3E2CF', 68 'info_dict': { 69 'id': '3C3E2CF', 70 'ext': 'mp4', 71 'title': 'a/ Hot Teens', 72 'categories': list, 73 'upload_date': '20210104', 74 'uploader_id': 'yonbiw', 75 'thumbnail': r're:https?://.*\.jpg', 76 'age_limit': 18, 77 }, 78 'params': { 79 'skip_download': True, 80 }, 81 }] 82 83 def _real_extract(self, url): 84 video_id = self._match_id(url) 85 webpage = self._download_webpage(url, video_id) 86 87 if any(p in webpage for p in ( 88 '<title>404 - MOTHERLESS.COM<', 89 ">The page you're looking for cannot be found.<")): 90 raise ExtractorError('Video %s does not exist' % video_id, expected=True) 91 92 if '>The content you are trying to view is for friends only.' in webpage: 93 raise ExtractorError('Video %s is for friends only' % video_id, expected=True) 94 95 title = self._html_search_regex( 96 (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>', 97 r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title') 98 video_url = (self._html_search_regex( 99 (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', 100 r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), 101 webpage, 'video URL', default=None, group='url') 102 or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) 103 age_limit = self._rta_search(webpage) 104 view_count = str_to_int(self._html_search_regex( 105 (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'), 106 webpage, 'view count', fatal=False)) 107 like_count = str_to_int(self._html_search_regex( 108 (r'>([\d,.]+)\s+Favorites<', 109 r'<strong>Favorited</strong>\s+([^<]+)<'), 110 webpage, 'like count', fatal=False)) 111 112 upload_date = unified_strdate(self._search_regex( 113 r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, 114 'upload date', default=None)) 115 if not upload_date: 116 uploaded_ago = self._search_regex( 117 r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', 118 default=None) 119 if uploaded_ago: 120 delta = int(uploaded_ago[:-1]) 121 _AGO_UNITS = { 122 'h': 'hours', 123 'd': 'days', 124 } 125 kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} 126 upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') 127 128 comment_count = webpage.count('class="media-comment-contents"') 129 uploader_id = self._html_search_regex( 130 r'"thumb-member-username">\s+<a href="/m/([^"]+)"', 131 webpage, 'uploader_id') 132 133 categories = self._html_search_meta('keywords', webpage, default=None) 134 if categories: 135 categories = [cat.strip() for cat in categories.split(',')] 136 137 return { 138 'id': video_id, 139 'title': title, 140 'upload_date': upload_date, 141 'uploader_id': uploader_id, 142 'thumbnail': self._og_search_thumbnail(webpage), 143 'categories': categories, 144 'view_count': view_count, 145 'like_count': like_count, 146 'comment_count': comment_count, 147 'age_limit': age_limit, 148 'url': video_url, 149 } 150 151 152 class MotherlessGroupIE(InfoExtractor): 153 _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' 154 _TESTS = [{ 155 'url': 'http://motherless.com/g/movie_scenes', 156 'info_dict': { 157 'id': 'movie_scenes', 158 'title': 'Movie Scenes', 159 'description': 'Hot and sexy scenes from "regular" movies... ' 160 'Beautiful actresses fully nude... A looot of ' 161 'skin! :)Enjoy!', 162 }, 163 'playlist_mincount': 662, 164 }, { 165 'url': 'http://motherless.com/gv/sex_must_be_funny', 166 'info_dict': { 167 'id': 'sex_must_be_funny', 168 'title': 'Sex must be funny', 169 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' 170 'any kind!' 171 }, 172 'playlist_mincount': 9, 173 }] 174 175 @classmethod 176 def suitable(cls, url): 177 return (False if MotherlessIE.suitable(url) 178 else super(MotherlessGroupIE, cls).suitable(url)) 179 180 def _extract_entries(self, webpage, base): 181 entries = [] 182 for mobj in re.finditer( 183 r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', 184 webpage): 185 video_url = compat_urlparse.urljoin(base, mobj.group('href')) 186 if not MotherlessIE.suitable(video_url): 187 continue 188 video_id = MotherlessIE._match_id(video_url) 189 title = mobj.group('title') 190 entries.append(self.url_result( 191 video_url, ie=MotherlessIE.ie_key(), video_id=video_id, 192 video_title=title)) 193 # Alternative fallback 194 if not entries: 195 entries = [ 196 self.url_result( 197 compat_urlparse.urljoin(base, '/' + entry_id), 198 ie=MotherlessIE.ie_key(), video_id=entry_id) 199 for entry_id in orderedSet(re.findall( 200 r'data-codename=["\']([A-Z0-9]+)', webpage))] 201 return entries 202 203 def _real_extract(self, url): 204 group_id = self._match_id(url) 205 page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) 206 webpage = self._download_webpage(page_url, group_id) 207 title = self._search_regex( 208 r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) 209 description = self._html_search_meta( 210 'description', webpage, fatal=False) 211 page_count = self._int(self._search_regex( 212 r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', 213 webpage, 'page_count'), 'page_count') 214 PAGE_SIZE = 80 215 216 def _get_page(idx): 217 webpage = self._download_webpage( 218 page_url, group_id, query={'page': idx + 1}, 219 note='Downloading page %d/%d' % (idx + 1, page_count) 220 ) 221 for entry in self._extract_entries(webpage, url): 222 yield entry 223 224 playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) 225 226 return { 227 '_type': 'playlist', 228 'id': group_id, 229 'title': title, 230 'description': description, 231 'entries': playlist 232 }