youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

motherless.py (8892B)


      1 from __future__ import unicode_literals
      2 
      3 import datetime
      4 import re
      5 
      6 from .common import InfoExtractor
      7 from ..compat import compat_urlparse
      8 from ..utils import (
      9     ExtractorError,
     10     InAdvancePagedList,
     11     orderedSet,
     12     str_to_int,
     13     unified_strdate,
     14 )
     15 
     16 
     17 class MotherlessIE(InfoExtractor):
     18     _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
     19     _TESTS = [{
     20         'url': 'http://motherless.com/AC3FFE1',
     21         'md5': '310f62e325a9fafe64f68c0bccb6e75f',
     22         'info_dict': {
     23             'id': 'AC3FFE1',
     24             'ext': 'mp4',
     25             'title': 'Fucked in the ass while playing PS3',
     26             'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
     27             'upload_date': '20100913',
     28             'uploader_id': 'famouslyfuckedup',
     29             'thumbnail': r're:https?://.*\.jpg',
     30             'age_limit': 18,
     31         }
     32     }, {
     33         'url': 'http://motherless.com/532291B',
     34         'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
     35         'info_dict': {
     36             'id': '532291B',
     37             'ext': 'mp4',
     38             'title': 'Amazing girl playing the omegle game, PERFECT!',
     39             'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
     40                            'game', 'hairy'],
     41             'upload_date': '20140622',
     42             'uploader_id': 'Sulivana7x',
     43             'thumbnail': r're:https?://.*\.jpg',
     44             'age_limit': 18,
     45         },
     46         'skip': '404',
     47     }, {
     48         'url': 'http://motherless.com/g/cosplay/633979F',
     49         'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
     50         'info_dict': {
     51             'id': '633979F',
     52             'ext': 'mp4',
     53             'title': 'Turtlette',
     54             'categories': ['superheroine heroine  superher'],
     55             'upload_date': '20140827',
     56             'uploader_id': 'shade0230',
     57             'thumbnail': r're:https?://.*\.jpg',
     58             'age_limit': 18,
     59         }
     60     }, {
     61         # no keywords
     62         'url': 'http://motherless.com/8B4BBC1',
     63         'only_matching': True,
     64     }, {
     65         # see https://motherless.com/videos/recent for recent videos with
     66         # uploaded date in "ago" format
     67         'url': 'https://motherless.com/3C3E2CF',
     68         'info_dict': {
     69             'id': '3C3E2CF',
     70             'ext': 'mp4',
     71             'title': 'a/ Hot Teens',
     72             'categories': list,
     73             'upload_date': '20210104',
     74             'uploader_id': 'yonbiw',
     75             'thumbnail': r're:https?://.*\.jpg',
     76             'age_limit': 18,
     77         },
     78         'params': {
     79             'skip_download': True,
     80         },
     81     }]
     82 
     83     def _real_extract(self, url):
     84         video_id = self._match_id(url)
     85         webpage = self._download_webpage(url, video_id)
     86 
     87         if any(p in webpage for p in (
     88                 '<title>404 - MOTHERLESS.COM<',
     89                 ">The page you're looking for cannot be found.<")):
     90             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
     91 
     92         if '>The content you are trying to view is for friends only.' in webpage:
     93             raise ExtractorError('Video %s is for friends only' % video_id, expected=True)
     94 
     95         title = self._html_search_regex(
     96             (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>',
     97              r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title')
     98         video_url = (self._html_search_regex(
     99             (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
    100              r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
    101             webpage, 'video URL', default=None, group='url')
    102             or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
    103         age_limit = self._rta_search(webpage)
    104         view_count = str_to_int(self._html_search_regex(
    105             (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
    106             webpage, 'view count', fatal=False))
    107         like_count = str_to_int(self._html_search_regex(
    108             (r'>([\d,.]+)\s+Favorites<',
    109              r'<strong>Favorited</strong>\s+([^<]+)<'),
    110             webpage, 'like count', fatal=False))
    111 
    112         upload_date = unified_strdate(self._search_regex(
    113             r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
    114             'upload date', default=None))
    115         if not upload_date:
    116             uploaded_ago = self._search_regex(
    117                 r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
    118                 default=None)
    119             if uploaded_ago:
    120                 delta = int(uploaded_ago[:-1])
    121                 _AGO_UNITS = {
    122                     'h': 'hours',
    123                     'd': 'days',
    124                 }
    125                 kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
    126                 upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
    127 
    128         comment_count = webpage.count('class="media-comment-contents"')
    129         uploader_id = self._html_search_regex(
    130             r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
    131             webpage, 'uploader_id')
    132 
    133         categories = self._html_search_meta('keywords', webpage, default=None)
    134         if categories:
    135             categories = [cat.strip() for cat in categories.split(',')]
    136 
    137         return {
    138             'id': video_id,
    139             'title': title,
    140             'upload_date': upload_date,
    141             'uploader_id': uploader_id,
    142             'thumbnail': self._og_search_thumbnail(webpage),
    143             'categories': categories,
    144             'view_count': view_count,
    145             'like_count': like_count,
    146             'comment_count': comment_count,
    147             'age_limit': age_limit,
    148             'url': video_url,
    149         }
    150 
    151 
    152 class MotherlessGroupIE(InfoExtractor):
    153     _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
    154     _TESTS = [{
    155         'url': 'http://motherless.com/g/movie_scenes',
    156         'info_dict': {
    157             'id': 'movie_scenes',
    158             'title': 'Movie Scenes',
    159             'description': 'Hot and sexy scenes from "regular" movies... '
    160                            'Beautiful actresses fully nude... A looot of '
    161                            'skin! :)Enjoy!',
    162         },
    163         'playlist_mincount': 662,
    164     }, {
    165         'url': 'http://motherless.com/gv/sex_must_be_funny',
    166         'info_dict': {
    167             'id': 'sex_must_be_funny',
    168             'title': 'Sex must be funny',
    169             'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
    170                            'any kind!'
    171         },
    172         'playlist_mincount': 9,
    173     }]
    174 
    175     @classmethod
    176     def suitable(cls, url):
    177         return (False if MotherlessIE.suitable(url)
    178                 else super(MotherlessGroupIE, cls).suitable(url))
    179 
    180     def _extract_entries(self, webpage, base):
    181         entries = []
    182         for mobj in re.finditer(
    183                 r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
    184                 webpage):
    185             video_url = compat_urlparse.urljoin(base, mobj.group('href'))
    186             if not MotherlessIE.suitable(video_url):
    187                 continue
    188             video_id = MotherlessIE._match_id(video_url)
    189             title = mobj.group('title')
    190             entries.append(self.url_result(
    191                 video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
    192                 video_title=title))
    193         # Alternative fallback
    194         if not entries:
    195             entries = [
    196                 self.url_result(
    197                     compat_urlparse.urljoin(base, '/' + entry_id),
    198                     ie=MotherlessIE.ie_key(), video_id=entry_id)
    199                 for entry_id in orderedSet(re.findall(
    200                     r'data-codename=["\']([A-Z0-9]+)', webpage))]
    201         return entries
    202 
    203     def _real_extract(self, url):
    204         group_id = self._match_id(url)
    205         page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id)
    206         webpage = self._download_webpage(page_url, group_id)
    207         title = self._search_regex(
    208             r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
    209         description = self._html_search_meta(
    210             'description', webpage, fatal=False)
    211         page_count = self._int(self._search_regex(
    212             r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT',
    213             webpage, 'page_count'), 'page_count')
    214         PAGE_SIZE = 80
    215 
    216         def _get_page(idx):
    217             webpage = self._download_webpage(
    218                 page_url, group_id, query={'page': idx + 1},
    219                 note='Downloading page %d/%d' % (idx + 1, page_count)
    220             )
    221             for entry in self._extract_entries(webpage, url):
    222                 yield entry
    223 
    224         playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
    225 
    226         return {
    227             '_type': 'playlist',
    228             'id': group_id,
    229             'title': title,
    230             'description': description,
    231             'entries': playlist
    232         }