[motherless] Fix review issues and improve extraction (closes #26495, closes #27450)
authorSergey M․ <dstftw@gmail.com>
Tue, 5 Jan 2021 00:40:06 +0000 (07:40 +0700)
committerSergey M․ <dstftw@gmail.com>
Tue, 5 Jan 2021 00:40:30 +0000 (07:40 +0700)
youtube_dl/extractor/motherless.py

index 6cc36b308fbe1d66d4c191dff468a8b70398e46e..ef1e081f20e7359139dc209275f29c162311d9dc 100644 (file)
@@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor):
         # no keywords
         'url': 'http://motherless.com/8B4BBC1',
         'only_matching': True,
+    }, {
+        # see https://motherless.com/videos/recent for recent videos with
+        # uploaded date in "ago" format
+        'url': 'https://motherless.com/3C3E2CF',
+        'info_dict': {
+            'id': '3C3E2CF',
+            'ext': 'mp4',
+            'title': 'a/ Hot Teens',
+            'categories': list,
+            'upload_date': '20210104',
+            'uploader_id': 'yonbiw',
+            'thumbnail': r're:https?://.*\.jpg',
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -85,29 +102,28 @@ class MotherlessIE(InfoExtractor):
             or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
         age_limit = self._rta_search(webpage)
         view_count = str_to_int(self._html_search_regex(
-            (r'>([\d,.]+)\s+Views<',  # 1,234,567 Views
-             r'<strong>Views</strong>\s+([^<]+)<'),
+            (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
             webpage, 'view count', fatal=False))
         like_count = str_to_int(self._html_search_regex(
-            (r'>([\d,.]+)\s+Favorites<',  # 1,234 Favorites
+            (r'>([\d,.]+)\s+Favorites<',
              r'<strong>Favorited</strong>\s+([^<]+)<'),
             webpage, 'like count', fatal=False))
 
-        upload_date = self._html_search_regex(
-            (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<',
-             r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<',  # 20h/1d ago
-             r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date')
-        relative = re.match(r'(\d+)([hd])$', upload_date)
-        if relative:
-            delta = int(relative.group(1))
-            unit = relative.group(2)
-            if unit == 'h':
-                delta_t = datetime.timedelta(hours=delta)
-            else:  # unit == 'd'
-                delta_t = datetime.timedelta(days=delta)
-            upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d')
-        else:
-            upload_date = unified_strdate(upload_date)
+        upload_date = unified_strdate(self._search_regex(
+            r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
+            'upload date', default=None))
+        if not upload_date:
+            uploaded_ago = self._search_regex(
+                r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
+                default=None)
+            if uploaded_ago:
+                delta = int(uploaded_ago[:-1])
+                _AGO_UNITS = {
+                    'h': 'hours',
+                    'd': 'days',
+                }
+                kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
+                upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
 
         comment_count = webpage.count('class="media-comment-contents"')
         uploader_id = self._html_search_regex(