[xhamster] Extract formats from xplayer settings and extract filesizes (closes #28114)
authorSergey M․ <dstftw@gmail.com>
Mon, 8 Feb 2021 08:46:32 +0000 (15:46 +0700)
committerSergey M․ <dstftw@gmail.com>
Mon, 8 Feb 2021 08:47:12 +0000 (15:47 +0700)
youtube_dl/extractor/xhamster.py

index 76aeaf9a46a6f67a054bfbc3313b8e6d4309a7f2..f73b9778f6a9ffca618825b57fbf365955dc0ccb 100644 (file)
@@ -11,11 +11,14 @@ from ..utils import (
     dict_get,
     extract_attributes,
     ExtractorError,
+    float_or_none,
     int_or_none,
     parse_duration,
+    str_or_none,
     try_get,
     unified_strdate,
     url_or_none,
+    urljoin,
 )
 
 
@@ -146,36 +149,89 @@ class XHamsterIE(InfoExtractor):
             video = initials['videoModel']
             title = video['title']
             formats = []
-            for format_id, formats_dict in video['sources'].items():
+            format_urls = set()
+            format_sizes = {}
+            sources = try_get(video, lambda x: x['sources'], dict) or {}
+            for format_id, formats_dict in sources.items():
                 if not isinstance(formats_dict, dict):
                     continue
+                download_sources = try_get(sources, lambda x: x['download'], dict) or {}
+                for quality, format_dict in download_sources.items():
+                    if not isinstance(format_dict, dict):
+                        continue
+                    format_sizes[quality] = float_or_none(format_dict.get('size'))
                 for quality, format_item in formats_dict.items():
                     if format_id == 'download':
                         # Download link takes some time to be generated,
                         # skipping for now
                         continue
-                        if not isinstance(format_item, dict):
-                            continue
-                        format_url = format_item.get('link')
-                        filesize = int_or_none(
-                            format_item.get('size'), invscale=1000000)
-                    else:
-                        format_url = format_item
-                        filesize = None
+                    format_url = format_item
                     format_url = url_or_none(format_url)
-                    if not format_url:
+                    if not format_url or format_url in format_urls:
                         continue
+                    format_urls.add(format_url)
                     formats.append({
                         'format_id': '%s-%s' % (format_id, quality),
                         'url': format_url,
                         'ext': determine_ext(format_url, 'mp4'),
                         'height': get_height(quality),
-                        'filesize': filesize,
+                        'filesize': format_sizes.get(quality),
                         'http_headers': {
                             'Referer': urlh.geturl(),
                         },
                     })
-            self._sort_formats(formats)
+            xplayer_sources = try_get(
+                initials, lambda x: x['xplayerSettings']['sources'], dict)
+            if xplayer_sources:
+                hls_sources = xplayer_sources.get('hls')
+                if isinstance(hls_sources, dict):
+                    for hls_format_key in ('url', 'fallback'):
+                        hls_url = hls_sources.get(hls_format_key)
+                        if not hls_url:
+                            continue
+                        hls_url = urljoin(url, hls_url)
+                        if not hls_url or hls_url in format_urls:
+                            continue
+                        format_urls.add(hls_url)
+                        formats.extend(self._extract_m3u8_formats(
+                            hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                            m3u8_id='hls', fatal=False))
+                standard_sources = xplayer_sources.get('standard')
+                if isinstance(standard_sources, dict):
+                    for format_id, formats_list in standard_sources.items():
+                        if not isinstance(formats_list, list):
+                            continue
+                        for standard_format in formats_list:
+                            if not isinstance(standard_format, dict):
+                                continue
+                            for standard_format_key in ('url', 'fallback'):
+                                standard_url = standard_format.get(standard_format_key)
+                                if not standard_url:
+                                    continue
+                                standard_url = urljoin(url, standard_url)
+                                if not standard_url or standard_url in format_urls:
+                                    continue
+                                format_urls.add(standard_url)
+                                ext = determine_ext(standard_url, 'mp4')
+                                if ext == 'm3u8':
+                                    formats.extend(self._extract_m3u8_formats(
+                                        standard_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                                        m3u8_id='hls', fatal=False))
+                                    continue
+                                quality = (str_or_none(standard_format.get('quality'))
+                                           or str_or_none(standard_format.get('label'))
+                                           or '')
+                                formats.append({
+                                    'format_id': '%s-%s' % (format_id, quality),
+                                    'url': standard_url,
+                                    'ext': ext,
+                                    'height': get_height(quality),
+                                    'filesize': format_sizes.get(quality),
+                                    'http_headers': {
+                                        'Referer': standard_url,
+                                    },
+                                })
+            self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
 
             categories_list = video.get('categories')
             if isinstance(categories_list, list):