[generic] Extract RSS video itunes metadata
authorSergey M․ <dstftw@gmail.com>
Sun, 6 Dec 2020 16:08:03 +0000 (23:08 +0700)
committerSergey M․ <dstftw@gmail.com>
Sun, 6 Dec 2020 16:08:03 +0000 (23:08 +0700)
youtube_dl/extractor/generic.py

index d0e0d69194abc1873695fdb90cc0842b452e1211..d2ba0783991a2453e9a755a2ec8b4375b48df481 100644 (file)
@@ -20,12 +20,14 @@ from ..utils import (
     ExtractorError,
     float_or_none,
     HEADRequest,
+    int_or_none,
     is_html,
     js_to_json,
     KNOWN_EXTENSIONS,
     merge_dicts,
     mimetype2ext,
     orderedSet,
+    parse_duration,
     sanitized_Request,
     smuggle_url,
     unescapeHTML,
@@ -33,7 +35,9 @@ from ..utils import (
     unified_timestamp,
     unsmuggle_url,
     UnsupportedError,
+    url_or_none,
     xpath_text,
+    xpath_with_ns,
 )
 from .commonprotocols import RtmpIE
 from .brightcove import (
@@ -206,10 +210,12 @@ class GenericIE(InfoExtractor):
             'playlist': [{
                 'info_dict': {
                     'ext': 'mov',
-                    'id': 'pdv_maddow_netcast_mov-12-03-2020-223726',
-                    'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726',
+                    'id': 'pdv_maddow_netcast_mov-12-04-2020-224335',
+                    'title': 're:MSNBC Rachel Maddow',
                     'description': 're:.*her unique approach to storytelling.*',
-                    'upload_date': '20201204',
+                    'timestamp': int,
+                    'upload_date': compat_str,
+                    'duration': float,
                 },
             }],
         },
@@ -2189,6 +2195,10 @@ class GenericIE(InfoExtractor):
         playlist_desc_el = doc.find('./channel/description')
         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
 
+        NS_MAP = {
+            'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+        }
+
         entries = []
         for it in doc.findall('./channel/item'):
             next_url = None
@@ -2204,6 +2214,20 @@ class GenericIE(InfoExtractor):
             if not next_url:
                 continue
 
+            def itunes(key):
+                return xpath_text(
+                    it, xpath_with_ns('./itunes:%s' % key, NS_MAP),
+                    default=None)
+
+            duration = itunes('duration')
+            explicit = itunes('explicit')
+            if explicit == 'true':
+                age_limit = 18
+            elif explicit == 'false':
+                age_limit = 0
+            else:
+                age_limit = None
+
             entries.append({
                 '_type': 'url_transparent',
                 'url': next_url,
@@ -2211,6 +2235,12 @@ class GenericIE(InfoExtractor):
                 'description': xpath_text(it, 'description', default=None),
                 'timestamp': unified_timestamp(
                     xpath_text(it, 'pubDate', default=None)),
+                'duration': int_or_none(duration) or parse_duration(duration),
+                'thumbnail': url_or_none(itunes('image')),
+                'episode': itunes('title'),
+                'episode_number': int_or_none(itunes('episode')),
+                'season_number': int_or_none(itunes('season')),
+                'age_limit': age_limit,
             })
 
         return {