[tv2] improve MTV Uutiset Article extraction
authorRemita Amine <remitamine@gmail.com>
Mon, 25 Jan 2021 13:46:04 +0000 (14:46 +0100)
committerRemita Amine <remitamine@gmail.com>
Mon, 25 Jan 2021 13:46:04 +0000 (14:46 +0100)
youtube_dl/extractor/extractors.py
youtube_dl/extractor/tv2.py

index 2331b0e1501afa0438f389dee88041e33cb2e602..c554a8504ecc9a3095dc0bf1d7c0cac3510ee199 100644 (file)
@@ -1260,7 +1260,7 @@ from .tv2 import (
     TV2IE,
     TV2ArticleIE,
     KatsomoIE,
-    MTVuutisetIE,
+    MTVUutisetArticleIE,
 )
 from .tv2dk import (
     TV2DKIE,
index 42a9af126db5917364ada129dafcdb8661d826e6..334b7d540499e047f14381f879abf9a443e3c999 100644 (file)
@@ -20,7 +20,7 @@ from ..utils import (
 
 class TV2IE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.tv2.no/v/916509/',
         'info_dict': {
             'id': '916509',
@@ -33,7 +33,7 @@ class TV2IE(InfoExtractor):
             'view_count': int,
             'categories': list,
         },
-    }
+    }]
     _API_DOMAIN = 'sumo.tv2.no'
     _PROTOCOLS = ('HDS', 'HLS', 'DASH')
     _GEO_COUNTRIES = ['NO']
@@ -42,6 +42,12 @@ class TV2IE(InfoExtractor):
         video_id = self._match_id(url)
         api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id)
 
+        asset = self._download_json(
+            api_base + '.json', video_id,
+            'Downloading metadata JSON')['asset']
+        title = asset.get('subtitle') or asset['title']
+        is_live = asset.get('live') is True
+
         formats = []
         format_urls = []
         for protocol in self._PROTOCOLS:
@@ -81,7 +87,8 @@ class TV2IE(InfoExtractor):
                 elif ext == 'm3u8':
                     if not data.get('drmProtected'):
                         formats.extend(self._extract_m3u8_formats(
-                            video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                            video_url, video_id, 'mp4',
+                            'm3u8' if is_live else 'm3u8_native',
                             m3u8_id=format_id, fatal=False))
                 elif ext == 'mpd':
                     formats.extend(self._extract_mpd_formats(
@@ -99,11 +106,6 @@ class TV2IE(InfoExtractor):
             raise ExtractorError('This video is DRM protected.', expected=True)
         self._sort_formats(formats)
 
-        asset = self._download_json(
-            api_base + '.json', video_id,
-            'Downloading metadata JSON')['asset']
-        title = asset['title']
-
         thumbnails = [{
             'id': thumbnail.get('@type'),
             'url': thumbnail.get('url'),
@@ -112,7 +114,7 @@ class TV2IE(InfoExtractor):
         return {
             'id': video_id,
             'url': video_url,
-            'title': title,
+            'title': self._live_title(title) if is_live else title,
             'description': strip_or_none(asset.get('description')),
             'thumbnails': thumbnails,
             'timestamp': parse_iso8601(asset.get('createTime')),
@@ -120,6 +122,7 @@ class TV2IE(InfoExtractor):
             'view_count': int_or_none(asset.get('views')),
             'categories': asset.get('keywords', '').split(','),
             'formats': formats,
+            'is_live': is_live,
         }
 
 
@@ -168,13 +171,13 @@ class TV2ArticleIE(InfoExtractor):
 
 
 class KatsomoIE(TV2IE):
-    _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)'
+    _TESTS = [{
         'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321',
         'info_dict': {
             'id': '1181321',
             'ext': 'mp4',
-            'title': 'MTV Uutiset Live',
+            'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle',
             'description': 'Päätöksen teki Pelicansin hallitus.',
             'timestamp': 1575116484,
             'upload_date': '20191130',
@@ -186,20 +189,29 @@ class KatsomoIE(TV2IE):
             # m3u8 download
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.mtvuutiset.fi/video/prog1311159',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.katsomo.fi/#!/jakso/1311159',
+        'only_matching': True,
+    }]
     _API_DOMAIN = 'api.katsomo.fi'
     _PROTOCOLS = ('HLS', 'MPD')
     _GEO_COUNTRIES = ['FI']
 
 
-class MTVuutisetIE(KatsomoIE):
-    _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/(?:artikkeli/[0-9a-z-]+/|video/prog)(?P<id>\d+)'
-    _TEST = {
+class MTVUutisetArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)'
+    _TESTS = [{
         'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384',
         'info_dict': {
             'id': '1311159',
             'ext': 'mp4',
-            'title': 'MTV Uutiset Live',
+            'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla',
             'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla',
             'timestamp': 1600608966,
             'upload_date': '20200920',
@@ -211,11 +223,26 @@ class MTVuutisetIE(KatsomoIE):
             # m3u8 download
             'skip_download': True,
         },
-    }
+    }, {
+        # multiple Youtube embeds
+        'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        art_id = self._match_id(url)
-        webpage = self._download_webpage(url, art_id)
-        video_id = self._html_search_regex(
-            r'<div class=\'player-container\' .*data-katsomoid="(.+?)"', webpage, 'video_id')
-        return self.url_result("http://mtv.fi/a/0/a/%s" % video_id, video_id=video_id, ie="Katsomo")
+        article_id = self._match_id(url)
+        article = self._download_json(
+            'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id,
+            article_id)
+
+        def entries():
+            for video in (article.get('videos') or []):
+                video_type = video.get('videotype')
+                video_url = video.get('url')
+                if not (video_url and video_type in ('katsomo', 'youtube')):
+                    continue
+                yield self.url_result(
+                    video_url, video_type.capitalize(), video.get('video_id'))
+
+        return self.playlist_result(
+            entries(), article_id, article.get('title'), article.get('description'))