From f66372403fd9e1661199fea100ba2600fa9697b2 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 21 Feb 2024 00:09:48 +0000 Subject: [PATCH] [InfoExtractor] Rework and improve JWPlayer extraction * use traverse_obj() and _search_json() * support playlist `.load({**video1},{**video2}, ...)` * support transform_source=... for _extract_jwplayer_data() --- youtube_dl/extractor/common.py | 55 ++++++++++++++-------------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b5e95a318..7fae9e57b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -3021,25 +3021,22 @@ class InfoExtractor(object): return formats def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): - mobj = re.search( - r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''', - webpage) - if mobj: - try: - jwplayer_data = self._parse_json(mobj.group('options'), - video_id=video_id, - transform_source=transform_source) - except ExtractorError: - pass - else: - if isinstance(jwplayer_data, dict): - return jwplayer_data + return self._search_json( + r'''(?'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!).)*?\.\s*(?:setup\s*\(|(?Pload)\s*\(\s*\[)''', + webpage, 'JWPlayer data', video_id, + # must be a {...} or sequence, ending + contains_pattern=r'\{[\s\S]*}(?(load)(?:\s*,\s*\{[\s\S]*})*)', end_pattern=r'(?(load)\]|\))', + transform_source=transform_source, default=None) def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) + + # allow passing `transform_source` through to _find_jwplayer_data() + transform_source = kwargs.pop('transform_source', None) + kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {} + + jwplayer_data = self._find_jwplayer_data(webpage, video_id, **kwfind) + + return self._parse_jwplayer_data(jwplayer_data, video_id, *args, **kwargs) def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): @@ -3073,22 +3070,14 @@ class InfoExtractor(object): mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) + for track in traverse_obj(video_data, ( + 'tracks', lambda _, t: t.get('kind').lower() in ('captions', 'subtitles'))): + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) entry = { 'id': this_video_id, -- 2.22.2