From 2fe1b5bd2add12d70717878704cd3f811af5d22c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 21 Apr 2015 03:18:38 +0800 Subject: [PATCH] [CSpan] Add detection for Senate ISVP. Closes #5302 --- youtube_dl/extractor/cspan.py | 18 +++++++++++++++++- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/senateisvp.py | 20 ++++++++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 955119d40..7377ac7b9 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -7,7 +7,9 @@ from ..utils import ( int_or_none, unescapeHTML, find_xpath_attr, + smuggle_url, ) +from .senateisvp import SenateISVPIE class CSpanIE(InfoExtractor): @@ -40,6 +42,15 @@ class CSpanIE(InfoExtractor): 'title': 'General Motors Ignition Switch Recall', }, 'playlist_duration_sum': 14855, + }, { + # Video from senate.gov + 'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers', + 'md5': '7314c4b96dad66dd8e63dc3518ceaa6f', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'flv', + 'title': 'Immigration Reforms Needed to Protect Skilled American Workers', + } }] def _real_extract(self, url): @@ -56,7 +67,7 @@ class CSpanIE(InfoExtractor): # present, otherwise this is a stripped version r'

(.*?)

' ], - webpage, 'description', flags=re.DOTALL) + webpage, 'description', flags=re.DOTALL, default=None) info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id data = self._download_json(info_url, video_id) @@ -68,6 +79,11 @@ class CSpanIE(InfoExtractor): title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) + files = data['video']['files'] entries = [{ diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e645d1bb3..ec4d0c210 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -35,6 +35,7 @@ from .rutv import RUTVIE from .smotri import SmotriIE from .condenast import CondeNastIE from .udn import UDNEmbedIE +from .senateisvp import SenateISVPIE class GenericIE(InfoExtractor): @@ -1365,6 +1366,11 @@ class GenericIE(InfoExtractor): return self.url_result( compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') + # Look for Senate ISVP iframe + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + return self.url_result(surl, 'SenateISVP') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index a93874cad..23e1cd944 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + unsmuggle_url, +) from ..compat import ( compat_parse_qs, compat_urlparse, @@ -73,12 +76,22 @@ class SenateISVPIE(InfoExtractor): } }] + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"]+src=['\"](?Phttp://www\.senate\.gov/isvp/\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + def _get_info_for_comm(self, committee): for entry in self._COMM_MAP: if entry[0] == committee: return entry[1:] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs')) if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): raise ExtractorError('Invalid URL', expected=True) @@ -87,7 +100,10 @@ class SenateISVPIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'([^<]+)', webpage, video_id) + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_search_regex(r'([^<]+)', webpage, video_id) poster = qs.get('poster') if poster: thumbnail = poster[0] -- 2.22.2