youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit 0fe2ff78e68ec03d56bf3d9434eb612ffb683977
parent dc1eed93be7ac2afb3f52237ae0034d24715b4bd
Author: Yen Chi Hsuan <yan12125@gmail.com>
Date:   Mon,  4 May 2015 21:53:05 +0800

[NBC] Enhance embedURL extraction (closes #2549)

Diffstat:
Mtest/test_utils.py | 5+++++
Myoutube_dl/extractor/nbc.py | 11+++++++++--
Myoutube_dl/utils.py | 8++++++++
3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py @@ -53,6 +53,7 @@ from youtube_dl.utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + lowercase_escape, url_basename, urlencode_postdata, version_tuple, @@ -418,6 +419,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') + def test_lowercase_escape(self): + self.assertEqual(lowercase_escape('aä'), 'aä') + self.assertEqual(lowercase_escape('\\u0026'), '&') + def test_limit_length(self): self.assertEqual(limit_length(None, 12), None) self.assertEqual(limit_length('foo', 12), 'foo') diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py @@ -10,6 +10,8 @@ from ..compat import ( from ..utils import ( ExtractorError, find_xpath_attr, + lowercase_escape, + unescapeHTML, ) @@ -46,18 +48,23 @@ class NBCIE(InfoExtractor): 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', }, 'skip': 'Only works from US', + }, + { + # This video has expired but with an escaped embedURL + 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', + 'skip': 'Expired' } ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._search_regex( + theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( [ r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', r'"embedURL"\s*:\s*"([^"]+)"' ], - webpage, 'theplatform url').replace('_no_endcard', '') + webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) if theplatform_url.startswith('//'): theplatform_url = 'http:' + theplatform_url return self.url_result(theplatform_url) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py @@ -1486,6 +1486,14 @@ def uppercase_escape(s): s) +def lowercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') + return re.sub( + r'\\u[0-9a-fA-F]{4}', + lambda m: unicode_escape(m.group(0))[0], + s) + + def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, compat_str):