youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit 55b2f099c0c820d6c4b46609b175a44a6d7f97bf
parent 9631a94fb5e5ee9b92135f938df00866535fc6c6
Author: Yen Chi Hsuan <yan12125@gmail.com>
Date:   Fri, 10 Jun 2016 15:11:55 +0800

[utils] Decode HTML5 entities

Used in test_Vporn_1. Also related to #9270

Diffstat:
Mtest/test_utils.py | 2++
Myoutube_dl/utils.py | 12++++++++++--
2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py @@ -249,6 +249,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('&#47;'), '/') self.assertEqual(unescapeHTML('&eacute;'), 'é') self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;') + # HTML5 entities + self.assertEqual(unescapeHTML('&period;&apos;'), '.\'') def test_date_from_str(self): self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_chr, compat_etree_fromstring, compat_html_entities, + compat_html_entities_html5, compat_http_client, compat_kwargs, compat_parse_qs, @@ -456,12 +457,19 @@ def orderedSet(iterable): return res -def _htmlentity_transform(entity): +def _htmlentity_transform(entity_with_semicolon): """Transforms an HTML entity to a character.""" + entity = entity_with_semicolon[:-1] + # Known non-numeric HTML entity if entity in compat_html_entities.name2codepoint: return compat_chr(compat_html_entities.name2codepoint[entity]) + # TODO: HTML5 allows entities without a semicolon. For example, + # '&Eacuteric' should be decoded as 'Éric'. + if entity_with_semicolon in compat_html_entities_html5: + return compat_html_entities_html5[entity_with_semicolon] + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) if mobj is not None: numstr = mobj.group(1) @@ -486,7 +494,7 @@ def unescapeHTML(s): assert type(s) == compat_str return re.sub( - r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) + r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) def get_subprocess_encoding():