[utils] Decode HTML5 entities - youtube-dl - Another place where youtube-dl lives on

commit 55b2f099c0c820d6c4b46609b175a44a6d7f97bf
parent 9631a94fb5e5ee9b92135f938df00866535fc6c6
Author: Yen Chi Hsuan <yan12125@gmail.com>
Date:   Fri, 10 Jun 2016 15:11:55 +0800

[utils] Decode HTML5 entities

Used in test_Vporn_1. Also related to #9270

Diffstat:
M test/test_utils.py  | 2 ++
M youtube_dl/utils.py  | 12 ++++++++++--

2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -249,6 +249,8 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(unescapeHTML('&#47;'), '/')
         self.assertEqual(unescapeHTML('&eacute;'), 'é')
         self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
+        # HTML5 entities
+        self.assertEqual(unescapeHTML('&period;&apos;'), '.\'')
 
     def test_date_from_str(self):
         self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day'))
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
@@ -39,6 +39,7 @@ from .compat import (
     compat_chr,
     compat_etree_fromstring,
     compat_html_entities,
+    compat_html_entities_html5,
     compat_http_client,
     compat_kwargs,
     compat_parse_qs,
@@ -456,12 +457,19 @@ def orderedSet(iterable):
     return res
 
 
-def _htmlentity_transform(entity):
+def _htmlentity_transform(entity_with_semicolon):
     """Transforms an HTML entity to a character."""
+    entity = entity_with_semicolon[:-1]
+
     # Known non-numeric HTML entity
     if entity in compat_html_entities.name2codepoint:
         return compat_chr(compat_html_entities.name2codepoint[entity])
 
+    # TODO: HTML5 allows entities without a semicolon. For example,
+    # '&Eacuteric' should be decoded as 'Éric'.
+    if entity_with_semicolon in compat_html_entities_html5:
+        return compat_html_entities_html5[entity_with_semicolon]
+
     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
     if mobj is not None:
         numstr = mobj.group(1)
@@ -486,7 +494,7 @@ def unescapeHTML(s):
     assert type(s) == compat_str
 
     return re.sub(
-        r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
+        r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 
 
 def get_subprocess_encoding():

	youtube-dl Another place where youtube-dl lives on
	git clone git://git.oshgnacknak.de/youtube-dl.git
	Log \| Files \| Refs \| README \| LICENSE