encodeFilename,
escape_rfc3986,
escape_url,
+ extract_attributes,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
cli_bool_option,
)
from youtube_dl.compat import (
+ compat_chr,
compat_etree_fromstring,
)
on = js_to_json('{"abc": "def",}')
self.assertEqual(json.loads(on), {'abc': 'def'})
+ def test_extract_attributes(self):
+ self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
+ self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
+ self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
+ self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x="&">'), {'x': '&'}) # XML
+ self.assertEqual(extract_attributes('<e x=""">'), {'x': '"'})
+ self.assertEqual(extract_attributes('<e x="£">'), {'x': '£'}) # HTML 3.2
+ self.assertEqual(extract_attributes('<e x="λ">'), {'x': 'λ'}) # HTML 4.0
+ self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
+ self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
+ self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
+ self.assertEqual(extract_attributes('<e x >'), {'x': None})
+ self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
+ self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
+ self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
+ self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
+ self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
+ self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
+ self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
+ self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
+ self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
+ self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
+ self.assertEqual(extract_attributes('<e x="décomposé">'), {'x': 'décompose\u0301'})
+ # "Narrow" Python builds don't support unicode code points outside BMP.
+ try:
+ compat_chr(0x10000)
+ supports_outside_bmp = True
+ except ValueError:
+ supports_outside_bmp = False
+ if supports_outside_bmp:
+ self.assertEqual(extract_attributes('<e x="Smile 😀!">'), {'x': 'Smile \U0001f600!'})
+
def test_clean_html(self):
self.assertEqual(clean_html('a:\nb'), 'a: b')
self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
import zlib
from .compat import (
+ compat_HTMLParser,
compat_basestring,
compat_chr,
compat_etree_fromstring,
return unescapeHTML(res)
+class HTMLAttributeParser(compat_HTMLParser):
+ """Trivial HTML parser to gather the attributes for a single element"""
+ def __init__(self):
+ self.attrs = { }
+ compat_HTMLParser.__init__(self)
+
+ def handle_starttag(self, tag, attrs):
+ self.attrs = dict(attrs)
+
+def extract_attributes(html_element):
+ """Given a string for an HTML element such as
+ <el
+ a="foo" B="bar" c="&98;az" d=boz
+ empty= noval entity="&"
+ sq='"' dq="'"
+ >
+ Decode and return a dictionary of attributes.
+ {
+ 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
+ 'empty': '', 'noval': None, 'entity': '&',
+ 'sq': '"', 'dq': '\''
+ }.
+ NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
+ but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
+ """
+ parser = HTMLAttributeParser()
+ parser.feed(html_element)
+ parser.close()
+ return parser.attrs
def clean_html(html):
"""Clean an HTML snippet into a readable string"""