self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&">'), {'x': '&'}) # XML
self.assertEqual(extract_attributes('<e x=""">'), {'x': '"'})
- self.assertEqual(extract_attributes('<e x="£">'), {'x': '£'}) # HTML 3.2
- self.assertEqual(extract_attributes('<e x="λ">'), {'x': 'λ'}) # HTML 4.0
+ self.assertEqual(extract_attributes('<e x="£">'), {'x': '£'}) # HTML 3.2
+ self.assertEqual(extract_attributes('<e x="λ">'), {'x': 'λ'}) # HTML 4.0
self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
- self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
+ self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
return unescapeHTML(res)
+
class HTMLAttributeParser(compat_HTMLParser):
"""Trivial HTML parser to gather the attributes for a single element"""
def __init__(self):
- self.attrs = { }
+ self.attrs = {}
compat_HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
self.attrs = dict(attrs)
+
def extract_attributes(html_element):
"""Given a string for an HTML element such as
<el
parser.close()
return parser.attrs
+
def clean_html(html):
"""Clean an HTML snippet into a readable string"""