youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit 83548824c29ccdf53a4659260aa3898939833882
parent 354dbbd8808dc5e835c7042f84c175eb56e0bcfc
Author: remitamine <remitamine@gmail.com>
Date:   Wed, 16 Mar 2016 13:16:27 +0100

Merge pull request #8092 from bpfoley/twitter-thumbnail

[utils] Add extract_attributes for extracting html tag attributes
Diffstat:
Mtest/test_utils.py | 40++++++++++++++++++++++++++++++++++++++++
Myoutube_dl/compat.py | 6++++++
Myoutube_dl/utils.py | 30++++++++++++++++++++++++++++++
3 files changed, 76 insertions(+), 0 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py @@ -28,6 +28,7 @@ from youtube_dl.utils import ( encodeFilename, escape_rfc3986, escape_url, + extract_attributes, ExtractorError, find_xpath_attr, fix_xml_ampersands, @@ -77,6 +78,7 @@ from youtube_dl.utils import ( cli_bool_option, ) from youtube_dl.compat import ( + compat_chr, compat_etree_fromstring, compat_urlparse, compat_parse_qs, @@ -629,6 +631,44 @@ class TestUtil(unittest.TestCase): on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + def test_extract_attributes(self): + self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) + self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'}) + self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'}) + self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes('<e x="&#121;">'), {'x': 'y'}) + self.assertEqual(extract_attributes('<e x="&#x79;">'), {'x': 'y'}) + self.assertEqual(extract_attributes('<e x="&amp;">'), {'x': '&'}) # XML + self.assertEqual(extract_attributes('<e x="&quot;">'), {'x': '"'}) + self.assertEqual(extract_attributes('<e x="&pound;">'), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes('<e x="&lambda;">'), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'}) + self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"}) + self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'}) + self.assertEqual(extract_attributes('<e x >'), {'x': None}) + self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'}) + self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'}) + self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'}) + self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'}) + self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'}) + self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'}) + self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'}) + self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes('<e x="décompose&#769;">'), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + compat_chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py @@ -77,6 +77,11 @@ try: except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +try: + from html.parser import HTMLParser as compat_HTMLParser +except ImportError: # Python 2 + from HTMLParser import HTMLParser as compat_HTMLParser + try: from subprocess import DEVNULL @@ -543,6 +548,7 @@ else: from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ + 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', 'compat_chr', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = { } + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + +def extract_attributes(html_element): + """Given a string for an HTML element such as + <el + a="foo" B="bar" c="&98;az" d=boz + empty= noval entity="&amp;" + sq='"' dq="'" + > + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs def clean_html(html): """Clean an HTML snippet into a readable string"""