Deletes parse_xml from utils, because it also does it.
self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
def test_compat_etree_fromstring(self):
- xml = '<el foo="bar" spam="中文"></el>'
+ xml = '''
+ <root foo="bar" spam="中文">
+ <normal>foo</normal>
+ <chinese>中文</chinese>
+ <foo><bar>spam</bar></foo>
+ </root>
+ '''
doc = compat_etree_fromstring(xml.encode('utf-8'))
self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
self.assertTrue(isinstance(doc.attrib['spam'], compat_str))
+ self.assertTrue(isinstance(doc.find('normal').text, compat_str))
+ self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
+ self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
if __name__ == '__main__':
unittest.main()
if sys.version_info[0] >= 3:
compat_etree_fromstring = xml.etree.ElementTree.fromstring
else:
- # on python 2.x the the attributes of a node aren't always unicode objects
+ # on python 2.x the attributes and text of a node aren't always unicode
+ # objects
etree = xml.etree.ElementTree
+ try:
+ _etree_iter = etree.Element.iter
+ except AttributeError: # Python <=2.6
+ def _etree_iter(root):
+ for el in root.findall('*'):
+ yield el
+ for sub in _etree_iter(el):
+ yield sub
+
# on 2.6 XML doesn't have a parser argument, function copied from CPython
# 2.7 source
def _XML(text, parser=None):
return el
def compat_etree_fromstring(text):
- return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+ doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+ for el in _etree_iter(doc):
+ if el.text is not None and isinstance(el.text, bytes):
+ el.text = el.text.decode('utf-8')
+ return doc
try:
from urllib.parse import parse_qs as compat_parse_qs
parse_duration,
unified_strdate,
xpath_text,
- parse_xml,
)
+from ..compat import compat_etree_fromstring
class ARDMediathekIE(InfoExtractor):
raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
if re.search(r'[\?&]rss($|[=&])', url):
- doc = parse_xml(webpage)
+ doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return GenericIE()._extract_rss(url, video_id, doc)
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
+ compat_etree_fromstring,
compat_urllib_parse_unquote,
compat_urllib_request,
compat_urlparse,
HEADRequest,
is_html,
orderedSet,
- parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
# Is it an RSS feed, a SMIL file or a XSPF playlist?
try:
- doc = parse_xml(webpage)
+ doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
-try:
- etree_iter = xml.etree.ElementTree.Element.iter
-except AttributeError: # Python <=2.6
- etree_iter = lambda n: n.findall('.//*')
-
-
-def parse_xml(s):
- class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
- def doctype(self, name, pubid, system):
- pass # Ignore doctypes
-
- parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
- kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
- tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
- # Fix up XML parser in Python 2.x
- if sys.version_info < (3, 0):
- for n in etree_iter(tree):
- if n.text is not None:
- if not isinstance(n.text, compat_str):
- n.text = n.text.decode('utf-8')
- return tree
-
-
US_RATINGS = {
'G': 0,
'PG': 10,