[compat] compat_etree_fromstring: also decode the text attribute

author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Mon, 26 Oct 2015 15:41:24 +0000 (16:41 +0100)

committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Mon, 26 Oct 2015 15:41:24 +0000 (16:41 +0100)
author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Mon, 26 Oct 2015 15:41:24 +0000 (16:41 +0100)
committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Mon, 26 Oct 2015 15:41:24 +0000 (16:41 +0100)
diff --git a/test/test_compat.py b/test/test_compat.py

index 834f4bc55233765f653ee703ff6ea811edf3065e..b6bfad05e3c85c07854cc00c337a12caf493e849 100644 (file)
--- a/test/test_compat.py
+++ b/test/test_compat.py
@@ -74,10 +74,19 @@ class TestCompat(unittest.TestCase):
          self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
  
      def test_compat_etree_fromstring(self):
-        xml = '<el foo="bar" spam="中文"></el>'
+        xml = '''
+            <root foo="bar" spam="中文">
+                <normal>foo</normal>
+                <chinese>中文</chinese>
+                <foo><bar>spam</bar></foo>
+            </root>
+        '''
          doc = compat_etree_fromstring(xml.encode('utf-8'))
          self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
          self.assertTrue(isinstance(doc.attrib['spam'], compat_str))
+        self.assertTrue(isinstance(doc.find('normal').text, compat_str))
+        self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
+        self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py

index f39d4e9a9ee1e35e45be35ae122778a0910e3238..2d43ec852ca2cfa41cf702fb7aac28fbee11bec6 100644 (file)
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -216,9 +216,19 @@ except ImportError:  # Python 2.6
  if sys.version_info[0] >= 3:
      compat_etree_fromstring = xml.etree.ElementTree.fromstring
  else:
-    # on python 2.x the the attributes of a node aren't always unicode objects
+    # on python 2.x the attributes and text of a node aren't always unicode
+    # objects
      etree = xml.etree.ElementTree
  
+    try:
+        _etree_iter = etree.Element.iter
+    except AttributeError:  # Python <=2.6
+        def _etree_iter(root):
+            for el in root.findall('*'):
+                yield el
+                for sub in _etree_iter(el):
+                    yield sub
+
      # on 2.6 XML doesn't have a parser argument, function copied from CPython
      # 2.7 source
      def _XML(text, parser=None):
@@ -235,7 +245,11 @@ else:
          return el
  
      def compat_etree_fromstring(text):
-        return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+        doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+        for el in _etree_iter(doc):
+            if el.text is not None and isinstance(el.text, bytes):
+                el.text = el.text.decode('utf-8')
+        return doc
  
  try:
      from urllib.parse import parse_qs as compat_parse_qs
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py

index 6f465789b497a6625776c383ff699a64b0b5c346..73be6d2040b7197a94939ddbe5f3d7f81a92b750 100644 (file)
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -14,8 +14,8 @@ from ..utils import (
      parse_duration,
      unified_strdate,
      xpath_text,
-    parse_xml,
  )
+from ..compat import compat_etree_fromstring
  
  
  class ARDMediathekIE(InfoExtractor):
@@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor):
              raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
  
          if re.search(r'[\?&]rss($|[=&])', url):
-            doc = parse_xml(webpage)
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
              if doc.tag == 'rss':
                  return GenericIE()._extract_rss(url, video_id, doc)
  
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index ca5fbafb2606ff096025d8f6a86bff598410d4d0..1de96b268c959490b17c966b7b334b8ab22b7fae 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -9,6 +9,7 @@ import sys
  from .common import InfoExtractor
  from .youtube import YoutubeIE
  from ..compat import (
+    compat_etree_fromstring,
      compat_urllib_parse_unquote,
      compat_urllib_request,
      compat_urlparse,
@@ -21,7 +22,6 @@ from ..utils import (
      HEADRequest,
      is_html,
      orderedSet,
-    parse_xml,
      smuggle_url,
      unescapeHTML,
      unified_strdate,
@@ -1237,7 +1237,7 @@ class GenericIE(InfoExtractor):
  
          # Is it an RSS feed, a SMIL file or a XSPF playlist?
          try:
-            doc = parse_xml(webpage)
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
              if doc.tag == 'rss':
                  return self._extract_rss(url, video_id, doc)
              elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 7d846d68085cdbf13e88f116b30b0ad9f2f21b0c..c761ea22a4580216bb2484bd08ec7e8de1d724e4 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'):
      return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
  
  
-try:
-    etree_iter = xml.etree.ElementTree.Element.iter
-except AttributeError:  # Python <=2.6
-    etree_iter = lambda n: n.findall('.//*')
-
-
-def parse_xml(s):
-    class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
-        def doctype(self, name, pubid, system):
-            pass  # Ignore doctypes
-
-    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
-    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
-    tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
-    # Fix up XML parser in Python 2.x
-    if sys.version_info < (3, 0):
-        for n in etree_iter(tree):
-            if n.text is not None:
-                if not isinstance(n.text, compat_str):
-                    n.text = n.text.decode('utf-8')
-    return tree
-
-
  US_RATINGS = {
      'G': 0,
      'PG': 10,
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Mon, 26 Oct 2015 15:41:24 +0000 (16:41 +0100)
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Mon, 26 Oct 2015 15:41:24 +0000 (16:41 +0100)
test/test_compat.py		patch \| blob \| history
youtube_dl/compat.py		patch \| blob \| history
youtube_dl/extractor/ard.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history