[generic] Suppress warning about doctypes in RSS parser
authorPhilipp Hagemeister <phihag@phihag.de>
Mon, 10 Mar 2014 16:31:32 +0000 (17:31 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Mon, 10 Mar 2014 16:31:32 +0000 (17:31 +0100)
youtube_dl/extractor/generic.py
youtube_dl/utils.py

index 7a2e5dee0469e4e03ea0f97c7657f44c104cb266..7666cf2078177db0682aae81b01b4253362efdfd 100644 (file)
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 
 import os
 import re
-import xml.etree.ElementTree
 
 from .common import InfoExtractor
 from .youtube import YoutubeIE
@@ -17,6 +16,7 @@ from ..utils import (
 
     ExtractorError,
     HEADRequest,
+    parse_xml,
     smuggle_url,
     unescapeHTML,
     unified_strdate,
@@ -274,7 +274,7 @@ class GenericIE(InfoExtractor):
 
         # Is it an RSS feed?
         try:
-            doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8'))
+            doc = parse_xml(webpage)
             if doc.tag == 'rss':
                 return self._extract_rss(url, video_id, doc)
         except compat_xml_parse_error:
index d4abd403169aa624a47d622b250a5988691a742a..3943cc9c578fc7acd9dfd21cea7e2ad25d4b1410 100644 (file)
@@ -22,6 +22,7 @@ import struct
 import subprocess
 import sys
 import traceback
+import xml.etree.ElementTree
 import zlib
 
 try:
@@ -1267,3 +1268,13 @@ def read_batch_urls(batch_fd):
 
 def urlencode_postdata(*args, **kargs):
     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+
+
+def parse_xml(s):
+    class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
+        def doctype(self, name, pubid, system):
+            pass  # Ignore doctypes
+
+    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
+    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
+    return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)