youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit 5aafe895fce2a7be9595cb2e56b7bd73a748e6b6
parent b853d2e1555dbb4a09fe3d7857c6d2bc044646f4
Author: Philipp Hagemeister <phihag@phihag.de>
Date:   Mon, 20 Jan 2014 22:11:34 +0100

Correct XML ampersand fixup

Diffstat:
Mtest/test_utils.py | 14++++++++++++++
Myoutube_dl/extractor/clipsyndicate.py | 4++--
Myoutube_dl/extractor/metacritic.py | 4++--
Myoutube_dl/extractor/mtv.py | 6++----
Myoutube_dl/utils.py | 7+++++--
5 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py @@ -16,6 +16,7 @@ from youtube_dl.utils import ( DateRange, encodeFilename, find_xpath_attr, + fix_xml_ampersands, get_meta_content, orderedSet, parse_duration, @@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('9:12:43'), 33163) self.assertEqual(parse_duration('x:y'), None) + def test_fix_xml_ampersands(self): + self.assertEqual( + fix_xml_ampersands('"&x=y&z=a'), '"&amp;x=y&amp;z=a') + self.assertEqual( + fix_xml_ampersands('"&amp;x=y&wrong;&z=a'), + '"&amp;x=y&amp;wrong;&amp;z=a') + self.assertEqual( + fix_xml_ampersands('&amp;&apos;&gt;&lt;&quot;'), + '&amp;&apos;&gt;&lt;&quot;') + self.assertEqual( + fix_xml_ampersands('&#1234;&#x1abC;'), '&#1234;&#x1abC;') + self.assertEqual(fix_xml_ampersands('&#&#'), '&amp;#&amp;#') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor from ..utils import ( find_xpath_attr, - fix_xml_all_ampersand, + fix_xml_ampersands ) @@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor): pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, video_id, u'Downloading video info', - transform_source=fix_xml_all_ampersand) + transform_source=fix_xml_ampersands) track_doc = pdoc.find('trackList/track') def find_param(name): diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( - fix_xml_all_ampersand, + fix_xml_ampersands, ) @@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, - video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand) + video_id, 'Downloading info xml', transform_source=fix_xml_ampersands) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse, ExtractorError, + fix_xml_ampersands, ) def _media_xml_tag(tag): @@ -83,12 +84,9 @@ class MTVServicesInfoExtractor(InfoExtractor): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - def fix_ampersand(s): - """ Fix unencoded ampersand in XML """ - return s.replace(u'& ', '&amp; ') idoc = self._download_xml( self._FEED_URL + '?' + data, video_id, - u'Downloading info', transform_source=fix_ampersand) + u'Downloading info', transform_source=fix_xml_ampersands) return [self._get_video_info(item) for item in idoc.findall('.//item')] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py @@ -1092,9 +1092,12 @@ def month_by_name(name): return None -def fix_xml_all_ampersand(xml_str): +def fix_xml_ampersands(xml_str): """Replace all the '&' by '&amp;' in XML""" - return xml_str.replace(u'&', u'&amp;') + return re.sub( + r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', + u'&amp;', + xml_str) def setproctitle(title):