youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit 182583623583c8e71af9b4e24acf8c409fcff197
parent a0088bdf9342408a1fc5033a0f4599bae3b9aa0b
Author: Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Date:   Tue, 10 Dec 2013 21:03:53 +0100

Use `_download_xml` in more extractors

Diffstat:
Myoutube_dl/extractor/appletrailers.py | 23+++++++++++------------
Myoutube_dl/extractor/clipsyndicate.py | 10++++------
Myoutube_dl/extractor/metacritic.py | 9+++++----
Myoutube_dl/utils.py | 5+++++
4 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor): uploader_id = mobj.group('company') playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') - playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) - playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) - # The ' in the onClick attributes are not escaped, it couldn't be parsed - # with xml.etree.ElementTree.fromstring - # like: http://trailers.apple.com/trailers/wb/gravity/ - def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;') - playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) - playlist_html = u'<html>' + playlist_cleaned + u'</html>' + def fix_html(s): + s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) + s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;') + s = re.sub(self._JSON_RE, _clean_json, s) + s = u'<html>' + s + u'</html>' + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py @@ -1,9 +1,9 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( find_xpath_attr, + fix_xml_all_ampersand, ) @@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor): # it includes a required token flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') - playlist_page = self._download_webpage( + pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, u'Downloading video info') - # Fix broken xml - playlist_page = re.sub('&', '&amp;', playlist_page) - pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) + video_id, u'Downloading video info', + transform_source=fix_xml_all_ampersand) track_doc = pdoc.find('trackList/track') def find_param(name): diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py @@ -1,8 +1,10 @@ import re -import xml.etree.ElementTree import operator from .common import InfoExtractor +from ..utils import ( + fix_xml_all_ampersand, +) class MetacriticIE(InfoExtractor): @@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' - info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, - video_id, u'Downloading info xml').replace('&', '&amp;') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, + video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py @@ -1057,3 +1057,8 @@ def month_by_name(name): return ENGLISH_NAMES.index(name) + 1 except ValueError: return None + + +def fix_xml_all_ampersand(xml_str): + """Replace all the '&' by '&amp;' in XML""" + return xml_str.replace(u'&', u'&amp;')