From: Jeff Smith Date: Wed, 28 Aug 2013 19:00:59 +0000 (-0500) Subject: Fix MIT extractor for Python 2.6 X-Git-Url: http://git.oshgnacknak.de/?a=commitdiff_plain;h=b5ba7b9dcfed5ded96c841a0ebbbf12132de838f;p=youtube-dl Fix MIT extractor for Python 2.6 The HTML for the MIT page does not parse cleanly for Python 2.6 due to script tags within an actual script element. The offending piece is inside a comment block, so removing all such comment blocks fixes the parsing. --- diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index d09d03e36..52be9232f 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -25,23 +25,21 @@ class TechTVMITIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage( + raw_page = self._download_webpage( 'http://techtv.mit.edu/videos/%s' % video_id, video_id) - embed_page = self._download_webpage( - 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, - note=u'Downloading embed page') + clean_page = re.compile(u'', re.S).sub(u'', raw_page) base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', - embed_page, u'base url') - formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, + raw_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, u'video formats') formats = json.loads(formats_json) formats = sorted(formats, key=lambda f: f['bitrate']) - title = get_element_by_id('edit-title', webpage) - description = clean_html(get_element_by_id('edit-description', webpage)) + title = get_element_by_id('edit-title', clean_page) + description = clean_html(get_element_by_id('edit-description', clean_page)) thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', - embed_page, u'thumbnail', flags=re.DOTALL) + raw_page, u'thumbnail', flags=re.DOTALL) return {'id': video_id, 'title': title,