youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit ab2d524780736249c8988313db021e83642c24d1
parent 85d61685f15bdc62709c699e849af512db78089f
Author: Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Date:   Fri, 15 Nov 2013 12:24:54 +0100

Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).

Diffstat:
Myoutube_dl/extractor/common.py | 14+++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py @@ -315,13 +315,17 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod - def _og_regex(prop): - return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop) + def _og_regexes(prop): + esc_prop = re.escape(prop) + return [ + r'<meta[^>]+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop, + r'<meta[^>]+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop, + ] def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop - escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) + escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) if escaped is None: return None return unescapeHTML(escaped) @@ -336,8 +340,8 @@ class InfoExtractor(object): return self._og_search_property('title', html, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = [self._og_regex('video')] - if secure: regexes.insert(0, self._og_regex('video:secure_url')) + regexes = self._og_regexes('video') + if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) def _rta_search(self, html):