[InfoExtractor] Handle unquoted values in OpenGraph searches
authordirkf <fieldhouse@gmx.net>
Tue, 14 Feb 2023 02:47:09 +0000 (02:47 +0000)
committerdirkf <fieldhouse@gmx.net>
Tue, 14 Feb 2023 02:53:16 +0000 (02:53 +0000)
test/test_InfoExtractor.py
youtube_dl/extractor/common.py

index dd69a681bef4176b008e08bfc1196f6f599f7bc6..4db5c93f177d9829e37ac997f68cf862cacaac73 100644 (file)
@@ -62,6 +62,7 @@ class TestInfoExtractor(unittest.TestCase):
             <meta name="og:test1" content='foo > < bar'/>
             <meta name="og:test2" content="foo >//< bar"/>
             <meta property=og-test3 content='Ill-formatted opengraph'/>
+            <meta property=og:test4 content=unquoted-value/>
             '''
         self.assertEqual(ie._og_search_title(html), 'Foo')
         self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
@@ -74,6 +75,7 @@ class TestInfoExtractor(unittest.TestCase):
         self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
         self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
         self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)
+        self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value')
 
     def test_html_search_meta(self):
         ie = self.ie
index a0a796d7b406f799462d2c6395a8f7b7d5aa7a10..7244e5df64bb803c9759b3854611d2108152b64f 100644 (file)
@@ -1087,7 +1087,7 @@ class InfoExtractor(object):
     # Helper functions for extracting OpenGraph info
     @staticmethod
     def _og_regexes(prop):
-        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
                        % {'prop': re.escape(prop)})
         template = r'<meta[^>]+?%s[^>]+?%s'