Extract original URL from next_url parameter of verify_age page, before actual extract

author Witold Baryluk <baryluk@smp.if.uj.edu.pl>

Fri, 23 Mar 2012 05:17:29 +0000 (06:17 +0100)

committer Witold Baryluk <baryluk@smp.if.uj.edu.pl>

Fri, 23 Mar 2012 05:17:29 +0000 (06:17 +0100)
author Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Fri, 23 Mar 2012 05:17:29 +0000 (06:17 +0100)
committer Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Fri, 23 Mar 2012 05:17:29 +0000 (06:17 +0100)
diff --git a/youtube-dl b/youtube-dl

index b466c1570a0a90e2095f558585f6626cb8e9769e..d8b33e52c8bdee221d5532684d352527edd6e2c9 100755 (executable)
--- a/youtube-dl
+++ b/youtube-dl
@@ -1171,7 +1171,9 @@ class InfoExtractor(object):
  class YoutubeIE(InfoExtractor):
         """Information extractor for youtube.com."""
  
-       _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
+       _PREFIX = r'(?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)'
+       _VALID_URL = r'^('+_PREFIX+r'(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
+       _VALID_URL_WITH_AGE = r'^('+_PREFIX+')verify_age\?next_url=([^&]+)(?:.+)?$'
         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
@@ -1335,6 +1337,14 @@ class YoutubeIE(InfoExtractor):
                         return
  
         def _real_extract(self, url):
+               # Extract original video URL from URL with age verification, using next_url parameter
+               mobj = re.match(self._VALID_URL_WITH_AGE, url)
+               if mobj:
+                       urldecode = lambda x: re.sub(r'%([0-9a-hA-H][0-9a-hA-H])', lambda m: chr(int(m.group(1), 16)), x)
+                       # Keep original domain. We can probably change to www.youtube.com, but it should not hurt so keep it.
+                       # We just make sure we do not have double //, in URL, so we strip starting slash in next_url.
+                       url = mobj.group(1) + re.sub(r'^/', '', urldecode(mobj.group(2)))
+
                 # Extract video id from URL
                 mobj = re.match(self._VALID_URL, url)
                 if mobj is None:
author	Witold Baryluk <baryluk@smp.if.uj.edu.pl>
	Fri, 23 Mar 2012 05:17:29 +0000 (06:17 +0100)
committer	Witold Baryluk <baryluk@smp.if.uj.edu.pl>
	Fri, 23 Mar 2012 05:17:29 +0000 (06:17 +0100)