youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit 8bc7c3d858c9e2dcf6cf65a8306f3ed2bc9109f5
parent 36ed7177f01d278935ab5eac3d44fcea421c5325
Author: Filippo Valsorda <filippo.valsorda@gmail.com>
Date:   Mon, 17 Jun 2013 19:28:18 +0200

Merge branch 'search_regex' - PR #872 - closes #847

Diffstat:
Mtest/test_download.py | 23+++++++++++++++++++++--
Mtest/tests.json | 187+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Myoutube_dl/InfoExtractors.py | 776++++++++++++++++++++++++++++++++-----------------------------------------------
Myoutube_dl/utils.py | 9++++++++-
4 files changed, 496 insertions(+), 499 deletions(-)

diff --git a/test/test_download.py b/test/test_download.py @@ -7,8 +7,8 @@ import os import json import unittest import sys -import hashlib import socket +import binascii # Allow direct execution sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -38,11 +38,16 @@ def _try_rm(filename): if ose.errno != errno.ENOENT: raise +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() + class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen self.processed_info_dicts = [] return youtube_dl.FileDownloader.__init__(self, *args, **kwargs) + def report_warning(self, message): + # Don't accept warnings during tests + raise ExtractorError(message) def process_info(self, info_dict): self.processed_info_dicts.append(info_dict) return youtube_dl.FileDownloader.process_info(self, info_dict) @@ -121,7 +126,21 @@ def generator(test_case): with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, value) in tc.get('info_dict', {}).items(): - self.assertEqual(value, info_dict.get(info_field)) + if isinstance(value, compat_str) and value.startswith('md5:'): + self.assertEqual(value, 'md5:' + md5(info_dict.get(info_field))) + else: + self.assertEqual(value, info_dict.get(info_field)) + + # If checkable fields are missing from the test case, print the info_dict + test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) + for key, value in info_dict.items() + if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location')) + if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()): + sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=2) + u'\n') + + # Check for the presence of mandatory fields + for key in ('id', 'url', 'title', 'ext'): + self.assertTrue(key in info_dict.keys() and info_dict[key]) finally: for tc in test_cases: _try_rm(tc['file']) diff --git a/test/tests.json b/test/tests.json @@ -15,43 +15,76 @@ "name": "Dailymotion", "md5": "392c4b85a60a90dc4792da41ce3144eb", "url": "http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech", - "file": "x33vw9.mp4" + "file": "x33vw9.mp4", + "info_dict": { + "uploader": "Alex and Van .", + "title": "Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" + } }, { "name": "Metacafe", "add_ie": ["Youtube"], "url": "http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", - "file": "_aUehQsCQtM.flv" + "file": "_aUehQsCQtM.flv", + "info_dict": { + "upload_date": "20090102", + "title": "The Electric Company | \"Short I\" | PBS KIDS GO!", + "description": "md5:2439a8ef6d5a70e380c22f5ad323e5a8", + "uploader": "PBS", + "uploader_id": "PBS" + } }, { "name": "BlipTV", "md5": "b2d849efcf7ee18917e4b4d9ff37cafe", "url": "http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352", - "file": "5779306.m4v" + "file": "5779306.m4v", + "info_dict": { + "upload_date": "20111205", + "description": "md5:9bc31f227219cde65e47eeec8d2dc596", + "uploader": "Comic Book Resources - CBR TV", + "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3" + } }, { "name": "XVideos", "md5": "1d0c835822f0a71a7bf011855db929d0", "url": "http://www.xvideos.com/video939581/funny_porns_by_s_-1", - "file": "939581.flv" + "file": "939581.flv", + "info_dict": { + "title": "Funny Porns By >>>>S<<<<<< -1" + } }, { "name": "YouPorn", "md5": "c37ddbaaa39058c76a7e86c6813423c1", "url": "http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/", - "file": "505835.mp4" + "file": "505835.mp4", + "info_dict": { + "upload_date": "20101221", + "description": "Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", + "uploader": "Ask Dan And Jennifer", + "title": "Sex Ed: Is It Safe To Masturbate Daily?" + } }, { "name": "Pornotube", "md5": "374dd6dcedd24234453b295209aa69b6", "url": "http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing", - "file": "1689755.flv" + "file": "1689755.flv", + "info_dict": { + "upload_date": "20090708", + "title": "Marilyn-Monroe-Bathing" + } }, { "name": "YouJizz", "md5": "07e15fa469ba384c7693fd246905547c", "url": "http://www.youjizz.com/videos/zeichentrick-1-2189178.html", - "file": "2189178.flv" + "file": "2189178.flv", + "info_dict": { + "title": "Zeichentrick 1" + } }, { "name": "Vimeo", @@ -70,61 +103,103 @@ "name": "Soundcloud", "md5": "ebef0a451b909710ed1d7787dddbf0d7", "url": "http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy", - "file": "62986583.mp3" + "file": "62986583.mp3", + "info_dict": { + "upload_date": "20121011", + "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", + "uploader": "E.T. ExTerrestrial Music", + "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" + } }, { "name": "StanfordOpenClassroom", "md5": "544a9468546059d4e80d76265b0443b8", "url": "http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100", - "file": "PracticalUnix_intro-environment.mp4" + "file": "PracticalUnix_intro-environment.mp4", + "info_dict": { + "title": "Intro Environment" + } }, { "name": "XNXX", "md5": "0831677e2b4761795f68d417e0b7b445", "url": "http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_", - "file": "1135332.flv" + "file": "1135332.flv", + "info_dict": { + "title": "lida » Naked Funny Actress (5)" + } }, { "name": "Youku", "url": "http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", "file": "XNDgyMDQ2NTQw_part00.flv", "md5": "ffe3f2e435663dc2d1eea34faeff5b5b", - "params": { "test": false } + "params": { "test": false }, + "info_dict": { + "title": "youtube-dl test video \"'/\\ä↭𝕐" + } }, { "name": "NBA", "url": "http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html", "file": "0021200253-okc-bkn-recap.nba.mp4", - "md5": "c0edcfc37607344e2ff8f13c378c88a4" + "md5": "c0edcfc37607344e2ff8f13c378c88a4", + "info_dict": { + "description": "Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.", + "title": "Thunder vs. Nets" + } }, { "name": "JustinTV", "url": "http://www.twitch.tv/thegamedevhub/b/296128360", "file": "296128360.flv", - "md5": "ecaa8a790c22a40770901460af191c9a" + "md5": "ecaa8a790c22a40770901460af191c9a", + "info_dict": { + "upload_date": "20110927", + "uploader_id": 25114803, + "uploader": "thegamedevhub", + "title": "Beginner Series - Scripting With Python Pt.1" + } }, { "name": "MyVideo", "url": "http://www.myvideo.de/watch/8229274/bowling_fail_or_win", "file": "8229274.flv", - "md5": "2d2753e8130479ba2cb7e0a37002053e" + "md5": "2d2753e8130479ba2cb7e0a37002053e", + "info_dict": { + "title": "bowling-fail-or-win" + } }, { "name": "Escapist", "url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate", "file": "6618-Breaking-Down-Baldurs-Gate.mp4", - "md5": "c6793dbda81388f4264c1ba18684a74d" + "md5": "c6793dbda81388f4264c1ba18684a74d", + "info_dict": { + "description": "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", + "uploader": "the-escapist-presents", + "title": "Breaking Down Baldur's Gate" + } }, { "name": "GooglePlus", "url": "https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH", - "file": "ZButuJc6CtH.flv" + "file": "ZButuJc6CtH.flv", + "info_dict": { + "upload_date": "20120613", + "uploader": "井上ヨシマサ", + "title": "嘆きの天使 降臨" + } }, { "name": "FunnyOrDie", "url": "http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version", "file": "0732f586d7.mp4", - "md5": "f647e9e90064b53b6e046e75d0241fbd" + "md5": "f647e9e90064b53b6e046e75d0241fbd", + "info_dict": { + "description": "Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.", + "title": "Heart-Shaped Box: Literal Video Version" + } }, { "name": "Steam", @@ -161,6 +236,7 @@ "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", "file": "12-jan-pythonthings.mp4", "info_dict": { + "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", "title": "A Few of My Favorite [Python] Things" }, "params": { @@ -173,7 +249,10 @@ "file": "422212.mp4", "md5": "4e2f5cb088a83cd8cdb7756132f9739d", "info_dict": { - "title": "thedailyshow-kristen-stewart part 1" + "upload_date": "20121214", + "description": "Kristen Stewart", + "uploader": "thedailyshow", + "title": "thedailyshow-kristen-stewart part 1" } }, { @@ -224,42 +303,48 @@ "file": "11885679.m4a", "md5": "d30b5b5f74217410f4689605c35d1fd7", "info_dict": { - "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad" + "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885680.m4a", "md5": "4eb0a669317cd725f6bbd336a29f923a", "info_dict": { - "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad" + "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885682.m4a", "md5": "1893e872e263a2705558d1d319ad19e8", "info_dict": { - "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad" + "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885683.m4a", "md5": "b673c46f47a216ab1741ae8836af5899", "info_dict": { - "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad" + "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885684.m4a", "md5": "1d74534e95df54986da7f5abf7d842b7", "info_dict": { - "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad" + "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885685.m4a", "md5": "f081f47af8f6ae782ed131d38b9cd1c0", "info_dict": { - "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad" + "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } } ] @@ -270,9 +355,9 @@ "file": "NODfbab.mp4", "md5": "9b0636f8c0f7614afa4ea5e4c6e57e83", "info_dict": { + "uploader": "ytdl", "title": "test chars: \"'/\\ä<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ." } - }, { "name": "TED", @@ -290,14 +375,19 @@ "file": "11741.mp4", "md5": "0b49f4844a068f8b33f4b7c88405862b", "info_dict": { - "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" + "description": "Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", + "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" } }, { "name": "Generic", "url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html", "file": "13601338388002.mp4", - "md5": "85b90ccc9d73b4acd9138d3af4c27f89" + "md5": "85b90ccc9d73b4acd9138d3af4c27f89", + "info_dict": { + "uploader": "www.hodiho.fr", + "title": "Régis plante sa Jeep" + } }, { "name": "Spiegel", @@ -325,7 +415,7 @@ "file": "wshh6a7q1ny0G34ZwuIO.mp4", "md5": "9d04de741161603bf7071bbf4e883186", "info_dict": { - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick! " + "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } }, { @@ -355,42 +445,59 @@ "file":"30510138.mp3", "md5":"f9136bf103901728f29e419d2c70f55d", "info_dict": { - "title":"D-D-Dance" + "upload_date": "20111213", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "D-D-Dance" } }, { "file":"47127625.mp3", "md5":"09b6758a018470570f8fd423c9453dd8", "info_dict": { - "title":"The Royal Concept - Gimme Twice" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "The Royal Concept - Gimme Twice" } }, { "file":"47127627.mp3", "md5":"154abd4e418cea19c3b901f1e1306d9c", "info_dict": { - "title":"Goldrushed" + "upload_date": "20120521", + "uploader": "The Royal Concept", + "title": "Goldrushed" } }, { "file":"47127629.mp3", "md5":"2f5471edc79ad3f33a683153e96a79c1", "info_dict": { - "title":"In the End" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "In the End" } }, { "file":"47127631.mp3", "md5":"f9ba87aa940af7213f98949254f1c6e2", "info_dict": { - "title":"Knocked Up" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", + "uploader": "The Royal Concept", + "title": "Knocked Up" } }, { "file":"75206121.mp3", "md5":"f9d1fe9406717e302980c30de4af9353", "info_dict": { - "title":"World On Fire" + "upload_date": "20130116", + "description": "The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central). \r\nAs a gift to our fans we would like to offer you a free download of the track! ", + "uploader": "The Royal Concept", + "title": "World On Fire" } } ] @@ -419,8 +526,10 @@ "url": "http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0", "file": "zpsc0c3b9fa.mp4", "md5": "7dabfb92b0a31f6c16cebc0f8e60ff99", - "info_dict":{ - "title":"Tired of Link Building? Try BacklinkMyDomain.com!" + "info_dict": { + "upload_date": "20130504", + "uploader": "rachaneronas", + "title": "Tired of Link Building? Try BacklinkMyDomain.com!" } }, { @@ -488,8 +597,10 @@ "url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html", "file": "1509445.flv", "md5": "9f48e0e8d58e3076bb236ff412ab62fa", - "info_dict":{ - "title":"FemaleAgent Shy beauty takes the bait" + "info_dict": { + "upload_date": "20121014", + "uploader_id": "Ruseful2011", + "title": "FemaleAgent Shy beauty takes the bait" } }, { diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py @@ -191,6 +191,47 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info + def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + ExtractorError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: break + + if sys.stderr.isatty() and os.name != 'nt': + _name = u'\033[0;34m%s\033[0m' % name + else: + _name = name + + if mobj: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif default is not None: + return default + elif fatal: + raise ExtractorError(u'Unable to extract %s' % _name) + else: + self._downloader.report_warning(u'unable to extract %s; ' + u'please report this issue on GitHub.' % _name) + return None + + def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags) + if res: + return clean_html(res).strip() + else: + return res + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -964,18 +1005,13 @@ class PhotobucketIE(InfoExtractor): }] # We try looking in other parts of the webpage - mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - - video_url = mediaURL + video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />', + webpage, u'video URL') mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage) if mobj is None: raise ExtractorError(u'Unable to extract title') video_title = mobj.group(1).decode('utf-8') - video_uploader = mobj.group(2).decode('utf-8') return [{ @@ -1397,16 +1433,12 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - mobj = re.search(r'<title>(.*)</title>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex(r'<title>(.*)</title>', + webpage, u'video title') # video uploader is domain name - mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_uploader = mobj.group(1) + video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', + url, u'video uploader') return [{ 'id': video_id, @@ -1805,10 +1837,7 @@ class DepositFilesIE(InfoExtractor): file_extension = os.path.splitext(file_url)[1][1:] # Search for file title - mobj = re.search(r'<b title="(.*?)">', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - file_title = mobj.group(1).decode('utf-8') + file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title') return [{ 'id': file_id.decode('utf-8'), @@ -1902,10 +1931,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage) - if not m: - raise ExtractorError(u'Cannot find title in webpage') - video_title = unescapeHTML(m.group(1)) + video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>', + webpage, u'title') info = { 'id': video_id, @@ -2067,15 +2094,10 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - mobj = re.search('<title>([^<]+)</title>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex('<title>([^<]+)</title>', + webpage, u'title') - mobj = re.search('[.](.+?)$', video_url) - if mobj is None: - raise ExtractorError(u'Unable to extract extention') - video_ext = mobj.group(1) + video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') return [{ 'id': video_id, @@ -2123,25 +2145,23 @@ class MyVideoIE(InfoExtractor): # extracting infos self.report_extraction(video_id) + video_url = None mobj = re.search('connectionurl=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract rtmpurl') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) - if 'myvideo2flash' in video_rtmpurl: - self._downloader.report_warning(u'forcing RTMPT ...') - video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://') - - # extract non rtmp videos - if (video_rtmpurl is None) or (video_rtmpurl == ''): + if mobj: + video_url = compat_urllib_parse.unquote(mobj.group(1)) + if 'myvideo2flash' in video_url: + self._downloader.report_warning(u'forcing RTMPT ...') + video_url = video_url.replace('rtmpe://', 'rtmpt://') + + if not video_url: + # extract non rtmp videos mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) if mobj is None: raise ExtractorError(u'unable to extract url') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) + video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) - mobj = re.search('source=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_file = compat_urllib_parse.unquote(mobj.group(1)) + video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file') + video_file = compat_urllib_parse.unquote(video_file) if not video_file.endswith('f4m'): ppath, prefix = video_file.split('.') @@ -2153,20 +2173,16 @@ class MyVideoIE(InfoExtractor): video_filepath + video_file ).replace('.f4m', '.m3u8') - mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_swfobj = compat_urllib_parse.unquote(mobj.group(1)) + video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') + video_swfobj = compat_urllib_parse.unquote(video_swfobj) - mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage) - if mobj is None: - raise ExtractorError(u'unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>", + webpage, u'title') return [{ 'id': video_id, - 'url': video_rtmpurl, - 'tc_url': video_rtmpurl, + 'url': video_url, + 'tc_url': video_url, 'uploader': None, 'upload_date': None, 'title': video_title, @@ -2177,6 +2193,7 @@ class MyVideoIE(InfoExtractor): 'player_url': video_swfobj, }] + class ComedyCentralIE(InfoExtractor): """Information extractor for The Daily Show and Colbert Report """ @@ -2358,19 +2375,25 @@ class EscapistIE(InfoExtractor): showName = mobj.group('showname') videoId = mobj.group('episode') - self.report_extraction(showName) - webPage = self._download_webpage(url, showName) + self.report_extraction(videoId) + webpage = self._download_webpage(url, videoId) - descMatch = re.search('<meta name="description" content="([^"]*)"', webPage) - description = unescapeHTML(descMatch.group(1)) - imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage) - imgUrl = unescapeHTML(imgMatch.group(1)) - playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage) - playerUrl = unescapeHTML(playerUrlMatch.group(1)) - configUrlMatch = re.search('config=(.*)$', playerUrl) - configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1)) + videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"', + webpage, u'description', fatal=False) - configJSON = self._download_webpage(configUrl, showName, + imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"', + webpage, u'thumbnail', fatal=False) + + playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"', + webpage, u'player url') + + title = self._html_search_regex('<meta name="title" content="([^"]*)"', + webpage, u'player url').split(' : ')[-1] + + configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url') + configUrl = compat_urllib_parse.unquote(configUrl) + + configJSON = self._download_webpage(configUrl, videoId, u'Downloading configuration', u'unable to download configuration') @@ -2390,10 +2413,10 @@ class EscapistIE(InfoExtractor): 'url': videoUrl, 'uploader': showName, 'upload_date': None, - 'title': showName, + 'title': title, 'ext': 'mp4', 'thumbnail': imgUrl, - 'description': description, + 'description': videoDesc, 'player_url': playerUrl, } @@ -2478,26 +2501,17 @@ class XVideosIE(InfoExtractor): self.report_extraction(video_id) - # Extract video URL - mobj = re.search(r'flv_url=(.+?)&', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(mobj.group(1)) - + video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&', + webpage, u'video URL')) # Extract title - mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) - + video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID', + webpage, u'title') # Extract video thumbnail - mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = mobj.group(0) + video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', + webpage, u'thumbnail', fatal=False) info = { 'id': video_id, @@ -2654,16 +2668,12 @@ class InfoQIE(InfoExtractor): video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title - mobj = re.search(r'contentTitle = "(.*?)";', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) + video_title = self._search_regex(r'contentTitle = "(.*?)";', + webpage, u'title') # Extract description - video_description = u'No description available.' - mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage) - if mobj is not None: - video_description = mobj.group(1) + video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') @@ -2834,15 +2844,10 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - m = re.search('<h1>([^<]+)</h1>', coursepage) - if m: - info['title'] = unescapeHTML(m.group(1)) - else: - info['title'] = info['id'] + info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - m = re.search('<description>([^<]+)</description>', coursepage) - if m: - info['description'] = unescapeHTML(m.group(1)) + info['description'] = self._html_search_regex('<description>([^<]+)</description>', + coursepage, u'description', fatal=False) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2903,25 +2908,17 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract song name') - song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract performer') - performer = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - video_title = performer + ' - ' + song_name + song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + webpage, u'song name', fatal=False) - mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to mtvn_uri') - mtvn_uri = mobj.group(1) + video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + webpage, u'title') - mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract content id') - content_id = mobj.group(1) + mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + webpage, u'mtvn_uri', fatal=False) + + content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', + webpage, u'content id', fatal=False) videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri self.report_extraction(video_id) @@ -3069,20 +3066,15 @@ class XNXXIE(InfoExtractor): # Get webpage content webpage = self._download_webpage(url, video_id) - result = re.search(self.VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group(1)) + video_url = self._search_regex(self.VIDEO_URL_RE, + webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) - result = re.search(self.VIDEO_TITLE_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group(1) + video_title = self._html_search_regex(self.VIDEO_TITLE_RE, + webpage, u'title') - result = re.search(self.VIDEO_THUMB_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = result.group(1) + video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -3102,26 +3094,6 @@ class GooglePlusIE(InfoExtractor): _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' IE_NAME = u'plus.google' - def report_extract_entry(self, url): - """Report downloading extry""" - self.to_screen(u'Downloading entry: %s' % url) - - def report_date(self, upload_date): - """Report downloading extry""" - self.to_screen(u'Entry date: %s' % upload_date) - - def report_uploader(self, uploader): - """Report downloading extry""" - self.to_screen(u'Uploader: %s' % uploader) - - def report_title(self, video_title): - """Report downloading extry""" - self.to_screen(u'Title: %s' % video_title) - - def report_extract_vid_page(self, video_page): - """Report information extraction.""" - self.to_screen(u'Extracting video page: %s' % video_page) - def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) @@ -3134,47 +3106,31 @@ class GooglePlusIE(InfoExtractor): video_extension = 'flv' # Step 1, Retrieve post webpage to extract further information - self.report_extract_entry(post_url) webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage') + self.report_extraction(video_id) + # Extract update date - upload_date = None - pattern = 'title="Timestamp">(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - upload_date = mobj.group(1) + upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', + webpage, u'upload date', fatal=False) + if upload_date: # Convert timestring to a format suitable for filename upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") upload_date = upload_date.strftime('%Y%m%d') - self.report_date(upload_date) # Extract uploader - uploader = None - pattern = r'rel\="author".*?>(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - uploader = mobj.group(1) - self.report_uploader(uploader) + uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', + webpage, u'uploader', fatal=False) # Extract title # Get the first line for title - video_title = u'NA' - pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]' - mobj = re.search(pattern, webpage) - if mobj: - video_title = mobj.group(1) - self.report_title(video_title) + video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', + webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video - pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' - mobj = re.search(pattern, webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video page URL') - - video_page = mobj.group(1) + video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', + webpage, u'video page URL') webpage = self._download_webpage(video_page, video_id, u'Downloading video page') - self.report_extract_vid_page(video_page) - # Extract video links on video page """Extract video links of all sizes""" @@ -3207,7 +3163,7 @@ class GooglePlusIE(InfoExtractor): }] class NBAIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$' + _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' IE_NAME = u'nba' def _real_extract(self, url): @@ -3216,28 +3172,27 @@ class NBAIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) - if video_id.endswith('/index.html'): - video_id = video_id[:-len('/index.html')] webpage = self._download_webpage(url, video_id) video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' - def _findProp(rexp, default=None): - m = re.search(rexp, webpage) - if m: - return unescapeHTML(m.group(1)) - else: - return default shortened_video_id = video_id.rpartition('/')[2] - title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '') + title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', + webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') + + # It isn't there in the HTML it returns to us + # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) + + description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) + info = { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'), - 'description': _findProp(r'<div class="description">(.*?)</h1>'), + # 'uploader_date': uploader_date, + 'description': description, } return [info] @@ -3385,30 +3340,21 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL) - if not m: - raise ExtractorError(u'Unable to find video information') - video_url = unescapeHTML(m.group('url')) + video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + webpage, u'video URL', flags=re.DOTALL) - m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL) - if not m: - m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = clean_html(m.group('title')) + title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", + r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL) - m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) - if m: - desc = unescapeHTML(m.group('desc')) - else: - desc = None + video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', + webpage, u'description', fatal=False, flags=re.DOTALL) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - 'description': desc, + 'description': video_description, } return [info] @@ -3464,27 +3410,29 @@ class UstreamIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') + video_url = u'http://tcdn.ustream.tv/video/%s' % video_id webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) - try: - m = re.search(r'data-title="(?P<title>.+)"',webpage) - title = m.group('title') - m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', - webpage, re.DOTALL) - uploader = unescapeHTML(m.group('uploader').strip()) - m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage) - thumb = m.group('thumb') - except AttributeError: - raise ExtractorError(u'Unable to extract info') + + video_title = self._html_search_regex(r'data-title="(?P<title>.+)"', + webpage, u'title') + + uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', + webpage, u'uploader', fatal=False, flags=re.DOTALL) + + thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', + webpage, u'thumbnail', fatal=False) + info = { - 'id':video_id, - 'url':video_url, + 'id': video_id, + 'url': video_url, 'ext': 'flv', - 'title': title, + 'title': video_title, 'uploader': uploader, - 'thumbnail': thumb, - } + 'thumbnail': thumbnail, + } return info class WorldStarHipHopIE(InfoExtractor): @@ -3492,45 +3440,36 @@ class WorldStarHipHopIE(InfoExtractor): IE_NAME = u'WorldStarHipHop' def _real_extract(self, url): - _src_url = r'so\.addVariable\("file","(.*?)"\)' - m = re.match(self._VALID_URL, url) video_id = m.group('id') - webpage_src = self._download_webpage(url, video_id) + webpage_src = self._download_webpage(url, video_id) - mobj = re.search(_src_url, webpage_src) + video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)', + webpage_src, u'video URL') - if mobj is not None: - video_url = mobj.group(1) - if 'mp4' in video_url: - ext = 'mp4' - else: - ext = 'flv' + if 'mp4' in video_url: + ext = 'mp4' else: - raise ExtractorError(u'Cannot find video url for %s' % video_id) - - mobj = re.search(r"<title>(.*)</title>", webpage_src) + ext = 'flv' - if mobj is None: - raise ExtractorError(u'Cannot determine title') - title = mobj.group(1) + video_title = self._html_search_regex(r"<title>(.*)</title>", + webpage_src, u'title') - mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src) # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - if mobj is not None: - thumbnail = mobj.group(1) - else: + thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', + webpage_src, u'thumbnail', fatal=False) + + if not thumbnail: _title = r"""candytitles.*>(.*)</span>""" mobj = re.search(_title, webpage_src) if mobj is not None: - title = mobj.group(1) - thumbnail = None + video_title = mobj.group(1) results = [{ 'id': video_id, 'url' : video_url, - 'title' : title, + 'title' : video_title, 'thumbnail' : thumbnail, 'ext' : ext, }] @@ -3544,10 +3483,9 @@ class RBMARadioIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage) - if not m: - raise ExtractorError(u'Cannot find metadata') - json_data = m.group(1) + + json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', + webpage, u'json data') try: data = json.loads(json_data) @@ -3594,42 +3532,33 @@ class YouPornIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('videoid') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - # Get the video title - result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group('title').strip() - - # Get the video date - result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract video date') - upload_date = None - else: - upload_date = unified_strdate(result.group('date').strip()) + # Get JSON parameters + json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') + try: + params = json.loads(json_params) + except: + raise ExtractorError(u'Invalid JSON') - # Get the video uploader - result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract uploader') - video_uploader = None - else: - video_uploader = result.group('uploader').strip() - video_uploader = clean_html( video_uploader ) + self.report_extraction(video_id) + try: + video_title = params['title'] + upload_date = unified_strdate(params['release_date_f']) + video_description = params['description'] + video_uploader = params['submitted_by'] + thumbnail = params['thumbnails'][0]['image'] + except KeyError: + raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' - result = re.search(DOWNLOAD_LIST_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract download list') - download_list_html = result.group('download_list').strip() + download_list_html = self._search_regex(DOWNLOAD_LIST_RE, + webpage, u'download list').strip() # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' @@ -3653,19 +3582,18 @@ class YouPornIE(InfoExtractor): size = format[0] bitrate = format[1] format = "-".join( format ) - title = u'%s-%s-%s' % (video_title, size, bitrate) + # title = u'%s-%s-%s' % (video_title, size, bitrate) formats.append({ 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': upload_date, - 'title': title, + 'title': video_title, 'ext': extension, 'format': format, - 'thumbnail': None, - 'description': None, - 'player_url': None + 'thumbnail': thumbnail, + 'description': video_description }) if self._downloader.params.get('listformats', None): @@ -3706,17 +3634,13 @@ class PornotubeIE(InfoExtractor): # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - result = re.search(VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group('url')) + video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') + video_url = compat_urllib_parse.unquote(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - result = re.search(VIDEO_UPLOADED_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - upload_date = unified_strdate(result.group('date')) + upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, 'url': video_url, @@ -3743,10 +3667,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - result = re.search(r'<title>(?P<title>.*)</title>', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video title') - video_title = result.group('title').strip() + video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>', + webpage, u'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage) @@ -3759,10 +3681,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video url') - video_url = result.group('source') + video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', + webpage, u'video URL') info = {'id': video_id, 'url': video_url, @@ -3785,10 +3705,7 @@ class EightTracksIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) - if not m: - raise ExtractorError(u'Cannot find trax information') - json_like = m.group(1) + json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL) data = json.loads(json_like) session = str(random.randint(0, 1000000000)) @@ -3824,18 +3741,22 @@ class KeekIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') + video_url = u'http://cdn.keek.com/keek/video/%s' % video_id thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage) - title = unescapeHTML(m.group('title')) - m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage) - uploader = clean_html(m.group('uploader')) + + video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', + webpage, u'title') + + uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', + webpage, u'uploader', fatal=False) + info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, + 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader } @@ -3982,10 +3903,9 @@ class SpiegelIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'<div class="module-title">(.*?)</div>', webpage) - if not m: - raise ExtractorError(u'Cannot find title') - video_title = unescapeHTML(m.group(1)) + + video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>', + webpage, u'title') xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -4021,35 +3941,25 @@ class LiveLeakIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - m = re.search(r'file: "(.*?)",', webpage) - if not m: - raise ExtractorError(u'Unable to find video url') - video_url = m.group(1) + video_url = self._search_regex(r'file: "(.*?)",', + webpage, u'video URL') - m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip() + video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', + webpage, u'title').replace('LiveLeak.com -', '').strip() - m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) - if m: - desc = unescapeHTML(m.group('desc')) - else: - desc = None + video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', + webpage, u'description', fatal=False) - m = re.search(r'By:.*?(\w+)</a>', webpage) - if m: - uploader = clean_html(m.group(1)) - else: - uploader = None + video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', + webpage, u'uploader', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, - 'description': desc, - 'uploader': uploader + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader } return [info] @@ -4165,23 +4075,23 @@ class TumblrIE(InfoExtractor): re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: - self.to_screen("No video found") - return [] + raise ExtractorError(u'Unable to extract video') video_url = video.group('video_url') ext = video.group('ext') - re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster - thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '') + video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22', + webpage, u'thumbnail', fatal=False) # We pick the first poster + if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - re_title = r'<title>(?P<title>.*?)</title>' - title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title')) + video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>', + webpage, u'title', flags=re.DOTALL) return [{'id': video_id, 'url': video_url, - 'title': title, - 'thumbnail': thumb, + 'title': video_title, + 'thumbnail': video_thumbnail, 'ext': ext }] @@ -4195,7 +4105,7 @@ class BandcampIE(InfoExtractor): # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: - raise ExtractorError(u'No free songs founded') + raise ExtractorError(u'No free songs found') download_link = m_download.group(1) id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', @@ -4223,10 +4133,10 @@ class BandcampIE(InfoExtractor): track_info = {'id':id, 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, + 'ext' : 'mp3', + 'url' : final_url, 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] + 'uploader' : info[u'artist'] } return [track_info] @@ -4243,17 +4153,14 @@ class RedTubeIE(InfoExtractor): video_id = mobj.group('id') video_extension = 'mp4' webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) - mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') + video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">', + webpage, u'video URL') - video_url = mobj.group(1) - mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', + webpage, u'title') return [{ 'id': video_id, @@ -4274,15 +4181,13 @@ class InaIE(InfoExtractor): video_extension = 'mp4' webpage = self._download_webpage(mrss_url, video_id) - mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - video_url = mobj.group(1) + self.report_extraction(video_id) - mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', + webpage, u'video URL') + + video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', + webpage, u'title') return [{ 'id': video_id, @@ -4304,27 +4209,17 @@ class HowcastIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - video_url = mobj.group(1) + video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', + webpage, u'video URL') - mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) or mobj.group(2) + video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', + webpage, u'title') - mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage) - if mobj is None: - self._downloader.report_warning(u'unable to extract description') - video_description = None - else: - video_description = mobj.group(1) or mobj.group(2) + video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', + webpage, u'description', fatal=False) - mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail') - thumbnail = mobj.group(1) + thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'', + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -4340,7 +4235,6 @@ class VineIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -4349,25 +4243,17 @@ class VineIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - video_url = mobj.group(1) + video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"', + webpage, u'video URL') - mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', + webpage, u'title') - mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail') - thumbnail = mobj.group(1) + thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"', + webpage, u'thumbnail', fatal=False) - mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader') - uploader = mobj.group(1) + uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>', + webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ 'id': video_id, @@ -4390,18 +4276,13 @@ class FlickrIE(InfoExtractor): webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id webpage = self._download_webpage(webpage_url, video_id) - mobj = re.search(r"photo_secret: '(\w+)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video secret') - secret = mobj.group(1) + secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret') first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml) - if mobj is None: - raise ExtractorError(u'Unable to extract node_id') - node_id = mobj.group(1) + node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>', + first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') @@ -4413,22 +4294,14 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) or mobj.group(2) + video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', + webpage, u'video title') - mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage) - if mobj is None: - self._downloader.report_warning(u'unable to extract description') - video_description = None - else: - video_description = mobj.group(1) or mobj.group(2) + video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', + webpage, u'description', fatal=False) - mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail') - thumbnail = mobj.group(1) or mobj.group(2) + thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -4450,32 +4323,25 @@ class TeamcocoIE(InfoExtractor): url_title = mobj.group('url_title') webpage = self._download_webpage(url, url_title) - mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage) - video_id = mobj.group(1) + video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"', + webpage, u'video id') self.report_extraction(video_id) - mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', + webpage, u'title') - mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail') - thumbnail = mobj.group(1) + thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"', + webpage, u'thumbnail', fatal=False) - mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract description') - description = mobj.group(1) + video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"', + webpage, u'description', fatal=False) data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id data = self._download_webpage(data_url, video_id, 'Downloading data webpage') - mobj = re.search(r'<file type="high".*?>(.*?)</file>', data) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - video_url = mobj.group(1) + + video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>', + data, u'video URL') return [{ 'id': video_id, @@ -4483,9 +4349,9 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, - 'description': description, + 'description': video_description, }] - + class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' @@ -4494,8 +4360,9 @@ class XHamsterIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url='http://xhamster.com/movies/%s/.html' % video_id + mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id webpage = self._download_webpage(mrss_url, video_id) + mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') @@ -4505,39 +4372,33 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) + video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', + webpage, u'title') - mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage) - if mobj is None: - video_description = u'' - else: - video_description = unescapeHTML(mobj.group('description')) + # Can't see the description anywhere in the UI + # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)', + # webpage, u'description', fatal=False) + # if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract upload date') - video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') - - mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage) - if mobj is None: - video_uploader_id = u'anonymous' + if mobj: + video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: - video_uploader_id = mobj.group('uploader_id') + video_upload_date = None + self._downloader.report_warning(u'Unable to extract upload date') - mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail URL') - video_thumbnail = mobj.group('thumbnail') + video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', + webpage, u'uploader id', default=u'anonymous') + + video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'', + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, - 'description': video_description, + # 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail @@ -4561,10 +4422,9 @@ class HypemIE(InfoExtractor): cookie = urlh.headers.get('Set-Cookie', '') self.report_extraction(track_id) - mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extrack tracks') - html_tracks = mobj.group(1).strip() + + html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>', + response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) track = track_list[u'tracks'][0] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py @@ -12,7 +12,7 @@ import sys import traceback import zlib import email.utils -import json +import socket import datetime try: @@ -154,6 +154,9 @@ def compat_ord(c): if type(c) is int: return c else: return ord(c) +# This is not clearly defined otherwise +compiled_regex_type = type(re.compile('')) + std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', @@ -469,7 +472,11 @@ class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): """ tb, if given, is the original traceback (so that it can be printed out). """ + + if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + msg = msg + u'; please report this issue on GitHub.' super(ExtractorError, self).__init__(msg) + self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception