2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
26 # parse_qs was moved from the cgi module to the urlparse module recently.
28 from urlparse import parse_qs
30 from cgi import parse_qs
33 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36 'Accept-Language': 'en-us,en;q=0.5',
39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
41 def preferredencoding():
42 """Get preferred encoding.
44 Returns the best encoding scheme for the system, based on
45 locale.getpreferredencoding() and some further tweaks.
47 def yield_preferredencoding():
49 pref = locale.getpreferredencoding()
55 return yield_preferredencoding().next()
57 def htmlentity_transform(matchobj):
58 """Transforms an HTML entity to a Unicode character.
60 This function receives a match object and is intended to be used with
61 the re.sub() function.
63 entity = matchobj.group(1)
65 # Known non-numeric HTML entity
66 if entity in htmlentitydefs.name2codepoint:
67 return unichr(htmlentitydefs.name2codepoint[entity])
70 mobj = re.match(ur'(?u)#(x?\d+)', entity)
72 numstr = mobj.group(1)
73 if numstr.startswith(u'x'):
75 numstr = u'0%s' % numstr
78 return unichr(long(numstr, base))
80 # Unknown entity in name, return its literal representation
81 return (u'&%s;' % entity)
83 def sanitize_title(utitle):
84 """Sanitizes a video title so it could be used as part of a filename."""
85 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
86 return utitle.replace(unicode(os.sep), u'%')
88 def sanitize_open(filename, open_mode):
89 """Try to open the given filename, and slightly tweak it if this fails.
91 Attempts to open the given filename. If this fails, it tries to change
92 the filename slightly, step by step, until it's either able to open it
93 or it fails and raises a final exception, like the standard open()
96 It returns the tuple (stream, definitive_file_name).
100 if sys.platform == 'win32':
102 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
103 return (sys.stdout, filename)
104 stream = open(filename, open_mode)
105 return (stream, filename)
106 except (IOError, OSError), err:
107 # In case of error, try to remove win32 forbidden chars
108 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
110 # An exception here should be caught in the caller
111 stream = open(filename, open_mode)
112 return (stream, filename)
114 class DownloadError(Exception):
115 """Download Error exception.
117 This exception may be thrown by FileDownloader objects if they are not
118 configured to continue on errors. They will contain the appropriate
123 class SameFileError(Exception):
124 """Same File exception.
126 This exception will be thrown by FileDownloader objects if they detect
127 multiple files would have to be downloaded to the same file on disk.
131 class PostProcessingError(Exception):
132 """Post Processing exception.
134 This exception may be raised by PostProcessor's .run() method to
135 indicate an error in the postprocessing task.
139 class UnavailableVideoError(Exception):
140 """Unavailable Format exception.
142 This exception will be thrown when a video is requested
143 in a format that is not available for that video.
147 class ContentTooShortError(Exception):
148 """Content Too Short exception.
150 This exception may be raised by FileDownloader objects when a file they
151 download is too small for what the server announced first, indicating
152 the connection was probably interrupted.
158 def __init__(self, downloaded, expected):
159 self.downloaded = downloaded
160 self.expected = expected
162 class FileDownloader(object):
163 """File Downloader class.
165 File downloader objects are the ones responsible of downloading the
166 actual video file and writing it to disk if the user has requested
167 it, among some other tasks. In most cases there should be one per
168 program. As, given a video URL, the downloader doesn't know how to
169 extract all the needed information, task that InfoExtractors do, it
170 has to pass the URL to one of them.
172 For this, file downloader objects have a method that allows
173 InfoExtractors to be registered in a given order. When it is passed
174 a URL, the file downloader handles it to the first InfoExtractor it
175 finds that reports being able to handle it. The InfoExtractor extracts
176 all the information about the video or videos the URL refers to, and
177 asks the FileDownloader to process the video information, possibly
178 downloading the video.
180 File downloaders accept a lot of parameters. In order not to saturate
181 the object constructor with arguments, it receives a dictionary of
182 options instead. These options are available through the params
183 attribute for the InfoExtractors to use. The FileDownloader also
184 registers itself as the downloader in charge for the InfoExtractors
185 that are added to it, so this is a "mutual registration".
189 username: Username for authentication purposes.
190 password: Password for authentication purposes.
191 usenetrc: Use netrc for authentication instead.
192 quiet: Do not print messages to stdout.
193 forceurl: Force printing final URL.
194 forcetitle: Force printing title.
195 forcethumbnail: Force printing thumbnail URL.
196 forcedescription: Force printing description.
197 simulate: Do not download the video files.
198 format: Video format code.
199 format_limit: Highest quality format to try.
200 outtmpl: Template for output names.
201 ignoreerrors: Do not stop on download errors.
202 ratelimit: Download speed limit, in bytes/sec.
203 nooverwrites: Prevent overwriting files.
204 retries: Number of times to retry for HTTP error 5xx
205 continuedl: Try to continue downloads if possible.
206 noprogress: Do not print the progress bar.
207 playliststart: Playlist item to start at.
208 playlistend: Playlist item to end at.
209 logtostderr: Log messages to stderr instead of stdout.
215 _download_retcode = None
216 _num_downloads = None
219 def __init__(self, params):
220 """Create a FileDownloader object with the given options."""
223 self._download_retcode = 0
224 self._num_downloads = 0
225 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
229 def pmkdir(filename):
230 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231 components = filename.split(os.sep)
232 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234 for dir in aggregate:
235 if not os.path.exists(dir):
239 def temp_name(filename):
240 """Returns a temporary filename for the given filename."""
241 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
243 return filename + u'.part'
246 def format_bytes(bytes):
249 if type(bytes) is str:
254 exponent = long(math.log(bytes, 1024.0))
255 suffix = 'bkMGTPEZY'[exponent]
256 converted = float(bytes) / float(1024**exponent)
257 return '%.2f%s' % (converted, suffix)
260 def calc_percent(byte_counter, data_len):
263 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
266 def calc_eta(start, now, total, current):
270 if current == 0 or dif < 0.001: # One millisecond
272 rate = float(current) / dif
273 eta = long((float(total) - float(current)) / rate)
274 (eta_mins, eta_secs) = divmod(eta, 60)
277 return '%02d:%02d' % (eta_mins, eta_secs)
280 def calc_speed(start, now, bytes):
282 if bytes == 0 or dif < 0.001: # One millisecond
283 return '%10s' % '---b/s'
284 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
287 def best_block_size(elapsed_time, bytes):
288 new_min = max(bytes / 2.0, 1.0)
289 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
290 if elapsed_time < 0.001:
292 rate = bytes / elapsed_time
300 def parse_bytes(bytestr):
301 """Parse a string indicating a byte quantity into a long integer."""
302 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
305 number = float(matchobj.group(1))
306 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
307 return long(round(number * multiplier))
309 def add_info_extractor(self, ie):
310 """Add an InfoExtractor object to the end of the list."""
312 ie.set_downloader(self)
314 def add_post_processor(self, pp):
315 """Add a PostProcessor object to the end of the chain."""
317 pp.set_downloader(self)
319 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
320 """Print message to stdout if not in quiet mode."""
322 if not self.params.get('quiet', False):
323 terminator = [u'\n', u''][skip_eol]
324 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
325 self._screen_file.flush()
326 except (UnicodeEncodeError), err:
327 if not ignore_encoding_errors:
330 def to_stderr(self, message):
331 """Print message to stderr."""
332 print >>sys.stderr, message.encode(preferredencoding())
334 def fixed_template(self):
335 """Checks if the output template is fixed."""
336 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
338 def trouble(self, message=None):
339 """Determine action to take when a download problem appears.
341 Depending on if the downloader has been configured to ignore
342 download errors or not, this method may throw an exception or
343 not when errors are found, after printing the message.
345 if message is not None:
346 self.to_stderr(message)
347 if not self.params.get('ignoreerrors', False):
348 raise DownloadError(message)
349 self._download_retcode = 1
351 def slow_down(self, start_time, byte_counter):
352 """Sleep if the download speed is over the rate limit."""
353 rate_limit = self.params.get('ratelimit', None)
354 if rate_limit is None or byte_counter == 0:
357 elapsed = now - start_time
360 speed = float(byte_counter) / elapsed
361 if speed > rate_limit:
362 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
364 def try_rename(self, old_filename, new_filename):
366 if old_filename == new_filename:
368 os.rename(old_filename, new_filename)
369 except (IOError, OSError), err:
370 self.trouble(u'ERROR: unable to rename file')
372 def report_destination(self, filename):
373 """Report destination filename."""
374 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
376 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
377 """Report download progress."""
378 if self.params.get('noprogress', False):
380 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
381 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
383 def report_resuming_byte(self, resume_len):
384 """Report attempt to resume at given byte."""
385 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
387 def report_retry(self, count, retries):
388 """Report retry in case of HTTP error 5xx"""
389 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
391 def report_file_already_downloaded(self, file_name):
392 """Report file has already been fully downloaded."""
394 self.to_screen(u'[download] %s has already been downloaded' % file_name)
395 except (UnicodeEncodeError), err:
396 self.to_screen(u'[download] The file has already been downloaded')
398 def report_unable_to_resume(self):
399 """Report it was impossible to resume download."""
400 self.to_screen(u'[download] Unable to resume')
402 def report_finish(self):
403 """Report download finished."""
404 if self.params.get('noprogress', False):
405 self.to_screen(u'[download] Download completed')
409 def increment_downloads(self):
410 """Increment the ordinal that assigns a number to each file."""
411 self._num_downloads += 1
413 def process_info(self, info_dict):
414 """Process a single dictionary returned by an InfoExtractor."""
415 # Do nothing else if in simulate mode
416 if self.params.get('simulate', False):
418 if self.params.get('forcetitle', False):
419 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
420 if self.params.get('forceurl', False):
421 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
422 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
423 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
424 if self.params.get('forcedescription', False) and 'description' in info_dict:
425 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
430 template_dict = dict(info_dict)
431 template_dict['epoch'] = unicode(long(time.time()))
432 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
433 filename = self.params['outtmpl'] % template_dict
434 except (ValueError, KeyError), err:
435 self.trouble(u'ERROR: invalid system charset or erroneous output template')
437 if self.params.get('nooverwrites', False) and os.path.exists(filename):
438 self.to_stderr(u'WARNING: file exists and will be skipped')
442 self.pmkdir(filename)
443 except (OSError, IOError), err:
444 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
448 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
449 except (OSError, IOError), err:
450 raise UnavailableVideoError
451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
452 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
454 except (ContentTooShortError, ), err:
455 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
460 self.post_process(filename, info_dict)
461 except (PostProcessingError), err:
462 self.trouble(u'ERROR: postprocessing: %s' % str(err))
465 def download(self, url_list):
466 """Download a given list of URLs."""
467 if len(url_list) > 1 and self.fixed_template():
468 raise SameFileError(self.params['outtmpl'])
471 suitable_found = False
473 # Go to next InfoExtractor if not suitable
474 if not ie.suitable(url):
477 # Suitable InfoExtractor found
478 suitable_found = True
480 # Extract information from URL and process it
483 # Suitable InfoExtractor had been found; go to next URL
486 if not suitable_found:
487 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
489 return self._download_retcode
491 def post_process(self, filename, ie_info):
492 """Run the postprocessing chain on the given file."""
494 info['filepath'] = filename
500 def _download_with_rtmpdump(self, filename, url, player_url):
501 self.report_destination(filename)
502 tmpfilename = self.temp_name(filename)
504 # Check for rtmpdump first
506 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
507 except (OSError, IOError):
508 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
511 # Download using rtmpdump. rtmpdump returns exit code 2 when
512 # the connection was interrumpted and resuming appears to be
513 # possible. This is part of rtmpdump's normal usage, AFAIK.
514 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
515 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
516 while retval == 2 or retval == 1:
517 prevsize = os.path.getsize(tmpfilename)
518 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
519 time.sleep(5.0) # This seems to be needed
520 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
521 cursize = os.path.getsize(tmpfilename)
522 if prevsize == cursize and retval == 1:
525 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
526 self.try_rename(tmpfilename, filename)
529 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
532 def _do_download(self, filename, url, player_url):
533 # Check file already present
534 if self.params.get('continuedl', False) and os.path.isfile(filename):
535 self.report_file_already_downloaded(filename)
538 # Attempt to download using rtmpdump
539 if url.startswith('rtmp'):
540 return self._download_with_rtmpdump(filename, url, player_url)
542 tmpfilename = self.temp_name(filename)
545 basic_request = urllib2.Request(url, None, std_headers)
546 request = urllib2.Request(url, None, std_headers)
548 # Establish possible resume length
549 if os.path.isfile(tmpfilename):
550 resume_len = os.path.getsize(tmpfilename)
554 # Request parameters in case of being able to resume
555 if self.params.get('continuedl', False) and resume_len != 0:
556 self.report_resuming_byte(resume_len)
557 request.add_header('Range','bytes=%d-' % resume_len)
561 retries = self.params.get('retries', 0)
562 while count <= retries:
563 # Establish connection
565 data = urllib2.urlopen(request)
567 except (urllib2.HTTPError, ), err:
568 if (err.code < 500 or err.code >= 600) and err.code != 416:
569 # Unexpected HTTP error
571 elif err.code == 416:
572 # Unable to resume (requested range not satisfiable)
574 # Open the connection again without the range header
575 data = urllib2.urlopen(basic_request)
576 content_length = data.info()['Content-Length']
577 except (urllib2.HTTPError, ), err:
578 if err.code < 500 or err.code >= 600:
581 # Examine the reported length
582 if (content_length is not None and
583 (resume_len - 100 < long(content_length) < resume_len + 100)):
584 # The file had already been fully downloaded.
585 # Explanation to the above condition: in issue #175 it was revealed that
586 # YouTube sometimes adds or removes a few bytes from the end of the file,
587 # changing the file size slightly and causing problems for some users. So
588 # I decided to implement a suggested change and consider the file
589 # completely downloaded if the file size differs less than 100 bytes from
590 # the one in the hard drive.
591 self.report_file_already_downloaded(filename)
592 self.try_rename(tmpfilename, filename)
595 # The length does not match, we start the download over
596 self.report_unable_to_resume()
602 self.report_retry(count, retries)
605 self.trouble(u'ERROR: giving up after %s retries' % retries)
608 data_len = data.info().get('Content-length', None)
609 if data_len is not None:
610 data_len = long(data_len) + resume_len
611 data_len_str = self.format_bytes(data_len)
612 byte_counter = 0 + resume_len
618 data_block = data.read(block_size)
620 if len(data_block) == 0:
622 byte_counter += len(data_block)
624 # Open file just in time
627 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
628 self.report_destination(filename)
629 except (OSError, IOError), err:
630 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
633 stream.write(data_block)
634 except (IOError, OSError), err:
635 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
637 block_size = self.best_block_size(after - before, len(data_block))
640 percent_str = self.calc_percent(byte_counter, data_len)
641 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
642 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
643 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
646 self.slow_down(start, byte_counter - resume_len)
650 if data_len is not None and byte_counter != data_len:
651 raise ContentTooShortError(byte_counter, long(data_len))
652 self.try_rename(tmpfilename, filename)
655 class InfoExtractor(object):
656 """Information Extractor class.
658 Information extractors are the classes that, given a URL, extract
659 information from the video (or videos) the URL refers to. This
660 information includes the real video URL, the video title and simplified
661 title, author and others. The information is stored in a dictionary
662 which is then passed to the FileDownloader. The FileDownloader
663 processes this information possibly downloading the video to the file
664 system, among other possible outcomes. The dictionaries must include
665 the following fields:
667 id: Video identifier.
668 url: Final video URL.
669 uploader: Nickname of the video uploader.
670 title: Literal title.
671 stitle: Simplified title.
672 ext: Video filename extension.
673 format: Video format.
674 player_url: SWF Player URL (may be None).
676 The following fields are optional. Their primary purpose is to allow
677 youtube-dl to serve as the backend for a video search function, such
678 as the one in youtube2mp3. They are only used when their respective
679 forced printing functions are called:
681 thumbnail: Full URL to a video thumbnail image.
682 description: One-line video description.
684 Subclasses of this one should re-define the _real_initialize() and
685 _real_extract() methods, as well as the suitable() static method.
686 Probably, they should also be instantiated and added to the main
693 def __init__(self, downloader=None):
694 """Constructor. Receives an optional downloader."""
696 self.set_downloader(downloader)
700 """Receives a URL and returns True if suitable for this IE."""
703 def initialize(self):
704 """Initializes an instance (authentication, etc)."""
706 self._real_initialize()
709 def extract(self, url):
710 """Extracts URL information and returns it in list of dicts."""
712 return self._real_extract(url)
714 def set_downloader(self, downloader):
715 """Sets the downloader for this IE."""
716 self._downloader = downloader
718 def _real_initialize(self):
719 """Real initialization process. Redefine in subclasses."""
722 def _real_extract(self, url):
723 """Real extraction process. Redefine in subclasses."""
726 class YoutubeIE(InfoExtractor):
727 """Information extractor for youtube.com."""
729 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
730 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
731 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
732 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
733 _NETRC_MACHINE = 'youtube'
734 # Listed in order of quality
735 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
736 _video_extensions = {
742 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
749 return (re.match(YoutubeIE._VALID_URL, url) is not None)
751 def report_lang(self):
752 """Report attempt to set language."""
753 self._downloader.to_screen(u'[youtube] Setting language')
755 def report_login(self):
756 """Report attempt to log in."""
757 self._downloader.to_screen(u'[youtube] Logging in')
759 def report_age_confirmation(self):
760 """Report attempt to confirm age."""
761 self._downloader.to_screen(u'[youtube] Confirming age')
763 def report_video_webpage_download(self, video_id):
764 """Report attempt to download video webpage."""
765 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
767 def report_video_info_webpage_download(self, video_id):
768 """Report attempt to download video info webpage."""
769 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
771 def report_information_extraction(self, video_id):
772 """Report attempt to extract video information."""
773 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
775 def report_unavailable_format(self, video_id, format):
776 """Report extracted video URL."""
777 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
779 def report_rtmp_download(self):
780 """Indicate the download will use the RTMP protocol."""
781 self._downloader.to_screen(u'[youtube] RTMP download detected')
783 def _real_initialize(self):
784 if self._downloader is None:
789 downloader_params = self._downloader.params
791 # Attempt to use provided username and password or .netrc data
792 if downloader_params.get('username', None) is not None:
793 username = downloader_params['username']
794 password = downloader_params['password']
795 elif downloader_params.get('usenetrc', False):
797 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
802 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
803 except (IOError, netrc.NetrcParseError), err:
804 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
808 request = urllib2.Request(self._LANG_URL, None, std_headers)
811 urllib2.urlopen(request).read()
812 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
813 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
816 # No authentication to be performed
822 'current_form': 'loginForm',
824 'action_login': 'Log In',
825 'username': username,
826 'password': password,
828 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
831 login_results = urllib2.urlopen(request).read()
832 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
833 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
836 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
842 'action_confirm': 'Confirm',
844 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
846 self.report_age_confirmation()
847 age_results = urllib2.urlopen(request).read()
848 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
849 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
852 def _real_extract(self, url):
853 # Extract video id from URL
854 mobj = re.match(self._VALID_URL, url)
856 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
858 video_id = mobj.group(2)
861 self.report_video_webpage_download(video_id)
862 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
864 video_webpage = urllib2.urlopen(request).read()
865 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
866 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
869 # Attempt to extract SWF player URL
870 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
872 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
877 self.report_video_info_webpage_download(video_id)
878 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
879 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
880 % (video_id, el_type))
881 request = urllib2.Request(video_info_url, None, std_headers)
883 video_info_webpage = urllib2.urlopen(request).read()
884 video_info = parse_qs(video_info_webpage)
885 if 'token' in video_info:
887 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
888 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
890 if 'token' not in video_info:
891 if 'reason' in video_info:
892 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
894 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
897 # Start extracting information
898 self.report_information_extraction(video_id)
901 if 'author' not in video_info:
902 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
904 video_uploader = urllib.unquote_plus(video_info['author'][0])
907 if 'title' not in video_info:
908 self._downloader.trouble(u'ERROR: unable to extract video title')
910 video_title = urllib.unquote_plus(video_info['title'][0])
911 video_title = video_title.decode('utf-8')
912 video_title = sanitize_title(video_title)
915 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
916 simple_title = simple_title.strip(ur'_')
919 if 'thumbnail_url' not in video_info:
920 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
922 else: # don't panic if we can't find it
923 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
927 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
929 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
930 format_expressions = ['%d %B %Y', '%B %d %Y']
931 for expression in format_expressions:
933 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
938 video_description = 'No description available.'
939 if self._downloader.params.get('forcedescription', False):
940 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
942 video_description = mobj.group(1)
945 video_token = urllib.unquote_plus(video_info['token'][0])
947 # Decide which formats to download
948 req_format = self._downloader.params.get('format', None)
950 if 'fmt_url_map' in video_info:
951 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
952 format_limit = self._downloader.params.get('format_limit', None)
953 if format_limit is not None and format_limit in self._available_formats:
954 format_list = self._available_formats[self._available_formats.index(format_limit):]
956 format_list = self._available_formats
957 existing_formats = [x for x in format_list if x in url_map]
958 if len(existing_formats) == 0:
959 self._downloader.trouble(u'ERROR: no known formats available for video')
961 if req_format is None:
962 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
963 elif req_format == '-1':
964 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
967 if req_format not in url_map:
968 self._downloader.trouble(u'ERROR: requested format not available')
970 video_url_list = [(req_format, url_map[req_format])] # Specific format
972 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
973 self.report_rtmp_download()
974 video_url_list = [(None, video_info['conn'][0])]
977 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
980 for format_param, video_real_url in video_url_list:
981 # At this point we have a new video
982 self._downloader.increment_downloads()
985 video_extension = self._video_extensions.get(format_param, 'flv')
987 # Find the video URL in fmt_url_map or conn paramters
989 # Process video information
990 self._downloader.process_info({
991 'id': video_id.decode('utf-8'),
992 'url': video_real_url.decode('utf-8'),
993 'uploader': video_uploader.decode('utf-8'),
994 'upload_date': upload_date,
995 'title': video_title,
996 'stitle': simple_title,
997 'ext': video_extension.decode('utf-8'),
998 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
999 'thumbnail': video_thumbnail.decode('utf-8'),
1000 'description': video_description.decode('utf-8'),
1001 'player_url': player_url,
1003 except UnavailableVideoError, err:
1004 self._downloader.trouble(u'ERROR: unable to download video')
1007 class MetacafeIE(InfoExtractor):
1008 """Information Extractor for metacafe.com."""
1010 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1011 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1012 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1015 def __init__(self, youtube_ie, downloader=None):
1016 InfoExtractor.__init__(self, downloader)
1017 self._youtube_ie = youtube_ie
1021 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1023 def report_disclaimer(self):
1024 """Report disclaimer retrieval."""
1025 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1027 def report_age_confirmation(self):
1028 """Report attempt to confirm age."""
1029 self._downloader.to_screen(u'[metacafe] Confirming age')
1031 def report_download_webpage(self, video_id):
1032 """Report webpage download."""
1033 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1035 def report_extraction(self, video_id):
1036 """Report information extraction."""
1037 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1039 def _real_initialize(self):
1040 # Retrieve disclaimer
1041 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1043 self.report_disclaimer()
1044 disclaimer = urllib2.urlopen(request).read()
1045 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1046 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1052 'submit': "Continue - I'm over 18",
1054 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1056 self.report_age_confirmation()
1057 disclaimer = urllib2.urlopen(request).read()
1058 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1059 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1062 def _real_extract(self, url):
1063 # Extract id and simplified title from URL
1064 mobj = re.match(self._VALID_URL, url)
1066 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1069 video_id = mobj.group(1)
1071 # Check if video comes from YouTube
1072 mobj2 = re.match(r'^yt-(.*)$', video_id)
1073 if mobj2 is not None:
1074 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1077 # At this point we have a new video
1078 self._downloader.increment_downloads()
1080 simple_title = mobj.group(2).decode('utf-8')
1082 # Retrieve video webpage to extract further information
1083 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1085 self.report_download_webpage(video_id)
1086 webpage = urllib2.urlopen(request).read()
1087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1088 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1091 # Extract URL, uploader and title from webpage
1092 self.report_extraction(video_id)
1093 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1094 if mobj is not None:
1095 mediaURL = urllib.unquote(mobj.group(1))
1096 video_extension = mediaURL[-3:]
1098 # Extract gdaKey if available
1099 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1101 video_url = mediaURL
1103 gdaKey = mobj.group(1)
1104 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1106 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1108 self._downloader.trouble(u'ERROR: unable to extract media URL')
1110 vardict = parse_qs(mobj.group(1))
1111 if 'mediaData' not in vardict:
1112 self._downloader.trouble(u'ERROR: unable to extract media URL')
1114 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1116 self._downloader.trouble(u'ERROR: unable to extract media URL')
1118 mediaURL = mobj.group(1).replace('\\/', '/')
1119 video_extension = mediaURL[-3:]
1120 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1122 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1124 self._downloader.trouble(u'ERROR: unable to extract title')
1126 video_title = mobj.group(1).decode('utf-8')
1127 video_title = sanitize_title(video_title)
1129 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1131 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1133 video_uploader = mobj.group(1)
1136 # Process video information
1137 self._downloader.process_info({
1138 'id': video_id.decode('utf-8'),
1139 'url': video_url.decode('utf-8'),
1140 'uploader': video_uploader.decode('utf-8'),
1141 'upload_date': u'NA',
1142 'title': video_title,
1143 'stitle': simple_title,
1144 'ext': video_extension.decode('utf-8'),
1148 except UnavailableVideoError:
1149 self._downloader.trouble(u'ERROR: unable to download video')
1152 class DailymotionIE(InfoExtractor):
1153 """Information Extractor for Dailymotion"""
1155 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1157 def __init__(self, downloader=None):
1158 InfoExtractor.__init__(self, downloader)
1162 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1164 def report_download_webpage(self, video_id):
1165 """Report webpage download."""
1166 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1168 def report_extraction(self, video_id):
1169 """Report information extraction."""
1170 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1172 def _real_initialize(self):
1175 def _real_extract(self, url):
1176 # Extract id and simplified title from URL
1177 mobj = re.match(self._VALID_URL, url)
1179 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1182 # At this point we have a new video
1183 self._downloader.increment_downloads()
1184 video_id = mobj.group(1)
1186 simple_title = mobj.group(2).decode('utf-8')
1187 video_extension = 'flv'
1189 # Retrieve video webpage to extract further information
1190 request = urllib2.Request(url)
1192 self.report_download_webpage(video_id)
1193 webpage = urllib2.urlopen(request).read()
1194 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1195 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1198 # Extract URL, uploader and title from webpage
1199 self.report_extraction(video_id)
1200 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1202 self._downloader.trouble(u'ERROR: unable to extract media URL')
1204 mediaURL = urllib.unquote(mobj.group(1))
1206 # if needed add http://www.dailymotion.com/ if relative URL
1208 video_url = mediaURL
1210 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1211 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1213 self._downloader.trouble(u'ERROR: unable to extract title')
1215 video_title = mobj.group(1).decode('utf-8')
1216 video_title = sanitize_title(video_title)
1218 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1220 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1222 video_uploader = mobj.group(1)
1225 # Process video information
1226 self._downloader.process_info({
1227 'id': video_id.decode('utf-8'),
1228 'url': video_url.decode('utf-8'),
1229 'uploader': video_uploader.decode('utf-8'),
1230 'upload_date': u'NA',
1231 'title': video_title,
1232 'stitle': simple_title,
1233 'ext': video_extension.decode('utf-8'),
1237 except UnavailableVideoError:
1238 self._downloader.trouble(u'ERROR: unable to download video')
1240 class GoogleIE(InfoExtractor):
1241 """Information extractor for video.google.com."""
1243 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1245 def __init__(self, downloader=None):
1246 InfoExtractor.__init__(self, downloader)
1250 return (re.match(GoogleIE._VALID_URL, url) is not None)
1252 def report_download_webpage(self, video_id):
1253 """Report webpage download."""
1254 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1256 def report_extraction(self, video_id):
1257 """Report information extraction."""
1258 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1260 def _real_initialize(self):
1263 def _real_extract(self, url):
1264 # Extract id from URL
1265 mobj = re.match(self._VALID_URL, url)
1267 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1270 # At this point we have a new video
1271 self._downloader.increment_downloads()
1272 video_id = mobj.group(1)
1274 video_extension = 'mp4'
1276 # Retrieve video webpage to extract further information
1277 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1279 self.report_download_webpage(video_id)
1280 webpage = urllib2.urlopen(request).read()
1281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1282 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1285 # Extract URL, uploader, and title from webpage
1286 self.report_extraction(video_id)
1287 mobj = re.search(r"download_url:'([^']+)'", webpage)
1289 video_extension = 'flv'
1290 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1292 self._downloader.trouble(u'ERROR: unable to extract media URL')
1294 mediaURL = urllib.unquote(mobj.group(1))
1295 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1296 mediaURL = mediaURL.replace('\\x26', '\x26')
1298 video_url = mediaURL
1300 mobj = re.search(r'<title>(.*)</title>', webpage)
1302 self._downloader.trouble(u'ERROR: unable to extract title')
1304 video_title = mobj.group(1).decode('utf-8')
1305 video_title = sanitize_title(video_title)
1306 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1308 # Extract video description
1309 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1311 self._downloader.trouble(u'ERROR: unable to extract video description')
1313 video_description = mobj.group(1).decode('utf-8')
1314 if not video_description:
1315 video_description = 'No description available.'
1317 # Extract video thumbnail
1318 if self._downloader.params.get('forcethumbnail', False):
1319 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1321 webpage = urllib2.urlopen(request).read()
1322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1325 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1327 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1329 video_thumbnail = mobj.group(1)
1330 else: # we need something to pass to process_info
1331 video_thumbnail = ''
1335 # Process video information
1336 self._downloader.process_info({
1337 'id': video_id.decode('utf-8'),
1338 'url': video_url.decode('utf-8'),
1340 'upload_date': u'NA',
1341 'title': video_title,
1342 'stitle': simple_title,
1343 'ext': video_extension.decode('utf-8'),
1347 except UnavailableVideoError:
1348 self._downloader.trouble(u'ERROR: unable to download video')
1351 class PhotobucketIE(InfoExtractor):
1352 """Information extractor for photobucket.com."""
1354 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1356 def __init__(self, downloader=None):
1357 InfoExtractor.__init__(self, downloader)
1361 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1363 def report_download_webpage(self, video_id):
1364 """Report webpage download."""
1365 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1367 def report_extraction(self, video_id):
1368 """Report information extraction."""
1369 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1371 def _real_initialize(self):
1374 def _real_extract(self, url):
1375 # Extract id from URL
1376 mobj = re.match(self._VALID_URL, url)
1378 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1381 # At this point we have a new video
1382 self._downloader.increment_downloads()
1383 video_id = mobj.group(1)
1385 video_extension = 'flv'
1387 # Retrieve video webpage to extract further information
1388 request = urllib2.Request(url)
1390 self.report_download_webpage(video_id)
1391 webpage = urllib2.urlopen(request).read()
1392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1393 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1396 # Extract URL, uploader, and title from webpage
1397 self.report_extraction(video_id)
1398 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1400 self._downloader.trouble(u'ERROR: unable to extract media URL')
1402 mediaURL = urllib.unquote(mobj.group(1))
1404 video_url = mediaURL
1406 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1408 self._downloader.trouble(u'ERROR: unable to extract title')
1410 video_title = mobj.group(1).decode('utf-8')
1411 video_title = sanitize_title(video_title)
1412 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1414 video_uploader = mobj.group(2).decode('utf-8')
1417 # Process video information
1418 self._downloader.process_info({
1419 'id': video_id.decode('utf-8'),
1420 'url': video_url.decode('utf-8'),
1421 'uploader': video_uploader,
1422 'upload_date': u'NA',
1423 'title': video_title,
1424 'stitle': simple_title,
1425 'ext': video_extension.decode('utf-8'),
1429 except UnavailableVideoError:
1430 self._downloader.trouble(u'ERROR: unable to download video')
1433 class YahooIE(InfoExtractor):
1434 """Information extractor for video.yahoo.com."""
1436 # _VALID_URL matches all Yahoo! Video URLs
1437 # _VPAGE_URL matches only the extractable '/watch/' URLs
1438 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1439 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1441 def __init__(self, downloader=None):
1442 InfoExtractor.__init__(self, downloader)
1446 return (re.match(YahooIE._VALID_URL, url) is not None)
1448 def report_download_webpage(self, video_id):
1449 """Report webpage download."""
1450 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1452 def report_extraction(self, video_id):
1453 """Report information extraction."""
1454 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1456 def _real_initialize(self):
1459 def _real_extract(self, url, new_video=True):
1460 # Extract ID from URL
1461 mobj = re.match(self._VALID_URL, url)
1463 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1466 # At this point we have a new video
1467 self._downloader.increment_downloads()
1468 video_id = mobj.group(2)
1469 video_extension = 'flv'
1471 # Rewrite valid but non-extractable URLs as
1472 # extractable English language /watch/ URLs
1473 if re.match(self._VPAGE_URL, url) is None:
1474 request = urllib2.Request(url)
1476 webpage = urllib2.urlopen(request).read()
1477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1481 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1483 self._downloader.trouble(u'ERROR: Unable to extract id field')
1485 yahoo_id = mobj.group(1)
1487 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1489 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1491 yahoo_vid = mobj.group(1)
1493 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1494 return self._real_extract(url, new_video=False)
1496 # Retrieve video webpage to extract further information
1497 request = urllib2.Request(url)
1499 self.report_download_webpage(video_id)
1500 webpage = urllib2.urlopen(request).read()
1501 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1502 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1505 # Extract uploader and title from webpage
1506 self.report_extraction(video_id)
1507 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1509 self._downloader.trouble(u'ERROR: unable to extract video title')
1511 video_title = mobj.group(1).decode('utf-8')
1512 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1514 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1516 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1518 video_uploader = mobj.group(1).decode('utf-8')
1520 # Extract video thumbnail
1521 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1523 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1525 video_thumbnail = mobj.group(1).decode('utf-8')
1527 # Extract video description
1528 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1530 self._downloader.trouble(u'ERROR: unable to extract video description')
1532 video_description = mobj.group(1).decode('utf-8')
1533 if not video_description: video_description = 'No description available.'
1535 # Extract video height and width
1536 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1538 self._downloader.trouble(u'ERROR: unable to extract video height')
1540 yv_video_height = mobj.group(1)
1542 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1544 self._downloader.trouble(u'ERROR: unable to extract video width')
1546 yv_video_width = mobj.group(1)
1548 # Retrieve video playlist to extract media URL
1549 # I'm not completely sure what all these options are, but we
1550 # seem to need most of them, otherwise the server sends a 401.
1551 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1552 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1553 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1554 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1555 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1557 self.report_download_webpage(video_id)
1558 webpage = urllib2.urlopen(request).read()
1559 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1560 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1563 # Extract media URL from playlist XML
1564 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1566 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1568 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1569 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1572 # Process video information
1573 self._downloader.process_info({
1574 'id': video_id.decode('utf-8'),
1576 'uploader': video_uploader,
1577 'upload_date': u'NA',
1578 'title': video_title,
1579 'stitle': simple_title,
1580 'ext': video_extension.decode('utf-8'),
1581 'thumbnail': video_thumbnail.decode('utf-8'),
1582 'description': video_description,
1583 'thumbnail': video_thumbnail,
1584 'description': video_description,
1587 except UnavailableVideoError:
1588 self._downloader.trouble(u'ERROR: unable to download video')
1591 class GenericIE(InfoExtractor):
1592 """Generic last-resort information extractor."""
1594 def __init__(self, downloader=None):
1595 InfoExtractor.__init__(self, downloader)
1601 def report_download_webpage(self, video_id):
1602 """Report webpage download."""
1603 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1604 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1606 def report_extraction(self, video_id):
1607 """Report information extraction."""
1608 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1610 def _real_initialize(self):
1613 def _real_extract(self, url):
1614 # At this point we have a new video
1615 self._downloader.increment_downloads()
1617 video_id = url.split('/')[-1]
1618 request = urllib2.Request(url)
1620 self.report_download_webpage(video_id)
1621 webpage = urllib2.urlopen(request).read()
1622 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1623 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1625 except ValueError, err:
1626 # since this is the last-resort InfoExtractor, if
1627 # this error is thrown, it'll be thrown here
1628 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1631 self.report_extraction(video_id)
1632 # Start with something easy: JW Player in SWFObject
1633 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1635 # Broaden the search a little bit
1636 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1638 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1641 # It's possible that one of the regexes
1642 # matched, but returned an empty group:
1643 if mobj.group(1) is None:
1644 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1647 video_url = urllib.unquote(mobj.group(1))
1648 video_id = os.path.basename(video_url)
1650 # here's a fun little line of code for you:
1651 video_extension = os.path.splitext(video_id)[1][1:]
1652 video_id = os.path.splitext(video_id)[0]
1654 # it's tempting to parse this further, but you would
1655 # have to take into account all the variations like
1656 # Video Title - Site Name
1657 # Site Name | Video Title
1658 # Video Title - Tagline | Site Name
1659 # and so on and so forth; it's just not practical
1660 mobj = re.search(r'<title>(.*)</title>', webpage)
1662 self._downloader.trouble(u'ERROR: unable to extract title')
1664 video_title = mobj.group(1).decode('utf-8')
1665 video_title = sanitize_title(video_title)
1666 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1668 # video uploader is domain name
1669 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1671 self._downloader.trouble(u'ERROR: unable to extract title')
1673 video_uploader = mobj.group(1).decode('utf-8')
1676 # Process video information
1677 self._downloader.process_info({
1678 'id': video_id.decode('utf-8'),
1679 'url': video_url.decode('utf-8'),
1680 'uploader': video_uploader,
1681 'upload_date': u'NA',
1682 'title': video_title,
1683 'stitle': simple_title,
1684 'ext': video_extension.decode('utf-8'),
1688 except UnavailableVideoError, err:
1689 self._downloader.trouble(u'ERROR: unable to download video')
1692 class YoutubeSearchIE(InfoExtractor):
1693 """Information Extractor for YouTube search queries."""
1694 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1695 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1696 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1697 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1699 _max_youtube_results = 1000
1701 def __init__(self, youtube_ie, downloader=None):
1702 InfoExtractor.__init__(self, downloader)
1703 self._youtube_ie = youtube_ie
1707 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1709 def report_download_page(self, query, pagenum):
1710 """Report attempt to download playlist page with given number."""
1711 query = query.decode(preferredencoding())
1712 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1714 def _real_initialize(self):
1715 self._youtube_ie.initialize()
1717 def _real_extract(self, query):
1718 mobj = re.match(self._VALID_QUERY, query)
1720 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1723 prefix, query = query.split(':')
1725 query = query.encode('utf-8')
1727 self._download_n_results(query, 1)
1729 elif prefix == 'all':
1730 self._download_n_results(query, self._max_youtube_results)
1736 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1738 elif n > self._max_youtube_results:
1739 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1740 n = self._max_youtube_results
1741 self._download_n_results(query, n)
1743 except ValueError: # parsing prefix as integer fails
1744 self._download_n_results(query, 1)
1747 def _download_n_results(self, query, n):
1748 """Downloads a specified number of results for a query"""
1751 already_seen = set()
1755 self.report_download_page(query, pagenum)
1756 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1757 request = urllib2.Request(result_url, None, std_headers)
1759 page = urllib2.urlopen(request).read()
1760 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1761 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1764 # Extract video identifiers
1765 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1766 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1767 if video_id not in already_seen:
1768 video_ids.append(video_id)
1769 already_seen.add(video_id)
1770 if len(video_ids) == n:
1771 # Specified n videos reached
1772 for id in video_ids:
1773 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1776 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1777 for id in video_ids:
1778 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1781 pagenum = pagenum + 1
1783 class GoogleSearchIE(InfoExtractor):
1784 """Information Extractor for Google Video search queries."""
1785 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1786 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1787 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1788 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1790 _max_google_results = 1000
1792 def __init__(self, google_ie, downloader=None):
1793 InfoExtractor.__init__(self, downloader)
1794 self._google_ie = google_ie
1798 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1800 def report_download_page(self, query, pagenum):
1801 """Report attempt to download playlist page with given number."""
1802 query = query.decode(preferredencoding())
1803 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1805 def _real_initialize(self):
1806 self._google_ie.initialize()
1808 def _real_extract(self, query):
1809 mobj = re.match(self._VALID_QUERY, query)
1811 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1814 prefix, query = query.split(':')
1816 query = query.encode('utf-8')
1818 self._download_n_results(query, 1)
1820 elif prefix == 'all':
1821 self._download_n_results(query, self._max_google_results)
1827 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1829 elif n > self._max_google_results:
1830 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1831 n = self._max_google_results
1832 self._download_n_results(query, n)
1834 except ValueError: # parsing prefix as integer fails
1835 self._download_n_results(query, 1)
1838 def _download_n_results(self, query, n):
1839 """Downloads a specified number of results for a query"""
1842 already_seen = set()
1846 self.report_download_page(query, pagenum)
1847 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1848 request = urllib2.Request(result_url, None, std_headers)
1850 page = urllib2.urlopen(request).read()
1851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1852 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1855 # Extract video identifiers
1856 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1857 video_id = mobj.group(1)
1858 if video_id not in already_seen:
1859 video_ids.append(video_id)
1860 already_seen.add(video_id)
1861 if len(video_ids) == n:
1862 # Specified n videos reached
1863 for id in video_ids:
1864 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1867 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1868 for id in video_ids:
1869 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1872 pagenum = pagenum + 1
1874 class YahooSearchIE(InfoExtractor):
1875 """Information Extractor for Yahoo! Video search queries."""
1876 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1877 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1878 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1879 _MORE_PAGES_INDICATOR = r'\s*Next'
1881 _max_yahoo_results = 1000
1883 def __init__(self, yahoo_ie, downloader=None):
1884 InfoExtractor.__init__(self, downloader)
1885 self._yahoo_ie = yahoo_ie
1889 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1891 def report_download_page(self, query, pagenum):
1892 """Report attempt to download playlist page with given number."""
1893 query = query.decode(preferredencoding())
1894 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1896 def _real_initialize(self):
1897 self._yahoo_ie.initialize()
1899 def _real_extract(self, query):
1900 mobj = re.match(self._VALID_QUERY, query)
1902 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1905 prefix, query = query.split(':')
1907 query = query.encode('utf-8')
1909 self._download_n_results(query, 1)
1911 elif prefix == 'all':
1912 self._download_n_results(query, self._max_yahoo_results)
1918 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1920 elif n > self._max_yahoo_results:
1921 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1922 n = self._max_yahoo_results
1923 self._download_n_results(query, n)
1925 except ValueError: # parsing prefix as integer fails
1926 self._download_n_results(query, 1)
1929 def _download_n_results(self, query, n):
1930 """Downloads a specified number of results for a query"""
1933 already_seen = set()
1937 self.report_download_page(query, pagenum)
1938 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1939 request = urllib2.Request(result_url, None, std_headers)
1941 page = urllib2.urlopen(request).read()
1942 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1943 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1946 # Extract video identifiers
1947 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1948 video_id = mobj.group(1)
1949 if video_id not in already_seen:
1950 video_ids.append(video_id)
1951 already_seen.add(video_id)
1952 if len(video_ids) == n:
1953 # Specified n videos reached
1954 for id in video_ids:
1955 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1958 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1959 for id in video_ids:
1960 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1963 pagenum = pagenum + 1
1965 class YoutubePlaylistIE(InfoExtractor):
1966 """Information Extractor for YouTube playlists."""
1968 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1969 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1970 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1971 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1974 def __init__(self, youtube_ie, downloader=None):
1975 InfoExtractor.__init__(self, downloader)
1976 self._youtube_ie = youtube_ie
1980 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1982 def report_download_page(self, playlist_id, pagenum):
1983 """Report attempt to download playlist page with given number."""
1984 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1986 def _real_initialize(self):
1987 self._youtube_ie.initialize()
1989 def _real_extract(self, url):
1990 # Extract playlist id
1991 mobj = re.match(self._VALID_URL, url)
1993 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1996 # Download playlist pages
1997 playlist_id = mobj.group(1)
2002 self.report_download_page(playlist_id, pagenum)
2003 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2005 page = urllib2.urlopen(request).read()
2006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2010 # Extract video identifiers
2012 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2013 if mobj.group(1) not in ids_in_page:
2014 ids_in_page.append(mobj.group(1))
2015 video_ids.extend(ids_in_page)
2017 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2019 pagenum = pagenum + 1
2021 playliststart = self._downloader.params.get('playliststart', 1) - 1
2022 playlistend = self._downloader.params.get('playlistend', -1)
2023 video_ids = video_ids[playliststart:playlistend]
2025 for id in video_ids:
2026 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2029 class YoutubeUserIE(InfoExtractor):
2030 """Information Extractor for YouTube users."""
2032 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2033 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2034 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2037 def __init__(self, youtube_ie, downloader=None):
2038 InfoExtractor.__init__(self, downloader)
2039 self._youtube_ie = youtube_ie
2043 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2045 def report_download_page(self, username):
2046 """Report attempt to download user page."""
2047 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2049 def _real_initialize(self):
2050 self._youtube_ie.initialize()
2052 def _real_extract(self, url):
2054 mobj = re.match(self._VALID_URL, url)
2056 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2059 # Download user page
2060 username = mobj.group(1)
2064 self.report_download_page(username)
2065 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2067 page = urllib2.urlopen(request).read()
2068 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2069 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2072 # Extract video identifiers
2075 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2076 if mobj.group(1) not in ids_in_page:
2077 ids_in_page.append(mobj.group(1))
2078 video_ids.extend(ids_in_page)
2080 playliststart = self._downloader.params.get('playliststart', 1) - 1
2081 playlistend = self._downloader.params.get('playlistend', -1)
2082 video_ids = video_ids[playliststart:playlistend]
2084 for id in video_ids:
2085 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2088 class DepositFilesIE(InfoExtractor):
2089 """Information extractor for depositfiles.com"""
2091 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2093 def __init__(self, downloader=None):
2094 InfoExtractor.__init__(self, downloader)
2098 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2100 def report_download_webpage(self, file_id):
2101 """Report webpage download."""
2102 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2104 def report_extraction(self, file_id):
2105 """Report information extraction."""
2106 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2108 def _real_initialize(self):
2111 def _real_extract(self, url):
2112 # At this point we have a new file
2113 self._downloader.increment_downloads()
2115 file_id = url.split('/')[-1]
2116 # Rebuild url in english locale
2117 url = 'http://depositfiles.com/en/files/' + file_id
2119 # Retrieve file webpage with 'Free download' button pressed
2120 free_download_indication = { 'gateway_result' : '1' }
2121 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2123 self.report_download_webpage(file_id)
2124 webpage = urllib2.urlopen(request).read()
2125 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2126 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2129 # Search for the real file URL
2130 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2131 if (mobj is None) or (mobj.group(1) is None):
2132 # Try to figure out reason of the error.
2133 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2134 if (mobj is not None) and (mobj.group(1) is not None):
2135 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2136 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2138 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2141 file_url = mobj.group(1)
2142 file_extension = os.path.splitext(file_url)[1][1:]
2144 # Search for file title
2145 mobj = re.search(r'<b title="(.*?)">', webpage)
2147 self._downloader.trouble(u'ERROR: unable to extract title')
2149 file_title = mobj.group(1).decode('utf-8')
2152 # Process file information
2153 self._downloader.process_info({
2154 'id': file_id.decode('utf-8'),
2155 'url': file_url.decode('utf-8'),
2157 'upload_date': u'NA',
2158 'title': file_title,
2159 'stitle': file_title,
2160 'ext': file_extension.decode('utf-8'),
2164 except UnavailableVideoError, err:
2165 self._downloader.trouble(u'ERROR: unable to download file')
2167 class PostProcessor(object):
2168 """Post Processor class.
2170 PostProcessor objects can be added to downloaders with their
2171 add_post_processor() method. When the downloader has finished a
2172 successful download, it will take its internal chain of PostProcessors
2173 and start calling the run() method on each one of them, first with
2174 an initial argument and then with the returned value of the previous
2177 The chain will be stopped if one of them ever returns None or the end
2178 of the chain is reached.
2180 PostProcessor objects follow a "mutual registration" process similar
2181 to InfoExtractor objects.
2186 def __init__(self, downloader=None):
2187 self._downloader = downloader
2189 def set_downloader(self, downloader):
2190 """Sets the downloader for this PP."""
2191 self._downloader = downloader
2193 def run(self, information):
2194 """Run the PostProcessor.
2196 The "information" argument is a dictionary like the ones
2197 composed by InfoExtractors. The only difference is that this
2198 one has an extra field called "filepath" that points to the
2201 When this method returns None, the postprocessing chain is
2202 stopped. However, this method may return an information
2203 dictionary that will be passed to the next postprocessing
2204 object in the chain. It can be the one it received after
2205 changing some fields.
2207 In addition, this method may raise a PostProcessingError
2208 exception that will be taken into account by the downloader
2211 return information # by default, do nothing
2213 ### MAIN PROGRAM ###
2214 if __name__ == '__main__':
2216 # Modules needed only when running the main program
2220 # Function to update the program file with the latest version from bitbucket.org
2221 def update_self(downloader, filename):
2222 # Note: downloader only used for options
2223 if not os.access (filename, os.W_OK):
2224 sys.exit('ERROR: no write permissions on %s' % filename)
2226 downloader.to_screen('Updating to latest stable version...')
2227 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2228 latest_version = urllib.urlopen(latest_url).read().strip()
2229 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2230 newcontent = urllib.urlopen(prog_url).read()
2231 stream = open(filename, 'w')
2232 stream.write(newcontent)
2234 downloader.to_screen('Updated to version %s' % latest_version)
2236 # Parse command line
2237 parser = optparse.OptionParser(
2238 usage='Usage: %prog [options] url...',
2239 version='2010.12.09',
2240 conflict_handler='resolve',
2243 parser.add_option('-h', '--help',
2244 action='help', help='print this help text and exit')
2245 parser.add_option('-v', '--version',
2246 action='version', help='print program version and exit')
2247 parser.add_option('-U', '--update',
2248 action='store_true', dest='update_self', help='update this program to latest stable version')
2249 parser.add_option('-i', '--ignore-errors',
2250 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2251 parser.add_option('-r', '--rate-limit',
2252 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2253 parser.add_option('-R', '--retries',
2254 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2255 parser.add_option('--playlist-start',
2256 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2257 parser.add_option('--playlist-end',
2258 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2260 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2261 authentication.add_option('-u', '--username',
2262 dest='username', metavar='USERNAME', help='account username')
2263 authentication.add_option('-p', '--password',
2264 dest='password', metavar='PASSWORD', help='account password')
2265 authentication.add_option('-n', '--netrc',
2266 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2267 parser.add_option_group(authentication)
2269 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2270 video_format.add_option('-f', '--format',
2271 action='store', dest='format', metavar='FORMAT', help='video format code')
2272 video_format.add_option('--all-formats',
2273 action='store_const', dest='format', help='download all available video formats', const='-1')
2274 video_format.add_option('--max-quality',
2275 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2276 parser.add_option_group(video_format)
2278 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2279 verbosity.add_option('-q', '--quiet',
2280 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2281 verbosity.add_option('-s', '--simulate',
2282 action='store_true', dest='simulate', help='do not download video', default=False)
2283 verbosity.add_option('-g', '--get-url',
2284 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2285 verbosity.add_option('-e', '--get-title',
2286 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2287 verbosity.add_option('--get-thumbnail',
2288 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2289 verbosity.add_option('--get-description',
2290 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2291 verbosity.add_option('--no-progress',
2292 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2293 parser.add_option_group(verbosity)
2295 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2296 filesystem.add_option('-t', '--title',
2297 action='store_true', dest='usetitle', help='use title in file name', default=False)
2298 filesystem.add_option('-l', '--literal',
2299 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2300 filesystem.add_option('-A', '--auto-number',
2301 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2302 filesystem.add_option('-o', '--output',
2303 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2304 filesystem.add_option('-a', '--batch-file',
2305 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2306 filesystem.add_option('-w', '--no-overwrites',
2307 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2308 filesystem.add_option('-c', '--continue',
2309 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2310 filesystem.add_option('--cookies',
2311 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2312 parser.add_option_group(filesystem)
2314 (opts, args) = parser.parse_args()
2316 # Open appropriate CookieJar
2317 if opts.cookiefile is None:
2318 jar = cookielib.CookieJar()
2321 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2322 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2324 except (IOError, OSError), err:
2325 sys.exit(u'ERROR: unable to open cookie file')
2327 # General configuration
2328 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2329 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2330 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2331 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2333 # Batch file verification
2335 if opts.batchfile is not None:
2337 if opts.batchfile == '-':
2340 batchfd = open(opts.batchfile, 'r')
2341 batchurls = batchfd.readlines()
2342 batchurls = [x.strip() for x in batchurls]
2343 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2345 sys.exit(u'ERROR: batch file could not be read')
2346 all_urls = batchurls + args
2348 # Conflicting, missing and erroneous options
2349 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2350 parser.error(u'using .netrc conflicts with giving username/password')
2351 if opts.password is not None and opts.username is None:
2352 parser.error(u'account username missing')
2353 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2354 parser.error(u'using output template conflicts with using title, literal title or auto number')
2355 if opts.usetitle and opts.useliteral:
2356 parser.error(u'using title conflicts with using literal title')
2357 if opts.username is not None and opts.password is None:
2358 opts.password = getpass.getpass(u'Type account password and press return:')
2359 if opts.ratelimit is not None:
2360 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2361 if numeric_limit is None:
2362 parser.error(u'invalid rate limit specified')
2363 opts.ratelimit = numeric_limit
2364 if opts.retries is not None:
2366 opts.retries = long(opts.retries)
2367 except (TypeError, ValueError), err:
2368 parser.error(u'invalid retry count specified')
2370 opts.playliststart = long(opts.playliststart)
2371 if opts.playliststart <= 0:
2373 except (TypeError, ValueError), err:
2374 parser.error(u'invalid playlist start number specified')
2376 opts.playlistend = long(opts.playlistend)
2377 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2379 except (TypeError, ValueError), err:
2380 parser.error(u'invalid playlist end number specified')
2382 # Information extractors
2383 youtube_ie = YoutubeIE()
2384 metacafe_ie = MetacafeIE(youtube_ie)
2385 dailymotion_ie = DailymotionIE()
2386 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2387 youtube_user_ie = YoutubeUserIE(youtube_ie)
2388 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2389 google_ie = GoogleIE()
2390 google_search_ie = GoogleSearchIE(google_ie)
2391 photobucket_ie = PhotobucketIE()
2392 yahoo_ie = YahooIE()
2393 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2394 deposit_files_ie = DepositFilesIE()
2395 generic_ie = GenericIE()
2398 fd = FileDownloader({
2399 'usenetrc': opts.usenetrc,
2400 'username': opts.username,
2401 'password': opts.password,
2402 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2403 'forceurl': opts.geturl,
2404 'forcetitle': opts.gettitle,
2405 'forcethumbnail': opts.getthumbnail,
2406 'forcedescription': opts.getdescription,
2407 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2408 'format': opts.format,
2409 'format_limit': opts.format_limit,
2410 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2411 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2412 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2413 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2414 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2415 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2416 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2417 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2418 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2419 or u'%(id)s.%(ext)s'),
2420 'ignoreerrors': opts.ignoreerrors,
2421 'ratelimit': opts.ratelimit,
2422 'nooverwrites': opts.nooverwrites,
2423 'retries': opts.retries,
2424 'continuedl': opts.continue_dl,
2425 'noprogress': opts.noprogress,
2426 'playliststart': opts.playliststart,
2427 'playlistend': opts.playlistend,
2428 'logtostderr': opts.outtmpl == '-',
2430 fd.add_info_extractor(youtube_search_ie)
2431 fd.add_info_extractor(youtube_pl_ie)
2432 fd.add_info_extractor(youtube_user_ie)
2433 fd.add_info_extractor(metacafe_ie)
2434 fd.add_info_extractor(dailymotion_ie)
2435 fd.add_info_extractor(youtube_ie)
2436 fd.add_info_extractor(google_ie)
2437 fd.add_info_extractor(google_search_ie)
2438 fd.add_info_extractor(photobucket_ie)
2439 fd.add_info_extractor(yahoo_ie)
2440 fd.add_info_extractor(yahoo_search_ie)
2441 fd.add_info_extractor(deposit_files_ie)
2443 # This must come last since it's the
2444 # fallback if none of the others work
2445 fd.add_info_extractor(generic_ie)
2448 if opts.update_self:
2449 update_self(fd, sys.argv[0])
2452 if len(all_urls) < 1:
2453 if not opts.update_self:
2454 parser.error(u'you must provide at least one URL')
2457 retcode = fd.download(all_urls)
2459 # Dump cookie jar if requested
2460 if opts.cookiefile is not None:
2463 except (IOError, OSError), err:
2464 sys.exit(u'ERROR: unable to save cookie jar')
2468 except DownloadError:
2470 except SameFileError:
2471 sys.exit(u'ERROR: fixed output name but more than one file to download')
2472 except KeyboardInterrupt:
2473 sys.exit(u'\nERROR: Interrupted by user')