Take into account resume_len when calculating speed and ETA
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
8 import cookielib
9 import datetime
10 import htmlentitydefs
11 import httplib
12 import locale
13 import math
14 import netrc
15 import os
16 import os.path
17 import re
18 import socket
19 import string
20 import subprocess
21 import sys
22 import time
23 import urllib
24 import urllib2
25
26 # parse_qs was moved from the cgi module to the urlparse module recently.
27 try:
28         from urlparse import parse_qs
29 except ImportError:
30         from cgi import parse_qs
31
32 std_headers = {
33         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36         'Accept-Language': 'en-us,en;q=0.5',
37 }
38
39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
40
41 def preferredencoding():
42         """Get preferred encoding.
43
44         Returns the best encoding scheme for the system, based on
45         locale.getpreferredencoding() and some further tweaks.
46         """
47         def yield_preferredencoding():
48                 try:
49                         pref = locale.getpreferredencoding()
50                         u'TEST'.encode(pref)
51                 except:
52                         pref = 'UTF-8'
53                 while True:
54                         yield pref
55         return yield_preferredencoding().next()
56
57 def htmlentity_transform(matchobj):
58         """Transforms an HTML entity to a Unicode character.
59         
60         This function receives a match object and is intended to be used with
61         the re.sub() function.
62         """
63         entity = matchobj.group(1)
64
65         # Known non-numeric HTML entity
66         if entity in htmlentitydefs.name2codepoint:
67                 return unichr(htmlentitydefs.name2codepoint[entity])
68
69         # Unicode character
70         mobj = re.match(ur'(?u)#(x?\d+)', entity)
71         if mobj is not None:
72                 numstr = mobj.group(1)
73                 if numstr.startswith(u'x'):
74                         base = 16
75                         numstr = u'0%s' % numstr
76                 else:
77                         base = 10
78                 return unichr(long(numstr, base))
79
80         # Unknown entity in name, return its literal representation
81         return (u'&%s;' % entity)
82
83 def sanitize_title(utitle):
84         """Sanitizes a video title so it could be used as part of a filename."""
85         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
86         return utitle.replace(unicode(os.sep), u'%')
87
88 def sanitize_open(filename, open_mode):
89         """Try to open the given filename, and slightly tweak it if this fails.
90
91         Attempts to open the given filename. If this fails, it tries to change
92         the filename slightly, step by step, until it's either able to open it
93         or it fails and raises a final exception, like the standard open()
94         function.
95
96         It returns the tuple (stream, definitive_file_name).
97         """
98         try:
99                 if filename == u'-':
100                         if sys.platform == 'win32':
101                                 import msvcrt
102                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
103                         return (sys.stdout, filename)
104                 stream = open(filename, open_mode)
105                 return (stream, filename)
106         except (IOError, OSError), err:
107                 # In case of error, try to remove win32 forbidden chars
108                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
109
110                 # An exception here should be caught in the caller
111                 stream = open(filename, open_mode)
112                 return (stream, filename)
113
114 class DownloadError(Exception):
115         """Download Error exception.
116         
117         This exception may be thrown by FileDownloader objects if they are not
118         configured to continue on errors. They will contain the appropriate
119         error message.
120         """
121         pass
122
123 class SameFileError(Exception):
124         """Same File exception.
125
126         This exception will be thrown by FileDownloader objects if they detect
127         multiple files would have to be downloaded to the same file on disk.
128         """
129         pass
130
131 class PostProcessingError(Exception):
132         """Post Processing exception.
133
134         This exception may be raised by PostProcessor's .run() method to
135         indicate an error in the postprocessing task.
136         """
137         pass
138
139 class UnavailableVideoError(Exception):
140         """Unavailable Format exception.
141
142         This exception will be thrown when a video is requested
143         in a format that is not available for that video.
144         """
145         pass
146
147 class ContentTooShortError(Exception):
148         """Content Too Short exception.
149
150         This exception may be raised by FileDownloader objects when a file they
151         download is too small for what the server announced first, indicating
152         the connection was probably interrupted.
153         """
154         # Both in bytes
155         downloaded = None
156         expected = None
157
158         def __init__(self, downloaded, expected):
159                 self.downloaded = downloaded
160                 self.expected = expected
161
162 class FileDownloader(object):
163         """File Downloader class.
164
165         File downloader objects are the ones responsible of downloading the
166         actual video file and writing it to disk if the user has requested
167         it, among some other tasks. In most cases there should be one per
168         program. As, given a video URL, the downloader doesn't know how to
169         extract all the needed information, task that InfoExtractors do, it
170         has to pass the URL to one of them.
171
172         For this, file downloader objects have a method that allows
173         InfoExtractors to be registered in a given order. When it is passed
174         a URL, the file downloader handles it to the first InfoExtractor it
175         finds that reports being able to handle it. The InfoExtractor extracts
176         all the information about the video or videos the URL refers to, and
177         asks the FileDownloader to process the video information, possibly
178         downloading the video.
179
180         File downloaders accept a lot of parameters. In order not to saturate
181         the object constructor with arguments, it receives a dictionary of
182         options instead. These options are available through the params
183         attribute for the InfoExtractors to use. The FileDownloader also
184         registers itself as the downloader in charge for the InfoExtractors
185         that are added to it, so this is a "mutual registration".
186
187         Available options:
188
189         username:         Username for authentication purposes.
190         password:         Password for authentication purposes.
191         usenetrc:         Use netrc for authentication instead.
192         quiet:            Do not print messages to stdout.
193         forceurl:         Force printing final URL.
194         forcetitle:       Force printing title.
195         forcethumbnail:   Force printing thumbnail URL.
196         forcedescription: Force printing description.
197         simulate:         Do not download the video files.
198         format:           Video format code.
199         format_limit:     Highest quality format to try.
200         outtmpl:          Template for output names.
201         ignoreerrors:     Do not stop on download errors.
202         ratelimit:        Download speed limit, in bytes/sec.
203         nooverwrites:     Prevent overwriting files.
204         retries:          Number of times to retry for HTTP error 5xx
205         continuedl:       Try to continue downloads if possible.
206         noprogress:       Do not print the progress bar.
207         playliststart:    Playlist item to start at.
208         playlistend:      Playlist item to end at.
209         logtostderr:      Log messages to stderr instead of stdout.
210         """
211
212         params = None
213         _ies = []
214         _pps = []
215         _download_retcode = None
216         _num_downloads = None
217         _screen_file = None
218
219         def __init__(self, params):
220                 """Create a FileDownloader object with the given options."""
221                 self._ies = []
222                 self._pps = []
223                 self._download_retcode = 0
224                 self._num_downloads = 0
225                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
226                 self.params = params
227         
228         @staticmethod
229         def pmkdir(filename):
230                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231                 components = filename.split(os.sep)
232                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234                 for dir in aggregate:
235                         if not os.path.exists(dir):
236                                 os.mkdir(dir)
237         
238         @staticmethod
239         def temp_name(filename):
240                 """Returns a temporary filename for the given filename."""
241                 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
242                         return filename
243                 return filename + u'.part'
244         
245         @staticmethod
246         def format_bytes(bytes):
247                 if bytes is None:
248                         return 'N/A'
249                 if type(bytes) is str:
250                         bytes = float(bytes)
251                 if bytes == 0.0:
252                         exponent = 0
253                 else:
254                         exponent = long(math.log(bytes, 1024.0))
255                 suffix = 'bkMGTPEZY'[exponent]
256                 converted = float(bytes) / float(1024**exponent)
257                 return '%.2f%s' % (converted, suffix)
258
259         @staticmethod
260         def calc_percent(byte_counter, data_len):
261                 if data_len is None:
262                         return '---.-%'
263                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
264
265         @staticmethod
266         def calc_eta(start, now, total, current):
267                 if total is None:
268                         return '--:--'
269                 dif = now - start
270                 if current == 0 or dif < 0.001: # One millisecond
271                         return '--:--'
272                 rate = float(current) / dif
273                 eta = long((float(total) - float(current)) / rate)
274                 (eta_mins, eta_secs) = divmod(eta, 60)
275                 if eta_mins > 99:
276                         return '--:--'
277                 return '%02d:%02d' % (eta_mins, eta_secs)
278
279         @staticmethod
280         def calc_speed(start, now, bytes):
281                 dif = now - start
282                 if bytes == 0 or dif < 0.001: # One millisecond
283                         return '%10s' % '---b/s'
284                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
285
286         @staticmethod
287         def best_block_size(elapsed_time, bytes):
288                 new_min = max(bytes / 2.0, 1.0)
289                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
290                 if elapsed_time < 0.001:
291                         return long(new_max)
292                 rate = bytes / elapsed_time
293                 if rate > new_max:
294                         return long(new_max)
295                 if rate < new_min:
296                         return long(new_min)
297                 return long(rate)
298
299         @staticmethod
300         def parse_bytes(bytestr):
301                 """Parse a string indicating a byte quantity into a long integer."""
302                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
303                 if matchobj is None:
304                         return None
305                 number = float(matchobj.group(1))
306                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
307                 return long(round(number * multiplier))
308
309         def add_info_extractor(self, ie):
310                 """Add an InfoExtractor object to the end of the list."""
311                 self._ies.append(ie)
312                 ie.set_downloader(self)
313         
314         def add_post_processor(self, pp):
315                 """Add a PostProcessor object to the end of the chain."""
316                 self._pps.append(pp)
317                 pp.set_downloader(self)
318         
319         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
320                 """Print message to stdout if not in quiet mode."""
321                 try:
322                         if not self.params.get('quiet', False):
323                                 terminator = [u'\n', u''][skip_eol]
324                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
325                         self._screen_file.flush()
326                 except (UnicodeEncodeError), err:
327                         if not ignore_encoding_errors:
328                                 raise
329         
330         def to_stderr(self, message):
331                 """Print message to stderr."""
332                 print >>sys.stderr, message.encode(preferredencoding())
333         
334         def fixed_template(self):
335                 """Checks if the output template is fixed."""
336                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
337
338         def trouble(self, message=None):
339                 """Determine action to take when a download problem appears.
340
341                 Depending on if the downloader has been configured to ignore
342                 download errors or not, this method may throw an exception or
343                 not when errors are found, after printing the message.
344                 """
345                 if message is not None:
346                         self.to_stderr(message)
347                 if not self.params.get('ignoreerrors', False):
348                         raise DownloadError(message)
349                 self._download_retcode = 1
350
351         def slow_down(self, start_time, byte_counter):
352                 """Sleep if the download speed is over the rate limit."""
353                 rate_limit = self.params.get('ratelimit', None)
354                 if rate_limit is None or byte_counter == 0:
355                         return
356                 now = time.time()
357                 elapsed = now - start_time
358                 if elapsed <= 0.0:
359                         return
360                 speed = float(byte_counter) / elapsed
361                 if speed > rate_limit:
362                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
363         
364         def try_rename(self, old_filename, new_filename):
365                 try:
366                         if old_filename == new_filename:
367                                 return
368                         os.rename(old_filename, new_filename)
369                 except (IOError, OSError), err:
370                         self.trouble(u'ERROR: unable to rename file')
371
372         def report_destination(self, filename):
373                 """Report destination filename."""
374                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
375         
376         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
377                 """Report download progress."""
378                 if self.params.get('noprogress', False):
379                         return
380                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
381                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
382
383         def report_resuming_byte(self, resume_len):
384                 """Report attempt to resume at given byte."""
385                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
386         
387         def report_retry(self, count, retries):
388                 """Report retry in case of HTTP error 5xx"""
389                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
390         
391         def report_file_already_downloaded(self, file_name):
392                 """Report file has already been fully downloaded."""
393                 try:
394                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
395                 except (UnicodeEncodeError), err:
396                         self.to_screen(u'[download] The file has already been downloaded')
397         
398         def report_unable_to_resume(self):
399                 """Report it was impossible to resume download."""
400                 self.to_screen(u'[download] Unable to resume')
401         
402         def report_finish(self):
403                 """Report download finished."""
404                 if self.params.get('noprogress', False):
405                         self.to_screen(u'[download] Download completed')
406                 else:
407                         self.to_screen(u'')
408         
409         def increment_downloads(self):
410                 """Increment the ordinal that assigns a number to each file."""
411                 self._num_downloads += 1
412
413         def process_info(self, info_dict):
414                 """Process a single dictionary returned by an InfoExtractor."""
415                 # Do nothing else if in simulate mode
416                 if self.params.get('simulate', False):
417                         # Forced printings
418                         if self.params.get('forcetitle', False):
419                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
420                         if self.params.get('forceurl', False):
421                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
422                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
423                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
424                         if self.params.get('forcedescription', False) and 'description' in info_dict:
425                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
426
427                         return
428                         
429                 try:
430                         template_dict = dict(info_dict)
431                         template_dict['epoch'] = unicode(long(time.time()))
432                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
433                         filename = self.params['outtmpl'] % template_dict
434                 except (ValueError, KeyError), err:
435                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
436                         return
437                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
438                         self.to_stderr(u'WARNING: file exists and will be skipped')
439                         return
440
441                 try:
442                         self.pmkdir(filename)
443                 except (OSError, IOError), err:
444                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
445                         return
446
447                 try:
448                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
449                 except (OSError, IOError), err:
450                         raise UnavailableVideoError
451                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
452                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
453                         return
454                 except (ContentTooShortError, ), err:
455                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
456                         return
457
458                 if success:
459                         try:
460                                 self.post_process(filename, info_dict)
461                         except (PostProcessingError), err:
462                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
463                                 return
464
465         def download(self, url_list):
466                 """Download a given list of URLs."""
467                 if len(url_list) > 1 and self.fixed_template():
468                         raise SameFileError(self.params['outtmpl'])
469
470                 for url in url_list:
471                         suitable_found = False
472                         for ie in self._ies:
473                                 # Go to next InfoExtractor if not suitable
474                                 if not ie.suitable(url):
475                                         continue
476
477                                 # Suitable InfoExtractor found
478                                 suitable_found = True
479
480                                 # Extract information from URL and process it
481                                 ie.extract(url)
482
483                                 # Suitable InfoExtractor had been found; go to next URL
484                                 break
485
486                         if not suitable_found:
487                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
488
489                 return self._download_retcode
490
491         def post_process(self, filename, ie_info):
492                 """Run the postprocessing chain on the given file."""
493                 info = dict(ie_info)
494                 info['filepath'] = filename
495                 for pp in self._pps:
496                         info = pp.run(info)
497                         if info is None:
498                                 break
499         
500         def _download_with_rtmpdump(self, filename, url, player_url):
501                 self.report_destination(filename)
502                 tmpfilename = self.temp_name(filename)
503
504                 # Check for rtmpdump first
505                 try:
506                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
507                 except (OSError, IOError):
508                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
509                         return False
510
511                 # Download using rtmpdump. rtmpdump returns exit code 2 when
512                 # the connection was interrumpted and resuming appears to be
513                 # possible. This is part of rtmpdump's normal usage, AFAIK.
514                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
515                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
516                 while retval == 2 or retval == 1:
517                         prevsize = os.path.getsize(tmpfilename)
518                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
519                         time.sleep(5.0) # This seems to be needed
520                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
521                         cursize = os.path.getsize(tmpfilename)
522                         if prevsize == cursize and retval == 1:
523                                 break
524                 if retval == 0:
525                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
526                         self.try_rename(tmpfilename, filename)
527                         return True
528                 else:
529                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
530                         return False
531
532         def _do_download(self, filename, url, player_url):
533                 # Check file already present
534                 if self.params.get('continuedl', False) and os.path.isfile(filename):
535                         self.report_file_already_downloaded(filename)
536                         return True
537
538                 # Attempt to download using rtmpdump
539                 if url.startswith('rtmp'):
540                         return self._download_with_rtmpdump(filename, url, player_url)
541
542                 tmpfilename = self.temp_name(filename)
543                 stream = None
544                 open_mode = 'wb'
545                 basic_request = urllib2.Request(url, None, std_headers)
546                 request = urllib2.Request(url, None, std_headers)
547
548                 # Establish possible resume length
549                 if os.path.isfile(tmpfilename):
550                         resume_len = os.path.getsize(tmpfilename)
551                 else:
552                         resume_len = 0
553
554                 # Request parameters in case of being able to resume
555                 if self.params.get('continuedl', False) and resume_len != 0:
556                         self.report_resuming_byte(resume_len)
557                         request.add_header('Range','bytes=%d-' % resume_len)
558                         open_mode = 'ab'
559
560                 count = 0
561                 retries = self.params.get('retries', 0)
562                 while count <= retries:
563                         # Establish connection
564                         try:
565                                 data = urllib2.urlopen(request)
566                                 break
567                         except (urllib2.HTTPError, ), err:
568                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
569                                         # Unexpected HTTP error
570                                         raise
571                                 elif err.code == 416:
572                                         # Unable to resume (requested range not satisfiable)
573                                         try:
574                                                 # Open the connection again without the range header
575                                                 data = urllib2.urlopen(basic_request)
576                                                 content_length = data.info()['Content-Length']
577                                         except (urllib2.HTTPError, ), err:
578                                                 if err.code < 500 or err.code >= 600:
579                                                         raise
580                                         else:
581                                                 # Examine the reported length
582                                                 if (content_length is not None and
583                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
584                                                         # The file had already been fully downloaded.
585                                                         # Explanation to the above condition: in issue #175 it was revealed that
586                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
587                                                         # changing the file size slightly and causing problems for some users. So
588                                                         # I decided to implement a suggested change and consider the file
589                                                         # completely downloaded if the file size differs less than 100 bytes from
590                                                         # the one in the hard drive.
591                                                         self.report_file_already_downloaded(filename)
592                                                         self.try_rename(tmpfilename, filename)
593                                                         return True
594                                                 else:
595                                                         # The length does not match, we start the download over
596                                                         self.report_unable_to_resume()
597                                                         open_mode = 'wb'
598                                                         break
599                         # Retry
600                         count += 1
601                         if count <= retries:
602                                 self.report_retry(count, retries)
603
604                 if count > retries:
605                         self.trouble(u'ERROR: giving up after %s retries' % retries)
606                         return False
607
608                 data_len = data.info().get('Content-length', None)
609                 if data_len is not None:
610                         data_len = long(data_len) + resume_len
611                 data_len_str = self.format_bytes(data_len)
612                 byte_counter = 0 + resume_len
613                 block_size = 1024
614                 start = time.time()
615                 while True:
616                         # Download and write
617                         before = time.time()
618                         data_block = data.read(block_size)
619                         after = time.time()
620                         if len(data_block) == 0:
621                                 break
622                         byte_counter += len(data_block)
623
624                         # Open file just in time
625                         if stream is None:
626                                 try:
627                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
628                                         self.report_destination(filename)
629                                 except (OSError, IOError), err:
630                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
631                                         return False
632                         try:
633                                 stream.write(data_block)
634                         except (IOError, OSError), err:
635                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
636                                 return False
637                         block_size = self.best_block_size(after - before, len(data_block))
638
639                         # Progress message
640                         percent_str = self.calc_percent(byte_counter, data_len)
641                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
642                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
643                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
644
645                         # Apply rate limit
646                         self.slow_down(start, byte_counter - resume_len)
647
648                 stream.close()
649                 self.report_finish()
650                 if data_len is not None and byte_counter != data_len:
651                         raise ContentTooShortError(byte_counter, long(data_len))
652                 self.try_rename(tmpfilename, filename)
653                 return True
654
655 class InfoExtractor(object):
656         """Information Extractor class.
657
658         Information extractors are the classes that, given a URL, extract
659         information from the video (or videos) the URL refers to. This
660         information includes the real video URL, the video title and simplified
661         title, author and others. The information is stored in a dictionary
662         which is then passed to the FileDownloader. The FileDownloader
663         processes this information possibly downloading the video to the file
664         system, among other possible outcomes. The dictionaries must include
665         the following fields:
666
667         id:             Video identifier.
668         url:            Final video URL.
669         uploader:       Nickname of the video uploader.
670         title:          Literal title.
671         stitle:         Simplified title.
672         ext:            Video filename extension.
673         format:         Video format.
674         player_url:     SWF Player URL (may be None).
675
676         The following fields are optional. Their primary purpose is to allow
677         youtube-dl to serve as the backend for a video search function, such
678         as the one in youtube2mp3.  They are only used when their respective
679         forced printing functions are called:
680
681         thumbnail:      Full URL to a video thumbnail image.
682         description:    One-line video description.
683
684         Subclasses of this one should re-define the _real_initialize() and
685         _real_extract() methods, as well as the suitable() static method.
686         Probably, they should also be instantiated and added to the main
687         downloader.
688         """
689
690         _ready = False
691         _downloader = None
692
693         def __init__(self, downloader=None):
694                 """Constructor. Receives an optional downloader."""
695                 self._ready = False
696                 self.set_downloader(downloader)
697
698         @staticmethod
699         def suitable(url):
700                 """Receives a URL and returns True if suitable for this IE."""
701                 return False
702
703         def initialize(self):
704                 """Initializes an instance (authentication, etc)."""
705                 if not self._ready:
706                         self._real_initialize()
707                         self._ready = True
708
709         def extract(self, url):
710                 """Extracts URL information and returns it in list of dicts."""
711                 self.initialize()
712                 return self._real_extract(url)
713
714         def set_downloader(self, downloader):
715                 """Sets the downloader for this IE."""
716                 self._downloader = downloader
717         
718         def _real_initialize(self):
719                 """Real initialization process. Redefine in subclasses."""
720                 pass
721
722         def _real_extract(self, url):
723                 """Real extraction process. Redefine in subclasses."""
724                 pass
725
726 class YoutubeIE(InfoExtractor):
727         """Information extractor for youtube.com."""
728
729         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
730         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
731         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
732         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
733         _NETRC_MACHINE = 'youtube'
734         # Listed in order of quality
735         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
736         _video_extensions = {
737                 '13': '3gp',
738                 '17': 'mp4',
739                 '18': 'mp4',
740                 '22': 'mp4',
741                 '37': 'mp4',
742                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
743                 '43': 'webm',
744                 '45': 'webm',
745         }
746
747         @staticmethod
748         def suitable(url):
749                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
750
751         def report_lang(self):
752                 """Report attempt to set language."""
753                 self._downloader.to_screen(u'[youtube] Setting language')
754
755         def report_login(self):
756                 """Report attempt to log in."""
757                 self._downloader.to_screen(u'[youtube] Logging in')
758         
759         def report_age_confirmation(self):
760                 """Report attempt to confirm age."""
761                 self._downloader.to_screen(u'[youtube] Confirming age')
762         
763         def report_video_webpage_download(self, video_id):
764                 """Report attempt to download video webpage."""
765                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
766         
767         def report_video_info_webpage_download(self, video_id):
768                 """Report attempt to download video info webpage."""
769                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
770         
771         def report_information_extraction(self, video_id):
772                 """Report attempt to extract video information."""
773                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
774         
775         def report_unavailable_format(self, video_id, format):
776                 """Report extracted video URL."""
777                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
778         
779         def report_rtmp_download(self):
780                 """Indicate the download will use the RTMP protocol."""
781                 self._downloader.to_screen(u'[youtube] RTMP download detected')
782         
783         def _real_initialize(self):
784                 if self._downloader is None:
785                         return
786
787                 username = None
788                 password = None
789                 downloader_params = self._downloader.params
790
791                 # Attempt to use provided username and password or .netrc data
792                 if downloader_params.get('username', None) is not None:
793                         username = downloader_params['username']
794                         password = downloader_params['password']
795                 elif downloader_params.get('usenetrc', False):
796                         try:
797                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
798                                 if info is not None:
799                                         username = info[0]
800                                         password = info[2]
801                                 else:
802                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
803                         except (IOError, netrc.NetrcParseError), err:
804                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
805                                 return
806
807                 # Set language
808                 request = urllib2.Request(self._LANG_URL, None, std_headers)
809                 try:
810                         self.report_lang()
811                         urllib2.urlopen(request).read()
812                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
813                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
814                         return
815
816                 # No authentication to be performed
817                 if username is None:
818                         return
819
820                 # Log in
821                 login_form = {
822                                 'current_form': 'loginForm',
823                                 'next':         '/',
824                                 'action_login': 'Log In',
825                                 'username':     username,
826                                 'password':     password,
827                                 }
828                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
829                 try:
830                         self.report_login()
831                         login_results = urllib2.urlopen(request).read()
832                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
833                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
834                                 return
835                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
836                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
837                         return
838         
839                 # Confirm age
840                 age_form = {
841                                 'next_url':             '/',
842                                 'action_confirm':       'Confirm',
843                                 }
844                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
845                 try:
846                         self.report_age_confirmation()
847                         age_results = urllib2.urlopen(request).read()
848                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
849                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
850                         return
851
852         def _real_extract(self, url):
853                 # Extract video id from URL
854                 mobj = re.match(self._VALID_URL, url)
855                 if mobj is None:
856                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
857                         return
858                 video_id = mobj.group(2)
859
860                 # Get video webpage
861                 self.report_video_webpage_download(video_id)
862                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
863                 try:
864                         video_webpage = urllib2.urlopen(request).read()
865                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
866                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
867                         return
868
869                 # Attempt to extract SWF player URL
870                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
871                 if mobj is not None:
872                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
873                 else:
874                         player_url = None
875
876                 # Get video info
877                 self.report_video_info_webpage_download(video_id)
878                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
879                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
880                                            % (video_id, el_type))
881                         request = urllib2.Request(video_info_url, None, std_headers)
882                         try:
883                                 video_info_webpage = urllib2.urlopen(request).read()
884                                 video_info = parse_qs(video_info_webpage)
885                                 if 'token' in video_info:
886                                         break
887                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
888                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
889                                 return
890                 if 'token' not in video_info:
891                         if 'reason' in video_info:
892                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
893                         else:
894                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
895                         return
896
897                 # Start extracting information
898                 self.report_information_extraction(video_id)
899
900                 # uploader
901                 if 'author' not in video_info:
902                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
903                         return
904                 video_uploader = urllib.unquote_plus(video_info['author'][0])
905
906                 # title
907                 if 'title' not in video_info:
908                         self._downloader.trouble(u'ERROR: unable to extract video title')
909                         return
910                 video_title = urllib.unquote_plus(video_info['title'][0])
911                 video_title = video_title.decode('utf-8')
912                 video_title = sanitize_title(video_title)
913
914                 # simplified title
915                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
916                 simple_title = simple_title.strip(ur'_')
917
918                 # thumbnail image
919                 if 'thumbnail_url' not in video_info:
920                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
921                         video_thumbnail = ''
922                 else:   # don't panic if we can't find it
923                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
924
925                 # upload date
926                 upload_date = u'NA'
927                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
928                 if mobj is not None:
929                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
930                         format_expressions = ['%d %B %Y', '%B %d %Y']
931                         for expression in format_expressions:
932                                 try:
933                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
934                                 except:
935                                         pass
936
937                 # description
938                 video_description = 'No description available.'
939                 if self._downloader.params.get('forcedescription', False):
940                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
941                         if mobj is not None:
942                                 video_description = mobj.group(1)
943
944                 # token
945                 video_token = urllib.unquote_plus(video_info['token'][0])
946
947                 # Decide which formats to download
948                 req_format = self._downloader.params.get('format', None)
949
950                 if 'fmt_url_map' in video_info:
951                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
952                         format_limit = self._downloader.params.get('format_limit', None)
953                         if format_limit is not None and format_limit in self._available_formats:
954                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
955                         else:
956                                 format_list = self._available_formats
957                         existing_formats = [x for x in format_list if x in url_map]
958                         if len(existing_formats) == 0:
959                                 self._downloader.trouble(u'ERROR: no known formats available for video')
960                                 return
961                         if req_format is None:
962                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
963                         elif req_format == '-1':
964                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
965                         else:
966                                 # Specific format
967                                 if req_format not in url_map:
968                                         self._downloader.trouble(u'ERROR: requested format not available')
969                                         return
970                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
971
972                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
973                         self.report_rtmp_download()
974                         video_url_list = [(None, video_info['conn'][0])]
975
976                 else:
977                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
978                         return
979
980                 for format_param, video_real_url in video_url_list:
981                         # At this point we have a new video
982                         self._downloader.increment_downloads()
983
984                         # Extension
985                         video_extension = self._video_extensions.get(format_param, 'flv')
986
987                         # Find the video URL in fmt_url_map or conn paramters
988                         try:
989                                 # Process video information
990                                 self._downloader.process_info({
991                                         'id':           video_id.decode('utf-8'),
992                                         'url':          video_real_url.decode('utf-8'),
993                                         'uploader':     video_uploader.decode('utf-8'),
994                                         'upload_date':  upload_date,
995                                         'title':        video_title,
996                                         'stitle':       simple_title,
997                                         'ext':          video_extension.decode('utf-8'),
998                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
999                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1000                                         'description':  video_description.decode('utf-8'),
1001                                         'player_url':   player_url,
1002                                 })
1003                         except UnavailableVideoError, err:
1004                                 self._downloader.trouble(u'ERROR: unable to download video')
1005
1006
1007 class MetacafeIE(InfoExtractor):
1008         """Information Extractor for metacafe.com."""
1009
1010         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1011         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1012         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1013         _youtube_ie = None
1014
1015         def __init__(self, youtube_ie, downloader=None):
1016                 InfoExtractor.__init__(self, downloader)
1017                 self._youtube_ie = youtube_ie
1018
1019         @staticmethod
1020         def suitable(url):
1021                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1022
1023         def report_disclaimer(self):
1024                 """Report disclaimer retrieval."""
1025                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1026
1027         def report_age_confirmation(self):
1028                 """Report attempt to confirm age."""
1029                 self._downloader.to_screen(u'[metacafe] Confirming age')
1030         
1031         def report_download_webpage(self, video_id):
1032                 """Report webpage download."""
1033                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1034         
1035         def report_extraction(self, video_id):
1036                 """Report information extraction."""
1037                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1038
1039         def _real_initialize(self):
1040                 # Retrieve disclaimer
1041                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1042                 try:
1043                         self.report_disclaimer()
1044                         disclaimer = urllib2.urlopen(request).read()
1045                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1046                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1047                         return
1048
1049                 # Confirm age
1050                 disclaimer_form = {
1051                         'filters': '0',
1052                         'submit': "Continue - I'm over 18",
1053                         }
1054                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1055                 try:
1056                         self.report_age_confirmation()
1057                         disclaimer = urllib2.urlopen(request).read()
1058                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1059                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1060                         return
1061         
1062         def _real_extract(self, url):
1063                 # Extract id and simplified title from URL
1064                 mobj = re.match(self._VALID_URL, url)
1065                 if mobj is None:
1066                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1067                         return
1068
1069                 video_id = mobj.group(1)
1070
1071                 # Check if video comes from YouTube
1072                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1073                 if mobj2 is not None:
1074                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1075                         return
1076
1077                 # At this point we have a new video
1078                 self._downloader.increment_downloads()
1079
1080                 simple_title = mobj.group(2).decode('utf-8')
1081
1082                 # Retrieve video webpage to extract further information
1083                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1084                 try:
1085                         self.report_download_webpage(video_id)
1086                         webpage = urllib2.urlopen(request).read()
1087                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1088                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1089                         return
1090
1091                 # Extract URL, uploader and title from webpage
1092                 self.report_extraction(video_id)
1093                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1094                 if mobj is not None:
1095                         mediaURL = urllib.unquote(mobj.group(1))
1096                         video_extension = mediaURL[-3:]
1097                         
1098                         # Extract gdaKey if available
1099                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1100                         if mobj is None:
1101                                 video_url = mediaURL
1102                         else:
1103                                 gdaKey = mobj.group(1)
1104                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1105                 else:
1106                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1107                         if mobj is None:
1108                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1109                                 return
1110                         vardict = parse_qs(mobj.group(1))
1111                         if 'mediaData' not in vardict:
1112                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1113                                 return
1114                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1115                         if mobj is None:
1116                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1117                                 return
1118                         mediaURL = mobj.group(1).replace('\\/', '/')
1119                         video_extension = mediaURL[-3:]
1120                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1121
1122                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1123                 if mobj is None:
1124                         self._downloader.trouble(u'ERROR: unable to extract title')
1125                         return
1126                 video_title = mobj.group(1).decode('utf-8')
1127                 video_title = sanitize_title(video_title)
1128
1129                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1130                 if mobj is None:
1131                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1132                         return
1133                 video_uploader = mobj.group(1)
1134
1135                 try:
1136                         # Process video information
1137                         self._downloader.process_info({
1138                                 'id':           video_id.decode('utf-8'),
1139                                 'url':          video_url.decode('utf-8'),
1140                                 'uploader':     video_uploader.decode('utf-8'),
1141                                 'upload_date':  u'NA',
1142                                 'title':        video_title,
1143                                 'stitle':       simple_title,
1144                                 'ext':          video_extension.decode('utf-8'),
1145                                 'format':       u'NA',
1146                                 'player_url':   None,
1147                         })
1148                 except UnavailableVideoError:
1149                         self._downloader.trouble(u'ERROR: unable to download video')
1150
1151
1152 class DailymotionIE(InfoExtractor):
1153         """Information Extractor for Dailymotion"""
1154
1155         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1156
1157         def __init__(self, downloader=None):
1158                 InfoExtractor.__init__(self, downloader)
1159
1160         @staticmethod
1161         def suitable(url):
1162                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1163
1164         def report_download_webpage(self, video_id):
1165                 """Report webpage download."""
1166                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1167         
1168         def report_extraction(self, video_id):
1169                 """Report information extraction."""
1170                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1171
1172         def _real_initialize(self):
1173                 return
1174
1175         def _real_extract(self, url):
1176                 # Extract id and simplified title from URL
1177                 mobj = re.match(self._VALID_URL, url)
1178                 if mobj is None:
1179                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1180                         return
1181
1182                 # At this point we have a new video
1183                 self._downloader.increment_downloads()
1184                 video_id = mobj.group(1)
1185
1186                 simple_title = mobj.group(2).decode('utf-8')
1187                 video_extension = 'flv'
1188
1189                 # Retrieve video webpage to extract further information
1190                 request = urllib2.Request(url)
1191                 try:
1192                         self.report_download_webpage(video_id)
1193                         webpage = urllib2.urlopen(request).read()
1194                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1195                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1196                         return
1197
1198                 # Extract URL, uploader and title from webpage
1199                 self.report_extraction(video_id)
1200                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1201                 if mobj is None:
1202                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1203                         return
1204                 mediaURL = urllib.unquote(mobj.group(1))
1205
1206                 # if needed add http://www.dailymotion.com/ if relative URL
1207
1208                 video_url = mediaURL
1209
1210                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1211                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1212                 if mobj is None:
1213                         self._downloader.trouble(u'ERROR: unable to extract title')
1214                         return
1215                 video_title = mobj.group(1).decode('utf-8')
1216                 video_title = sanitize_title(video_title)
1217
1218                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1219                 if mobj is None:
1220                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1221                         return
1222                 video_uploader = mobj.group(1)
1223
1224                 try:
1225                         # Process video information
1226                         self._downloader.process_info({
1227                                 'id':           video_id.decode('utf-8'),
1228                                 'url':          video_url.decode('utf-8'),
1229                                 'uploader':     video_uploader.decode('utf-8'),
1230                                 'upload_date':  u'NA',
1231                                 'title':        video_title,
1232                                 'stitle':       simple_title,
1233                                 'ext':          video_extension.decode('utf-8'),
1234                                 'format':       u'NA',
1235                                 'player_url':   None,
1236                         })
1237                 except UnavailableVideoError:
1238                         self._downloader.trouble(u'ERROR: unable to download video')
1239
1240 class GoogleIE(InfoExtractor):
1241         """Information extractor for video.google.com."""
1242
1243         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1244
1245         def __init__(self, downloader=None):
1246                 InfoExtractor.__init__(self, downloader)
1247
1248         @staticmethod
1249         def suitable(url):
1250                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1251
1252         def report_download_webpage(self, video_id):
1253                 """Report webpage download."""
1254                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1255
1256         def report_extraction(self, video_id):
1257                 """Report information extraction."""
1258                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1259
1260         def _real_initialize(self):
1261                 return
1262
1263         def _real_extract(self, url):
1264                 # Extract id from URL
1265                 mobj = re.match(self._VALID_URL, url)
1266                 if mobj is None:
1267                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1268                         return
1269
1270                 # At this point we have a new video
1271                 self._downloader.increment_downloads()
1272                 video_id = mobj.group(1)
1273
1274                 video_extension = 'mp4'
1275
1276                 # Retrieve video webpage to extract further information
1277                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1278                 try:
1279                         self.report_download_webpage(video_id)
1280                         webpage = urllib2.urlopen(request).read()
1281                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1282                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1283                         return
1284
1285                 # Extract URL, uploader, and title from webpage
1286                 self.report_extraction(video_id)
1287                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1288                 if mobj is None:
1289                         video_extension = 'flv'
1290                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1291                 if mobj is None:
1292                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1293                         return
1294                 mediaURL = urllib.unquote(mobj.group(1))
1295                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1296                 mediaURL = mediaURL.replace('\\x26', '\x26')
1297
1298                 video_url = mediaURL
1299
1300                 mobj = re.search(r'<title>(.*)</title>', webpage)
1301                 if mobj is None:
1302                         self._downloader.trouble(u'ERROR: unable to extract title')
1303                         return
1304                 video_title = mobj.group(1).decode('utf-8')
1305                 video_title = sanitize_title(video_title)
1306                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1307
1308                 # Extract video description
1309                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1310                 if mobj is None:
1311                         self._downloader.trouble(u'ERROR: unable to extract video description')
1312                         return
1313                 video_description = mobj.group(1).decode('utf-8')
1314                 if not video_description:
1315                         video_description = 'No description available.'
1316
1317                 # Extract video thumbnail
1318                 if self._downloader.params.get('forcethumbnail', False):
1319                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1320                         try:
1321                                 webpage = urllib2.urlopen(request).read()
1322                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1324                                 return
1325                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1326                         if mobj is None:
1327                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1328                                 return
1329                         video_thumbnail = mobj.group(1)
1330                 else:   # we need something to pass to process_info
1331                         video_thumbnail = ''
1332
1333
1334                 try:
1335                         # Process video information
1336                         self._downloader.process_info({
1337                                 'id':           video_id.decode('utf-8'),
1338                                 'url':          video_url.decode('utf-8'),
1339                                 'uploader':     u'NA',
1340                                 'upload_date':  u'NA',
1341                                 'title':        video_title,
1342                                 'stitle':       simple_title,
1343                                 'ext':          video_extension.decode('utf-8'),
1344                                 'format':       u'NA',
1345                                 'player_url':   None,
1346                         })
1347                 except UnavailableVideoError:
1348                         self._downloader.trouble(u'ERROR: unable to download video')
1349
1350
1351 class PhotobucketIE(InfoExtractor):
1352         """Information extractor for photobucket.com."""
1353
1354         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1355
1356         def __init__(self, downloader=None):
1357                 InfoExtractor.__init__(self, downloader)
1358
1359         @staticmethod
1360         def suitable(url):
1361                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1362
1363         def report_download_webpage(self, video_id):
1364                 """Report webpage download."""
1365                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1366
1367         def report_extraction(self, video_id):
1368                 """Report information extraction."""
1369                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1370
1371         def _real_initialize(self):
1372                 return
1373
1374         def _real_extract(self, url):
1375                 # Extract id from URL
1376                 mobj = re.match(self._VALID_URL, url)
1377                 if mobj is None:
1378                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1379                         return
1380
1381                 # At this point we have a new video
1382                 self._downloader.increment_downloads()
1383                 video_id = mobj.group(1)
1384
1385                 video_extension = 'flv'
1386
1387                 # Retrieve video webpage to extract further information
1388                 request = urllib2.Request(url)
1389                 try:
1390                         self.report_download_webpage(video_id)
1391                         webpage = urllib2.urlopen(request).read()
1392                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1393                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1394                         return
1395
1396                 # Extract URL, uploader, and title from webpage
1397                 self.report_extraction(video_id)
1398                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1399                 if mobj is None:
1400                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1401                         return
1402                 mediaURL = urllib.unquote(mobj.group(1))
1403
1404                 video_url = mediaURL
1405
1406                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1407                 if mobj is None:
1408                         self._downloader.trouble(u'ERROR: unable to extract title')
1409                         return
1410                 video_title = mobj.group(1).decode('utf-8')
1411                 video_title = sanitize_title(video_title)
1412                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1413
1414                 video_uploader = mobj.group(2).decode('utf-8')
1415
1416                 try:
1417                         # Process video information
1418                         self._downloader.process_info({
1419                                 'id':           video_id.decode('utf-8'),
1420                                 'url':          video_url.decode('utf-8'),
1421                                 'uploader':     video_uploader,
1422                                 'upload_date':  u'NA',
1423                                 'title':        video_title,
1424                                 'stitle':       simple_title,
1425                                 'ext':          video_extension.decode('utf-8'),
1426                                 'format':       u'NA',
1427                                 'player_url':   None,
1428                         })
1429                 except UnavailableVideoError:
1430                         self._downloader.trouble(u'ERROR: unable to download video')
1431
1432
1433 class YahooIE(InfoExtractor):
1434         """Information extractor for video.yahoo.com."""
1435
1436         # _VALID_URL matches all Yahoo! Video URLs
1437         # _VPAGE_URL matches only the extractable '/watch/' URLs
1438         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1439         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1440
1441         def __init__(self, downloader=None):
1442                 InfoExtractor.__init__(self, downloader)
1443
1444         @staticmethod
1445         def suitable(url):
1446                 return (re.match(YahooIE._VALID_URL, url) is not None)
1447
1448         def report_download_webpage(self, video_id):
1449                 """Report webpage download."""
1450                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1451
1452         def report_extraction(self, video_id):
1453                 """Report information extraction."""
1454                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1455
1456         def _real_initialize(self):
1457                 return
1458
1459         def _real_extract(self, url, new_video=True):
1460                 # Extract ID from URL
1461                 mobj = re.match(self._VALID_URL, url)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1464                         return
1465
1466                 # At this point we have a new video
1467                 self._downloader.increment_downloads()
1468                 video_id = mobj.group(2)
1469                 video_extension = 'flv'
1470
1471                 # Rewrite valid but non-extractable URLs as
1472                 # extractable English language /watch/ URLs
1473                 if re.match(self._VPAGE_URL, url) is None:
1474                         request = urllib2.Request(url)
1475                         try:
1476                                 webpage = urllib2.urlopen(request).read()
1477                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1479                                 return
1480
1481                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1482                         if mobj is None:
1483                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1484                                 return
1485                         yahoo_id = mobj.group(1)
1486
1487                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1488                         if mobj is None:
1489                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1490                                 return
1491                         yahoo_vid = mobj.group(1)
1492
1493                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1494                         return self._real_extract(url, new_video=False)
1495
1496                 # Retrieve video webpage to extract further information
1497                 request = urllib2.Request(url)
1498                 try:
1499                         self.report_download_webpage(video_id)
1500                         webpage = urllib2.urlopen(request).read()
1501                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1502                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1503                         return
1504
1505                 # Extract uploader and title from webpage
1506                 self.report_extraction(video_id)
1507                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1508                 if mobj is None:
1509                         self._downloader.trouble(u'ERROR: unable to extract video title')
1510                         return
1511                 video_title = mobj.group(1).decode('utf-8')
1512                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1513
1514                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1515                 if mobj is None:
1516                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1517                         return
1518                 video_uploader = mobj.group(1).decode('utf-8')
1519
1520                 # Extract video thumbnail
1521                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1522                 if mobj is None:
1523                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1524                         return
1525                 video_thumbnail = mobj.group(1).decode('utf-8')
1526
1527                 # Extract video description
1528                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1529                 if mobj is None:
1530                         self._downloader.trouble(u'ERROR: unable to extract video description')
1531                         return
1532                 video_description = mobj.group(1).decode('utf-8')
1533                 if not video_description: video_description = 'No description available.'
1534
1535                 # Extract video height and width
1536                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1537                 if mobj is None:
1538                         self._downloader.trouble(u'ERROR: unable to extract video height')
1539                         return
1540                 yv_video_height = mobj.group(1)
1541
1542                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1543                 if mobj is None:
1544                         self._downloader.trouble(u'ERROR: unable to extract video width')
1545                         return
1546                 yv_video_width = mobj.group(1)
1547
1548                 # Retrieve video playlist to extract media URL
1549                 # I'm not completely sure what all these options are, but we
1550                 # seem to need most of them, otherwise the server sends a 401.
1551                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1552                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1553                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1554                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1555                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1556                 try:
1557                         self.report_download_webpage(video_id)
1558                         webpage = urllib2.urlopen(request).read()
1559                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1560                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1561                         return
1562
1563                 # Extract media URL from playlist XML
1564                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1565                 if mobj is None:
1566                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1567                         return
1568                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1569                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1570
1571                 try:
1572                         # Process video information
1573                         self._downloader.process_info({
1574                                 'id':           video_id.decode('utf-8'),
1575                                 'url':          video_url,
1576                                 'uploader':     video_uploader,
1577                                 'upload_date':  u'NA',
1578                                 'title':        video_title,
1579                                 'stitle':       simple_title,
1580                                 'ext':          video_extension.decode('utf-8'),
1581                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1582                                 'description':  video_description,
1583                                 'thumbnail':    video_thumbnail,
1584                                 'description':  video_description,
1585                                 'player_url':   None,
1586                         })
1587                 except UnavailableVideoError:
1588                         self._downloader.trouble(u'ERROR: unable to download video')
1589
1590
1591 class GenericIE(InfoExtractor):
1592         """Generic last-resort information extractor."""
1593
1594         def __init__(self, downloader=None):
1595                 InfoExtractor.__init__(self, downloader)
1596
1597         @staticmethod
1598         def suitable(url):
1599                 return True
1600
1601         def report_download_webpage(self, video_id):
1602                 """Report webpage download."""
1603                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1604                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1605
1606         def report_extraction(self, video_id):
1607                 """Report information extraction."""
1608                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1609
1610         def _real_initialize(self):
1611                 return
1612
1613         def _real_extract(self, url):
1614                 # At this point we have a new video
1615                 self._downloader.increment_downloads()
1616
1617                 video_id = url.split('/')[-1]
1618                 request = urllib2.Request(url)
1619                 try:
1620                         self.report_download_webpage(video_id)
1621                         webpage = urllib2.urlopen(request).read()
1622                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1623                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1624                         return
1625                 except ValueError, err:
1626                         # since this is the last-resort InfoExtractor, if
1627                         # this error is thrown, it'll be thrown here
1628                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1629                         return
1630
1631                 self.report_extraction(video_id)
1632                 # Start with something easy: JW Player in SWFObject
1633                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1634                 if mobj is None:
1635                         # Broaden the search a little bit
1636                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1637                 if mobj is None:
1638                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1639                         return
1640
1641                 # It's possible that one of the regexes
1642                 # matched, but returned an empty group:
1643                 if mobj.group(1) is None:
1644                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1645                         return
1646
1647                 video_url = urllib.unquote(mobj.group(1))
1648                 video_id  = os.path.basename(video_url)
1649
1650                 # here's a fun little line of code for you:
1651                 video_extension = os.path.splitext(video_id)[1][1:]
1652                 video_id        = os.path.splitext(video_id)[0]
1653
1654                 # it's tempting to parse this further, but you would
1655                 # have to take into account all the variations like
1656                 #   Video Title - Site Name
1657                 #   Site Name | Video Title
1658                 #   Video Title - Tagline | Site Name
1659                 # and so on and so forth; it's just not practical
1660                 mobj = re.search(r'<title>(.*)</title>', webpage)
1661                 if mobj is None:
1662                         self._downloader.trouble(u'ERROR: unable to extract title')
1663                         return
1664                 video_title = mobj.group(1).decode('utf-8')
1665                 video_title = sanitize_title(video_title)
1666                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1667
1668                 # video uploader is domain name
1669                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1670                 if mobj is None:
1671                         self._downloader.trouble(u'ERROR: unable to extract title')
1672                         return
1673                 video_uploader = mobj.group(1).decode('utf-8')
1674
1675                 try:
1676                         # Process video information
1677                         self._downloader.process_info({
1678                                 'id':           video_id.decode('utf-8'),
1679                                 'url':          video_url.decode('utf-8'),
1680                                 'uploader':     video_uploader,
1681                                 'upload_date':  u'NA',
1682                                 'title':        video_title,
1683                                 'stitle':       simple_title,
1684                                 'ext':          video_extension.decode('utf-8'),
1685                                 'format':       u'NA',
1686                                 'player_url':   None,
1687                         })
1688                 except UnavailableVideoError, err:
1689                         self._downloader.trouble(u'ERROR: unable to download video')
1690
1691
1692 class YoutubeSearchIE(InfoExtractor):
1693         """Information Extractor for YouTube search queries."""
1694         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1695         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1696         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1697         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1698         _youtube_ie = None
1699         _max_youtube_results = 1000
1700
1701         def __init__(self, youtube_ie, downloader=None):
1702                 InfoExtractor.__init__(self, downloader)
1703                 self._youtube_ie = youtube_ie
1704         
1705         @staticmethod
1706         def suitable(url):
1707                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1708
1709         def report_download_page(self, query, pagenum):
1710                 """Report attempt to download playlist page with given number."""
1711                 query = query.decode(preferredencoding())
1712                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1713
1714         def _real_initialize(self):
1715                 self._youtube_ie.initialize()
1716         
1717         def _real_extract(self, query):
1718                 mobj = re.match(self._VALID_QUERY, query)
1719                 if mobj is None:
1720                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1721                         return
1722
1723                 prefix, query = query.split(':')
1724                 prefix = prefix[8:]
1725                 query  = query.encode('utf-8')
1726                 if prefix == '':
1727                         self._download_n_results(query, 1)
1728                         return
1729                 elif prefix == 'all':
1730                         self._download_n_results(query, self._max_youtube_results)
1731                         return
1732                 else:
1733                         try:
1734                                 n = long(prefix)
1735                                 if n <= 0:
1736                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1737                                         return
1738                                 elif n > self._max_youtube_results:
1739                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1740                                         n = self._max_youtube_results
1741                                 self._download_n_results(query, n)
1742                                 return
1743                         except ValueError: # parsing prefix as integer fails
1744                                 self._download_n_results(query, 1)
1745                                 return
1746
1747         def _download_n_results(self, query, n):
1748                 """Downloads a specified number of results for a query"""
1749
1750                 video_ids = []
1751                 already_seen = set()
1752                 pagenum = 1
1753
1754                 while True:
1755                         self.report_download_page(query, pagenum)
1756                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1757                         request = urllib2.Request(result_url, None, std_headers)
1758                         try:
1759                                 page = urllib2.urlopen(request).read()
1760                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1761                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1762                                 return
1763
1764                         # Extract video identifiers
1765                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1766                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1767                                 if video_id not in already_seen:
1768                                         video_ids.append(video_id)
1769                                         already_seen.add(video_id)
1770                                         if len(video_ids) == n:
1771                                                 # Specified n videos reached
1772                                                 for id in video_ids:
1773                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1774                                                 return
1775
1776                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1777                                 for id in video_ids:
1778                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1779                                 return
1780
1781                         pagenum = pagenum + 1
1782
1783 class GoogleSearchIE(InfoExtractor):
1784         """Information Extractor for Google Video search queries."""
1785         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1786         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1787         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1788         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1789         _google_ie = None
1790         _max_google_results = 1000
1791
1792         def __init__(self, google_ie, downloader=None):
1793                 InfoExtractor.__init__(self, downloader)
1794                 self._google_ie = google_ie
1795         
1796         @staticmethod
1797         def suitable(url):
1798                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1799
1800         def report_download_page(self, query, pagenum):
1801                 """Report attempt to download playlist page with given number."""
1802                 query = query.decode(preferredencoding())
1803                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1804
1805         def _real_initialize(self):
1806                 self._google_ie.initialize()
1807         
1808         def _real_extract(self, query):
1809                 mobj = re.match(self._VALID_QUERY, query)
1810                 if mobj is None:
1811                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1812                         return
1813
1814                 prefix, query = query.split(':')
1815                 prefix = prefix[8:]
1816                 query  = query.encode('utf-8')
1817                 if prefix == '':
1818                         self._download_n_results(query, 1)
1819                         return
1820                 elif prefix == 'all':
1821                         self._download_n_results(query, self._max_google_results)
1822                         return
1823                 else:
1824                         try:
1825                                 n = long(prefix)
1826                                 if n <= 0:
1827                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1828                                         return
1829                                 elif n > self._max_google_results:
1830                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1831                                         n = self._max_google_results
1832                                 self._download_n_results(query, n)
1833                                 return
1834                         except ValueError: # parsing prefix as integer fails
1835                                 self._download_n_results(query, 1)
1836                                 return
1837
1838         def _download_n_results(self, query, n):
1839                 """Downloads a specified number of results for a query"""
1840
1841                 video_ids = []
1842                 already_seen = set()
1843                 pagenum = 1
1844
1845                 while True:
1846                         self.report_download_page(query, pagenum)
1847                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1848                         request = urllib2.Request(result_url, None, std_headers)
1849                         try:
1850                                 page = urllib2.urlopen(request).read()
1851                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1852                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1853                                 return
1854
1855                         # Extract video identifiers
1856                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1857                                 video_id = mobj.group(1)
1858                                 if video_id not in already_seen:
1859                                         video_ids.append(video_id)
1860                                         already_seen.add(video_id)
1861                                         if len(video_ids) == n:
1862                                                 # Specified n videos reached
1863                                                 for id in video_ids:
1864                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1865                                                 return
1866
1867                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1868                                 for id in video_ids:
1869                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1870                                 return
1871
1872                         pagenum = pagenum + 1
1873
1874 class YahooSearchIE(InfoExtractor):
1875         """Information Extractor for Yahoo! Video search queries."""
1876         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1877         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1878         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1879         _MORE_PAGES_INDICATOR = r'\s*Next'
1880         _yahoo_ie = None
1881         _max_yahoo_results = 1000
1882
1883         def __init__(self, yahoo_ie, downloader=None):
1884                 InfoExtractor.__init__(self, downloader)
1885                 self._yahoo_ie = yahoo_ie
1886         
1887         @staticmethod
1888         def suitable(url):
1889                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1890
1891         def report_download_page(self, query, pagenum):
1892                 """Report attempt to download playlist page with given number."""
1893                 query = query.decode(preferredencoding())
1894                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1895
1896         def _real_initialize(self):
1897                 self._yahoo_ie.initialize()
1898         
1899         def _real_extract(self, query):
1900                 mobj = re.match(self._VALID_QUERY, query)
1901                 if mobj is None:
1902                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1903                         return
1904
1905                 prefix, query = query.split(':')
1906                 prefix = prefix[8:]
1907                 query  = query.encode('utf-8')
1908                 if prefix == '':
1909                         self._download_n_results(query, 1)
1910                         return
1911                 elif prefix == 'all':
1912                         self._download_n_results(query, self._max_yahoo_results)
1913                         return
1914                 else:
1915                         try:
1916                                 n = long(prefix)
1917                                 if n <= 0:
1918                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1919                                         return
1920                                 elif n > self._max_yahoo_results:
1921                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1922                                         n = self._max_yahoo_results
1923                                 self._download_n_results(query, n)
1924                                 return
1925                         except ValueError: # parsing prefix as integer fails
1926                                 self._download_n_results(query, 1)
1927                                 return
1928
1929         def _download_n_results(self, query, n):
1930                 """Downloads a specified number of results for a query"""
1931
1932                 video_ids = []
1933                 already_seen = set()
1934                 pagenum = 1
1935
1936                 while True:
1937                         self.report_download_page(query, pagenum)
1938                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1939                         request = urllib2.Request(result_url, None, std_headers)
1940                         try:
1941                                 page = urllib2.urlopen(request).read()
1942                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1943                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1944                                 return
1945
1946                         # Extract video identifiers
1947                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1948                                 video_id = mobj.group(1)
1949                                 if video_id not in already_seen:
1950                                         video_ids.append(video_id)
1951                                         already_seen.add(video_id)
1952                                         if len(video_ids) == n:
1953                                                 # Specified n videos reached
1954                                                 for id in video_ids:
1955                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1956                                                 return
1957
1958                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1959                                 for id in video_ids:
1960                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1961                                 return
1962
1963                         pagenum = pagenum + 1
1964
1965 class YoutubePlaylistIE(InfoExtractor):
1966         """Information Extractor for YouTube playlists."""
1967
1968         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1969         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1970         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1971         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1972         _youtube_ie = None
1973
1974         def __init__(self, youtube_ie, downloader=None):
1975                 InfoExtractor.__init__(self, downloader)
1976                 self._youtube_ie = youtube_ie
1977         
1978         @staticmethod
1979         def suitable(url):
1980                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1981
1982         def report_download_page(self, playlist_id, pagenum):
1983                 """Report attempt to download playlist page with given number."""
1984                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1985
1986         def _real_initialize(self):
1987                 self._youtube_ie.initialize()
1988         
1989         def _real_extract(self, url):
1990                 # Extract playlist id
1991                 mobj = re.match(self._VALID_URL, url)
1992                 if mobj is None:
1993                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1994                         return
1995
1996                 # Download playlist pages
1997                 playlist_id = mobj.group(1)
1998                 video_ids = []
1999                 pagenum = 1
2000
2001                 while True:
2002                         self.report_download_page(playlist_id, pagenum)
2003                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2004                         try:
2005                                 page = urllib2.urlopen(request).read()
2006                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2008                                 return
2009
2010                         # Extract video identifiers
2011                         ids_in_page = []
2012                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2013                                 if mobj.group(1) not in ids_in_page:
2014                                         ids_in_page.append(mobj.group(1))
2015                         video_ids.extend(ids_in_page)
2016
2017                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2018                                 break
2019                         pagenum = pagenum + 1
2020
2021                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2022                 playlistend = self._downloader.params.get('playlistend', -1)
2023                 video_ids = video_ids[playliststart:playlistend]
2024
2025                 for id in video_ids:
2026                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2027                 return
2028
2029 class YoutubeUserIE(InfoExtractor):
2030         """Information Extractor for YouTube users."""
2031
2032         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2033         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2034         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2035         _youtube_ie = None
2036
2037         def __init__(self, youtube_ie, downloader=None):
2038                 InfoExtractor.__init__(self, downloader)
2039                 self._youtube_ie = youtube_ie
2040         
2041         @staticmethod
2042         def suitable(url):
2043                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2044
2045         def report_download_page(self, username):
2046                 """Report attempt to download user page."""
2047                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2048
2049         def _real_initialize(self):
2050                 self._youtube_ie.initialize()
2051         
2052         def _real_extract(self, url):
2053                 # Extract username
2054                 mobj = re.match(self._VALID_URL, url)
2055                 if mobj is None:
2056                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2057                         return
2058
2059                 # Download user page
2060                 username = mobj.group(1)
2061                 video_ids = []
2062                 pagenum = 1
2063
2064                 self.report_download_page(username)
2065                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2066                 try:
2067                         page = urllib2.urlopen(request).read()
2068                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2069                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2070                         return
2071
2072                 # Extract video identifiers
2073                 ids_in_page = []
2074
2075                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2076                         if mobj.group(1) not in ids_in_page:
2077                                 ids_in_page.append(mobj.group(1))
2078                 video_ids.extend(ids_in_page)
2079
2080                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2081                 playlistend = self._downloader.params.get('playlistend', -1)
2082                 video_ids = video_ids[playliststart:playlistend]
2083
2084                 for id in video_ids:
2085                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2086                 return
2087
2088 class DepositFilesIE(InfoExtractor):
2089         """Information extractor for depositfiles.com"""
2090
2091         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2092
2093         def __init__(self, downloader=None):
2094                 InfoExtractor.__init__(self, downloader)
2095
2096         @staticmethod
2097         def suitable(url):
2098                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2099
2100         def report_download_webpage(self, file_id):
2101                 """Report webpage download."""
2102                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2103
2104         def report_extraction(self, file_id):
2105                 """Report information extraction."""
2106                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2107
2108         def _real_initialize(self):
2109                 return
2110
2111         def _real_extract(self, url):
2112                 # At this point we have a new file
2113                 self._downloader.increment_downloads()
2114
2115                 file_id = url.split('/')[-1]
2116                 # Rebuild url in english locale
2117                 url = 'http://depositfiles.com/en/files/' + file_id
2118
2119                 # Retrieve file webpage with 'Free download' button pressed
2120                 free_download_indication = { 'gateway_result' : '1' }
2121                 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2122                 try:
2123                         self.report_download_webpage(file_id)
2124                         webpage = urllib2.urlopen(request).read()
2125                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2126                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2127                         return
2128
2129                 # Search for the real file URL
2130                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2131                 if (mobj is None) or (mobj.group(1) is None):
2132                         # Try to figure out reason of the error.
2133                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2134                         if (mobj is not None) and (mobj.group(1) is not None):
2135                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2136                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2137                         else:
2138                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2139                         return
2140
2141                 file_url = mobj.group(1)
2142                 file_extension = os.path.splitext(file_url)[1][1:]
2143
2144                 # Search for file title
2145                 mobj = re.search(r'<b title="(.*?)">', webpage)
2146                 if mobj is None:
2147                         self._downloader.trouble(u'ERROR: unable to extract title')
2148                         return
2149                 file_title = mobj.group(1).decode('utf-8')
2150
2151                 try:
2152                         # Process file information
2153                         self._downloader.process_info({
2154                                 'id':           file_id.decode('utf-8'),
2155                                 'url':          file_url.decode('utf-8'),
2156                                 'uploader':     u'NA',
2157                                 'upload_date':  u'NA',
2158                                 'title':        file_title,
2159                                 'stitle':       file_title,
2160                                 'ext':          file_extension.decode('utf-8'),
2161                                 'format':       u'NA',
2162                                 'player_url':   None,
2163                         })
2164                 except UnavailableVideoError, err:
2165                         self._downloader.trouble(u'ERROR: unable to download file')
2166
2167 class PostProcessor(object):
2168         """Post Processor class.
2169
2170         PostProcessor objects can be added to downloaders with their
2171         add_post_processor() method. When the downloader has finished a
2172         successful download, it will take its internal chain of PostProcessors
2173         and start calling the run() method on each one of them, first with
2174         an initial argument and then with the returned value of the previous
2175         PostProcessor.
2176
2177         The chain will be stopped if one of them ever returns None or the end
2178         of the chain is reached.
2179
2180         PostProcessor objects follow a "mutual registration" process similar
2181         to InfoExtractor objects.
2182         """
2183
2184         _downloader = None
2185
2186         def __init__(self, downloader=None):
2187                 self._downloader = downloader
2188
2189         def set_downloader(self, downloader):
2190                 """Sets the downloader for this PP."""
2191                 self._downloader = downloader
2192         
2193         def run(self, information):
2194                 """Run the PostProcessor.
2195
2196                 The "information" argument is a dictionary like the ones
2197                 composed by InfoExtractors. The only difference is that this
2198                 one has an extra field called "filepath" that points to the
2199                 downloaded file.
2200
2201                 When this method returns None, the postprocessing chain is
2202                 stopped. However, this method may return an information
2203                 dictionary that will be passed to the next postprocessing
2204                 object in the chain. It can be the one it received after
2205                 changing some fields.
2206
2207                 In addition, this method may raise a PostProcessingError
2208                 exception that will be taken into account by the downloader
2209                 it was called from.
2210                 """
2211                 return information # by default, do nothing
2212         
2213 ### MAIN PROGRAM ###
2214 if __name__ == '__main__':
2215         try:
2216                 # Modules needed only when running the main program
2217                 import getpass
2218                 import optparse
2219
2220                 # Function to update the program file with the latest version from bitbucket.org
2221                 def update_self(downloader, filename):
2222                         # Note: downloader only used for options
2223                         if not os.access (filename, os.W_OK):
2224                                 sys.exit('ERROR: no write permissions on %s' % filename)
2225
2226                         downloader.to_screen('Updating to latest stable version...')
2227                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2228                         latest_version = urllib.urlopen(latest_url).read().strip()
2229                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2230                         newcontent = urllib.urlopen(prog_url).read()
2231                         stream = open(filename, 'w')
2232                         stream.write(newcontent)
2233                         stream.close()
2234                         downloader.to_screen('Updated to version %s' % latest_version)
2235
2236                 # Parse command line
2237                 parser = optparse.OptionParser(
2238                         usage='Usage: %prog [options] url...',
2239                         version='2010.12.09',
2240                         conflict_handler='resolve',
2241                 )
2242
2243                 parser.add_option('-h', '--help',
2244                                 action='help', help='print this help text and exit')
2245                 parser.add_option('-v', '--version',
2246                                 action='version', help='print program version and exit')
2247                 parser.add_option('-U', '--update',
2248                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2249                 parser.add_option('-i', '--ignore-errors',
2250                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2251                 parser.add_option('-r', '--rate-limit',
2252                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2253                 parser.add_option('-R', '--retries',
2254                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2255                 parser.add_option('--playlist-start',
2256                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2257                 parser.add_option('--playlist-end',
2258                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2259
2260                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2261                 authentication.add_option('-u', '--username',
2262                                 dest='username', metavar='USERNAME', help='account username')
2263                 authentication.add_option('-p', '--password',
2264                                 dest='password', metavar='PASSWORD', help='account password')
2265                 authentication.add_option('-n', '--netrc',
2266                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2267                 parser.add_option_group(authentication)
2268
2269                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2270                 video_format.add_option('-f', '--format',
2271                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2272                 video_format.add_option('--all-formats',
2273                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2274                 video_format.add_option('--max-quality',
2275                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2276                 parser.add_option_group(video_format)
2277
2278                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2279                 verbosity.add_option('-q', '--quiet',
2280                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2281                 verbosity.add_option('-s', '--simulate',
2282                                 action='store_true', dest='simulate', help='do not download video', default=False)
2283                 verbosity.add_option('-g', '--get-url',
2284                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2285                 verbosity.add_option('-e', '--get-title',
2286                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2287                 verbosity.add_option('--get-thumbnail',
2288                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2289                 verbosity.add_option('--get-description',
2290                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2291                 verbosity.add_option('--no-progress',
2292                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2293                 parser.add_option_group(verbosity)
2294
2295                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2296                 filesystem.add_option('-t', '--title',
2297                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2298                 filesystem.add_option('-l', '--literal',
2299                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2300                 filesystem.add_option('-A', '--auto-number',
2301                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2302                 filesystem.add_option('-o', '--output',
2303                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2304                 filesystem.add_option('-a', '--batch-file',
2305                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2306                 filesystem.add_option('-w', '--no-overwrites',
2307                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2308                 filesystem.add_option('-c', '--continue',
2309                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2310                 filesystem.add_option('--cookies',
2311                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2312                 parser.add_option_group(filesystem)
2313
2314                 (opts, args) = parser.parse_args()
2315
2316                 # Open appropriate CookieJar
2317                 if opts.cookiefile is None:
2318                         jar = cookielib.CookieJar()
2319                 else:
2320                         try:
2321                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2322                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2323                                         jar.load()
2324                         except (IOError, OSError), err:
2325                                 sys.exit(u'ERROR: unable to open cookie file')
2326
2327                 # General configuration
2328                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2329                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2330                 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2331                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2332
2333                 # Batch file verification
2334                 batchurls = []
2335                 if opts.batchfile is not None:
2336                         try:
2337                                 if opts.batchfile == '-':
2338                                         batchfd = sys.stdin
2339                                 else:
2340                                         batchfd = open(opts.batchfile, 'r')
2341                                 batchurls = batchfd.readlines()
2342                                 batchurls = [x.strip() for x in batchurls]
2343                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2344                         except IOError:
2345                                 sys.exit(u'ERROR: batch file could not be read')
2346                 all_urls = batchurls + args
2347
2348                 # Conflicting, missing and erroneous options
2349                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2350                         parser.error(u'using .netrc conflicts with giving username/password')
2351                 if opts.password is not None and opts.username is None:
2352                         parser.error(u'account username missing')
2353                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2354                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2355                 if opts.usetitle and opts.useliteral:
2356                         parser.error(u'using title conflicts with using literal title')
2357                 if opts.username is not None and opts.password is None:
2358                         opts.password = getpass.getpass(u'Type account password and press return:')
2359                 if opts.ratelimit is not None:
2360                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2361                         if numeric_limit is None:
2362                                 parser.error(u'invalid rate limit specified')
2363                         opts.ratelimit = numeric_limit
2364                 if opts.retries is not None:
2365                         try:
2366                                 opts.retries = long(opts.retries)
2367                         except (TypeError, ValueError), err:
2368                                 parser.error(u'invalid retry count specified')
2369                 try:
2370                         opts.playliststart = long(opts.playliststart)
2371                         if opts.playliststart <= 0:
2372                                 raise ValueError
2373                 except (TypeError, ValueError), err:
2374                         parser.error(u'invalid playlist start number specified')
2375                 try:
2376                         opts.playlistend = long(opts.playlistend)
2377                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2378                                 raise ValueError
2379                 except (TypeError, ValueError), err:
2380                         parser.error(u'invalid playlist end number specified')
2381
2382                 # Information extractors
2383                 youtube_ie = YoutubeIE()
2384                 metacafe_ie = MetacafeIE(youtube_ie)
2385                 dailymotion_ie = DailymotionIE()
2386                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2387                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2388                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2389                 google_ie = GoogleIE()
2390                 google_search_ie = GoogleSearchIE(google_ie)
2391                 photobucket_ie = PhotobucketIE()
2392                 yahoo_ie = YahooIE()
2393                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2394                 deposit_files_ie = DepositFilesIE()
2395                 generic_ie = GenericIE()
2396
2397                 # File downloader
2398                 fd = FileDownloader({
2399                         'usenetrc': opts.usenetrc,
2400                         'username': opts.username,
2401                         'password': opts.password,
2402                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2403                         'forceurl': opts.geturl,
2404                         'forcetitle': opts.gettitle,
2405                         'forcethumbnail': opts.getthumbnail,
2406                         'forcedescription': opts.getdescription,
2407                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2408                         'format': opts.format,
2409                         'format_limit': opts.format_limit,
2410                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2411                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2412                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2413                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2414                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2415                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2416                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2417                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2418                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2419                                 or u'%(id)s.%(ext)s'),
2420                         'ignoreerrors': opts.ignoreerrors,
2421                         'ratelimit': opts.ratelimit,
2422                         'nooverwrites': opts.nooverwrites,
2423                         'retries': opts.retries,
2424                         'continuedl': opts.continue_dl,
2425                         'noprogress': opts.noprogress,
2426                         'playliststart': opts.playliststart,
2427                         'playlistend': opts.playlistend,
2428                         'logtostderr': opts.outtmpl == '-',
2429                         })
2430                 fd.add_info_extractor(youtube_search_ie)
2431                 fd.add_info_extractor(youtube_pl_ie)
2432                 fd.add_info_extractor(youtube_user_ie)
2433                 fd.add_info_extractor(metacafe_ie)
2434                 fd.add_info_extractor(dailymotion_ie)
2435                 fd.add_info_extractor(youtube_ie)
2436                 fd.add_info_extractor(google_ie)
2437                 fd.add_info_extractor(google_search_ie)
2438                 fd.add_info_extractor(photobucket_ie)
2439                 fd.add_info_extractor(yahoo_ie)
2440                 fd.add_info_extractor(yahoo_search_ie)
2441                 fd.add_info_extractor(deposit_files_ie)
2442
2443                 # This must come last since it's the
2444                 # fallback if none of the others work
2445                 fd.add_info_extractor(generic_ie)
2446
2447                 # Update version
2448                 if opts.update_self:
2449                         update_self(fd, sys.argv[0])
2450
2451                 # Maybe do nothing
2452                 if len(all_urls) < 1:
2453                         if not opts.update_self:
2454                                 parser.error(u'you must provide at least one URL')
2455                         else:
2456                                 sys.exit()
2457                 retcode = fd.download(all_urls)
2458
2459                 # Dump cookie jar if requested
2460                 if opts.cookiefile is not None:
2461                         try:
2462                                 jar.save()
2463                         except (IOError, OSError), err:
2464                                 sys.exit(u'ERROR: unable to save cookie jar')
2465
2466                 sys.exit(retcode)
2467
2468         except DownloadError:
2469                 sys.exit(1)
2470         except SameFileError:
2471                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2472         except KeyboardInterrupt:
2473                 sys.exit(u'\nERROR: Interrupted by user')