youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

http.py (16160B)


      1 from __future__ import unicode_literals
      2 
      3 import errno
      4 import os
      5 import socket
      6 import time
      7 import random
      8 import re
      9 
     10 from .common import FileDownloader
     11 from ..compat import (
     12     compat_str,
     13     compat_urllib_error,
     14 )
     15 from ..utils import (
     16     ContentTooShortError,
     17     encodeFilename,
     18     int_or_none,
     19     sanitize_open,
     20     sanitized_Request,
     21     write_xattr,
     22     XAttrMetadataError,
     23     XAttrUnavailableError,
     24 )
     25 
     26 
     27 class HttpFD(FileDownloader):
     28     def real_download(self, filename, info_dict):
     29         url = info_dict['url']
     30 
     31         class DownloadContext(dict):
     32             __getattr__ = dict.get
     33             __setattr__ = dict.__setitem__
     34             __delattr__ = dict.__delitem__
     35 
     36         ctx = DownloadContext()
     37         ctx.filename = filename
     38         ctx.tmpfilename = self.temp_name(filename)
     39         ctx.stream = None
     40 
     41         # Do not include the Accept-Encoding header
     42         headers = {'Youtubedl-no-compression': 'True'}
     43         add_headers = info_dict.get('http_headers')
     44         if add_headers:
     45             headers.update(add_headers)
     46 
     47         is_test = self.params.get('test', False)
     48         chunk_size = self._TEST_FILE_SIZE if is_test else (
     49             info_dict.get('downloader_options', {}).get('http_chunk_size')
     50             or self.params.get('http_chunk_size') or 0)
     51 
     52         ctx.open_mode = 'wb'
     53         ctx.resume_len = 0
     54         ctx.data_len = None
     55         ctx.block_size = self.params.get('buffersize', 1024)
     56         ctx.start_time = time.time()
     57         ctx.chunk_size = None
     58 
     59         if self.params.get('continuedl', True):
     60             # Establish possible resume length
     61             if os.path.isfile(encodeFilename(ctx.tmpfilename)):
     62                 ctx.resume_len = os.path.getsize(
     63                     encodeFilename(ctx.tmpfilename))
     64 
     65         ctx.is_resume = ctx.resume_len > 0
     66 
     67         count = 0
     68         retries = self.params.get('retries', 0)
     69 
     70         class SucceedDownload(Exception):
     71             pass
     72 
     73         class RetryDownload(Exception):
     74             def __init__(self, source_error):
     75                 self.source_error = source_error
     76 
     77         class NextFragment(Exception):
     78             pass
     79 
     80         def set_range(req, start, end):
     81             range_header = 'bytes=%d-' % start
     82             if end:
     83                 range_header += compat_str(end)
     84             req.add_header('Range', range_header)
     85 
     86         def establish_connection():
     87             ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size)
     88                               if not is_test and chunk_size else chunk_size)
     89             if ctx.resume_len > 0:
     90                 range_start = ctx.resume_len
     91                 if ctx.is_resume:
     92                     self.report_resuming_byte(ctx.resume_len)
     93                 ctx.open_mode = 'ab'
     94             elif ctx.chunk_size > 0:
     95                 range_start = 0
     96             else:
     97                 range_start = None
     98             ctx.is_resume = False
     99             range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None
    100             if range_end and ctx.data_len is not None and range_end >= ctx.data_len:
    101                 range_end = ctx.data_len - 1
    102             has_range = range_start is not None
    103             ctx.has_range = has_range
    104             request = sanitized_Request(url, None, headers)
    105             if has_range:
    106                 set_range(request, range_start, range_end)
    107             # Establish connection
    108             try:
    109                 try:
    110                     ctx.data = self.ydl.urlopen(request)
    111                 except (compat_urllib_error.URLError, ) as err:
    112                     # reason may not be available, e.g. for urllib2.HTTPError on python 2.6
    113                     reason = getattr(err, 'reason', None)
    114                     if isinstance(reason, socket.timeout):
    115                         raise RetryDownload(err)
    116                     raise err
    117                 # When trying to resume, Content-Range HTTP header of response has to be checked
    118                 # to match the value of requested Range HTTP header. This is due to a webservers
    119                 # that don't support resuming and serve a whole file with no Content-Range
    120                 # set in response despite of requested Range (see
    121                 # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799)
    122                 if has_range:
    123                     content_range = ctx.data.headers.get('Content-Range')
    124                     if content_range:
    125                         content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range)
    126                         # Content-Range is present and matches requested Range, resume is possible
    127                         if content_range_m:
    128                             if range_start == int(content_range_m.group(1)):
    129                                 content_range_end = int_or_none(content_range_m.group(2))
    130                                 content_len = int_or_none(content_range_m.group(3))
    131                                 accept_content_len = (
    132                                     # Non-chunked download
    133                                     not ctx.chunk_size
    134                                     # Chunked download and requested piece or
    135                                     # its part is promised to be served
    136                                     or content_range_end == range_end
    137                                     or content_len < range_end)
    138                                 if accept_content_len:
    139                                     ctx.data_len = content_len
    140                                     return
    141                     # Content-Range is either not present or invalid. Assuming remote webserver is
    142                     # trying to send the whole file, resume is not possible, so wiping the local file
    143                     # and performing entire redownload
    144                     self.report_unable_to_resume()
    145                     ctx.resume_len = 0
    146                     ctx.open_mode = 'wb'
    147                 ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None))
    148                 return
    149             except (compat_urllib_error.HTTPError, ) as err:
    150                 if err.code == 416:
    151                     # Unable to resume (requested range not satisfiable)
    152                     try:
    153                         # Open the connection again without the range header
    154                         ctx.data = self.ydl.urlopen(
    155                             sanitized_Request(url, None, headers))
    156                         content_length = ctx.data.info()['Content-Length']
    157                     except (compat_urllib_error.HTTPError, ) as err:
    158                         if err.code < 500 or err.code >= 600:
    159                             raise
    160                     else:
    161                         # Examine the reported length
    162                         if (content_length is not None
    163                                 and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):
    164                             # The file had already been fully downloaded.
    165                             # Explanation to the above condition: in issue #175 it was revealed that
    166                             # YouTube sometimes adds or removes a few bytes from the end of the file,
    167                             # changing the file size slightly and causing problems for some users. So
    168                             # I decided to implement a suggested change and consider the file
    169                             # completely downloaded if the file size differs less than 100 bytes from
    170                             # the one in the hard drive.
    171                             self.report_file_already_downloaded(ctx.filename)
    172                             self.try_rename(ctx.tmpfilename, ctx.filename)
    173                             self._hook_progress({
    174                                 'filename': ctx.filename,
    175                                 'status': 'finished',
    176                                 'downloaded_bytes': ctx.resume_len,
    177                                 'total_bytes': ctx.resume_len,
    178                             })
    179                             raise SucceedDownload()
    180                         else:
    181                             # The length does not match, we start the download over
    182                             self.report_unable_to_resume()
    183                             ctx.resume_len = 0
    184                             ctx.open_mode = 'wb'
    185                             return
    186                 elif err.code < 500 or err.code >= 600:
    187                     # Unexpected HTTP error
    188                     raise
    189                 raise RetryDownload(err)
    190             except socket.error as err:
    191                 if err.errno != errno.ECONNRESET:
    192                     # Connection reset is no problem, just retry
    193                     raise
    194                 raise RetryDownload(err)
    195 
    196         def download():
    197             data_len = ctx.data.info().get('Content-length', None)
    198 
    199             # Range HTTP header may be ignored/unsupported by a webserver
    200             # (e.g. extractor/scivee.py, extractor/bambuser.py).
    201             # However, for a test we still would like to download just a piece of a file.
    202             # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
    203             # block size when downloading a file.
    204             if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
    205                 data_len = self._TEST_FILE_SIZE
    206 
    207             if data_len is not None:
    208                 data_len = int(data_len) + ctx.resume_len
    209                 min_data_len = self.params.get('min_filesize')
    210                 max_data_len = self.params.get('max_filesize')
    211                 if min_data_len is not None and data_len < min_data_len:
    212                     self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
    213                     return False
    214                 if max_data_len is not None and data_len > max_data_len:
    215                     self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
    216                     return False
    217 
    218             byte_counter = 0 + ctx.resume_len
    219             block_size = ctx.block_size
    220             start = time.time()
    221 
    222             # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
    223             now = None  # needed for slow_down() in the first loop run
    224             before = start  # start measuring
    225 
    226             def retry(e):
    227                 to_stdout = ctx.tmpfilename == '-'
    228                 if ctx.stream is not None:
    229                     if not to_stdout:
    230                         ctx.stream.close()
    231                     ctx.stream = None
    232                 ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename))
    233                 raise RetryDownload(e)
    234 
    235             while True:
    236                 try:
    237                     # Download and write
    238                     data_block = ctx.data.read(block_size if data_len is None else min(block_size, data_len - byte_counter))
    239                 # socket.timeout is a subclass of socket.error but may not have
    240                 # errno set
    241                 except socket.timeout as e:
    242                     retry(e)
    243                 except socket.error as e:
    244                     # SSLError on python 2 (inherits socket.error) may have
    245                     # no errno set but this error message
    246                     if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message', None) == 'The read operation timed out':
    247                         retry(e)
    248                     raise
    249 
    250                 byte_counter += len(data_block)
    251 
    252                 # exit loop when download is finished
    253                 if len(data_block) == 0:
    254                     break
    255 
    256                 # Open destination file just in time
    257                 if ctx.stream is None:
    258                     try:
    259                         ctx.stream, ctx.tmpfilename = sanitize_open(
    260                             ctx.tmpfilename, ctx.open_mode)
    261                         assert ctx.stream is not None
    262                         ctx.filename = self.undo_temp_name(ctx.tmpfilename)
    263                         self.report_destination(ctx.filename)
    264                     except (OSError, IOError) as err:
    265                         self.report_error('unable to open for writing: %s' % str(err))
    266                         return False
    267 
    268                     if self.params.get('xattr_set_filesize', False) and data_len is not None:
    269                         try:
    270                             write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8'))
    271                         except (XAttrUnavailableError, XAttrMetadataError) as err:
    272                             self.report_error('unable to set filesize xattr: %s' % str(err))
    273 
    274                 try:
    275                     ctx.stream.write(data_block)
    276                 except (IOError, OSError) as err:
    277                     self.to_stderr('\n')
    278                     self.report_error('unable to write data: %s' % str(err))
    279                     return False
    280 
    281                 # Apply rate limit
    282                 self.slow_down(start, now, byte_counter - ctx.resume_len)
    283 
    284                 # end measuring of one loop run
    285                 now = time.time()
    286                 after = now
    287 
    288                 # Adjust block size
    289                 if not self.params.get('noresizebuffer', False):
    290                     block_size = self.best_block_size(after - before, len(data_block))
    291 
    292                 before = after
    293 
    294                 # Progress message
    295                 speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
    296                 if ctx.data_len is None:
    297                     eta = None
    298                 else:
    299                     eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len)
    300 
    301                 self._hook_progress({
    302                     'status': 'downloading',
    303                     'downloaded_bytes': byte_counter,
    304                     'total_bytes': ctx.data_len,
    305                     'tmpfilename': ctx.tmpfilename,
    306                     'filename': ctx.filename,
    307                     'eta': eta,
    308                     'speed': speed,
    309                     'elapsed': now - ctx.start_time,
    310                 })
    311 
    312                 if data_len is not None and byte_counter == data_len:
    313                     break
    314 
    315             if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len:
    316                 ctx.resume_len = byte_counter
    317                 # ctx.block_size = block_size
    318                 raise NextFragment()
    319 
    320             if ctx.stream is None:
    321                 self.to_stderr('\n')
    322                 self.report_error('Did not get any data blocks')
    323                 return False
    324             if ctx.tmpfilename != '-':
    325                 ctx.stream.close()
    326 
    327             if data_len is not None and byte_counter != data_len:
    328                 err = ContentTooShortError(byte_counter, int(data_len))
    329                 if count <= retries:
    330                     retry(err)
    331                 raise err
    332 
    333             self.try_rename(ctx.tmpfilename, ctx.filename)
    334 
    335             # Update file modification time
    336             if self.params.get('updatetime', True):
    337                 info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
    338 
    339             self._hook_progress({
    340                 'downloaded_bytes': byte_counter,
    341                 'total_bytes': byte_counter,
    342                 'filename': ctx.filename,
    343                 'status': 'finished',
    344                 'elapsed': time.time() - ctx.start_time,
    345             })
    346 
    347             return True
    348 
    349         while count <= retries:
    350             try:
    351                 establish_connection()
    352                 return download()
    353             except RetryDownload as e:
    354                 count += 1
    355                 if count <= retries:
    356                     self.report_retry(e.source_error, count, retries)
    357                 continue
    358             except NextFragment:
    359                 continue
    360             except SucceedDownload:
    361                 return True
    362 
    363         self.report_error('giving up after %s retries' % retries)
    364         return False