[youtube] Improve cache and add an option to print the extracted signatures
authorPhilipp Hagemeister <phihag@phihag.de>
Sun, 22 Sep 2013 08:30:02 +0000 (10:30 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Sun, 22 Sep 2013 08:30:02 +0000 (10:30 +0200)
youtube_dl/FileDownloader.py
youtube_dl/__init__.py
youtube_dl/extractor/youtube.py

index 1eb71a80e64d644f3d99a023ef259b89954ac93d..6047141345b74851d193bbcb898d340527b96438 100644 (file)
@@ -40,7 +40,7 @@ class FileDownloader(object):
     min_filesize:      Skip files smaller than this size
     max_filesize:      Skip files larger than this size
     cachedir:          Location of the cache files in the filesystem.
-                       False to disable filesystem cache.
+                       "NONE" to disable filesystem cache.
     """
 
     params = None
index 1ed30aae39ade326a2a0dd440d187fa482f35c1d..072f69f2e44918a37a552a7131a7c8916de17aa5 100644 (file)
@@ -167,6 +167,7 @@ def parseOpts(overrideArguments=None):
             help='Output descriptions of all supported extractors', default=False)
     general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL')
     general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
+    general.add_option('--cache-dir', dest='cachedir', default=u'~/.youtube-dl/cache', help='Location in the filesystem where youtube-dl can store downloaded information permanently. NONE to disable filesystem caching, %default by default')
 
 
     selection.add_option('--playlist-start',
@@ -272,6 +273,10 @@ def parseOpts(overrideArguments=None):
     verbosity.add_option('--dump-intermediate-pages',
             action='store_true', dest='dump_intermediate_pages', default=False,
             help='print downloaded pages to debug problems(very verbose)')
+    verbosity.add_option('--youtube-print-sig-code',
+            action='store_true', dest='youtube_print_sig_code', default=False,
+            help=optparse.SUPPRESS_HELP)
+
 
     filesystem.add_option('-t', '--title',
             action='store_true', dest='usetitle', help='use title in file name (default)', default=False)
@@ -613,6 +618,7 @@ def _real_main(argv=None):
         'min_filesize': opts.min_filesize,
         'max_filesize': opts.max_filesize,
         'daterange': date,
+        'youtube_print_sig_code': opts.youtube_print_sig_code
         })
 
     if opts.verbose:
index 63f59ae8fc63e01ee1ecfb59834889999911d4f2..4200f987e315ad633b7b6394c4e1ae0bb8981ab9 100644 (file)
@@ -1,13 +1,13 @@
 # coding: utf-8
 
 import collections
+import errno
 import itertools
 import io
 import json
 import operator
 import os.path
 import re
-import shutil
 import socket
 import string
 import struct
@@ -17,6 +17,7 @@ import zlib
 from .common import InfoExtractor, SearchInfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
+    compat_chr,
     compat_http_client,
     compat_parse_qs,
     compat_urllib_error,
@@ -30,6 +31,7 @@ from ..utils import (
     unescapeHTML,
     unified_strdate,
     orderedSet,
+    write_json_file,
 )
 
 class YoutubeBaseInfoExtractor(InfoExtractor):
@@ -433,18 +435,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         # Read from filesystem cache
         func_id = '%s_%s_%d' % (player_type, player_id, slen)
         assert os.path.basename(func_id) == func_id
-        cache_dir = self.downloader.params.get('cachedir',
-                                               u'~/.youtube-dl/cache')
+        cache_dir = self._downloader.params.get('cachedir',
+                                                u'~/.youtube-dl/cache')
 
-        if cache_dir is not False:
+        if cache_dir != u'NONE':
             cache_fn = os.path.join(os.path.expanduser(cache_dir),
                                     u'youtube-sigfuncs',
                                     func_id + '.json')
             try:
-                with io.open(cache_fn, '', encoding='utf-8') as cachef:
+                with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
                     cache_spec = json.load(cachef)
                 return lambda s: u''.join(s[i] for i in cache_spec)
-            except OSError:
+            except IOError:
                 pass  # No cache available
 
         if player_type == 'js':
@@ -464,13 +466,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             assert False, 'Invalid player type %r' % player_type
 
         if cache_dir is not False:
-            cache_res = res(map(compat_chr, range(slen)))
-            cache_spec = [ord(c) for c in cache_res]
-            shutil.makedirs(os.path.dirname(cache_fn))
-            write_json_file(cache_spec, cache_fn)
+            try:
+                cache_res = res(map(compat_chr, range(slen)))
+                cache_spec = [ord(c) for c in cache_res]
+                try:
+                    os.makedirs(os.path.dirname(cache_fn))
+                except OSError as ose:
+                    if ose.errno != errno.EEXIST:
+                        raise
+                write_json_file(cache_spec, cache_fn)
+            except Exception as e:
+                tb = traceback.format_exc()
+                self._downloader.report_warning(
+                    u'Writing cache to %r failed: %s' % (cache_fn, tb))
 
         return res
 
+    def _print_sig_code(self, func, slen):
+        def gen_sig_code(idxs):
+            def _genslice(start, end, step):
+                starts = u'' if start == 0 else str(start)
+                ends = u':%d' % (end+step)
+                steps = u'' if step == 1 else (':%d' % step)
+                return u's[%s%s%s]' % (starts, ends, steps)
+
+            step = None
+            for i, prev in zip(idxs[1:], idxs[:-1]):
+                if step is not None:
+                    if i - prev == step:
+                        continue
+                    yield _genslice(start, prev, step)
+                    step = None
+                    continue
+                if i - prev in [-1, 1]:
+                    step = i - prev
+                    start = prev
+                    continue
+                else:
+                    yield u's[%d]' % prev
+            if step is None:
+                yield u's[%d]' % i
+            else:
+                yield _genslice(start, i, step)
+
+        cache_res = func(map(compat_chr, range(slen)))
+        cache_spec = [ord(c) for c in cache_res]
+        expr_code = u' + '.join(gen_sig_code(cache_spec))
+        code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
+        self.to_screen(u'Extracted signature:\n' + code)
+
     def _parse_sig_js(self, jscode):
         funcname = self._search_regex(
             r'signature=([a-zA-Z]+)', jscode,
@@ -1007,7 +1051,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                         video_id, player_url, len(s)
                     )
                     self._player_cache[player_url] = func
-                return self._player_cache[player_url](s)
+                func = self._player_cache[player_url]
+                if self._downloader.params.get('youtube_print_sig_code'):
+                    self._print_sig_code(func, len(s))
+                return func(s)
             except Exception as e:
                 tb = traceback.format_exc()
                 self._downloader.report_warning(