youtube-dl

Another place where youtube-dl lives on
git clone git://git.oshgnacknak.de/youtube-dl.git
Log | Files | Refs | README | LICENSE

commit 347de4931cb3e496fb7b1dfb0314c213f44cce6b
parent 88296505131f4b91ff91eaa0af34318664d892c9
Author: Philipp Hagemeister <phihag@phihag.de>
Date:   Tue, 10 Feb 2015 03:32:21 +0100

[YoutubeDL] Add generic video filtering (Fixes #4916)

This functionality is intended to eventually encompass the current format filtering.

Diffstat:
Mtest/test_utils.py | 32++++++++++++++++++++++++++++++++
Myoutube_dl/YoutubeDL.py | 14+++++++++++++-
Myoutube_dl/__init__.py | 7++++++-
Myoutube_dl/options.py | 19+++++++++++++++++++
Myoutube_dl/utils.py | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py @@ -53,6 +53,7 @@ from youtube_dl.utils import ( version_tuple, xpath_with_ns, render_table, + match_str, ) @@ -459,6 +460,37 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') '123 4\n' '9999 51') + def test_match_str(self): + self.assertRaises(ValueError, match_str, 'xy>foobar', {}) + self.assertFalse(match_str('xy', {'x': 1200})) + self.assertTrue(match_str('!xy', {'x': 1200})) + self.assertTrue(match_str('x', {'x': 1200})) + self.assertFalse(match_str('!x', {'x': 1200})) + self.assertTrue(match_str('x', {'x': 0})) + self.assertFalse(match_str('x>0', {'x': 0})) + self.assertFalse(match_str('x>0', {})) + self.assertTrue(match_str('x>?0', {})) + self.assertTrue(match_str('x>1K', {'x': 1200})) + self.assertFalse(match_str('x>2K', {'x': 1200})) + self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200})) + self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200})) + self.assertFalse(match_str('y=a212', {'y': 'foobar42'})) + self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'})) + self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'})) + self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'})) + self.assertFalse(match_str( + 'like_count > 100 & dislike_count <? 50 & description', + {'like_count': 90, 'description': 'foo'})) + self.assertTrue(match_str( + 'like_count > 100 & dislike_count <? 50 & description', + {'like_count': 190, 'description': 'foo'})) + self.assertFalse(match_str( + 'like_count > 100 & dislike_count <? 50 & description', + {'like_count': 190, 'dislike_count': 60, 'description': 'foo'})) + self.assertFalse(match_str( + 'like_count > 100 & dislike_count <? 50 & description', + {'like_count': 190, 'dislike_count': 10})) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py @@ -228,6 +228,11 @@ class YoutubeDL(object): external_downloader: Executable of the external downloader to call. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. + match_filter: A function that gets called with the info_dict of + every video. + If it returns a message, the video is ignored. + If it returns None, the video is downloaded. + match_filter_func in utils.py is one example for this. The following parameters are not used by YoutubeDL itself, they are used by @@ -583,9 +588,16 @@ class YoutubeDL(object): if max_views is not None and view_count > max_views: return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): - return 'Skipping "%s" because it is age restricted' % title + return 'Skipping "%s" because it is age restricted' % video_title if self.in_download_archive(info_dict): return '%s has already been recorded in archive' % video_title + + match_filter = self.params.get('match_filter') + if match_filter is not None: + ret = match_filter(info_dict) + if ret is not None: + return ret + return None @staticmethod diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py @@ -23,9 +23,10 @@ from .compat import ( ) from .utils import ( DateRange, - DEFAULT_OUTTMPL, decodeOption, + DEFAULT_OUTTMPL, DownloadError, + match_filter_func, MaxDownloadsReached, preferredencoding, read_batch_urls, @@ -247,6 +248,9 @@ def _real_main(argv=None): xattr # Confuse flake8 except ImportError: parser.error('setting filesize xattr requested but python-xattr is not available') + match_filter = ( + None if opts.match_filter is None + else match_filter_func(opts.match_filter)) ydl_opts = { 'usenetrc': opts.usenetrc, @@ -344,6 +348,7 @@ def _real_main(argv=None): 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, + 'match_filter': match_filter, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/options.py b/youtube_dl/options.py @@ -245,6 +245,25 @@ def parseOpts(overrideArguments=None): metavar='COUNT', dest='max_views', default=None, type=int, help='Do not download any videos with more than COUNT views') selection.add_option( + '--match-filter', + metavar='FILTER', dest='match_filter', default=None, + help=( + '(Experimental) Generic video filter. ' + 'Specify any key (see help for -o for a list of available keys) to' + ' match if the key is present, ' + '!key to check if the key is not present,' + 'key > NUMBER (like "comment_count > 12", also works with ' + '>=, <, <=, !=, =) to compare against a number, and ' + '& to require multiple matches. ' + 'Values which are not known are excluded unless you' + ' put a question mark (?) after the operator.' + 'For example, to only match videos that have been liked more than ' + '100 times and disliked less than 50 times (or the dislike ' + 'functionality is not available at the given service), but who ' + 'also have a description, use --match-filter ' + '"like_count > 100 & dislike_count <? 50 & description" .' + )) + selection.add_option( '--no-playlist', action='store_true', dest='noplaylist', default=False, help='If the URL refers to a video and a playlist, download only the video.') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py @@ -17,6 +17,7 @@ import io import json import locale import math +import operator import os import pipes import platform @@ -1678,3 +1679,79 @@ def render_table(header_row, data): max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)] format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s' return '\n'.join(format_str % tuple(row) for row in table) + + +def _match_one(filter_part, dct): + COMPARISON_OPERATORS = { + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge, + '=': operator.eq, + '!=': operator.ne, + } + operator_rex = re.compile(r'''(?x)\s* + (?P<key>[a-z_]+) + \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* + (?: + (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| + (?P<strval>(?![0-9.])[a-z0-9A-Z]*) + ) + \s*$ + ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) + m = operator_rex.search(filter_part) + if m: + op = COMPARISON_OPERATORS[m.group('op')] + if m.group('strval') is not None: + if m.group('op') not in ('=', '!='): + raise ValueError( + 'Operator %s does not support string values!' % m.group('op')) + comparison_value = m.group('strval') + else: + try: + comparison_value = int(m.group('intval')) + except ValueError: + comparison_value = parse_filesize(m.group('intval')) + if comparison_value is None: + comparison_value = parse_filesize(m.group('intval') + 'B') + if comparison_value is None: + raise ValueError( + 'Invalid integer value %r in filter part %r' % ( + m.group('intval'), filter_part)) + actual_value = dct.get(m.group('key')) + if actual_value is None: + return m.group('none_inclusive') + return op(actual_value, comparison_value) + + UNARY_OPERATORS = { + '': lambda v: v is not None, + '!': lambda v: v is None, + } + operator_rex = re.compile(r'''(?x)\s* + (?P<op>%s)\s*(?P<key>[a-z_]+) + \s*$ + ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) + m = operator_rex.search(filter_part) + if m: + op = UNARY_OPERATORS[m.group('op')] + actual_value = dct.get(m.group('key')) + return op(actual_value) + + raise ValueError('Invalid filter part %r' % filter_part) + + +def match_str(filter_str, dct): + """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """ + + return all( + _match_one(filter_part, dct) for filter_part in filter_str.split('&')) + + +def match_filter_func(filter_str): + def _match_func(info_dict): + if match_str(filter_str, info_dict): + return None + else: + video_title = info_dict.get('title', info_dict.get('id', 'video')) + return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) + return _match_func