check-porn.py (1923B)
1 #!/usr/bin/env python 2 from __future__ import unicode_literals 3 4 """ 5 This script employs a VERY basic heuristic ('porn' in webpage.lower()) to check 6 if we are not 'age_limit' tagging some porn site 7 8 A second approach implemented relies on a list of porn domains, to activate it 9 pass the list filename as the only argument 10 """ 11 12 # Allow direct execution 13 import os 14 import sys 15 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 16 17 from test.helper import gettestcases 18 from youtube_dl.utils import compat_urllib_parse_urlparse 19 from youtube_dl.utils import compat_urllib_request 20 21 if len(sys.argv) > 1: 22 METHOD = 'LIST' 23 LIST = open(sys.argv[1]).read().decode('utf8').strip() 24 else: 25 METHOD = 'EURISTIC' 26 27 for test in gettestcases(): 28 if METHOD == 'EURISTIC': 29 try: 30 webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() 31 except Exception: 32 print('\nFail: {0}'.format(test['name'])) 33 continue 34 35 webpage = webpage.decode('utf8', 'replace') 36 37 RESULT = 'porn' in webpage.lower() 38 39 elif METHOD == 'LIST': 40 domain = compat_urllib_parse_urlparse(test['url']).netloc 41 if not domain: 42 print('\nFail: {0}'.format(test['name'])) 43 continue 44 domain = '.'.join(domain.split('.')[-2:]) 45 46 RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST) 47 48 if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] 49 or test['info_dict']['age_limit'] != 18): 50 print('\nPotential missing age_limit check: {0}'.format(test['name'])) 51 52 elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] 53 and test['info_dict']['age_limit'] == 18): 54 print('\nPotential false negative: {0}'.format(test['name'])) 55 56 else: 57 sys.stdout.write('.') 58 sys.stdout.flush() 59 60 print()