Files
appchk-web/src/index_domains.py
2020-09-21 16:18:12 +02:00

139 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
import common_lib as mylib
import bundle_combine # get_evaluated
import download_tracker # is_tracker
def fname_all():
return mylib.path_data_index('domains_all.json')
def fname_tracker():
return mylib.path_data_index('domains_tracker.json')
def load_json_from_disk(index_file):
if mylib.file_exists(index_file):
return mylib.json_read(index_file)
else:
return {'bundle': [], 'pardom': {}, 'subdom': {}}
def delete_from_index(index, bundle_ids, deleteOnly=False):
ids_to_delete = set()
for bid in bundle_ids:
try:
i = index['bundle'].index(bid)
except ValueError: # index not found
continue
ids_to_delete.add(i)
if deleteOnly:
index['bundle'][i] = '_'
if len(ids_to_delete) == 0:
return False
for key in ['pardom', 'subdom']:
for domain in list(index[key].keys()):
for i in ids_to_delete:
try:
index[key][domain].remove(i)
except ValueError: # ignore if not present
continue
if not index[key][domain]:
del(index[key][domain])
return True
def insert_in_index(index, bundle_ids):
has_changes = False
for bid in bundle_ids:
try:
i = index['bundle'].index(bid)
except ValueError: # index not found
i = len(index['bundle'])
index['bundle'].append(bid)
json = bundle_combine.get_evaluated(bid)
for key in ['pardom', 'subdom']: # assuming keys are identical
for domain, _, _ in json[key]:
try:
index[key][domain].append(i)
except KeyError:
index[key][domain] = [i]
has_changes = True
return has_changes
def filter_tracker_only(index):
sub_trkr = {}
par_trkr = {}
for domain, ids in filter(lambda x: download_tracker.is_tracker(x[0]),
index['subdom'].items()):
sub_trkr[domain] = ids
pardom = mylib.parent_domain(domain)
try:
par_trkr[pardom].update(ids)
except KeyError:
par_trkr[pardom] = set(ids)
for dom, ids in par_trkr.items():
par_trkr[dom] = list(ids)
index['subdom'] = sub_trkr
index['pardom'] = par_trkr
def load(tracker=False):
return load_json_from_disk(fname_tracker() if tracker else fname_all())
def number_of_apps(index):
return sum(1 for x in index['bundle'] if x != '_')
def enrich_with_bundle_ids(index):
for key in ['pardom', 'subdom']:
for dom, ids in index[key].items():
index[key][dom] = [index['bundle'][i] for i in ids]
def all_bundles_containing(list_of_domains):
affected = set()
json = load_json_from_disk(fname_all())
haystack = sorted([x[::-1] for x in list_of_domains])
for key in ['pardom', 'subdom']:
for dom, ids in json[key].items():
if mylib.bintree_lookup(haystack, dom[::-1]):
affected.update(ids)
return [json['bundle'][i] for i in affected]
def process(bundle_ids, deleteOnly=False):
print('writing index: domains ...')
fname = fname_all()
if bundle_ids == ['*']:
print(' full reset')
mylib.rm_file(fname) # rebuild from ground up
index = load_json_from_disk(fname)
ids = mylib.appids_in_data(bundle_ids)
did_change = delete_from_index(index, ids, deleteOnly=deleteOnly)
if not deleteOnly:
did_change |= insert_in_index(index, ids)
if did_change:
mylib.json_write(fname, index, pretty=False)
filter_tracker_only(index)
mylib.json_write(fname_tracker(), index, pretty=False)
else:
print(' no change')
print('')
if __name__ == '__main__':
args = sys.argv[1:]
if len(args) > 0:
process(args)
else:
# process(['*'], deleteOnly=False)
mylib.usage(__file__, '[bundle_id] [...]')