From 8f88f7097729b0f64fbe045646334e3612e89833 Mon Sep 17 00:00:00 2001 From: relikd Date: Mon, 28 Sep 2020 14:28:35 +0200 Subject: [PATCH] Highly-used domains --- src/html_index_domains.py | 12 ++++++-- src/html_root.py | 5 ++-- src/index_domains.py | 63 ++++++++++++++++++++++++++++----------- 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/src/html_index_domains.py b/src/html_index_domains.py index 1b7f37a..d947035 100755 --- a/src/html_index_domains.py +++ b/src/html_index_domains.py @@ -82,7 +82,7 @@ def gen_html_trinity(idx_dir, app_count, json, title, symlink): # Full list (by count) list1.sort(key=lambda x: -x[1]) list2.sort(key=lambda x: -x[1]) - write_index('by_count.html', title='{} (most apps)'.format(title), + write_index('by_count.html', title='{} (by count)'.format(title), button='Full list (by count)') # Top 10 gen_html_top_10(idx_dir, list2[:25], app_count, 'Top 25 {}'.format(title)) @@ -113,7 +113,7 @@ def gen_lookup(html_dir, doms_dict, names_dict, title): def process(): # bundle_combine assures domain name is [a-zA-Z0-9.-] print('generating html: domain-index ...') - json = index_domains.load() + json = index_domains.loadAll() app_count = index_domains.number_of_apps(json) dom_count = len(json['subdom']) @@ -133,8 +133,14 @@ def process(): print(' Trackers Only') gen_html_trinity(mylib.path_out('index', 'domains', 'tracker'), app_count, - json=index_domains.load(tracker=True), title='Tracker', + json=index_domains.loadTracker(), title='Tracker', symlink=index_domains.fname_tracker()) + + print(' Highly Used') + gen_html_trinity(mylib.path_out('index', 'domains', 'highly-used'), + app_count, json=index_domains.loadNonTracker(), + title='Highly Used Domains', + symlink=index_domains.fname_no_tracker()) print('') return app_count, dom_count diff --git a/src/html_root.py b/src/html_root.py index ecad5e7..52cbe30 100755 --- a/src/html_root.py +++ b/src/html_root.py @@ -90,8 +90,9 @@ def gen_results(base_dir, c_apps, c_domains, title):

Or compare similar application via custom comparison Lists.

'''.format(title, c_apps, c_domains, c_recs, c_logs), title=title) diff --git a/src/index_domains.py b/src/index_domains.py index 4030d67..eab8141 100755 --- a/src/index_domains.py +++ b/src/index_domains.py @@ -14,11 +14,27 @@ def fname_tracker(): return mylib.path_data_index('domains_tracker.json') +def fname_no_tracker(): + return mylib.path_data_index('domains_no_tracker.json') + + def load_json_from_disk(index_file): return mylib.json_safe_read( index_file, fallback={'bundle': [], 'pardom': {}, 'subdom': {}}) +def loadAll(): + return load_json_from_disk(fname_all()) + + +def loadTracker(): + return load_json_from_disk(fname_tracker()) + + +def loadNonTracker(): + return load_json_from_disk(fname_no_tracker()) + + def delete_from_index(index, bundle_ids, deleteOnly=False): ids_to_delete = set() for bid in bundle_ids: @@ -55,34 +71,43 @@ def insert_in_index(index, bundle_ids): index['bundle'].append(bid) json = bundle_combine.get_evaluated(bid) for key in ['pardom', 'subdom']: # assuming keys are identical - for domain, _, _ in json[key]: + for domain, _, is_trkr in json[key]: try: index[key][domain].append(i) except KeyError: - index[key][domain] = [i] + index[key][domain] = [is_trkr, i] has_changes = True return has_changes -def filter_tracker_only(index): - sub_trkr = {} - par_trkr = {} - for domain, ids in filter(lambda x: download_tracker.is_tracker(x[0]), - index['subdom'].items()): - sub_trkr[domain] = ids +def split_trackers(index): + ret = {'trkr': {'bundle': index['bundle'], 'subdom': {}, 'pardom': {}}, + 'no-trkr': {'bundle': index['bundle'], 'subdom': {}, 'pardom': {}}} + for domain, [is_trkr, *ids] in index['subdom'].items(): + key = 'trkr' if is_trkr else 'no-trkr' + ret[key]['subdom'][domain] = ids pardom = mylib.parent_domain(domain) try: - par_trkr[pardom].update(ids) + ret[key]['pardom'][pardom].update(ids) except KeyError: - par_trkr[pardom] = set(ids) - for dom, ids in par_trkr.items(): - par_trkr[dom] = list(ids) - index['subdom'] = sub_trkr - index['pardom'] = par_trkr + ret[key]['pardom'][pardom] = set(ids) + for dic in ret.values(): + for dom, ids in dic['pardom'].items(): + dic['pardom'][dom] = list(ids) + return ret['trkr'], ret['no-trkr'] -def load(tracker=False): - return load_json_from_disk(fname_tracker() if tracker else fname_all()) +def filter_list_at_least(index, min_count): + sub = {} + par = {} + for domain, ids in index['subdom'].items(): + if len(ids) >= min_count: + sub[domain] = ids + for domain, ids in index['pardom'].items(): + if len(ids) >= min_count: + par[domain] = ids + index['subdom'] = sub + index['pardom'] = par def number_of_apps(index): @@ -120,8 +145,10 @@ def process(bundle_ids, deleteOnly=False): did_change |= insert_in_index(index, ids) if did_change: mylib.json_write(fname, index, pretty=False) - filter_tracker_only(index) - mylib.json_write(fname_tracker(), index, pretty=False) + dict_trkr, dict_no_trkr = split_trackers(index) + mylib.json_write(fname_tracker(), dict_trkr, pretty=False) + filter_list_at_least(dict_no_trkr, 5) # or 0.1 * len(ids) + mylib.json_write(fname_no_tracker(), dict_no_trkr, pretty=False) else: print(' no change') print('')