From a2aaf55cd98d781ff6dcdfbbcfe8e6e3045e39a7 Mon Sep 17 00:00:00 2001 From: relikd Date: Sat, 19 Sep 2020 14:01:20 +0200 Subject: [PATCH] Refactor domain index, introducing tracker index --- out/static/style.css | 4 +- src/html_reverse_domains.py | 77 ++++++++++++++----------- src/index_reverse_domains.py | 108 ++++++++++++++++++++--------------- templates/base.html | 6 +- 4 files changed, 111 insertions(+), 84 deletions(-) diff --git a/out/static/style.css b/out/static/style.css index 78f9b7c..5b1e845 100644 --- a/out/static/style.css +++ b/out/static/style.css @@ -66,14 +66,14 @@ footer .links { cursor: pointer; } .dropdown { display: inline-block; position: relative; } -.dropdown div { +.dropdown nav { display: none; position: absolute; width: max-content; box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); z-index: 1; } -.dropdown:hover div { display: block; } +.dropdown:hover nav { display: block; } .dropdown a { display: block; padding: 0.5em 1em; } .dropdown a:hover { background-color: #eee; } diff --git a/src/html_reverse_domains.py b/src/html_reverse_domains.py index da1559b..2272ac4 100755 --- a/src/html_reverse_domains.py +++ b/src/html_reverse_domains.py @@ -24,11 +24,11 @@ def dropdown_choose(button): ''' @@ -54,7 +54,7 @@ def gen_html_index(l1, l2, fname, title, button): title=title)) -def gen_html_top_domains(subset, fname, total, title): +def gen_html_top_10(subset, fname, total, title): def div_loadbar(percent): return '{0}%'.format(percent) @@ -64,7 +64,7 @@ def gen_html_top_domains(subset, fname, total, title):

{ title }

''' for dom, ids in subset: - dom_str = div_dom(dom, len(ids), 'subdomain') + dom_str = div_dom(dom, len(ids), 'domain') pct_bar = div_loadbar(round(len(ids) / total * 100)) txt += f'\n

{dom_str} {pct_bar}

' fp.write(mylib.template_with_base(txt + ''' @@ -72,10 +72,28 @@ def gen_html_top_domains(subset, fname, total, title): sorted by Occurrence frequency or in Alphabetical order.

-

Download: json

+

Download: json

''', title=title)) +def gen_html_trinity(json, idx_dir, app_count, title): + # Full list (A–Z) + list1 = sorted(json['subdom'].items(), key=lambda x: x[0]) + list2 = sorted(json['pardom'].items(), key=lambda x: x[0]) + gen_html_index(list1, list2, mylib.path_add(idx_dir, 'by_name.html'), + title='{} (A–Z)'.format(title), + button='Full list (A–Z)') + # Full list (by count) + list1.sort(key=lambda x: -len(x[1])) + list2.sort(key=lambda x: -len(x[1])) + gen_html_index(list1, list2, mylib.path_add(idx_dir, 'by_count.html'), + title='{} (most apps)'.format(title), + button='Full list (by count)') + # Top 10 + gen_html_top_10(list2[:25], mylib.path_add(idx_dir, 'index.html'), + app_count, title='Top 25 {}'.format(title)) + + def gen_html_lookup(html_dir, json, key, title): mylib.mkdir(html_dir) names = [[x, index_bundle_names.get_name(x)] for x in json['bundle']] @@ -97,43 +115,34 @@ def gen_html_lookup(html_dir, json, key, title): def process(): # bundle_combine assures domain name is [a-zA-Z0-9.-] print('generating reverse-domain-index ...') - idx_dir = mylib.path_out('index', 'domains') - mylib.mkdir(idx_dir) # Data export - mylib.symlink(mylib.path_data_index('reverse_domains.json'), - mylib.path_out_app(idx_dir, 'data.json')) - - par_arr = list(index_reverse_domains.enumerate('pardom')) - sub_arr = list(index_reverse_domains.enumerate('subdom')) - - # Full list (A–Z) - sub_arr.sort(key=lambda x: x[0]) - par_arr.sort(key=lambda x: x[0]) - gen_html_index(sub_arr, par_arr, mylib.path_add(idx_dir, 'by_name.html'), - title='Requested Domains (A–Z)', - button='Full list (A–Z)') - - # Full list (by count) - sub_arr.sort(key=lambda x: -len(x[1])) - par_arr.sort(key=lambda x: -len(x[1])) - gen_html_index(sub_arr, par_arr, mylib.path_add(idx_dir, 'by_count.html'), - title='Requested Domains (most apps)', - button='Full list (by count)') - - # Top 10 - del(sub_arr[20:]) - del(par_arr) - total = index_reverse_domains.number_of_apps() - gen_html_top_domains(sub_arr, mylib.path_add(idx_dir, 'index.html'), - total, 'Top 20 Requested Domains') + all_dom_dir = mylib.path_out('index', 'domains', 'all') + trkr_dir = mylib.path_out('index', 'domains', 'tracker') + mylib.mkdir(all_dom_dir) + mylib.mkdir(trkr_dir) + mylib.symlink(index_reverse_domains.fname_all(), + mylib.path_out_app(all_dom_dir, 'data.json')) + mylib.symlink(index_reverse_domains.fname_tracker(), + mylib.path_out_app(trkr_dir, 'data.json')) + # Load + json = index_reverse_domains.load() + app_count = index_reverse_domains.number_of_apps(json) # Lookup - json = index_reverse_domains.raw() gen_html_lookup(mylib.path_out('domain'), json, 'pardom', title='Domain Lookup') gen_html_lookup(mylib.path_out('subdomain'), json, 'subdom', title='Subdomain Lookup') + # All domains + index_reverse_domains.enrich_with_bundle_ids(json) + gen_html_trinity(json, all_dom_dir, app_count, + title='Requested Domains') + # Tacker only + json = index_reverse_domains.load(tracker=True) + index_reverse_domains.enrich_with_bundle_ids(json) + gen_html_trinity(json, trkr_dir, app_count, + title='Tracker') print('') diff --git a/src/index_reverse_domains.py b/src/index_reverse_domains.py index 5c80281..c2a8be0 100755 --- a/src/index_reverse_domains.py +++ b/src/index_reverse_domains.py @@ -2,104 +2,122 @@ import sys import common_lib as mylib - -_reverse_domain_dict = None +import bundle_combine +import tracker_download -def index_fname(): - return mylib.path_data_index('reverse_domains.json') +def fname_all(): + return mylib.path_data_index('all_domains.json') -def load_json_if_not_already(): - global _reverse_domain_dict - if not _reverse_domain_dict: - index_file = index_fname() - if mylib.file_exists(index_file): - _reverse_domain_dict = mylib.json_read(index_file) - else: - _reverse_domain_dict = {'bundle': [], 'pardom': {}, 'subdom': {}} +def fname_tracker(): + return mylib.path_data_index('tracker_domains.json') -def write_json_to_disk(): - mylib.json_write(index_fname(), _reverse_domain_dict, pretty=False) +def index_fname(tracker_only=False): + return mylib.path_data_index( + 'tracker_domains.json' if tracker_only else 'all_domains.json') -def delete_from_index(bundle_ids, deleteOnly=False): - global _reverse_domain_dict +def load_json_from_disk(index_file): + if mylib.file_exists(index_file): + return mylib.json_read(index_file) + else: + return {'bundle': [], 'pardom': {}, 'subdom': {}} + + +def delete_from_index(index, bundle_ids, deleteOnly=False): ids_to_delete = set() for bid in bundle_ids: try: - i = _reverse_domain_dict['bundle'].index(bid) + i = index['bundle'].index(bid) except ValueError: # index not found continue ids_to_delete.add(i) if deleteOnly: - _reverse_domain_dict['bundle'][i] = '_' + index['bundle'][i] = '_' if len(ids_to_delete) == 0: return False for key in ['pardom', 'subdom']: - for domain in list(_reverse_domain_dict[key].keys()): + for domain in list(index[key].keys()): for i in ids_to_delete: try: - _reverse_domain_dict[key][domain].remove(i) + index[key][domain].remove(i) except ValueError: # ignore if not present continue - if not _reverse_domain_dict[key][domain]: - del(_reverse_domain_dict[key][domain]) + if not index[key][domain]: + del(index[key][domain]) return True -def insert_in_index(bundle_ids): - global _reverse_domain_dict +def insert_in_index(index, bundle_ids): has_changes = False for bid in bundle_ids: try: - i = _reverse_domain_dict['bundle'].index(bid) + i = index['bundle'].index(bid) except ValueError: # index not found - i = len(_reverse_domain_dict['bundle']) - _reverse_domain_dict['bundle'].append(bid) + i = len(index['bundle']) + index['bundle'].append(bid) json, _ = mylib.json_read_evaluated(bid) for key in ['pardom', 'subdom']: # assuming keys are identical for domain, _, _ in json[key]: try: - _reverse_domain_dict[key][domain].append(i) + index[key][domain].append(i) except KeyError: - _reverse_domain_dict[key][domain] = [i] + index[key][domain] = [i] has_changes = True return has_changes -def raw(): - load_json_if_not_already() - return _reverse_domain_dict +def filter_tracker_only(index): + sub_trkr = {} + par_trkr = {} + for domain, ids in filter(lambda x: tracker_download.is_tracker(x[0]), + index['subdom'].items()): + sub_trkr[domain] = ids + pardom = bundle_combine.get_parent_domain(domain) + try: + par_trkr[pardom].update(ids) + except KeyError: + par_trkr[pardom] = set(ids) + for dom, ids in par_trkr.items(): + par_trkr[dom] = list(ids) + index['subdom'] = sub_trkr + index['pardom'] = par_trkr -def number_of_apps(): - load_json_if_not_already() - return sum(1 for x in _reverse_domain_dict['bundle'] if x != '_') +def load(tracker=False): + return load_json_from_disk(fname_tracker() if tracker else fname_all()) -def enumerate(key): - load_json_if_not_already() - for dom, bundles in _reverse_domain_dict[key].items(): - yield [dom, [_reverse_domain_dict['bundle'][i] for i in bundles]] +def number_of_apps(index): + return sum(1 for x in index['bundle'] if x != '_') + + +def enrich_with_bundle_ids(index): + for key in ['pardom', 'subdom']: + for dom, ids in index[key].items(): + index[key][dom] = [index['bundle'][i] for i in ids] def process(bundle_ids, deleteOnly=False): print('writing index: reverse domains ...') + fname = fname_all() if bundle_ids == ['*']: bundle_ids = list(mylib.enum_data_appids()) print(' full reset') - mylib.rm_file(index_fname()) # rebuild from ground up + mylib.rm_file(fname) # rebuild from ground up - load_json_if_not_already() - did_change = delete_from_index(bundle_ids, deleteOnly=deleteOnly) + index = load_json_from_disk(fname) + did_change = delete_from_index(index, bundle_ids, deleteOnly=deleteOnly) if not deleteOnly: - did_change |= insert_in_index(bundle_ids) + did_change |= insert_in_index(index, bundle_ids) if did_change: - write_json_to_disk() + mylib.json_write(fname, index, pretty=False) + filter_tracker_only(index) + mylib.json_write(fname_tracker(), index, pretty=False) else: print(' no change') print('') @@ -110,5 +128,5 @@ if __name__ == '__main__': if len(args) > 0: process(args) else: - # process(['*'], deleteOnly=False) + process(['*'], deleteOnly=False) mylib.usage(__file__, '[bundle_id] [...]') diff --git a/templates/base.html b/templates/base.html index fc4b958..7dc9b82 100644 --- a/templates/base.html +++ b/templates/base.html @@ -3,9 +3,9 @@ - + #_TITLE_#AppCheck: Privacy Monitor - + @@ -22,7 +22,7 @@