Refactor domain index, introducing tracker index
This commit is contained in:
@@ -66,14 +66,14 @@ footer .links {
|
||||
cursor: pointer;
|
||||
}
|
||||
.dropdown { display: inline-block; position: relative; }
|
||||
.dropdown div {
|
||||
.dropdown nav {
|
||||
display: none;
|
||||
position: absolute;
|
||||
width: max-content;
|
||||
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
|
||||
z-index: 1;
|
||||
}
|
||||
.dropdown:hover div { display: block; }
|
||||
.dropdown:hover nav { display: block; }
|
||||
.dropdown a { display: block; padding: 0.5em 1em; }
|
||||
.dropdown a:hover { background-color: #eee; }
|
||||
|
||||
|
||||
@@ -24,11 +24,11 @@ def dropdown_choose(button):
|
||||
<label for="dropdown">Choose list:</label>
|
||||
<div class="dropdown" name="dropdown">
|
||||
<button class="bg1 border">{button}</button>
|
||||
<div class="bg1 no_ul_all">
|
||||
<nav class="bg1 no_ul_all">
|
||||
<a href="index.html">Most frequent</a>
|
||||
<a href="by_name.html">Full list (A–Z)</a>
|
||||
<a href="by_count.html">Full list (by count)</a>
|
||||
</div>
|
||||
</nav>
|
||||
</div>'''
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ def gen_html_index(l1, l2, fname, title, button):
|
||||
title=title))
|
||||
|
||||
|
||||
def gen_html_top_domains(subset, fname, total, title):
|
||||
def gen_html_top_10(subset, fname, total, title):
|
||||
|
||||
def div_loadbar(percent):
|
||||
return '<span class="loadbar"><span style="width: {0}%">{0}%</span></span>'.format(percent)
|
||||
@@ -64,7 +64,7 @@ def gen_html_top_domains(subset, fname, total, title):
|
||||
<div id="dom-top10" class="found-in">
|
||||
<h2>{ title }</h2>'''
|
||||
for dom, ids in subset:
|
||||
dom_str = div_dom(dom, len(ids), 'subdomain')
|
||||
dom_str = div_dom(dom, len(ids), 'domain')
|
||||
pct_bar = div_loadbar(round(len(ids) / total * 100))
|
||||
txt += f'\n<p>{dom_str} {pct_bar}</p>'
|
||||
fp.write(mylib.template_with_base(txt + '''
|
||||
@@ -72,10 +72,28 @@ def gen_html_top_domains(subset, fname, total, title):
|
||||
sorted by <a class="snd" href="by_count.html">Occurrence frequency</a>
|
||||
or in <a class="snd" href="by_name.html">Alphabetical order</a>.</p>
|
||||
</div>
|
||||
<p class="right snd">Download: <a href="data.json" download="appcheck_domains_full.json">json</a></p>
|
||||
<p class="right snd">Download: <a href="data.json" download="domains.json">json</a></p>
|
||||
''', title=title))
|
||||
|
||||
|
||||
def gen_html_trinity(json, idx_dir, app_count, title):
|
||||
# Full list (A–Z)
|
||||
list1 = sorted(json['subdom'].items(), key=lambda x: x[0])
|
||||
list2 = sorted(json['pardom'].items(), key=lambda x: x[0])
|
||||
gen_html_index(list1, list2, mylib.path_add(idx_dir, 'by_name.html'),
|
||||
title='{} (A–Z)'.format(title),
|
||||
button='Full list (A–Z)')
|
||||
# Full list (by count)
|
||||
list1.sort(key=lambda x: -len(x[1]))
|
||||
list2.sort(key=lambda x: -len(x[1]))
|
||||
gen_html_index(list1, list2, mylib.path_add(idx_dir, 'by_count.html'),
|
||||
title='{} (most apps)'.format(title),
|
||||
button='Full list (by count)')
|
||||
# Top 10
|
||||
gen_html_top_10(list2[:25], mylib.path_add(idx_dir, 'index.html'),
|
||||
app_count, title='Top 25 {}'.format(title))
|
||||
|
||||
|
||||
def gen_html_lookup(html_dir, json, key, title):
|
||||
mylib.mkdir(html_dir)
|
||||
names = [[x, index_bundle_names.get_name(x)] for x in json['bundle']]
|
||||
@@ -97,43 +115,34 @@ def gen_html_lookup(html_dir, json, key, title):
|
||||
def process():
|
||||
# bundle_combine assures domain name is [a-zA-Z0-9.-]
|
||||
print('generating reverse-domain-index ...')
|
||||
idx_dir = mylib.path_out('index', 'domains')
|
||||
mylib.mkdir(idx_dir)
|
||||
|
||||
# Data export
|
||||
mylib.symlink(mylib.path_data_index('reverse_domains.json'),
|
||||
mylib.path_out_app(idx_dir, 'data.json'))
|
||||
|
||||
par_arr = list(index_reverse_domains.enumerate('pardom'))
|
||||
sub_arr = list(index_reverse_domains.enumerate('subdom'))
|
||||
|
||||
# Full list (A–Z)
|
||||
sub_arr.sort(key=lambda x: x[0])
|
||||
par_arr.sort(key=lambda x: x[0])
|
||||
gen_html_index(sub_arr, par_arr, mylib.path_add(idx_dir, 'by_name.html'),
|
||||
title='Requested Domains (A–Z)',
|
||||
button='Full list (A–Z)')
|
||||
|
||||
# Full list (by count)
|
||||
sub_arr.sort(key=lambda x: -len(x[1]))
|
||||
par_arr.sort(key=lambda x: -len(x[1]))
|
||||
gen_html_index(sub_arr, par_arr, mylib.path_add(idx_dir, 'by_count.html'),
|
||||
title='Requested Domains (most apps)',
|
||||
button='Full list (by count)')
|
||||
|
||||
# Top 10
|
||||
del(sub_arr[20:])
|
||||
del(par_arr)
|
||||
total = index_reverse_domains.number_of_apps()
|
||||
gen_html_top_domains(sub_arr, mylib.path_add(idx_dir, 'index.html'),
|
||||
total, 'Top 20 Requested Domains')
|
||||
all_dom_dir = mylib.path_out('index', 'domains', 'all')
|
||||
trkr_dir = mylib.path_out('index', 'domains', 'tracker')
|
||||
mylib.mkdir(all_dom_dir)
|
||||
mylib.mkdir(trkr_dir)
|
||||
mylib.symlink(index_reverse_domains.fname_all(),
|
||||
mylib.path_out_app(all_dom_dir, 'data.json'))
|
||||
mylib.symlink(index_reverse_domains.fname_tracker(),
|
||||
mylib.path_out_app(trkr_dir, 'data.json'))
|
||||
|
||||
# Load
|
||||
json = index_reverse_domains.load()
|
||||
app_count = index_reverse_domains.number_of_apps(json)
|
||||
# Lookup
|
||||
json = index_reverse_domains.raw()
|
||||
gen_html_lookup(mylib.path_out('domain'), json, 'pardom',
|
||||
title='Domain Lookup')
|
||||
gen_html_lookup(mylib.path_out('subdomain'), json, 'subdom',
|
||||
title='Subdomain Lookup')
|
||||
# All domains
|
||||
index_reverse_domains.enrich_with_bundle_ids(json)
|
||||
gen_html_trinity(json, all_dom_dir, app_count,
|
||||
title='Requested Domains')
|
||||
# Tacker only
|
||||
json = index_reverse_domains.load(tracker=True)
|
||||
index_reverse_domains.enrich_with_bundle_ids(json)
|
||||
gen_html_trinity(json, trkr_dir, app_count,
|
||||
title='Tracker')
|
||||
print('')
|
||||
|
||||
|
||||
|
||||
@@ -2,104 +2,122 @@
|
||||
|
||||
import sys
|
||||
import common_lib as mylib
|
||||
|
||||
_reverse_domain_dict = None
|
||||
import bundle_combine
|
||||
import tracker_download
|
||||
|
||||
|
||||
def index_fname():
|
||||
return mylib.path_data_index('reverse_domains.json')
|
||||
def fname_all():
|
||||
return mylib.path_data_index('all_domains.json')
|
||||
|
||||
|
||||
def load_json_if_not_already():
|
||||
global _reverse_domain_dict
|
||||
if not _reverse_domain_dict:
|
||||
index_file = index_fname()
|
||||
if mylib.file_exists(index_file):
|
||||
_reverse_domain_dict = mylib.json_read(index_file)
|
||||
else:
|
||||
_reverse_domain_dict = {'bundle': [], 'pardom': {}, 'subdom': {}}
|
||||
def fname_tracker():
|
||||
return mylib.path_data_index('tracker_domains.json')
|
||||
|
||||
|
||||
def write_json_to_disk():
|
||||
mylib.json_write(index_fname(), _reverse_domain_dict, pretty=False)
|
||||
def index_fname(tracker_only=False):
|
||||
return mylib.path_data_index(
|
||||
'tracker_domains.json' if tracker_only else 'all_domains.json')
|
||||
|
||||
|
||||
def delete_from_index(bundle_ids, deleteOnly=False):
|
||||
global _reverse_domain_dict
|
||||
def load_json_from_disk(index_file):
|
||||
if mylib.file_exists(index_file):
|
||||
return mylib.json_read(index_file)
|
||||
else:
|
||||
return {'bundle': [], 'pardom': {}, 'subdom': {}}
|
||||
|
||||
|
||||
def delete_from_index(index, bundle_ids, deleteOnly=False):
|
||||
ids_to_delete = set()
|
||||
for bid in bundle_ids:
|
||||
try:
|
||||
i = _reverse_domain_dict['bundle'].index(bid)
|
||||
i = index['bundle'].index(bid)
|
||||
except ValueError: # index not found
|
||||
continue
|
||||
ids_to_delete.add(i)
|
||||
if deleteOnly:
|
||||
_reverse_domain_dict['bundle'][i] = '_'
|
||||
index['bundle'][i] = '_'
|
||||
|
||||
if len(ids_to_delete) == 0:
|
||||
return False
|
||||
|
||||
for key in ['pardom', 'subdom']:
|
||||
for domain in list(_reverse_domain_dict[key].keys()):
|
||||
for domain in list(index[key].keys()):
|
||||
for i in ids_to_delete:
|
||||
try:
|
||||
_reverse_domain_dict[key][domain].remove(i)
|
||||
index[key][domain].remove(i)
|
||||
except ValueError: # ignore if not present
|
||||
continue
|
||||
if not _reverse_domain_dict[key][domain]:
|
||||
del(_reverse_domain_dict[key][domain])
|
||||
if not index[key][domain]:
|
||||
del(index[key][domain])
|
||||
return True
|
||||
|
||||
|
||||
def insert_in_index(bundle_ids):
|
||||
global _reverse_domain_dict
|
||||
def insert_in_index(index, bundle_ids):
|
||||
has_changes = False
|
||||
for bid in bundle_ids:
|
||||
try:
|
||||
i = _reverse_domain_dict['bundle'].index(bid)
|
||||
i = index['bundle'].index(bid)
|
||||
except ValueError: # index not found
|
||||
i = len(_reverse_domain_dict['bundle'])
|
||||
_reverse_domain_dict['bundle'].append(bid)
|
||||
i = len(index['bundle'])
|
||||
index['bundle'].append(bid)
|
||||
json, _ = mylib.json_read_evaluated(bid)
|
||||
for key in ['pardom', 'subdom']: # assuming keys are identical
|
||||
for domain, _, _ in json[key]:
|
||||
try:
|
||||
_reverse_domain_dict[key][domain].append(i)
|
||||
index[key][domain].append(i)
|
||||
except KeyError:
|
||||
_reverse_domain_dict[key][domain] = [i]
|
||||
index[key][domain] = [i]
|
||||
has_changes = True
|
||||
return has_changes
|
||||
|
||||
|
||||
def raw():
|
||||
load_json_if_not_already()
|
||||
return _reverse_domain_dict
|
||||
def filter_tracker_only(index):
|
||||
sub_trkr = {}
|
||||
par_trkr = {}
|
||||
for domain, ids in filter(lambda x: tracker_download.is_tracker(x[0]),
|
||||
index['subdom'].items()):
|
||||
sub_trkr[domain] = ids
|
||||
pardom = bundle_combine.get_parent_domain(domain)
|
||||
try:
|
||||
par_trkr[pardom].update(ids)
|
||||
except KeyError:
|
||||
par_trkr[pardom] = set(ids)
|
||||
for dom, ids in par_trkr.items():
|
||||
par_trkr[dom] = list(ids)
|
||||
index['subdom'] = sub_trkr
|
||||
index['pardom'] = par_trkr
|
||||
|
||||
|
||||
def number_of_apps():
|
||||
load_json_if_not_already()
|
||||
return sum(1 for x in _reverse_domain_dict['bundle'] if x != '_')
|
||||
def load(tracker=False):
|
||||
return load_json_from_disk(fname_tracker() if tracker else fname_all())
|
||||
|
||||
|
||||
def enumerate(key):
|
||||
load_json_if_not_already()
|
||||
for dom, bundles in _reverse_domain_dict[key].items():
|
||||
yield [dom, [_reverse_domain_dict['bundle'][i] for i in bundles]]
|
||||
def number_of_apps(index):
|
||||
return sum(1 for x in index['bundle'] if x != '_')
|
||||
|
||||
|
||||
def enrich_with_bundle_ids(index):
|
||||
for key in ['pardom', 'subdom']:
|
||||
for dom, ids in index[key].items():
|
||||
index[key][dom] = [index['bundle'][i] for i in ids]
|
||||
|
||||
|
||||
def process(bundle_ids, deleteOnly=False):
|
||||
print('writing index: reverse domains ...')
|
||||
fname = fname_all()
|
||||
if bundle_ids == ['*']:
|
||||
bundle_ids = list(mylib.enum_data_appids())
|
||||
print(' full reset')
|
||||
mylib.rm_file(index_fname()) # rebuild from ground up
|
||||
mylib.rm_file(fname) # rebuild from ground up
|
||||
|
||||
load_json_if_not_already()
|
||||
did_change = delete_from_index(bundle_ids, deleteOnly=deleteOnly)
|
||||
index = load_json_from_disk(fname)
|
||||
did_change = delete_from_index(index, bundle_ids, deleteOnly=deleteOnly)
|
||||
if not deleteOnly:
|
||||
did_change |= insert_in_index(bundle_ids)
|
||||
did_change |= insert_in_index(index, bundle_ids)
|
||||
if did_change:
|
||||
write_json_to_disk()
|
||||
mylib.json_write(fname, index, pretty=False)
|
||||
filter_tracker_only(index)
|
||||
mylib.json_write(fname_tracker(), index, pretty=False)
|
||||
else:
|
||||
print(' no change')
|
||||
print('')
|
||||
@@ -110,5 +128,5 @@ if __name__ == '__main__':
|
||||
if len(args) > 0:
|
||||
process(args)
|
||||
else:
|
||||
# process(['*'], deleteOnly=False)
|
||||
process(['*'], deleteOnly=False)
|
||||
mylib.usage(__file__, '[bundle_id] [...]')
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=0.75" />
|
||||
<script type="text/javascript" src="/static/script.js?7"></script>
|
||||
<script type="text/javascript" src="/static/script.js?8"></script>
|
||||
<title>#_TITLE_#AppCheck: Privacy Monitor</title>
|
||||
<link rel="stylesheet" type="text/css" href="/static/style.css?7">
|
||||
<link rel="stylesheet" type="text/css" href="/static/style.css?8">
|
||||
<link rel="stylesheet" type="text/css" href="/static/fonts/font.css">
|
||||
<link rel="apple-touch-icon" sizes="180x180" href="/static/favicon/apple-touch-icon.png">
|
||||
<link rel="icon" type="image/png" sizes="32x32" href="/static/favicon/favicon-32x32.png">
|
||||
@@ -22,7 +22,7 @@
|
||||
<nav>
|
||||
<ul id="main-nav">
|
||||
<li><a href="/index/page/1/">All Apps</a></li>
|
||||
<li><a href="/index/domains/">Domains</a></li>
|
||||
<li><a href="/index/domains/all/">Domains</a></li>
|
||||
<li><a href="/help/">Help needed</a></li>
|
||||
<li><a class="no-ul" href="https://github.com/relikd/appcheck" target="_blank"><img src="/static/github.svg" alt="GitHub"></a></li>
|
||||
</ul>
|
||||
|
||||
Reference in New Issue
Block a user