Highly-used domains
This commit is contained in:
@@ -82,7 +82,7 @@ def gen_html_trinity(idx_dir, app_count, json, title, symlink):
|
|||||||
# Full list (by count)
|
# Full list (by count)
|
||||||
list1.sort(key=lambda x: -x[1])
|
list1.sort(key=lambda x: -x[1])
|
||||||
list2.sort(key=lambda x: -x[1])
|
list2.sort(key=lambda x: -x[1])
|
||||||
write_index('by_count.html', title='{} (most apps)'.format(title),
|
write_index('by_count.html', title='{} (by count)'.format(title),
|
||||||
button='Full list (by count)')
|
button='Full list (by count)')
|
||||||
# Top 10
|
# Top 10
|
||||||
gen_html_top_10(idx_dir, list2[:25], app_count, 'Top 25 {}'.format(title))
|
gen_html_top_10(idx_dir, list2[:25], app_count, 'Top 25 {}'.format(title))
|
||||||
@@ -113,7 +113,7 @@ def gen_lookup(html_dir, doms_dict, names_dict, title):
|
|||||||
def process():
|
def process():
|
||||||
# bundle_combine assures domain name is [a-zA-Z0-9.-]
|
# bundle_combine assures domain name is [a-zA-Z0-9.-]
|
||||||
print('generating html: domain-index ...')
|
print('generating html: domain-index ...')
|
||||||
json = index_domains.load()
|
json = index_domains.loadAll()
|
||||||
app_count = index_domains.number_of_apps(json)
|
app_count = index_domains.number_of_apps(json)
|
||||||
dom_count = len(json['subdom'])
|
dom_count = len(json['subdom'])
|
||||||
|
|
||||||
@@ -133,8 +133,14 @@ def process():
|
|||||||
|
|
||||||
print(' Trackers Only')
|
print(' Trackers Only')
|
||||||
gen_html_trinity(mylib.path_out('index', 'domains', 'tracker'), app_count,
|
gen_html_trinity(mylib.path_out('index', 'domains', 'tracker'), app_count,
|
||||||
json=index_domains.load(tracker=True), title='Tracker',
|
json=index_domains.loadTracker(), title='Tracker',
|
||||||
symlink=index_domains.fname_tracker())
|
symlink=index_domains.fname_tracker())
|
||||||
|
|
||||||
|
print(' Highly Used')
|
||||||
|
gen_html_trinity(mylib.path_out('index', 'domains', 'highly-used'),
|
||||||
|
app_count, json=index_domains.loadNonTracker(),
|
||||||
|
title='Highly Used Domains',
|
||||||
|
symlink=index_domains.fname_no_tracker())
|
||||||
print('')
|
print('')
|
||||||
return app_count, dom_count
|
return app_count, dom_count
|
||||||
|
|
||||||
|
|||||||
@@ -90,8 +90,9 @@ def gen_results(base_dir, c_apps, c_domains, title):
|
|||||||
<ul>
|
<ul>
|
||||||
<li>List of <a href="/index/apps/">Apps</a></li>
|
<li>List of <a href="/index/apps/">Apps</a></li>
|
||||||
<li>List of <a href="/category/">Categories</a></li>
|
<li>List of <a href="/category/">Categories</a></li>
|
||||||
<li>List of <a href="/index/domains/all/">Requested Domains</a></li>
|
<li>List of <a href="/index/domains/all/">All Domains</a>,
|
||||||
<li>List of <a href="/index/domains/tracker/">Trackers</a></li>
|
only <a href="/index/domains/tracker/">Trackers</a>,
|
||||||
|
or <a href="/index/domains/highly-used/">Highly-used Domains</a> which appear in at least 5 apps but are not considered tracker <i>yet</i>.</li>
|
||||||
</ul>
|
</ul>
|
||||||
<p>Or compare similar application via custom comparison <a href="/lists/">Lists</a>.</p>
|
<p>Or compare similar application via custom comparison <a href="/lists/">Lists</a>.</p>
|
||||||
'''.format(title, c_apps, c_domains, c_recs, c_logs), title=title)
|
'''.format(title, c_apps, c_domains, c_recs, c_logs), title=title)
|
||||||
|
|||||||
@@ -14,11 +14,27 @@ def fname_tracker():
|
|||||||
return mylib.path_data_index('domains_tracker.json')
|
return mylib.path_data_index('domains_tracker.json')
|
||||||
|
|
||||||
|
|
||||||
|
def fname_no_tracker():
|
||||||
|
return mylib.path_data_index('domains_no_tracker.json')
|
||||||
|
|
||||||
|
|
||||||
def load_json_from_disk(index_file):
|
def load_json_from_disk(index_file):
|
||||||
return mylib.json_safe_read(
|
return mylib.json_safe_read(
|
||||||
index_file, fallback={'bundle': [], 'pardom': {}, 'subdom': {}})
|
index_file, fallback={'bundle': [], 'pardom': {}, 'subdom': {}})
|
||||||
|
|
||||||
|
|
||||||
|
def loadAll():
|
||||||
|
return load_json_from_disk(fname_all())
|
||||||
|
|
||||||
|
|
||||||
|
def loadTracker():
|
||||||
|
return load_json_from_disk(fname_tracker())
|
||||||
|
|
||||||
|
|
||||||
|
def loadNonTracker():
|
||||||
|
return load_json_from_disk(fname_no_tracker())
|
||||||
|
|
||||||
|
|
||||||
def delete_from_index(index, bundle_ids, deleteOnly=False):
|
def delete_from_index(index, bundle_ids, deleteOnly=False):
|
||||||
ids_to_delete = set()
|
ids_to_delete = set()
|
||||||
for bid in bundle_ids:
|
for bid in bundle_ids:
|
||||||
@@ -55,34 +71,43 @@ def insert_in_index(index, bundle_ids):
|
|||||||
index['bundle'].append(bid)
|
index['bundle'].append(bid)
|
||||||
json = bundle_combine.get_evaluated(bid)
|
json = bundle_combine.get_evaluated(bid)
|
||||||
for key in ['pardom', 'subdom']: # assuming keys are identical
|
for key in ['pardom', 'subdom']: # assuming keys are identical
|
||||||
for domain, _, _ in json[key]:
|
for domain, _, is_trkr in json[key]:
|
||||||
try:
|
try:
|
||||||
index[key][domain].append(i)
|
index[key][domain].append(i)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
index[key][domain] = [i]
|
index[key][domain] = [is_trkr, i]
|
||||||
has_changes = True
|
has_changes = True
|
||||||
return has_changes
|
return has_changes
|
||||||
|
|
||||||
|
|
||||||
def filter_tracker_only(index):
|
def split_trackers(index):
|
||||||
sub_trkr = {}
|
ret = {'trkr': {'bundle': index['bundle'], 'subdom': {}, 'pardom': {}},
|
||||||
par_trkr = {}
|
'no-trkr': {'bundle': index['bundle'], 'subdom': {}, 'pardom': {}}}
|
||||||
for domain, ids in filter(lambda x: download_tracker.is_tracker(x[0]),
|
for domain, [is_trkr, *ids] in index['subdom'].items():
|
||||||
index['subdom'].items()):
|
key = 'trkr' if is_trkr else 'no-trkr'
|
||||||
sub_trkr[domain] = ids
|
ret[key]['subdom'][domain] = ids
|
||||||
pardom = mylib.parent_domain(domain)
|
pardom = mylib.parent_domain(domain)
|
||||||
try:
|
try:
|
||||||
par_trkr[pardom].update(ids)
|
ret[key]['pardom'][pardom].update(ids)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
par_trkr[pardom] = set(ids)
|
ret[key]['pardom'][pardom] = set(ids)
|
||||||
for dom, ids in par_trkr.items():
|
for dic in ret.values():
|
||||||
par_trkr[dom] = list(ids)
|
for dom, ids in dic['pardom'].items():
|
||||||
index['subdom'] = sub_trkr
|
dic['pardom'][dom] = list(ids)
|
||||||
index['pardom'] = par_trkr
|
return ret['trkr'], ret['no-trkr']
|
||||||
|
|
||||||
|
|
||||||
def load(tracker=False):
|
def filter_list_at_least(index, min_count):
|
||||||
return load_json_from_disk(fname_tracker() if tracker else fname_all())
|
sub = {}
|
||||||
|
par = {}
|
||||||
|
for domain, ids in index['subdom'].items():
|
||||||
|
if len(ids) >= min_count:
|
||||||
|
sub[domain] = ids
|
||||||
|
for domain, ids in index['pardom'].items():
|
||||||
|
if len(ids) >= min_count:
|
||||||
|
par[domain] = ids
|
||||||
|
index['subdom'] = sub
|
||||||
|
index['pardom'] = par
|
||||||
|
|
||||||
|
|
||||||
def number_of_apps(index):
|
def number_of_apps(index):
|
||||||
@@ -120,8 +145,10 @@ def process(bundle_ids, deleteOnly=False):
|
|||||||
did_change |= insert_in_index(index, ids)
|
did_change |= insert_in_index(index, ids)
|
||||||
if did_change:
|
if did_change:
|
||||||
mylib.json_write(fname, index, pretty=False)
|
mylib.json_write(fname, index, pretty=False)
|
||||||
filter_tracker_only(index)
|
dict_trkr, dict_no_trkr = split_trackers(index)
|
||||||
mylib.json_write(fname_tracker(), index, pretty=False)
|
mylib.json_write(fname_tracker(), dict_trkr, pretty=False)
|
||||||
|
filter_list_at_least(dict_no_trkr, 5) # or 0.1 * len(ids)
|
||||||
|
mylib.json_write(fname_no_tracker(), dict_no_trkr, pretty=False)
|
||||||
else:
|
else:
|
||||||
print(' no change')
|
print(' no change')
|
||||||
print('')
|
print('')
|
||||||
|
|||||||
Reference in New Issue
Block a user