diff --git a/src/README.md b/src/README.md index 9ea28b0..a0982d2 100644 --- a/src/README.md +++ b/src/README.md @@ -49,7 +49,7 @@ digraph Dependency { index_meta -> html_index_domains "." -> bundle_combine bundle_combine -> index_meta - bundle_combine -> html_bundle + index_meta -> html_bundle bundle_combine -> index_domains index_domains -> html_index_domains "." -> tracker_download diff --git a/src/bundle_combine.py b/src/bundle_combine.py index 6c73637..e5ad53b 100755 --- a/src/bundle_combine.py +++ b/src/bundle_combine.py @@ -4,33 +4,25 @@ import os import re import sys import common_lib as mylib -import tracker_download +import tracker_download # is_tracker THRESHOLD_PERCENT_OF_LOGS = 0.33 # domain appears in % recordings THRESHOLD_MIN_AVG_LOGS = 0.4 # at least x times in total (after %-thresh) -level3_doms = None re_domain = re.compile(r'[^a-zA-Z0-9.-]') -def dom_in_3rd_domain(needle): - global level3_doms - if not level3_doms: - level3_doms = mylib.read_list('3rd-domains.txt') - return mylib.bintree_lookup(level3_doms, needle) +def fname_combined(bundle_id): + return mylib.path_data_app(bundle_id, 'combined.json') -def get_parent_domain(subdomain): - parts = subdomain.split('.') - if len(parts) < 3: - return subdomain - elif parts[-1].isdigit(): - return subdomain # ip address - elif dom_in_3rd_domain(parts[-1] + '.' + parts[-2]): - return '.'.join(parts[-3:]) - else: - return '.'.join(parts[-2:]) +def fname_evaluated(bundle_id): + return mylib.path_data_app(bundle_id, 'evaluated.json') + + +def get_evaluated(bundle_id): + return mylib.json_read(fname_evaluated(bundle_id)) def cleanup_domain_name(domain): @@ -66,7 +58,7 @@ def json_combine(bundle_id): occurs = len(logs[subdomain]) subdomain = cleanup_domain_name(subdomain) inc_dic(subdom, subdomain, occurs) - par_dom = get_parent_domain(subdomain) + par_dom = mylib.parent_domain(subdomain) try: uniq_par[par_dom] += occurs except KeyError: @@ -81,6 +73,24 @@ def json_combine(bundle_id): return res +def evaluate_domains(ddic, number_of_recordings): + res = [] + c_sum = 0 + c_trkr = 0 + for name, (is_tracker, counts) in ddic.items(): + rec_percent = len(counts) / number_of_recordings + if rec_percent < THRESHOLD_PERCENT_OF_LOGS: + continue + avg = sum(counts) / number_of_recordings # len(counts) + if avg < THRESHOLD_MIN_AVG_LOGS: + continue + res.append([name, round(avg + 0.001), is_tracker]) + c_sum += avg + c_trkr += avg if is_tracker else 0 + res.sort(key=lambda x: (-x[1], x[0])) # sort by count desc, then name + return res, c_trkr, c_sum + + def json_evaluate_inplace(obj): def float3(val): return int(val * 1000) / 1000 @@ -93,26 +103,8 @@ def json_evaluate_inplace(obj): obj['sum_logs_pm'] = float3(obj['sum_logs'] / (time_total or 1) * 60) obj['sum_time'] = time_total obj['avg_time'] = float3(time_total / rec_count) - - def transform(ddic): - res = [] - c_sum = 0 - c_trkr = 0 - for name, (is_tracker, counts) in ddic.items(): - rec_percent = len(counts) / rec_count - if rec_percent < THRESHOLD_PERCENT_OF_LOGS: - continue - avg = sum(counts) / rec_count # len(counts) - if avg < THRESHOLD_MIN_AVG_LOGS: - continue - res.append([name, round(avg + 0.001), is_tracker]) - c_sum += avg - c_trkr += avg if is_tracker else 0 - res.sort(key=lambda x: (-x[1], x[0])) # sort by count desc, then name - return res, c_trkr, c_sum - - obj['pardom'], p_t, p_c = transform(obj['pardom']) - obj['subdom'], s_t, s_c = transform(obj['subdom']) + obj['pardom'], p_t, p_c = evaluate_domains(obj['pardom'], rec_count) + obj['subdom'], s_t, s_c = evaluate_domains(obj['subdom'], rec_count) obj['tracker_percent'] = float3(s_t / (s_c or 1)) obj['avg_logs'] = float3(s_c) obj['avg_logs_pm'] = float3(s_c / (obj['avg_time'] or 1) * 60) @@ -137,9 +129,9 @@ def process(bundle_ids, where=None): break if should_update: print(' ' + bid) - mylib.json_write_combined(bid, obj) + mylib.json_write(fname_combined(bid), obj, pretty=False) json_evaluate_inplace(obj) - mylib.json_write_evaluated(bid, obj) + mylib.json_write(fname_evaluated(bid), obj, pretty=False) affected_ids.append(bid) print('') return affected_ids diff --git a/src/common_lib.py b/src/common_lib.py index 47f2479..9c59a78 100755 --- a/src/common_lib.py +++ b/src/common_lib.py @@ -94,6 +94,9 @@ def printf(msg): # Binary Tree Search +_list_TLD = None + + def read_list(list_name): path = path_root('src', 'lists', list_name) if not file_exists(path): @@ -118,6 +121,24 @@ def bintree_lookup(tree, needle): return False # -1 +def parent_domain(subdomain): + def is_third_level(needle): + global _list_TLD + if not _list_TLD: + _list_TLD = read_list('3rd-domains.txt') + return bintree_lookup(_list_TLD, needle) + + parts = subdomain.split('.') + if len(parts) < 3: + return subdomain + elif parts[-1].isdigit(): + return subdomain # ip address + elif is_third_level(parts[-1] + '.' + parts[-2]): + return '.'.join(parts[-3:]) + else: + return '.'.join(parts[-2:]) + + # Filesystem def mkdir(path): @@ -252,27 +273,6 @@ def json_read(path): return json.load(fp) -def json_read_combined(bundle_id): - return json_read(path_data_app(bundle_id, 'combined.json')) - - -def json_read_evaluated(bundle_id): - pth = path_data_app(bundle_id, 'evaluated.json') - return json_read(pth), pth - - -# JSON write - def json_write(path, obj, pretty=False): with open(path, 'w') as fp: json.dump(obj, fp, indent=2 if pretty else None, sort_keys=pretty) - - -def json_write_combined(bundle_id, obj): - fname = path_data_app(bundle_id, 'combined.json') - json_write(fname, obj, pretty=False) - - -def json_write_evaluated(bundle_id, obj): - fname = path_data_app(bundle_id, 'evaluated.json') - json_write(fname, obj, pretty=False) diff --git a/src/html_bundle.py b/src/html_bundle.py index e28c500..341d599 100755 --- a/src/html_bundle.py +++ b/src/html_bundle.py @@ -4,9 +4,10 @@ import sys import time import math import common_lib as mylib -import bundle_download -import index_app_names -import index_meta +import bundle_download # get_genres +import bundle_combine # get_evaluated, fname_evaluated +import index_app_names # get_name +import index_meta # get_rank def gen_dotgraph(sorted_arr): @@ -155,17 +156,18 @@ def gen_html(bundle_id, obj): def process(bundle_ids): - print('generating html pages ...') + print('generating html: apps ...') if bundle_ids == ['*']: bundle_ids = list(mylib.enum_appids()) for bid in bundle_ids: print(' ' + bid) - json, json_data_path = mylib.json_read_evaluated(bid) + json = bundle_combine.get_evaluated(bid) mylib.mkdir_out_app(bid) with open(mylib.path_out_app(bid, 'index.html'), 'w') as fp: fp.write(gen_html(bid, json)) - mylib.symlink(json_data_path, mylib.path_out_app(bid, 'data.json')) + mylib.symlink(bundle_combine.fname_evaluated(bid), + mylib.path_out_app(bid, 'data.json')) print('') diff --git a/src/html_index_apps.py b/src/html_index_apps.py index a8257fc..3835c03 100755 --- a/src/html_index_apps.py +++ b/src/html_index_apps.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import common_lib as mylib -import index_app_names +import index_app_names # get_name def gen_obj(bundle_id): @@ -60,7 +60,7 @@ def gen_page(arr, base, page_id=1, total=1): def process(per_page=60): - print('generating app-index ...') + print('generating html: app-index ...') index_dir = mylib.path_out('index', 'apps') mylib.rm_dir(index_dir) mylib.mkdir(index_dir) diff --git a/src/html_index_domains.py b/src/html_index_domains.py index 5714a4f..c228739 100755 --- a/src/html_index_domains.py +++ b/src/html_index_domains.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 import common_lib as mylib -import index_app_names +import index_app_names # get_name import index_domains -import index_meta +import index_meta # get_total_counts def a_app(bundle_id): @@ -117,6 +117,10 @@ def gen_html_lookup(html_dir, json, key, title): def gen_html_stats(c_apps, c_domains): [c_recordings, c_logs] = index_meta.get_total_counts() + print(' {} apps'.format(c_apps)) + print(' {} domains'.format(c_domains)) + print(' {} recordings'.format(c_recordings)) + print(' {} logs'.format(c_logs)) title = 'Statistics' mylib.mkdir(mylib.path_out('stats')) with open(mylib.path_out('stats', 'index.html'), 'w') as fp: @@ -137,7 +141,7 @@ def gen_html_stats(c_apps, c_domains): def process(): # bundle_combine assures domain name is [a-zA-Z0-9.-] - print('generating domain-index ...') + print('generating html: domain-index ...') # Data export all_dom_dir = mylib.path_out('index', 'domains', 'all') trkr_dir = mylib.path_out('index', 'domains', 'tracker') diff --git a/src/index_app_names.py b/src/index_app_names.py index 9ef0453..ba27347 100755 --- a/src/index_app_names.py +++ b/src/index_app_names.py @@ -2,6 +2,7 @@ import sys import common_lib as mylib +import bundle_download # app_names _bundle_name_dict = None @@ -25,7 +26,7 @@ def load_json_if_not_already(): def write_json_to_disk(): - mylib.json_write(index_fname(), _bundle_name_dict, pretty=True) + mylib.json_write(index_fname(), _bundle_name_dict, pretty=False) def get_name(bundle_id, langs=['us', 'de'], fallback='< App-Name >'): @@ -39,7 +40,7 @@ def get_name(bundle_id, langs=['us', 'de'], fallback='< App-Name >'): def process(bundle_ids): - print('writing index: bundle name ...') + print('writing index: app names ...') if bundle_ids == ['*']: bundle_ids = list(mylib.enum_data_appids()) print(' full reset') @@ -48,7 +49,7 @@ def process(bundle_ids): load_json_if_not_already() did_change = False for bid in bundle_ids: - names = mylib.app_names(bid) + names = bundle_download.app_names(bid) if not names: mylib.err('index-app-names', 'could not load: {}'.format(bid)) continue diff --git a/src/index_domains.py b/src/index_domains.py index 128d980..b53b236 100755 --- a/src/index_domains.py +++ b/src/index_domains.py @@ -2,8 +2,8 @@ import sys import common_lib as mylib -import bundle_combine -import tracker_download +import bundle_combine # get_evaluated +import tracker_download # is_tracker def fname_all(): @@ -55,7 +55,7 @@ def insert_in_index(index, bundle_ids): except ValueError: # index not found i = len(index['bundle']) index['bundle'].append(bid) - json, _ = mylib.json_read_evaluated(bid) + json = bundle_combine.get_evaluated(bid) for key in ['pardom', 'subdom']: # assuming keys are identical for domain, _, _ in json[key]: try: @@ -72,7 +72,7 @@ def filter_tracker_only(index): for domain, ids in filter(lambda x: tracker_download.is_tracker(x[0]), index['subdom'].items()): sub_trkr[domain] = ids - pardom = bundle_combine.get_parent_domain(domain) + pardom = mylib.parent_domain(domain) try: par_trkr[pardom].update(ids) except KeyError: diff --git a/src/index_meta.py b/src/index_meta.py index 9175873..dfb9cb0 100755 --- a/src/index_meta.py +++ b/src/index_meta.py @@ -2,6 +2,7 @@ import sys import common_lib as mylib +import bundle_combine # get_evaluated _rank_dict = None @@ -65,8 +66,7 @@ def write_summary_index(index, bundle_ids, deleteOnly=False): if deleteOnly: continue # set new value - evaluated_json, _ = mylib.json_read_evaluated(bid) - index[bid] = json_to_list(evaluated_json) + index[bid] = json_to_list(bundle_combine.get_evaluated(bid)) # sum of counts try_del(index, ['_sum']) @@ -82,26 +82,27 @@ def write_rank_index(index): try_del(index, ['_sum', '_ranks', '_min', '_max']) mins = [] maxs = [] - for i in range(11): # equal to number of array entries - tmp = {} - # make temporary reverse index - for bid, val in index.items(): - try: - tmp[val[i]].append(bid) - except KeyError: - tmp[val[i]] = [bid] - # read index position from temp reverse index - r = 1 - ordered = sorted(tmp.items(), reverse=i in [0, 3, 6, 7]) - for idx, (_, ids) in enumerate(ordered): - for bid in ids: - index[bid][i] = r - r += len(ids) - mins.append(ordered[0][0]) - maxs.append(ordered[-1][0]) + if len(index) > 0: + for i in range(11): # equal to number of array entries + tmp = {} + # make temporary reverse index + for bid, val in index.items(): + try: + tmp[val[i]].append(bid) + except KeyError: + tmp[val[i]] = [bid] + # read index position from temp reverse index + r = 1 + ordered = sorted(tmp.items(), reverse=i in [0, 3, 6, 7]) + for idx, (_, ids) in enumerate(ordered): + for bid in ids: + index[bid][i] = r + r += len(ids) + mins.append(ordered[0][0]) + maxs.append(ordered[-1][0]) + index['_ranks'] = len(index) index['_min'] = mins index['_max'] = maxs - index['_ranks'] = len(index) mylib.json_write(fname_app_rank(), index, pretty=False) diff --git a/src/main.py b/src/main.py index 5c544af..2a47b54 100755 --- a/src/main.py +++ b/src/main.py @@ -35,18 +35,10 @@ def rebuild_app_index_html(inclRoot=False): def rebuild_domain_index(bundle_ids, deleteOnly=False): - index_meta.process(bundle_ids, deleteOnly=deleteOnly) index_domains.process(bundle_ids, deleteOnly=deleteOnly) html_index_domains.process() -def rebuild_name_index(new_ids): - if index_app_names.missing(): - index_app_names.process(['*']) - elif len(new_ids) > 0: - index_app_names.process(new_ids) # after bundle_download - - def del_id(bundle_ids): print('removing apps from website:') if bundle_ids == ['*']: @@ -60,30 +52,35 @@ def del_id(bundle_ids): mylib.rm_dir(dest) update_app_index = True print('') + index_meta.process(bundle_ids, deleteOnly=True) rebuild_domain_index(bundle_ids, deleteOnly=True) if update_app_index: rebuild_app_index_html(inclRoot=True) def combine_and_update(bundle_ids, where=None): + def star_reset(ids): + # special case needed. '*' will force rebuilt index + return ['*'] if not where and bundle_ids == ['*'] else ids # 1. download meta data from iTunes store, incl. app icons new_ids = bundle_download.process(bundle_ids) + new_ids = star_reset(new_ids) # 2. if new apps, update bundle name index - rebuild_name_index(new_ids) # after bundle_download + if len(new_ids) > 0: + index_app_names.process(new_ids) # after bundle_download # 3. re-calculate combined.json and evaluated.json files affected = bundle_combine.process(bundle_ids, where=where) - # special case needed for domain index. '*' will force rebuilt index - if not where and bundle_ids == ['*']: - affected = ['*'] + affected = star_reset(affected) # 4. was any json updated? if so, make html and update domain index if len(affected) > 0: - rebuild_domain_index(affected) # after bundle_combine + index_meta.process(bundle_ids) # after bundle_combine html_bundle.process(affected) # after index_app_names + rebuild_domain_index(affected) # after bundle_combine else: print('no bundle affected by tracker, not generating bundle html') # 5. make all apps index if len(new_ids) > 0: - rebuild_app_index_html() # must be called after bundle_combine + rebuild_app_index_html() # after bundle_combine else: print('no new bundle, not rebuilding index') @@ -135,6 +132,7 @@ try: if bundle_download.download_missing_icons(force=False): rebuild_app_index_html() elif cmd == 'index': + index_meta.process(['*']) rebuild_domain_index(['*']) rebuild_app_index_html(inclRoot=True) elif cmd == 'run': @@ -145,6 +143,8 @@ try: if len(params) == 0: print_usage_and_exit() del_id(params) # ['_manually'] + else: + print_usage_and_exit() except Exception: mylib.err('critical', traceback.format_exc(), logOnly=True) raise diff --git a/src/z_dependency.svg b/src/z_dependency.svg index d7bcaab..32b4bfa 100644 --- a/src/z_dependency.svg +++ b/src/z_dependency.svg @@ -1,126 +1,126 @@ - + Dependency - + . - -. + +. html_root - -html_root + +html_root .->html_root - - + + bundle_download - -bundle_download + +bundle_download .->bundle_download - - + + bundle_combine - -bundle_combine + +bundle_combine .->bundle_combine - - + + tracker_download - -tracker_download + +tracker_download .->tracker_download - - + + index_app_names - -index_app_names + +index_app_names bundle_download->index_app_names - - + + html_bundle - -html_bundle + +html_bundle index_app_names->html_bundle - - + + html_index_apps - -html_index_apps + +html_index_apps index_app_names->html_index_apps - - + + html_index_domains - -html_index_domains + +html_index_domains index_app_names->html_index_domains - - + + index_meta - -index_meta + +index_meta + + +index_meta->html_bundle + + index_meta->html_index_domains - - - - -bundle_combine->html_bundle - - + + bundle_combine->index_meta - - + + index_domains - -index_domains + +index_domains bundle_combine->index_domains - - + + index_domains->html_index_domains - - + + \ No newline at end of file