diff --git a/src/bundle_combine.py b/src/bundle_combine.py index 131a9ab..89b75fb 100755 --- a/src/bundle_combine.py +++ b/src/bundle_combine.py @@ -38,18 +38,17 @@ def json_combine(bundle_id): except KeyError: ddic[key] = (tracker.is_tracker(key), [num]) - res = dict({'rec_len': [], 'name': mylib.app_name(bundle_id)}) - pardom = dict() - subdom = dict() + res = {'rec_len': []} + pardom = {} + subdom = {} latest = 0 for fname, jdata in mylib.enum_jsons(bundle_id): + # TODO: load combined and append newest only, then update evaluated latest = max(latest, os.path.getmtime(fname)) # or getctime - # if not res['name']: - # res['name'] = jdata['app-name'] res['rec_len'].append(jdata['duration']) try: logs = jdata['logs'] - uniq_par = dict() + uniq_par = {} for subdomain in logs: occurs = len(logs[subdomain]) inc_dic(subdom, subdomain, occurs) @@ -69,8 +68,6 @@ def json_combine(bundle_id): def json_evaluate_inplace(obj): - if not obj['name']: - obj['name'] = '< App-Name >' rec_count = len(obj['rec_len']) time_total = sum(obj['rec_len']) del(obj['rec_len']) @@ -81,7 +78,7 @@ def json_evaluate_inplace(obj): obj['avg_time'] = time_total / rec_count def transform(ddic): - res = list() + res = [] c_sum = 0 c_trkr = 0 for name, (is_tracker, counts) in ddic.items(): diff --git a/src/common_lib.py b/src/common_lib.py index 8a6fdd9..a25e68e 100755 --- a/src/common_lib.py +++ b/src/common_lib.py @@ -32,6 +32,12 @@ def path_data_app(bundle_id, filename=None): return path_add(pth, filename) if filename else pth +def path_data_index(filename): + pth = path_root('data', '_eval') + mkdir(pth) + return path_add(pth, filename) + + def path_out(*path_components): return path_root('out', *path_components) @@ -76,17 +82,18 @@ def valid_bundle_id(bundle_id): return regex_bundle_id.match(bundle_id) -def app_name(bundle_id, fallback=None): +def app_names(bundle_id): def name_for(lang): try: return json_read_meta(bundle_id, lang)['trackCensoredName'] except Exception: return None + ret = {} for lang in ['us', 'de']: name = name_for(lang) if name: - return name - return fallback + ret[lang] = name + return ret def err(scope, msg, logOnly=False): diff --git a/src/html_bundle.py b/src/html_bundle.py index dd3247a..2c1aaa6 100755 --- a/src/html_bundle.py +++ b/src/html_bundle.py @@ -5,6 +5,7 @@ import sys import time import math import common_lib as mylib +import index_bundle_names def seconds_to_time(seconds): @@ -80,9 +81,10 @@ def gen_dom_tags(sorted_arr, onlyTrackers=False): def gen_html(bundle_id, obj): + name = index_bundle_names.get_name(bundle_id) obj['tracker'] = list(filter(lambda x: x[2], obj['subdom'])) return mylib.template_with_base(f''' -

{obj['name']}

+

{name}

Bundle-id:{ bundle_id }

@@ -122,7 +124,7 @@ def gen_html(bundle_id, obj): { gen_dotgraph(obj['subdom']) } { gen_dom_tags(obj['subdom']) }
-

Download: json

''', title=obj['name']) +

Download: json

''', title=name) def process(bundle_ids): diff --git a/src/html_index.py b/src/html_index.py index 0e4b2ea..03ad290 100755 --- a/src/html_index.py +++ b/src/html_index.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import common_lib as mylib +import index_bundle_names def gen_obj(bundle_id): @@ -10,7 +11,7 @@ def gen_obj(bundle_id): icon = '/static/app-template.svg' return { 'id': bundle_id, - 'name': mylib.app_name(bundle_id, '< App-Name >'), + 'name': index_bundle_names.get_name(bundle_id), 'img': icon } diff --git a/src/index_bundle_names.py b/src/index_bundle_names.py new file mode 100755 index 0000000..d2aba8f --- /dev/null +++ b/src/index_bundle_names.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import sys +import common_lib as mylib + +_bundle_name_dict = None + + +def index_fname(): + return mylib.path_data_index('bundle_names.json') + + +def load_json_if_not_already(): + global _bundle_name_dict + if not _bundle_name_dict: + index_file = index_fname() + if mylib.file_exists(index_file): + _bundle_name_dict = mylib.json_read(index_file) + else: + _bundle_name_dict = {} + + +def write_json_to_disk(): + mylib.json_write(index_fname(), _bundle_name_dict, pretty=True) + + +def get_name(bundle_id, langs=['us', 'de']): + load_json_if_not_already() + for lang in langs: + try: + return _bundle_name_dict[bundle_id][lang] + except KeyError: + continue + return '< App-Name >' # None + + +def process(bundle_ids): + print('writing index: bundle name ...') + if bundle_ids == ['*']: + bundle_ids = list(mylib.enum_data_appids()) + print(' full reset') + mylib.rm_file(index_fname()) # rebuild from ground up + + load_json_if_not_already() + did_change = False + for bid in bundle_ids: + names = mylib.app_names(bid) + if not names: + mylib.err('index-bundle-names', 'could not load: {}'.format(bid)) + continue + _bundle_name_dict[bid] = names + did_change = True + if did_change: + write_json_to_disk() + else: + print(' no change') + print('') + + +if __name__ == '__main__': + args = sys.argv[1:] + if len(args) > 0: + process(args) + else: + # process(['*']) + mylib.usage(__file__, '[bundle_id] [...]') diff --git a/src/json_reverse_index.py b/src/index_reverse_domains.py similarity index 90% rename from src/json_reverse_index.py rename to src/index_reverse_domains.py index 5cad9d9..640d813 100755 --- a/src/json_reverse_index.py +++ b/src/index_reverse_domains.py @@ -4,12 +4,6 @@ import sys import common_lib as mylib -def get_index_path(): - pth = mylib.path_root('data', '_eval') - mylib.mkdir(pth) - return mylib.path_add(pth, 'reverse_index.json') - - def load_index_json(file_path): if mylib.file_exists(file_path): json = mylib.json_read(file_path) @@ -52,7 +46,10 @@ def insert_in_index(index, bundle_ids): except ValueError: # index not found i = len(index['bundle']) index['bundle'].append(bid) - json, _ = mylib.json_read_evaluated(bid) + try: + json, _ = mylib.json_read_evaluated(bid) + except FileNotFoundError: + continue for key in ['pardom', 'subdom']: # assuming keys are identical for domain, _, _ in json[key]: try: @@ -64,10 +61,11 @@ def insert_in_index(index, bundle_ids): def process(bundle_ids, deleteOnly=False): - print('writing reverse index ...') - index_file = get_index_path() + print('writing index: reverse domains ...') + index_file = mylib.path_data_index('reverse_domains.json') if bundle_ids == ['*']: bundle_ids = list(mylib.enum_data_appids()) + print(' full reset') mylib.rm_file(index_file) # rebuild from ground up # load previous index json = load_index_json(index_file) diff --git a/src/main.py b/src/main.py index e931f69..872886f 100755 --- a/src/main.py +++ b/src/main.py @@ -8,7 +8,8 @@ import bundle_download import html_root import html_index import html_bundle -import json_reverse_index +import index_bundle_names +import index_reverse_domains import tracker_download @@ -44,23 +45,31 @@ def del_id(bundle_ids): mylib.rm_dir(dest) update_index = True print('') - json_reverse_index.process(bundle_ids, deleteOnly=True) + index_reverse_domains.process(bundle_ids, deleteOnly=True) if update_index: rebuild_index() def combine_and_update(bundle_ids, where=None): + # 1. download meta data from iTunes store, incl. app icons new_ids = bundle_download.process(bundle_ids) + # 2. if new apps, update bundle name index + if len(new_ids) > 0: + index_bundle_names.process(new_ids) + # 3. re-calculate combined.json and evaluated.json files affected = bundle_combine.process(bundle_ids, where=where) + # special case needed for reverse index. '*' will force rebuilt index if not where and bundle_ids == ['*']: affected = ['*'] + # 4. was any json updated? if so, make html and update reverse index if len(affected) > 0: - json_reverse_index.process(affected) + index_reverse_domains.process(affected) html_bundle.process(affected) else: print('no bundle affected by tracker, not generating bundle html') + # 5. make all apps index if len(new_ids) > 0: - rebuild_index() + rebuild_index() # must be called after bundle_combine else: print('no new bundle, not rebuilding index') diff --git a/src/tracker_download.py b/src/tracker_download.py index 82111b9..725de0d 100755 --- a/src/tracker_download.py +++ b/src/tracker_download.py @@ -6,6 +6,13 @@ import common_lib as mylib known_trackers = None +def is_tracker(domain): + global known_trackers + if not known_trackers: + known_trackers = mylib.read_list('tracker_all.txt') + return mylib.bintree_lookup(known_trackers, domain[::-1]) + + # def md5(fname): # hash_md5 = hashlib.md5() # with open(fname, 'rb') as f: @@ -110,13 +117,6 @@ def exodus(fname): return save_list(res, fname, binary=False) -def is_tracker(domain): - global known_trackers - if not known_trackers: - known_trackers = mylib.read_list('tracker_all.txt') - return mylib.bintree_lookup(known_trackers, domain[::-1]) - - def combine_all(changes): final = mylib.path_root('src', 'lists', 'tracker_all.txt') if changes or not mylib.file_exists(final):