From 5f1f6e41244fe4f7e846e5fdb4f4534d6d3fa61a Mon Sep 17 00:00:00 2001 From: relikd Date: Mon, 7 Sep 2020 17:11:34 +0200 Subject: [PATCH] Refactor combined.json --- out/static/style.css | 20 +++---- src/bundle_combine.py | 61 ++++++++------------ src/bundle_download.py | 34 ++++++++--- src/common_lib.py | 23 +++++++- src/html_bundle.py | 125 +++++++++++++++++++++-------------------- src/html_index.py | 10 ++-- src/main.py | 14 ++--- 7 files changed, 158 insertions(+), 129 deletions(-) diff --git a/out/static/style.css b/out/static/style.css index 23a011f..d748268 100644 --- a/out/static/style.css +++ b/out/static/style.css @@ -138,16 +138,16 @@ p.trckr { font-size: 0.8em; } margin: 1px 1px; } /* color-bind friendly color palette */ -.c0{color:#63ACBE} .cb0{background:#63ACBE} -.c1{color:#601A4A} .cb1{background:#601A4A} -.c2{color:#09F4EC} .cb2{background:#09F4EC} -.c3{color:#1F77B4} .cb3{background:#1F77B4} -.c4{color:#EE442F} .cb4{background:#EE442F} -.c5{color:#7F7F7F} .cb5{background:#7F7F7F} -.c6{color:#0F2080} .cb6{background:#0F2080} -.c7{color:#3b9f35} .cb7{background:#3b9f35} -.c8{color:#F5793A} .cb8{background:#F5793A} -.c9{color:#AC66FB} .cb9{background:#AC66FB} +.cb0{background:#63ACBE} +.cb1{background:#601A4A} +.cb2{background:#09F4EC} +.cb3{background:#1F77B4} +.cb4{background:#EE442F} +.cb5{background:#7F7F7F} +.cb6{background:#0F2080} +.cb7{background:#3b9f35} +.cb8{background:#F5793A} +.cb9{background:#AC66FB} .cs0{stroke:#3AE48C} .cs1{stroke:#D11} diff --git a/src/bundle_combine.py b/src/bundle_combine.py index ee45723..71ed236 100755 --- a/src/bundle_combine.py +++ b/src/bundle_combine.py @@ -26,54 +26,39 @@ def get_parent_domain(subdomain): return '.'.join(parts[-2:]) -def dict_increment(ddic, key, num): - try: - ddic[key] - except KeyError: - ddic[key] = 0 - ddic[key] += num - - def json_combine(bundle_id): - res = dict({'#rec': 0, '#logs': 0}) - domA = dict() # unique sub domains - domB = dict() # total sub domains - domC = dict() # unique parent domains - domD = dict() # total parent domains + def inc_dic(ddic, key, num): + try: + ddic[key][1].append(num) + except KeyError: + ddic[key] = (tracker.is_tracker(key), [num]) + + res = dict({'rec_len': [], 'name': mylib.app_name(bundle_id)}) + pardom = dict() + subdom = dict() latest = 0 for fname, jdata in mylib.enum_jsons(bundle_id): latest = max(latest, os.path.getmtime(fname)) # or getctime - res['name'] = jdata['app-name'] - res['#rec'] += 1 - dict_increment(res, 'rec-total', jdata['duration']) + # if not res['name']: + # res['name'] = jdata['app-name'] + res['rec_len'].append(jdata['duration']) try: logs = jdata['logs'] - uniq_par = set() + uniq_par = dict() for subdomain in logs: occurs = len(logs[subdomain]) - sub_tracker = tracker.is_tracker(subdomain) - res['#logs'] += 1 - dict_increment(domA, subdomain, 1) - dict_increment(domB, subdomain, occurs) + inc_dic(subdom, subdomain, occurs) par_dom = get_parent_domain(subdomain) - uniq_par.add(par_dom) - dict_increment(domD, par_dom, occurs) - for par in uniq_par: - dict_increment(domC, par, 1) + try: + uniq_par[par_dom] += occurs + except KeyError: + uniq_par[par_dom] = occurs + for name, val in uniq_par.items(): + inc_dic(pardom, name, val) except KeyError: mylib.err('bundle-combine', 'skip: ' + fname) - res['uniq_subdom'] = domA - res['uniq_pardom'] = domC - res['total_subdom'] = domB - res['total_pardom'] = domD - sub_tracker = dict() - par_tracker = dict() - for x in domA: - sub_tracker[x] = tracker.is_tracker(x) - for x in domC: - par_tracker[x] = tracker.is_tracker(x) - res['tracker_subdom'] = sub_tracker - res['tracker_pardom'] = par_tracker + res['pardom'] = pardom + res['subdom'] = subdom res['last_date'] = latest return res @@ -91,7 +76,7 @@ def process(bundle_ids, where=None): if not haystack: should_update = True else: - for x in obj['uniq_subdom']: + for x in obj['subdom']: if mylib.bintree_lookup(haystack, x[::-1]): should_update = True break diff --git a/src/bundle_download.py b/src/bundle_download.py index a4c32c3..3f66ab7 100755 --- a/src/bundle_download.py +++ b/src/bundle_download.py @@ -20,10 +20,14 @@ def download_info(bundle_id, lang, force=False): mylib.json_write_meta(bundle_id, json, lang) -def download_icon(bundle_id, force=False, langs=['us', 'de']): - # icon_file = mylib.path_data_app(bundle_id, 'icon.png') +def needs_icon_path(bundle_id): icon_file = mylib.path_out_app(bundle_id, 'icon.png') - if force or not mylib.file_exists(icon_file): + return (mylib.file_exists(icon_file), icon_file) + + +def download_icon(bundle_id, force=False, langs=['us', 'de']): + exists, icon_file = needs_icon_path(bundle_id) + if force or not exists: json = None for lang in langs: if not json: @@ -31,13 +35,18 @@ def download_icon(bundle_id, force=False, langs=['us', 'de']): json = mylib.json_read_meta(bundle_id, lang) except Exception: continue - mylib.download_file(json['artworkUrl100'], icon_file) + image_url = json['artworkUrl100'] # fail early on KeyError + is_new = mylib.mkdir_out_app(bundle_id) + mylib.download_file(image_url, icon_file) + return is_new + return False def download_missing_icons(force=False, langs=['us', 'de']): didAny = False for bid in mylib.enum_appids(): - if not mylib.file_exists(mylib.path_out_app(bid, 'icon.png')): + exists, _ = needs_icon_path(bid) + if not exists: if not didAny: print('downloading missing icons ...') didAny = True @@ -51,7 +60,11 @@ def download_missing_icons(force=False, langs=['us', 'de']): def download(bundle_id, force=False): if not mylib.valid_bundle_id(bundle_id): mylib.err('apple-download', 'invalid id: ' + bundle_id) - return + return False + + exists, _ = needs_icon_path(bundle_id) + if exists and not force: + return False mylib.printf(' {} => '.format(bundle_id)) for lang in ['us', 'de']: @@ -65,12 +78,14 @@ def download(bundle_id, force=False): lang.upper(), bundle_id), logOnly=True) try: mylib.printf('icon') - download_icon(bundle_id, force=force) + index_needs_update = download_icon(bundle_id, force=force) mylib.printf('[✔] ') except Exception: + index_needs_update = False mylib.printf('[✘] ') mylib.err('apple-download', 'img for ' + bundle_id, logOnly=True) print('') # end printf line + return index_needs_update def process(bundle_ids, force=False): @@ -78,9 +93,12 @@ def process(bundle_ids, force=False): if bundle_ids == ['*']: bundle_ids = list(mylib.enum_data_appids()) + newly_created = set() for bid in bundle_ids: - download(bid, force=force) + if download(bid, force=force): + newly_created.add(bid) print('') + return newly_created if __name__ == "__main__": diff --git a/src/common_lib.py b/src/common_lib.py index 261c864..21c39ed 100755 --- a/src/common_lib.py +++ b/src/common_lib.py @@ -77,6 +77,19 @@ def valid_bundle_id(bundle_id): return regex_bundle_id.match(bundle_id) +def app_name(bundle_id, fallback=None): + def name_for(lang): + try: + return json_read_meta(bundle_id, lang)['trackCensoredName'] + except Exception: + return None + for lang in ['us', 'de']: + name = name_for(lang) + if name: + return name + return fallback + + def err(scope, msg, logOnly=False): if isinstance(msg, Exception): msg = traceback.format_exc() @@ -146,6 +159,14 @@ def meta_json_exists(bundle_id, lang): return file_exists(path_data_app(bundle_id, 'info_{}.json'.format(lang))) +def mkdir_out_app(bundle_id): + out_dir = path_out_app(bundle_id) + if not dir_exists(out_dir): + mkdir(out_dir) + return True + return False + + def next_path(path_pattern): i = 1 while os.path.exists(path_pattern % i): @@ -249,7 +270,7 @@ def json_write(path, obj, pretty=False): def json_write_combined(bundle_id, obj): fname = path_data_app(bundle_id, 'combined.json') - json_write(fname, obj, pretty=True) + json_write(fname, obj, pretty=False) def json_write_meta(bundle_id, obj, lang): diff --git a/src/html_bundle.py b/src/html_bundle.py index f8abdf6..643c519 100755 --- a/src/html_bundle.py +++ b/src/html_bundle.py @@ -12,31 +12,15 @@ def seconds_to_time(seconds): return '{:02d}:{:02d}:{:02d}'.format(hours, minutes, seconds) -def gen_dom_tags(unsorted_dict, trackers=None, additionalClasses=None): - sorted_arr = sorted(unsorted_dict, key=lambda x: (-x[1], x[0])) +def gen_dotgraph(sorted_arr): txt = '' - anyMark = False - for i, (x, y) in enumerate(sorted_arr): - mark = trackers[x] if trackers else True - title = x if y == 1 else '{} ({})'.format(x, y) - txt += '{} '.format(' class="trckr"' if mark else '', title) - anyMark |= mark - if txt: - note = '

known tracker

' - return '
{}{}
'.format( - additionalClasses or '', txt, note if anyMark else '') - else: - return '– None –' - - -def gen_dotgraph(count_dict): - txt = '' - sorted_count = sorted(count_dict.items(), key=lambda x: (-x[1], x[0])) - for i, (name, count) in enumerate(sorted_count): - # TODO: use average not total count - txt += '

{0} ({1})

'.format(name, count) - for x in range(count): - txt += ''.format(i % 10) + for i, (name, count, mark) in enumerate(sorted_arr): + title = '{} ({})'.format(name, count) if count > 1 else name + clss = 'cb{}'.format(i % 10) + if mark: + clss += ' trckr' + txt += '

{1}

'.format(clss, title) + txt += '' * count txt += '
' return '
{}
'.format(txt) @@ -71,20 +55,53 @@ def gen_pie_chart(parts, classes, stroke=0.6): def gen_radial_graph(obj): - total = 0 - tracker = 0 - for name, count in obj['total_subdom'].items(): - total += count - if obj['tracker_subdom'][name]: - tracker += count - percent = tracker / total + percent = obj['#logs_tracker'] / obj['#logs_total'] return '
{}
'.format( gen_pie_chart([1 - percent, percent], ['cs0', 'cs1'])) +def gen_dom_tags(sorted_arr, onlyTrackers=False): + txt = '' + anyMark = False + for i, (name, count, mark) in enumerate(sorted_arr): + title = '{} ({})'.format(name, count) if count > 1 else name + clss = ' class="trckr"' if mark and not onlyTrackers else '' + txt += '{} '.format(clss, title) + anyMark |= mark + if txt: + note = '

known tracker

' + return '
{}{}
'.format( + 'trckr ' if onlyTrackers else '', txt, note if anyMark else '') + else: + return '– None –' + + +def prepare_json(obj): + def calc_sum(arr): + # TODO: use average or median, not total count + return sum(arr) + + def transform(ddic): + res = list() + for name, (is_tracker, counts) in ddic.items(): + res.append([name, calc_sum(counts), is_tracker]) + res.sort(key=lambda x: (-x[1], x[0])) # sort by count desc, then name + return res + + if not obj['name']: + obj['name'] = '< App-Name >' + obj['#rec'] = len(obj['rec_len']) + obj['rec_len'] = sum(obj['rec_len']) + obj['pardom'] = transform(obj['pardom']) + obj['subdom'] = transform(obj['subdom']) + # do this after the transformation: + obj['tracker'] = list(filter(lambda x: x[2], obj['subdom'])) + obj['#logs_total'] = sum(map(lambda x: x[1], obj['pardom'])) + obj['#logs_tracker'] = sum(map(lambda x: x[1], obj['tracker'])) + + def gen_html(bundle_id, obj): - track_dom = [(dom, obj['total_subdom'][dom]) - for dom, known in obj['tracker_subdom'].items() if known] + prepare_json(obj) return mylib.template_with_base(f'''

{obj['name']}

@@ -100,13 +117,13 @@ def gen_html(bundle_id, obj): obj['#rec'] } Total number of logs:{ - obj['#logs'] + obj['#logs_total'] } Cumulative recording time:{ - seconds_to_time(obj['rec-total']) + seconds_to_time(obj['rec_len']) } Average recording time:{ - round(obj['rec-total'] / obj['#rec'], 1) + round(obj['rec_len'] / obj['#rec'], 1) } s Last updated: