Domain index

This commit is contained in:
relikd
2020-09-18 23:56:15 +02:00
parent 0148106a56
commit ba8091268d
10 changed files with 351 additions and 59 deletions

View File

@@ -166,6 +166,11 @@ def file_exists(path):
return os.path.isfile(path) and os.path.getsize(path) > 0
def symlink(source, target):
if not file_exists(target):
os.symlink(source, target)
def meta_json_exists(bundle_id, lang):
return file_exists(path_data_app(bundle_id, 'info_{}.json'.format(lang)))

View File

@@ -1,6 +1,5 @@
#!/usr/bin/env python3
import os
import sys
import time
import math
@@ -138,9 +137,7 @@ def process(bundle_ids):
mylib.mkdir_out_app(bid)
with open(mylib.path_out_app(bid, 'index.html'), 'w') as fp:
fp.write(gen_html(bid, json))
download_link = mylib.path_out_app(bid, 'data.json')
if not mylib.file_exists(download_link):
os.symlink(json_data_path, download_link)
mylib.symlink(json_data_path, mylib.path_out_app(bid, 'data.json'))
print('')

View File

@@ -41,7 +41,7 @@ def gen_pager(current, total):
links += mklink(i, i, active=i == current)
# if current < total:
# links += mklink(current + 1, 'Next')
return '<div id="pagination">{}</div>'.format(links)
return '<div id="pagination" class="no_ul_all">{}</div>'.format(links)
def gen_page(arr, base, page_id=1, total=1):
@@ -52,7 +52,7 @@ def gen_page(arr, base, page_id=1, total=1):
pagination = gen_pager(page_id, total) # if total > 1 else ''
fp.write(mylib.template_with_base('''
<h2>List of app recordings (AZ)</h2>
<div id="app-toc">
<div id="app-toc" class="center no_ul_all">
{}
</div>
{}'''.format(content, pagination), title="Index"))

141
src/html_reverse_domains.py Executable file
View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
import common_lib as mylib
import index_bundle_names
import index_reverse_domains
def a_app(bundle_id):
return '<a href="/app/{}/">{}</a>'.format(
bundle_id, index_bundle_names.get_name(bundle_id))
def a_dom(domain, key):
return '<a href="/{0}/#{1}">{1}</a>'.format(key, domain)
def div_dom(domain, count, key):
return '{} <span>found in {} {}</span>'.format(
a_dom(domain, key), count, 'apps' if count > 1 else 'app')
def dropdown_choose(button):
return f'''
<label for="dropdown">Choose list:</label>
<div class="dropdown" name="dropdown">
<button class="bg1 border">{button}</button>
<div class="bg1 no_ul_all">
<a href="index.html">Most frequent</a>
<a href="by_name.html">Full list (AZ)</a>
<a href="by_count.html">Full list (by count)</a>
</div>
</div>'''
def duo_list(list1, list2):
txt1 = '<br>\n'.join([div_dom(dom, len(ids), 'subdomain') for dom, ids in list1])
txt2 = '<br>\n'.join([div_dom(dom, len(ids), 'domain') for dom, ids in list2])
return '''
<div id="dom-toc" class="found-in">
<div id="subdomains">
<h3>Subdomains ({}) <a class="snd mg_lr" href="#domains">go to Domains</a></h3>
{}
</div><div id="domains">
<h3>Domains ({}) <a class="snd mg_lr" href="#subdomains">go to Subdomains</a></h3>
{}
</div>
</div>'''.format(len(list1), txt1, len(list2), txt2)
def gen_html_index(l1, l2, fname, title, button):
with open(fname, 'w') as fp:
fp.write(mylib.template_with_base(
f'<h2>{title}</h2>' + dropdown_choose(button) + duo_list(l1, l2),
title=title))
def gen_html_top_domains(subset, fname, total, title):
def div_loadbar(percent):
return '<span class="loadbar"><span style="width: {0}%">{0}%</span></span>'.format(percent)
with open(fname, 'w') as fp:
txt = f'''
<div id="dom-top10" class="found-in">
<h2>{ title }</h2>'''
for dom, ids in subset:
dom_str = div_dom(dom, len(ids), 'subdomain')
pct_bar = div_loadbar(round(len(ids) / total * 100))
txt += f'\n<p>{dom_str} {pct_bar}</p>'
fp.write(mylib.template_with_base(txt + '''
<p class="mg_top">Get full list
sorted by <a class="snd" href="by_count.html">Occurrence frequency</a>
or in <a class="snd" href="by_name.html">Alphabetical order</a>.</p>
</div>
<p class="right snd">Download: <a href="data.json" download="appcheck_domains_full.json">json</a></p>
''', title=title))
def gen_html_lookup(html_dir, json, key, title):
mylib.mkdir(html_dir)
names = [[x, index_bundle_names.get_name(x)] for x in json['bundle']]
mylib.json_write(mylib.path_add(html_dir, 'apps.json'), names)
mylib.json_write(mylib.path_add(html_dir, 'doms.json'), json[key])
with open(mylib.path_add(html_dir, 'index.html'), 'w') as fp:
fp.write(mylib.template_with_base(f'''
<h2 id="name"></h2>
<p>Present in: <b id="num_apps">… applications</b></p>
<h3>Apps containing this domain:</h3>
<div id="app_list">loading…</div>
<script type="text/javascript" src="/static/lookup-domain.js?1"></script>
<script type="text/javascript">
lookup_domain_fragment('doms.json', 'apps.json', 'name', 'num_apps', 'app_list');
</script>
''', title=title))
def process():
# bundle_combine assures domain name is [a-zA-Z0-9.-]
print('generating reverse-domain-index ...')
idx_dir = mylib.path_out('index', 'domains')
mylib.mkdir(idx_dir)
# Data export
mylib.symlink(mylib.path_data_index('reverse_domains.json'),
mylib.path_out_app(idx_dir, 'data.json'))
par_arr = list(index_reverse_domains.enumerate('pardom'))
sub_arr = list(index_reverse_domains.enumerate('subdom'))
# Full list (AZ)
sub_arr.sort(key=lambda x: x[0])
par_arr.sort(key=lambda x: x[0])
gen_html_index(sub_arr, par_arr, mylib.path_add(idx_dir, 'by_name.html'),
title='Requested Domains (AZ)',
button='Full list (AZ)')
# Full list (by count)
sub_arr.sort(key=lambda x: -len(x[1]))
par_arr.sort(key=lambda x: -len(x[1]))
gen_html_index(sub_arr, par_arr, mylib.path_add(idx_dir, 'by_count.html'),
title='Requested Domains (most apps)',
button='Full list (by count)')
# Top 10
del(sub_arr[20:])
del(par_arr)
total = index_reverse_domains.number_of_apps()
gen_html_top_domains(sub_arr, mylib.path_add(idx_dir, 'index.html'),
total, 'Top 20 Requested Domains')
# Lookup
json = index_reverse_domains.raw()
gen_html_lookup(mylib.path_out('domain'), json, 'pardom',
title='Domain Lookup')
gen_html_lookup(mylib.path_out('subdomain'), json, 'subdom',
title='Subdomain Lookup')
print('')
if __name__ == '__main__':
process()

View File

@@ -24,14 +24,14 @@ def write_json_to_disk():
mylib.json_write(index_fname(), _bundle_name_dict, pretty=True)
def get_name(bundle_id, langs=['us', 'de']):
def get_name(bundle_id, langs=['us', 'de'], fallback='&lt; App-Name &gt;'):
load_json_if_not_already()
for lang in langs:
try:
return _bundle_name_dict[bundle_id][lang]
except KeyError:
continue
return '&lt; App-Name &gt;' # None
return fallback # None
def process(bundle_ids):

View File

@@ -3,79 +3,103 @@
import sys
import common_lib as mylib
def load_index_json(file_path):
if mylib.file_exists(file_path):
json = mylib.json_read(file_path)
else:
json = dict({'bundle': [], 'pardom': dict(), 'subdom': dict()})
return json
_reverse_domain_dict = None
def delete_from_index(index, bundle_ids, deleteOnly=False):
def index_fname():
return mylib.path_data_index('reverse_domains.json')
def load_json_if_not_already():
global _reverse_domain_dict
if not _reverse_domain_dict:
index_file = index_fname()
if mylib.file_exists(index_file):
_reverse_domain_dict = mylib.json_read(index_file)
else:
_reverse_domain_dict = {'bundle': [], 'pardom': {}, 'subdom': {}}
def write_json_to_disk():
mylib.json_write(index_fname(), _reverse_domain_dict, pretty=False)
def delete_from_index(bundle_ids, deleteOnly=False):
global _reverse_domain_dict
ids_to_delete = set()
for bid in bundle_ids:
try:
i = index['bundle'].index(bid)
i = _reverse_domain_dict['bundle'].index(bid)
except ValueError: # index not found
continue
ids_to_delete.add(i)
if deleteOnly:
index['bundle'][i] = '_'
_reverse_domain_dict['bundle'][i] = '_'
if len(ids_to_delete) == 0:
return False
for key in ['pardom', 'subdom']:
for domain in list(index[key].keys()):
for domain in list(_reverse_domain_dict[key].keys()):
for i in ids_to_delete:
try:
index[key][domain].remove(i)
_reverse_domain_dict[key][domain].remove(i)
except ValueError: # ignore if not present
continue
if not index[key][domain]:
del(index[key][domain])
if not _reverse_domain_dict[key][domain]:
del(_reverse_domain_dict[key][domain])
return True
def insert_in_index(index, bundle_ids):
def insert_in_index(bundle_ids):
global _reverse_domain_dict
has_changes = False
for bid in bundle_ids:
try:
i = index['bundle'].index(bid)
i = _reverse_domain_dict['bundle'].index(bid)
except ValueError: # index not found
i = len(index['bundle'])
index['bundle'].append(bid)
try:
json, _ = mylib.json_read_evaluated(bid)
except FileNotFoundError:
continue
i = len(_reverse_domain_dict['bundle'])
_reverse_domain_dict['bundle'].append(bid)
json, _ = mylib.json_read_evaluated(bid)
for key in ['pardom', 'subdom']: # assuming keys are identical
for domain, _, _ in json[key]:
try:
index[key][domain].append(i)
_reverse_domain_dict[key][domain].append(i)
except KeyError:
index[key][domain] = [i]
_reverse_domain_dict[key][domain] = [i]
has_changes = True
return has_changes
def raw():
load_json_if_not_already()
return _reverse_domain_dict
def number_of_apps():
load_json_if_not_already()
return sum(1 for x in _reverse_domain_dict['bundle'] if x != '_')
def enumerate(key):
load_json_if_not_already()
for dom, bundles in _reverse_domain_dict[key].items():
yield [dom, [_reverse_domain_dict['bundle'][i] for i in bundles]]
def process(bundle_ids, deleteOnly=False):
print('writing index: reverse domains ...')
index_file = mylib.path_data_index('reverse_domains.json')
if bundle_ids == ['*']:
bundle_ids = list(mylib.enum_data_appids())
print(' full reset')
mylib.rm_file(index_file) # rebuild from ground up
# load previous index
json = load_index_json(index_file)
# delete previous index entries
did_change = delete_from_index(json, bundle_ids, deleteOnly=deleteOnly)
# write new index to disk
mylib.rm_file(index_fname()) # rebuild from ground up
load_json_if_not_already()
did_change = delete_from_index(bundle_ids, deleteOnly=deleteOnly)
if not deleteOnly:
did_change |= insert_in_index(json, bundle_ids)
did_change |= insert_in_index(bundle_ids)
if did_change:
mylib.json_write(index_file, json, pretty=False)
write_json_to_disk()
else:
print(' no change')
print('')