Fix invalid chars in domain name, especially (null)

This commit is contained in:
relikd
2020-09-16 13:43:38 +02:00
parent 6d33baa0db
commit 0148106a56

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import re
import sys import sys
import common_lib as mylib import common_lib as mylib
import tracker_download as tracker import tracker_download as tracker
@@ -10,6 +11,7 @@ THRESHOLD_PERCENT_OF_LOGS = 0.33 # domain appears in % recordings
THRESHOLD_MIN_AVG_LOGS = 0.4 # at least x times in total (after %-thresh) THRESHOLD_MIN_AVG_LOGS = 0.4 # at least x times in total (after %-thresh)
level3_doms = None level3_doms = None
re_domain = re.compile(r'[^a-zA-Z0-9.-]')
def dom_in_3rd_domain(needle): def dom_in_3rd_domain(needle):
@@ -31,6 +33,17 @@ def get_parent_domain(subdomain):
return '.'.join(parts[-2:]) return '.'.join(parts[-2:])
def cleanup_domain_name(domain):
safe_domain = re_domain.sub('', domain)
if domain == safe_domain:
return domain
mylib.err('bundle-combine', 'invalid character in domain name: ' + domain)
without_null = domain.replace('(null)', '')
if domain != without_null:
return cleanup_domain_name(without_null)
return safe_domain
def json_combine(bundle_id): def json_combine(bundle_id):
def inc_dic(ddic, key, num): def inc_dic(ddic, key, num):
try: try:
@@ -51,6 +64,7 @@ def json_combine(bundle_id):
uniq_par = {} uniq_par = {}
for subdomain in logs: for subdomain in logs:
occurs = len(logs[subdomain]) occurs = len(logs[subdomain])
subdomain = cleanup_domain_name(subdomain)
inc_dic(subdom, subdomain, occurs) inc_dic(subdom, subdomain, occurs)
par_dom = get_parent_domain(subdomain) par_dom = get_parent_domain(subdomain)
try: try: