Files
appchk-web/src/tracker_download.py
2020-09-19 01:44:07 +02:00

164 lines
5.0 KiB
Python
Executable File

#!/usr/bin/env python3
import common_lib as mylib
# import hashlib
known_trackers = None
def is_tracker(domain):
global known_trackers
if not known_trackers:
known_trackers = mylib.read_list('tracker_all.txt')
return mylib.bintree_lookup(known_trackers, domain[::-1])
# def md5(fname):
# hash_md5 = hashlib.md5()
# with open(fname, 'rb') as f:
# for chunk in iter(lambda: f.read(4096), b''):
# hash_md5.update(chunk)
# return hash_md5.hexdigest()
def save_list(result_set, fname, binary=False):
if not result_set:
return []
out = mylib.path_root('src', 'lists', 'tracker_' + fname)
with open(out + '_tmp', 'wb' if binary else 'w') as fp:
end = b'\n' if binary else '\n'
for domain in sorted(result_set):
fp.write(domain + end)
try:
changes = mylib.diff_files(out, out + '_tmp')
except Exception:
changes = [str(x) if binary else x for x in result_set]
mylib.mv(out + '_tmp', out)
# md5_old = md5(out) if mylib.file_exists(out) else None
# md5_new = md5(out)
if changes:
print(' updating: ' + fname)
else:
print(' no-change: ' + fname)
return changes
def enum_lines(url, ignore=None):
try:
whole = mylib.download(url)
for line in whole.split(b'\n'):
if not line or ignore and line.startswith(ignore):
continue
yield line
except Exception as e:
mylib.err('tracker-download', str(e) + ' in ' + url)
def github(path):
return 'https://raw.githubusercontent.com/' + path
def lockdown(fname, urlname):
url = github('confirmedcode/lockdown-ios/master/LockdowniOS/') + urlname
return save_list(set(enum_lines(url)), fname, binary=True)
def customlist(fname):
# We could access the 'list.txt' file directly on this server
# However, we can't separate the api from the website then
url = 'https://appchk.de/api/v1/trackers/'
return save_list(set(enum_lines(url)), fname, binary=True)
def easylist(fname, urlname):
url = github('easylist/easylist/master/') + urlname
res = set()
for x in enum_lines(url):
if not x.startswith(b'||'):
continue
x = x[2:]
parts = x.split(b'^')
if len(parts) == 1:
parts = x.split(b'$')
res.add(parts[0].split(b'/')[0])
return save_list(res, fname, binary=True)
def lowe(fname):
res = set()
for x in enum_lines('https://pgl.yoyo.org/adservers/serverlist.php'
'?hostformat=hosts&mimetype=plaintext', b'#'):
p = x.split()
if len(p) != 2:
mylib.err('tracker-list', 'Lowe: parsing error')
continue
res.add(p[1])
return save_list(res, fname, binary=True)
def exodus(fname):
res = set()
url = 'https://etip.exodus-privacy.eu.org/trackers/export'
json = mylib.download(url, isJSON=True)
try:
for entry in json['trackers']:
net = entry['network_signature']
if not net:
continue
net = net.replace('\\.', '.').replace('\\-', '-')
for dom in net.split('|'):
if dom[-1] in '/.':
continue
if dom[0] in '\\.':
dom = dom[1:]
res.add(dom)
except KeyError:
pass
return save_list(res, fname, binary=False)
def combine_all(changes=['_']):
final = mylib.path_root('src', 'lists', 'tracker_all.txt')
if changes or not mylib.file_exists(final):
print(' updating: tracker_all.txt')
else:
print(' no-change: tracker_all.txt')
return
res = set()
for fname in ['custom.txt', 'lowe.txt',
'easylist.txt', 'easyprivacy', 'easyprivacy_int',
'exodus.txt', 'lockdown_clickbait.txt',
'lockdown_marketing.txt', 'lockdown_game_ads.txt']:
for dom in mylib.read_list('tracker_' + fname):
if dom == 'google.com':
continue # added by exodus, not a tracker per se
res.add(dom[::-1]) # reverse for bintree lookup
with open(final, 'w') as fp:
for domain in sorted(res):
fp.write(domain + '\n')
def process():
print('downloading tracker domains ...')
changes = []
changes += customlist('custom.txt')
changes += lowe('lowe.txt')
changes += easylist('easylist.txt',
'easylist/easylist_adservers.txt')
changes += easylist('easyprivacy.txt',
'easyprivacy/easyprivacy_trackingservers.txt')
changes += easylist('easyprivacy_int.txt',
'easyprivacy/easyprivacy_trackingservers_international.txt')
changes += exodus('exodus.txt')
# changes += lockdown('lockdown_clickbait.txt', 'clickbait.txt')
# changes += lockdown('lockdown_marketing.txt', 'marketing.txt')
# changes += lockdown('lockdown_game_ads.txt', 'game_ads.txt')
combine_all(changes)
print('')
return changes
if __name__ == "__main__":
# combine_all()
process()