appchk-web/src/lib_common.py

#!/usr/bin/env python3

import os
import re
import glob
import json
import shutil
import logging
from pathlib import Path
import urllib.request as curl

base_dir = os.path.realpath(os.path.join(
    os.path.dirname(os.path.realpath(__file__)), os.pardir))


# Paths

def path_add(base, *parts):
    return os.path.join(base, *parts)


def path_root(*parts):
    return os.path.join(base_dir, *parts)


def path_data(*path_components):
    return path_root('data', *path_components)


def path_data_app(bundle_id, filename=None):
    pth = path_root('data', *bundle_id.split('.'))
    return path_add(pth, filename) if filename else pth


def path_data_index(filename):
    pth = path_root('data', '_eval')
    mkdir(pth)
    return path_add(pth, filename)


def path_out(*path_components):
    return path_root('out', *path_components)


def path_out_app(bundle_id, filename=None):
    pth = path_root('out', 'app', bundle_id)
    return path_add(pth, filename) if filename else pth


def path_len(path, isDir=True):
    return len(path) + (len(os.sep) if isDir else 0)


# Tempaltes

def template(html_file):
    return path_root('templates', html_file)


def template_with_base(content, title=None):
    with open(template('base.html'), 'r') as fp:
        return fp.read().replace(
            '#_TITLE_#', title + ' – ' if title else '').replace(
            '#_CONTENT_#', content)


# Other

# same regex as in `api/v1/contribute/index.php`
regex_bundle_id = re.compile(r'^[A-Za-z0-9\.\-]{1,155}$')
logging.basicConfig(filename=path_root('error.log'),
                    format='%(asctime)s %(message)s',
                    filemode='a')
logger = logging.getLogger()


def usage(_file_, params=''):
    print(' usage: ' + os.path.basename(_file_) + ' ' + params)


def valid_bundle_id(bundle_id):
    return regex_bundle_id.match(bundle_id)


def err(scope, msg, logOnly=False):
    logger.error('[{}] {}'.format(scope, msg))
    if not logOnly:
        print(' [ERROR] ' + msg)


def printf(msg):
    print(msg, end='', flush=True)


# Binary Tree Search

_list_TLD = None


def bintree_lookup(tree, needle):
    lo = 0
    hi = len(tree) - 1
    while lo <= hi:
        mid = (lo + hi) // 2
        if tree[mid] < needle:
            lo = mid + 1
        elif needle < tree[mid]:
            hi = mid - 1
        else:
            return True  # mid
    if lo > 0 and needle.startswith(tree[lo - 1] + '.'):
        return True  # lo - 1
    return False  # -1


def parent_domain(subdomain):
    def is_third_level(needle):
        global _list_TLD
        if not _list_TLD:
            with open(path_root('src', '3rd-domains.txt'), 'r') as fp:
                _list_TLD = [x.strip() for x in fp.readlines()]
        return bintree_lookup(_list_TLD, needle)

    parts = subdomain.split('.')
    if len(parts) < 3:
        return subdomain
    elif parts[-1].isdigit():
        return subdomain  # ip address
    elif is_third_level(parts[-1] + '.' + parts[-2]):
        return '.'.join(parts[-3:])
    else:
        return '.'.join(parts[-2:])


# Filesystem

def mkdir(path):
    Path(path).mkdir(parents=True, exist_ok=True)


def mv(path, to, printOmitPrefix=None):
    if printOmitPrefix:
        print('  mv ' + path[printOmitPrefix:] + ' -> ' + to[printOmitPrefix:])
    Path(path).rename(to)


def rm_file(file_path):
    try:
        os.remove(file_path)
    except FileNotFoundError:
        pass


def rm_dir(path):
    try:
        shutil.rmtree(path)
    except Exception:
        pass


def dir_exists(path):
    return os.path.isdir(path)


def file_exists(path):
    return os.path.isfile(path) and os.path.getsize(path) > 0


def symlink(source, target):
    if not file_exists(target):
        rm_file(target)  # file_exists is false if symlink cant be followed
        os.symlink(source, target)


def mkdir_out_app(bundle_id):
    out_dir = path_out_app(bundle_id)
    if not dir_exists(out_dir):
        mkdir(out_dir)
        return True
    return False


def next_path(path_pattern):
    i = 1
    while os.path.exists(path_pattern % i):
        i = i * 2
    a, b = (i // 2, i)
    while a + 1 < b:
        c = (a + b) // 2  # interval midpoint
        a, b = (c, b) if os.path.exists(path_pattern % c) else (a, c)
    return path_pattern % b


def diff_files(fileA, fileB):
    with open(fileA, 'r') as fpA:
        with open(fileB, 'r') as fpB:
            a = '_'
            b = '_'
            diff = []
            while a != '' and b != '':
                a = fpA.readline()
                b = fpB.readline()
                if a == b:
                    continue
                while a != b:
                    if a == '' or b == '':
                        break
                    if a < b:
                        diff.append(a.strip())
                        a = fpA.readline()
                    elif b < a:
                        diff.append(b.strip())
                        b = fpB.readline()
            while a != '':
                diff.append(a.strip())
                a = fpA.readline()
            while b != '':
                diff.append(b.strip())
                b = fpB.readline()
            return diff


# Download

def download(url, isJSON=False):
    req = curl.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    with curl.urlopen(req) as response:
        data = response.read()
    return json.loads(data.decode()) if isJSON else data


def download_file(url, path):
    curl.urlretrieve(url, path)


# Enumerator

_all_data_bundle_ids = None


def enum_newly_added():
    for fname in glob.glob(path_data('_in', 'in_*')):
        yield fname, os.path.basename(fname)[3:]  # del prefix 'in_'


def enum_jsons(bundle_id):
    for fname in glob.glob(path_data_app(bundle_id, 'id_*.json')):
        with open(fname, 'r') as fp:
            yield fname, json.load(fp)


def appids_in_out(selection=None):
    if selection and selection != ['*']:
        return selection
    return [os.path.basename(x) for x in glob.glob(path_out_app('*'))]


def appids_in_data(selection=None):
    if selection and selection != ['*']:
        return selection
    global _all_data_bundle_ids
    if not _all_data_bundle_ids:
        _all_data_bundle_ids = []
        data_root = path_data()
        prfx = path_len(data_root)
        for path, dirs, files in os.walk(data_root):
            if 'combined.json' in files:
                _all_data_bundle_ids.append(path[prfx:].replace(os.sep, '.'))
    return _all_data_bundle_ids


# JSON

def try_del(index, keys):
    did_change = False
    for x in keys:
        try:
            del(index[x])
            did_change = True
        except KeyError:
            pass
    return did_change


def json_read(path):
    with open(path, 'r') as fp:
        return json.load(fp)


def json_safe_read(path, fallback=None):
    return json_read(path) if file_exists(path) else fallback


def json_write(path, obj, pretty=False):
    with open(path, 'w') as fp:
        json.dump(obj, fp, indent=2 if pretty else None, sort_keys=pretty)