Files
LiberPrayground/LP/NGrams.py
2021-02-12 00:36:01 +01:00

77 lines
2.5 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import re
from Alphabet import RUNES
from IOReader import re_norune
from RuneText import RuneText
from LPath import LPath
#########################################
# NGrams : loads and writes ngrams, also: translate english text to runes
#########################################
class NGrams(object):
@staticmethod
def translate(infile, outfile, stream=False): # takes 10s
with open(infile, 'r') as f:
src = re.sub('[^A-Z]', '' if stream else ' ', f.read().upper())
if stream:
src.replace('\n', '')
with open(outfile, 'w') as f:
flag = False
for r in RuneText.from_text(src):
if r.kind != 'r':
if not flag:
f.write('\n')
flag = True
continue
f.write(r.rune)
flag = False
@staticmethod
def make(gramsize, infile, outfile):
with open(infile, 'r') as f:
data = re_norune.sub('', f.read())
res = {x: 0 for x in RUNES} if gramsize == 1 else {}
for i in range(len(data) - gramsize + 1):
ngram = data[i:i + gramsize]
try:
res[ngram] += 1
except KeyError:
res[ngram] = 1
with open(outfile, 'w') as f:
for x, y in sorted(res.items(), key=lambda x: -x[1]):
f.write(f'{x} {y}\n')
@staticmethod
def load(ngram=1, prefix=''):
ret = {}
with open(LPath.data(f'p{prefix}-{ngram}gram'), 'r') as f:
for line in f.readlines():
r, v = line.split()
ret[r] = int(v)
return ret
if __name__ == '__main__':
def make_translation(stream=False): # if true, ignore spaces / word bounds
NGrams.translate(LPath.data('baseline-text'),
LPath.data('baseline-rune'), stream)
def make_ngrams(max_ngram=1):
for i in range(1, max_ngram + 1):
print(f'generate {i}-gram file')
NGrams.make(i, infile=LPath.data('baseline-rune-words'),
outfile=LPath.data(f'p-{i}gram'))
NGrams.make(i, infile=LPath.root('_solved.txt'),
outfile=LPath.data(f'p-solved-{i}gram'))
NGrams.make(i, infile=LPath.data('baseline-rune-no-e'),
outfile=LPath.data(f'p-no-e-{i}gram'))
# make_translation(stream=False)
# make_ngrams(5)