remove load_indices()

2021-02-13 00:41:19 +01:00
parent 686afb6f26
commit e4b4ed4498
5 changed files with 32 additions and 52 deletions
--- a/LP/IOReader.py
+++ b/LP/IOReader.py
@@ -1,32 +1,5 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 import re  # load_indices
 from Alphabet import RUNES
 from RuneText import RuneText
 re_norune = re.compile('[^' + ''.join(RUNES) + ']')
 #########################################
 #  load page and convert to indices for faster access
 #########################################
 def load_indices(fname, interrupt, maxinterrupt=None, minlen=None, limit=None):
    with open(fname, 'r') as f:
        data = RuneText(re_norune.sub('', f.read())).index_no_white[:limit]
    if maxinterrupt is not None:
        # incl. everything up to but not including next interrupt
        # e.g., maxinterrupt = 0 will return text until first interrupt
        for i, x in enumerate(data):
            if x != interrupt:
                continue
            if maxinterrupt == 0:
                if minlen and i < minlen:
                    continue
                return data[:i]
            maxinterrupt -= 1
    return data
 #########################################
 #  find the longest chunk in a list of indices, which does not include an irp
--- a/LP/InterruptDB.py
+++ b/LP/InterruptDB.py
@@ -2,8 +2,9 @@
 # -*- coding: UTF-8 -*-
 import os
 from InterruptSearch import InterruptSearch
 from InterruptIndices import InterruptIndices
 from Probability import Probability
-from IOReader import load_indices
+from RuneText import RuneTextFile
 from LPath import FILES_ALL, FILES_UNSOLVED, LPath
@@ -82,15 +83,23 @@ class InterruptDB(object):
 #  helper functions
 #########################################
-def create_initial_db(dbname, fn_score, klset=range(1, 33),
+def get_db(fname, irp, max_irp):
-                      max_irp=20, irpset=range(29)):
+    T = False  # inverse
    _, Z = InterruptIndices().consider(fname, 28 - irp if T else irp, max_irp)
    data = RuneTextFile(LPath.page(fname)).index_no_white[:Z]
    if T:
        data = [28 - x for x in data]
    return InterruptDB(data, irp)
 def create_primary(dbname, fn_score, klset=range(1, 33),
                   max_irp=20, irpset=range(29)):
    oldDB = InterruptDB.load(dbname)
    oldValues = {k: set((a, b, c) for a, _, b, c, _ in v)
                 for k, v in oldDB.items()}
    for irp in irpset:  # interrupt rune index
        for name in FILES_ALL:
-            data = load_indices(LPath.page(name), irp, maxinterrupt=max_irp)
+            db = get_db(name, irp, max_irp)
            db = InterruptDB(data, irp)
            print('load:', name, 'interrupt:', irp, 'count:', db.irp_count)
            for keylen in klset:  # key length
                if (db.irp_count, irp, keylen) in oldValues.get(name, []):
@@ -100,8 +109,7 @@ def create_initial_db(dbname, fn_score, klset=range(1, 33),
                print(f'{keylen}: {score:.4f}, solutions: {len(interrupts)}')
-def find_secondary_solutions(db_in, db_out, fn_score,
+def create_secondary(db_in, db_out, fn_score, threshold=0.75, max_irp=20):
                             threshold=0.75, max_irp=20):
    oldDB = InterruptDB.load(db_in)
    search_set = set()
    for name, arr in oldDB.items():
@@ -114,16 +122,15 @@ def find_secondary_solutions(db_in, db_out, fn_score,
    print('searching through', len(search_set), 'files.')
    for name, irp, kl in search_set:
        print('load:', name, 'interrupt:', irp, 'keylen:', kl)
-        data = load_indices(LPath.page(name), irp, maxinterrupt=max_irp)
+        db = get_db(name, irp, max_irp)
        db = InterruptDB(data, irp)
        c = db.make_secondary(db_out, name, kl, fn_score, threshold)
        print('found', c, 'additional solutions')
 if __name__ == '__main__':
-    create_initial_db('db_high', Probability.IC_w_keylen, max_irp=20)
+    create_primary('db_high', Probability.IC_w_keylen, max_irp=20)
-    create_initial_db('db_norm', Probability.target_diff, max_irp=20)
+    create_primary('db_norm', Probability.target_diff, max_irp=20)
-    # find_secondary_solutions('db_high', 'db_high_secondary',
+    # create_secondary('db_high', 'db_high_secondary',
-    #                          Probability.IC_w_keylen, threshold=1.4)
+    #                  Probability.IC_w_keylen, threshold=1.4)
-    # find_secondary_solutions('db_norm', 'db_norm_secondary',
+    # create_secondary('db_norm', 'db_norm_secondary',
-    #                          Probability.target_diff, threshold=0.55)
+    #                  Probability.target_diff, threshold=0.55)
--- a/LP/InterruptIndices.py
+++ b/LP/InterruptIndices.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 from IOReader import load_indices
 from LPath import FILES_ALL, LPath
 from RuneText import RuneTextFile
 #########################################
@@ -20,18 +20,18 @@ class InterruptIndices(object):
    def total(self, name):
        return self.pos[name]['total']
-    def longest_no_interrupt(self, name, irp, irpmax=0):
+    # def longest_no_interrupt(self, name, irp, irpmax=0):
-        irpmax += 1
+    #     irpmax += 1
-        nums = self.pos[name]['pos'][irp] + [self.pos[name]['total']] * irpmax
+    #     nums = self.pos[name]['pos'][irp] + [self.pos[name]['total']] * irpmax
-        ret = [(y - x, x) for x, y in zip(nums, nums[irpmax:])]
+    #     ret = [(y - x, x) for x, y in zip(nums, nums[irpmax:])]
-        return sorted(ret, reverse=True)
+    #     return sorted(ret, reverse=True)
    @staticmethod
    def write(dbname='db_indices'):
        with open(LPath.db(dbname), 'w') as f:
            f.write('# file | total runes in file | interrupt | indices\n')
            for name in FILES_ALL:
-                data = load_indices(LPath.page(name), 0)
+                data = RuneTextFile(LPath.page(name)).index_no_white
                total = len(data)
                nums = [[] for x in range(29)]
                for idx, rune in enumerate(data):
--- a/LP/NGrams.py
+++ b/LP/NGrams.py
@@ -2,7 +2,6 @@
 # -*- coding: UTF-8 -*-
 import re
 from Alphabet import RUNES
 from IOReader import re_norune
 from RuneText import RuneText
 from LPath import LPath
@@ -33,7 +32,7 @@ class NGrams(object):
    @staticmethod
    def make(gramsize, infile, outfile):
        with open(infile, 'r') as f:
-            data = re_norune.sub('', f.read())
+            data = re.sub('[^' + ''.join(RUNES) + ']', '', f.read())
        res = {x: 0 for x in RUNES} if gramsize == 1 else {}
        for i in range(len(data) - gramsize + 1):
@@ -74,3 +73,4 @@ if __name__ == '__main__':
    # make_translation(stream=False)
    # make_ngrams(5)
    print(NGrams.load(2))
--- a/LP/init.py
+++ b/LP/init.py
@@ -10,7 +10,7 @@ from Alphabet import RUNES, alphabet
 from Rune import Rune
 from RuneText import RuneText, RuneTextFile
-from IOReader import load_indices, longest_no_interrupt
+from IOReader import longest_no_interrupt
 from IOWriter import IOWriter
 from RuneSolver import SequenceSolver, VigenereSolver, AffineSolver, AutokeySolver