remove load_indices()

2021-02-13 00:41:19 +01:00
parent 686afb6f26
commit e4b4ed4498
5 changed files with 32 additions and 52 deletions
--- a/LP/IOReader.py
+++ b/LP/IOReader.py
@@ -1,32 +1,5 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-import re  # load_indices
-from Alphabet import RUNES
-from RuneText import RuneText
-
-re_norune = re.compile('[^' + ''.join(RUNES) + ']')
-
-
-#########################################
-#  load page and convert to indices for faster access
-#########################################
-
-def load_indices(fname, interrupt, maxinterrupt=None, minlen=None, limit=None):
-    with open(fname, 'r') as f:
-        data = RuneText(re_norune.sub('', f.read())).index_no_white[:limit]
-    if maxinterrupt is not None:
-        # incl. everything up to but not including next interrupt
-        # e.g., maxinterrupt = 0 will return text until first interrupt
-        for i, x in enumerate(data):
-            if x != interrupt:
-                continue
-            if maxinterrupt == 0:
-                if minlen and i < minlen:
-                    continue
-                return data[:i]
-            maxinterrupt -= 1
-    return data
-

 #########################################
 #  find the longest chunk in a list of indices, which does not include an irp
--- a/LP/InterruptDB.py
+++ b/LP/InterruptDB.py
@@ -2,8 +2,9 @@
 # -*- coding: UTF-8 -*-
 import os
 from InterruptSearch import InterruptSearch
+from InterruptIndices import InterruptIndices
 from Probability import Probability
-from IOReader import load_indices
+from RuneText import RuneTextFile
 from LPath import FILES_ALL, FILES_UNSOLVED, LPath


@@ -82,15 +83,23 @@ class InterruptDB(object):
 #  helper functions
 #########################################

-def create_initial_db(dbname, fn_score, klset=range(1, 33),
-                      max_irp=20, irpset=range(29)):
+def get_db(fname, irp, max_irp):
+    T = False  # inverse
+    _, Z = InterruptIndices().consider(fname, 28 - irp if T else irp, max_irp)
+    data = RuneTextFile(LPath.page(fname)).index_no_white[:Z]
+    if T:
+        data = [28 - x for x in data]
+    return InterruptDB(data, irp)
+
+
+def create_primary(dbname, fn_score, klset=range(1, 33),
+                   max_irp=20, irpset=range(29)):
    oldDB = InterruptDB.load(dbname)
    oldValues = {k: set((a, b, c) for a, _, b, c, _ in v)
                 for k, v in oldDB.items()}
    for irp in irpset:  # interrupt rune index
        for name in FILES_ALL:
-            data = load_indices(LPath.page(name), irp, maxinterrupt=max_irp)
-            db = InterruptDB(data, irp)
+            db = get_db(name, irp, max_irp)
            print('load:', name, 'interrupt:', irp, 'count:', db.irp_count)
            for keylen in klset:  # key length
                if (db.irp_count, irp, keylen) in oldValues.get(name, []):
@@ -100,8 +109,7 @@ def create_initial_db(dbname, fn_score, klset=range(1, 33),
                print(f'{keylen}: {score:.4f}, solutions: {len(interrupts)}')


-def find_secondary_solutions(db_in, db_out, fn_score,
-                             threshold=0.75, max_irp=20):
+def create_secondary(db_in, db_out, fn_score, threshold=0.75, max_irp=20):
    oldDB = InterruptDB.load(db_in)
    search_set = set()
    for name, arr in oldDB.items():
@@ -114,16 +122,15 @@ def find_secondary_solutions(db_in, db_out, fn_score,
    print('searching through', len(search_set), 'files.')
    for name, irp, kl in search_set:
        print('load:', name, 'interrupt:', irp, 'keylen:', kl)
-        data = load_indices(LPath.page(name), irp, maxinterrupt=max_irp)
-        db = InterruptDB(data, irp)
+        db = get_db(name, irp, max_irp)
        c = db.make_secondary(db_out, name, kl, fn_score, threshold)
        print('found', c, 'additional solutions')


 if __name__ == '__main__':
-    create_initial_db('db_high', Probability.IC_w_keylen, max_irp=20)
-    create_initial_db('db_norm', Probability.target_diff, max_irp=20)
-    # find_secondary_solutions('db_high', 'db_high_secondary',
-    #                          Probability.IC_w_keylen, threshold=1.4)
-    # find_secondary_solutions('db_norm', 'db_norm_secondary',
-    #                          Probability.target_diff, threshold=0.55)
+    create_primary('db_high', Probability.IC_w_keylen, max_irp=20)
+    create_primary('db_norm', Probability.target_diff, max_irp=20)
+    # create_secondary('db_high', 'db_high_secondary',
+    #                  Probability.IC_w_keylen, threshold=1.4)
+    # create_secondary('db_norm', 'db_norm_secondary',
+    #                  Probability.target_diff, threshold=0.55)
--- a/LP/InterruptIndices.py
+++ b/LP/InterruptIndices.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-from IOReader import load_indices
 from LPath import FILES_ALL, LPath
+from RuneText import RuneTextFile


 #########################################
@@ -20,18 +20,18 @@ class InterruptIndices(object):
    def total(self, name):
        return self.pos[name]['total']

-    def longest_no_interrupt(self, name, irp, irpmax=0):
-        irpmax += 1
-        nums = self.pos[name]['pos'][irp] + [self.pos[name]['total']] * irpmax
-        ret = [(y - x, x) for x, y in zip(nums, nums[irpmax:])]
-        return sorted(ret, reverse=True)
+    # def longest_no_interrupt(self, name, irp, irpmax=0):
+    #     irpmax += 1
+    #     nums = self.pos[name]['pos'][irp] + [self.pos[name]['total']] * irpmax
+    #     ret = [(y - x, x) for x, y in zip(nums, nums[irpmax:])]
+    #     return sorted(ret, reverse=True)

    @staticmethod
    def write(dbname='db_indices'):
        with open(LPath.db(dbname), 'w') as f:
            f.write('# file | total runes in file | interrupt | indices\n')
            for name in FILES_ALL:
-                data = load_indices(LPath.page(name), 0)
+                data = RuneTextFile(LPath.page(name)).index_no_white
                total = len(data)
                nums = [[] for x in range(29)]
                for idx, rune in enumerate(data):
--- a/LP/NGrams.py
+++ b/LP/NGrams.py
@@ -2,7 +2,6 @@
 # -*- coding: UTF-8 -*-
 import re
 from Alphabet import RUNES
-from IOReader import re_norune
 from RuneText import RuneText
 from LPath import LPath

@@ -33,7 +32,7 @@ class NGrams(object):
    @staticmethod
    def make(gramsize, infile, outfile):
        with open(infile, 'r') as f:
-            data = re_norune.sub('', f.read())
+            data = re.sub('[^' + ''.join(RUNES) + ']', '', f.read())

        res = {x: 0 for x in RUNES} if gramsize == 1 else {}
        for i in range(len(data) - gramsize + 1):
@@ -74,3 +73,4 @@ if __name__ == '__main__':

    # make_translation(stream=False)
    # make_ngrams(5)
+    print(NGrams.load(2))
--- a/LP/init.py
+++ b/LP/init.py
@@ -10,7 +10,7 @@ from Alphabet import RUNES, alphabet
 from Rune import Rune
 from RuneText import RuneText, RuneTextFile

-from IOReader import load_indices, longest_no_interrupt
+from IOReader import longest_no_interrupt
 from IOWriter import IOWriter

 from RuneSolver import SequenceSolver, VigenereSolver, AffineSolver, AutokeySolver