use a simplified version of regenpfeifer which is faster and doesn’t find briefs

This commit is contained in:
Charlotte 🦝 Delenk 2022-06-17 20:27:31 +01:00
parent 1937db14c2
commit b9cd83489b
Signed by: darkkirb
GPG key ID: AB2BD8DAF2E37122
2 changed files with 140 additions and 1 deletions

View file

@ -68,9 +68,12 @@ in rec {
pname = "regenpfeifer";
version = inputs.regenpfeifer.lastModifiedDate;
src = inputs.regenpfeifer;
patches = [
./regenpfeifer.patch
];
nativeBuildInputs = [regenpfeifer-env];
buildPhase = ''
PYTHONPATH=${regenpfeifer-env}/site-packages LC_ALL=C.UTF-8 pypy3 -m regenpfeifer.dictionary_generator ${wortformliste} $out unmatched.log 25000 100000
PYTHONPATH=${regenpfeifer-env}/site-packages LC_ALL=C.UTF-8 pypy3 -m regenpfeifer.dictionary_generator ${wortformliste} $out unmatched.log 0 0
'';
installPhase = "cat unmatched.log";
};

136
misc/regenpfeifer.patch Normal file
View file

@ -0,0 +1,136 @@
diff --git a/regenpfeifer/stroke_generator.py b/regenpfeifer/stroke_generator.py
index e35c6ba..e84630c 100644
--- a/regenpfeifer/stroke_generator.py
+++ b/regenpfeifer/stroke_generator.py
@@ -39,7 +39,7 @@ class StrokeGenerator(object):
word = word.lower()
splitted_word = self.word_splitter.split(word)
- aggregated_words = self.stroke_aggregator.aggregate_strokes('/'.join(splitted_word))
+ aggregated_words = ['/'.join(splitted_word)] #self.stroke_aggregator.aggregate_strokes('/'.join(splitted_word))
matched_strokes_list = []
# for element in itertools.product(*somelists):
diff --git a/regenpfeifer/word_syllable_splitter.py b/regenpfeifer/word_syllable_splitter.py
index 47248d2..9d45887 100644
--- a/regenpfeifer/word_syllable_splitter.py
+++ b/regenpfeifer/word_syllable_splitter.py
@@ -1,11 +1,13 @@
'''
-Created on 27.07.2019
+Rewritten on 17.06.2022
-Based on Algorithm by Daniel Kirsch on https://www.wer-weiss-was.de/t/silbentrennung/544436
+Uses pyphen
@author: mkoerner
'''
+import pyphen
+
class WordSyllableSplitter(object):
'''
@@ -16,91 +18,7 @@ class WordSyllableSplitter(object):
'''
Constructor
'''
- self.vowels = ['a', 'e', 'i', 'o', 'u', 'ä', 'ö', 'ü']
-
- self.split_vowel_pairs = ['io', 'eie', 'eue']
- self.preventing_vowel_split_right = ['nen']
-
- self.splitters = ['sst', 'ier']
- self.non_connectors = ['chl']
- self.certain_connectors = ['sch', 'ch', 'ck', 'schl', 'chl']
- self.left_connectors = ['er', 'an']
- self.left_non_connectors = ['ana']
- self.possible_connectors = ['ph', 'pf', 'br', 'pl', 'tr', 'gr', 'sp', 'kl', 'zw', 'spr', 'fr', 'gl', 'bl', 'ren']
- self.separators = ['-', '*', ';', '.', '+', '=', ')', '(', '&', '!', '?', '', ':', ' ', '_', '~']
-
- def get_split_positions(self, word):
- word = word.lower()
- split_positions = []
- word_length = len(word)
- if word_length > 2:
- split_allowed = False
- for i in range(1, word_length):
- z_minus_3 = ""
- if i > 2:
- z_minus_3 = word[i - 3]
- z_minus_2 = ""
- if i > 1:
- z_minus_2 = word[i - 2]
- z_minus_1 = word[i - 1]
- if not split_allowed and z_minus_1 in self.vowels:
- split_allowed = True
- if split_allowed:
- z = word[i]
- z1 = ""
- if word_length > i + 1:
- z1 = word[i + 1]
-
- v = z_minus_1 + z
-
- v_extended = z_minus_2 + v
- if v_extended in self.certain_connectors or v_extended in self.possible_connectors or v_extended in self.splitters or v_extended in self.split_vowel_pairs or v_extended in self.non_connectors:
- v = v_extended
-
- v_extended = z_minus_3 + v_extended
- if v_extended in self.certain_connectors or v_extended in self.possible_connectors or v_extended in self.splitters or v_extended in self.split_vowel_pairs or v_extended in self.non_connectors:
- v = v_extended
-
- if v in self.split_vowel_pairs:
- # get everything after i
- z_i_plus = word[i + 1:]
- if z_i_plus not in self.preventing_vowel_split_right:
- split_positions.append(i)
- continue
-
- elif z1 in self.vowels and z not in self.vowels and z not in self.separators and z_minus_1 not in self.separators:
- if v in self.non_connectors:
- continue
- if v in self.certain_connectors:
- self.add_split_position(i - len(v) + 1, word, split_positions)
- elif v in self.splitters:
- self.add_split_position(i, word, split_positions)
- elif v + z1 in self.left_non_connectors:
- self.add_split_position(i + 2, word, split_positions)
- elif v in self.left_connectors :
- self.add_split_position(i + 1, word, split_positions)
- elif v in self.possible_connectors:
- self.add_split_position(i - len(v) + 1, word, split_positions)
- else:
- self.add_split_position(i, word, split_positions)
-
- return split_positions
-
- def add_split_position(self, split_position, word, split_positions):
- if split_position > 1 and split_position < len(word) - 1:
- split_positions.append(split_position)
+ self.dic = pyphen.Pyphen(lang="de_DE")
def split(self, word):
- split_positions = self.get_split_positions(word)
- syllables = []
-
- current_syllable = ''
- for i in range(len(word)):
- if i in split_positions:
- syllables.append(current_syllable)
- current_syllable = ''
- current_syllable += word[i]
- if len(current_syllable) > 0:
- syllables.append(current_syllable)
-
- return syllables
+ return self.dic.inserted(word, "\xAD").split("\xAD")
diff --git a/requirements.txt b/requirements.txt
index 7d23c96..054b8a3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
-marisa_trie==0.7.7
\ No newline at end of file
+marisa_trie==0.7.7
+pyphen==0.12.0