use a simplified version of regenpfeifer which is faster and doesn’t find briefs
This commit is contained in:
parent
1937db14c2
commit
b9cd83489b
2 changed files with 140 additions and 1 deletions
|
@ -68,9 +68,12 @@ in rec {
|
|||
pname = "regenpfeifer";
|
||||
version = inputs.regenpfeifer.lastModifiedDate;
|
||||
src = inputs.regenpfeifer;
|
||||
patches = [
|
||||
./regenpfeifer.patch
|
||||
];
|
||||
nativeBuildInputs = [regenpfeifer-env];
|
||||
buildPhase = ''
|
||||
PYTHONPATH=${regenpfeifer-env}/site-packages LC_ALL=C.UTF-8 pypy3 -m regenpfeifer.dictionary_generator ${wortformliste} $out unmatched.log 25000 100000
|
||||
PYTHONPATH=${regenpfeifer-env}/site-packages LC_ALL=C.UTF-8 pypy3 -m regenpfeifer.dictionary_generator ${wortformliste} $out unmatched.log 0 0
|
||||
'';
|
||||
installPhase = "cat unmatched.log";
|
||||
};
|
||||
|
|
136
misc/regenpfeifer.patch
Normal file
136
misc/regenpfeifer.patch
Normal file
|
@ -0,0 +1,136 @@
|
|||
diff --git a/regenpfeifer/stroke_generator.py b/regenpfeifer/stroke_generator.py
|
||||
index e35c6ba..e84630c 100644
|
||||
--- a/regenpfeifer/stroke_generator.py
|
||||
+++ b/regenpfeifer/stroke_generator.py
|
||||
@@ -39,7 +39,7 @@ class StrokeGenerator(object):
|
||||
word = word.lower()
|
||||
|
||||
splitted_word = self.word_splitter.split(word)
|
||||
- aggregated_words = self.stroke_aggregator.aggregate_strokes('/'.join(splitted_word))
|
||||
+ aggregated_words = ['/'.join(splitted_word)] #self.stroke_aggregator.aggregate_strokes('/'.join(splitted_word))
|
||||
|
||||
matched_strokes_list = []
|
||||
# for element in itertools.product(*somelists):
|
||||
diff --git a/regenpfeifer/word_syllable_splitter.py b/regenpfeifer/word_syllable_splitter.py
|
||||
index 47248d2..9d45887 100644
|
||||
--- a/regenpfeifer/word_syllable_splitter.py
|
||||
+++ b/regenpfeifer/word_syllable_splitter.py
|
||||
@@ -1,11 +1,13 @@
|
||||
'''
|
||||
-Created on 27.07.2019
|
||||
+Rewritten on 17.06.2022
|
||||
|
||||
-Based on Algorithm by Daniel Kirsch on https://www.wer-weiss-was.de/t/silbentrennung/544436
|
||||
+Uses pyphen
|
||||
|
||||
@author: mkoerner
|
||||
'''
|
||||
|
||||
+import pyphen
|
||||
+
|
||||
|
||||
class WordSyllableSplitter(object):
|
||||
'''
|
||||
@@ -16,91 +18,7 @@ class WordSyllableSplitter(object):
|
||||
'''
|
||||
Constructor
|
||||
'''
|
||||
- self.vowels = ['a', 'e', 'i', 'o', 'u', 'ä', 'ö', 'ü']
|
||||
-
|
||||
- self.split_vowel_pairs = ['io', 'eie', 'eue']
|
||||
- self.preventing_vowel_split_right = ['nen']
|
||||
-
|
||||
- self.splitters = ['sst', 'ier']
|
||||
- self.non_connectors = ['chl']
|
||||
- self.certain_connectors = ['sch', 'ch', 'ck', 'schl', 'chl']
|
||||
- self.left_connectors = ['er', 'an']
|
||||
- self.left_non_connectors = ['ana']
|
||||
- self.possible_connectors = ['ph', 'pf', 'br', 'pl', 'tr', 'gr', 'sp', 'kl', 'zw', 'spr', 'fr', 'gl', 'bl', 'ren']
|
||||
- self.separators = ['-', '*', ';', '.', '+', '=', ')', '(', '&', '!', '?', '', ':', ' ', '_', '~']
|
||||
-
|
||||
- def get_split_positions(self, word):
|
||||
- word = word.lower()
|
||||
- split_positions = []
|
||||
- word_length = len(word)
|
||||
- if word_length > 2:
|
||||
- split_allowed = False
|
||||
- for i in range(1, word_length):
|
||||
- z_minus_3 = ""
|
||||
- if i > 2:
|
||||
- z_minus_3 = word[i - 3]
|
||||
- z_minus_2 = ""
|
||||
- if i > 1:
|
||||
- z_minus_2 = word[i - 2]
|
||||
- z_minus_1 = word[i - 1]
|
||||
- if not split_allowed and z_minus_1 in self.vowels:
|
||||
- split_allowed = True
|
||||
- if split_allowed:
|
||||
- z = word[i]
|
||||
- z1 = ""
|
||||
- if word_length > i + 1:
|
||||
- z1 = word[i + 1]
|
||||
-
|
||||
- v = z_minus_1 + z
|
||||
-
|
||||
- v_extended = z_minus_2 + v
|
||||
- if v_extended in self.certain_connectors or v_extended in self.possible_connectors or v_extended in self.splitters or v_extended in self.split_vowel_pairs or v_extended in self.non_connectors:
|
||||
- v = v_extended
|
||||
-
|
||||
- v_extended = z_minus_3 + v_extended
|
||||
- if v_extended in self.certain_connectors or v_extended in self.possible_connectors or v_extended in self.splitters or v_extended in self.split_vowel_pairs or v_extended in self.non_connectors:
|
||||
- v = v_extended
|
||||
-
|
||||
- if v in self.split_vowel_pairs:
|
||||
- # get everything after i
|
||||
- z_i_plus = word[i + 1:]
|
||||
- if z_i_plus not in self.preventing_vowel_split_right:
|
||||
- split_positions.append(i)
|
||||
- continue
|
||||
-
|
||||
- elif z1 in self.vowels and z not in self.vowels and z not in self.separators and z_minus_1 not in self.separators:
|
||||
- if v in self.non_connectors:
|
||||
- continue
|
||||
- if v in self.certain_connectors:
|
||||
- self.add_split_position(i - len(v) + 1, word, split_positions)
|
||||
- elif v in self.splitters:
|
||||
- self.add_split_position(i, word, split_positions)
|
||||
- elif v + z1 in self.left_non_connectors:
|
||||
- self.add_split_position(i + 2, word, split_positions)
|
||||
- elif v in self.left_connectors :
|
||||
- self.add_split_position(i + 1, word, split_positions)
|
||||
- elif v in self.possible_connectors:
|
||||
- self.add_split_position(i - len(v) + 1, word, split_positions)
|
||||
- else:
|
||||
- self.add_split_position(i, word, split_positions)
|
||||
-
|
||||
- return split_positions
|
||||
-
|
||||
- def add_split_position(self, split_position, word, split_positions):
|
||||
- if split_position > 1 and split_position < len(word) - 1:
|
||||
- split_positions.append(split_position)
|
||||
+ self.dic = pyphen.Pyphen(lang="de_DE")
|
||||
|
||||
def split(self, word):
|
||||
- split_positions = self.get_split_positions(word)
|
||||
- syllables = []
|
||||
-
|
||||
- current_syllable = ''
|
||||
- for i in range(len(word)):
|
||||
- if i in split_positions:
|
||||
- syllables.append(current_syllable)
|
||||
- current_syllable = ''
|
||||
- current_syllable += word[i]
|
||||
- if len(current_syllable) > 0:
|
||||
- syllables.append(current_syllable)
|
||||
-
|
||||
- return syllables
|
||||
+ return self.dic.inserted(word, "\xAD").split("\xAD")
|
||||
diff --git a/requirements.txt b/requirements.txt
|
||||
index 7d23c96..054b8a3 100644
|
||||
--- a/requirements.txt
|
||||
+++ b/requirements.txt
|
||||
@@ -1 +1,2 @@
|
||||
-marisa_trie==0.7.7
|
||||
\ No newline at end of file
|
||||
+marisa_trie==0.7.7
|
||||
+pyphen==0.12.0
|
Reference in a new issue