diff --git a/misc/plover.nix b/misc/plover.nix index 8199db1..7e4c5ff 100644 --- a/misc/plover.nix +++ b/misc/plover.nix @@ -68,9 +68,12 @@ in rec { pname = "regenpfeifer"; version = inputs.regenpfeifer.lastModifiedDate; src = inputs.regenpfeifer; + patches = [ + ./regenpfeifer.patch + ]; nativeBuildInputs = [regenpfeifer-env]; buildPhase = '' - PYTHONPATH=${regenpfeifer-env}/site-packages LC_ALL=C.UTF-8 pypy3 -m regenpfeifer.dictionary_generator ${wortformliste} $out unmatched.log 25000 100000 + PYTHONPATH=${regenpfeifer-env}/site-packages LC_ALL=C.UTF-8 pypy3 -m regenpfeifer.dictionary_generator ${wortformliste} $out unmatched.log 0 0 ''; installPhase = "cat unmatched.log"; }; diff --git a/misc/regenpfeifer.patch b/misc/regenpfeifer.patch new file mode 100644 index 0000000..f25a26a --- /dev/null +++ b/misc/regenpfeifer.patch @@ -0,0 +1,136 @@ +diff --git a/regenpfeifer/stroke_generator.py b/regenpfeifer/stroke_generator.py +index e35c6ba..e84630c 100644 +--- a/regenpfeifer/stroke_generator.py ++++ b/regenpfeifer/stroke_generator.py +@@ -39,7 +39,7 @@ class StrokeGenerator(object): + word = word.lower() + + splitted_word = self.word_splitter.split(word) +- aggregated_words = self.stroke_aggregator.aggregate_strokes('/'.join(splitted_word)) ++ aggregated_words = ['/'.join(splitted_word)] #self.stroke_aggregator.aggregate_strokes('/'.join(splitted_word)) + + matched_strokes_list = [] + # for element in itertools.product(*somelists): +diff --git a/regenpfeifer/word_syllable_splitter.py b/regenpfeifer/word_syllable_splitter.py +index 47248d2..9d45887 100644 +--- a/regenpfeifer/word_syllable_splitter.py ++++ b/regenpfeifer/word_syllable_splitter.py +@@ -1,11 +1,13 @@ + ''' +-Created on 27.07.2019 ++Rewritten on 17.06.2022 + +-Based on Algorithm by Daniel Kirsch on https://www.wer-weiss-was.de/t/silbentrennung/544436 ++Uses pyphen + + @author: mkoerner + ''' + ++import pyphen ++ + + class WordSyllableSplitter(object): + ''' +@@ -16,91 +18,7 @@ class WordSyllableSplitter(object): + ''' + Constructor + ''' +- self.vowels = ['a', 'e', 'i', 'o', 'u', 'ä', 'ö', 'ü'] +- +- self.split_vowel_pairs = ['io', 'eie', 'eue'] +- self.preventing_vowel_split_right = ['nen'] +- +- self.splitters = ['sst', 'ier'] +- self.non_connectors = ['chl'] +- self.certain_connectors = ['sch', 'ch', 'ck', 'schl', 'chl'] +- self.left_connectors = ['er', 'an'] +- self.left_non_connectors = ['ana'] +- self.possible_connectors = ['ph', 'pf', 'br', 'pl', 'tr', 'gr', 'sp', 'kl', 'zw', 'spr', 'fr', 'gl', 'bl', 'ren'] +- self.separators = ['-', '*', ';', '.', '+', '=', ')', '(', '&', '!', '?', '', ':', ' ', '_', '~'] +- +- def get_split_positions(self, word): +- word = word.lower() +- split_positions = [] +- word_length = len(word) +- if word_length > 2: +- split_allowed = False +- for i in range(1, word_length): +- z_minus_3 = "" +- if i > 2: +- z_minus_3 = word[i - 3] +- z_minus_2 = "" +- if i > 1: +- z_minus_2 = word[i - 2] +- z_minus_1 = word[i - 1] +- if not split_allowed and z_minus_1 in self.vowels: +- split_allowed = True +- if split_allowed: +- z = word[i] +- z1 = "" +- if word_length > i + 1: +- z1 = word[i + 1] +- +- v = z_minus_1 + z +- +- v_extended = z_minus_2 + v +- if v_extended in self.certain_connectors or v_extended in self.possible_connectors or v_extended in self.splitters or v_extended in self.split_vowel_pairs or v_extended in self.non_connectors: +- v = v_extended +- +- v_extended = z_minus_3 + v_extended +- if v_extended in self.certain_connectors or v_extended in self.possible_connectors or v_extended in self.splitters or v_extended in self.split_vowel_pairs or v_extended in self.non_connectors: +- v = v_extended +- +- if v in self.split_vowel_pairs: +- # get everything after i +- z_i_plus = word[i + 1:] +- if z_i_plus not in self.preventing_vowel_split_right: +- split_positions.append(i) +- continue +- +- elif z1 in self.vowels and z not in self.vowels and z not in self.separators and z_minus_1 not in self.separators: +- if v in self.non_connectors: +- continue +- if v in self.certain_connectors: +- self.add_split_position(i - len(v) + 1, word, split_positions) +- elif v in self.splitters: +- self.add_split_position(i, word, split_positions) +- elif v + z1 in self.left_non_connectors: +- self.add_split_position(i + 2, word, split_positions) +- elif v in self.left_connectors : +- self.add_split_position(i + 1, word, split_positions) +- elif v in self.possible_connectors: +- self.add_split_position(i - len(v) + 1, word, split_positions) +- else: +- self.add_split_position(i, word, split_positions) +- +- return split_positions +- +- def add_split_position(self, split_position, word, split_positions): +- if split_position > 1 and split_position < len(word) - 1: +- split_positions.append(split_position) ++ self.dic = pyphen.Pyphen(lang="de_DE") + + def split(self, word): +- split_positions = self.get_split_positions(word) +- syllables = [] +- +- current_syllable = '' +- for i in range(len(word)): +- if i in split_positions: +- syllables.append(current_syllable) +- current_syllable = '' +- current_syllable += word[i] +- if len(current_syllable) > 0: +- syllables.append(current_syllable) +- +- return syllables ++ return self.dic.inserted(word, "\xAD").split("\xAD") +diff --git a/requirements.txt b/requirements.txt +index 7d23c96..054b8a3 100644 +--- a/requirements.txt ++++ b/requirements.txt +@@ -1 +1,2 @@ +-marisa_trie==0.7.7 +\ No newline at end of file ++marisa_trie==0.7.7 ++pyphen==0.12.0