Migrate to gitea

This commit is contained in:
jonasgaudian
2026-02-13 00:10:40 +01:00
commit 6d06a9e14e
38 changed files with 31427 additions and 0 deletions

View File

@@ -0,0 +1,225 @@
import re
class UniversalInflectionCompressor:
"""
A generic inflection compressor that uses a configuration dictionary
to process, partition, and compress verb forms for any language.
"""
def __init__(self, config: dict):
self.config = config
def _matches_criteria(self, form: dict, criteria: dict) -> bool:
"""Helper: Checks if a form matches specific criteria."""
# Regex Match
if 'form_regex' in criteria:
form_str = form.get('form', '')
if form_str is None: form_str = ''
if not re.search(criteria['form_regex'], form_str):
return False
# Tags Inclusion
if 'tags' in criteria:
form_tags = set(form.get('tags', []))
required = set(criteria['tags'])
if not required.issubset(form_tags):
return False
# Raw Tags Inclusion
if 'raw_tags' in criteria:
form_raw = set(form.get('raw_tags', []))
required_raw = set(criteria['raw_tags'])
if not required_raw.issubset(form_raw):
return False
# Tag Exclusion
if 'exclude_tags' in criteria:
form_tags = set(form.get('tags', []))
if not form_tags.isdisjoint(set(criteria['exclude_tags'])):
return False
return True
def _normalize_forms(self, forms: list) -> list:
"""Enriches forms with tags based on 'normalization_rules'."""
rules = self.config.get('normalization_rules', [])
skip_if_source = self.config.get('skip_normalization_if_source', True)
for form in forms:
if form.get('source') and skip_if_source:
continue
for rule in rules:
field = rule.get('field')
value_to_match = rule.get('match')
match_mode = rule.get('match_mode', 'exact')
add_tags = rule.get('add_tags', [])
form_value = form.get(field)
if form_value is None: continue
is_match = False
if match_mode == 'regex':
if isinstance(form_value, list):
for item in form_value:
if re.search(value_to_match, str(item)):
is_match = True; break
else:
if re.search(value_to_match, str(form_value)):
is_match = True
else:
if isinstance(form_value, list):
is_match = value_to_match in form_value
else:
is_match = value_to_match == form_value
if is_match:
current_tags = set(form.get('tags', []))
current_tags.update(add_tags)
form['tags'] = list(current_tags)
return forms
def _extract_properties(self, forms: list, entry_context: dict = None) -> dict:
"""Determines global properties (e.g. aux, group)."""
properties = {}
candidates = forms.copy()
if entry_context:
candidates.append(entry_context)
for prop_def in self.config.get('properties', []):
name = prop_def['name']
default_val = prop_def.get('default')
is_multivalue = prop_def.get('multivalue', False)
found_values = set()
for rule in prop_def.get('rules', []):
for candidate in candidates:
if self._matches_criteria(candidate, rule.get('criteria', {})):
found_values.add(rule['value'])
if not is_multivalue:
break
if found_values and not is_multivalue:
break
if not found_values:
if is_multivalue and default_val is not None:
properties[name] = default_val if isinstance(default_val, list) else [default_val]
else:
properties[name] = default_val
elif is_multivalue:
properties[name] = sorted(list(found_values))
else:
properties[name] = list(found_values)[0]
return properties
def _clean_verb_string(self, form_string: str) -> str:
ignored = self.config.get('clean_prefixes', [])
current_string = form_string.strip()
changed = True
while changed:
changed = False
for prefix in ignored:
if prefix.endswith("'") or prefix.endswith(""):
if current_string.startswith(prefix):
current_string = current_string[len(prefix):]
changed = True
break
else:
if current_string.startswith(prefix + " "):
current_string = current_string[len(prefix)+1:]
changed = True
break
return current_string
def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict:
if not forms_list:
return None
# 1. Normalize tags
normalized_forms = self._normalize_forms(forms_list)
# 2. Extract Properties
entry_context = None
if entry:
entry_context = {
'form': entry.get('word', ''),
'tags': entry.get('tags', []),
'raw_tags': entry.get('raw_tags', [])
}
table_properties = self._extract_properties(normalized_forms, entry_context)
# 3. Initialize Output
result = table_properties.copy()
# 4. Fill Slots
schema = self.config.get('schema', {})
for slot_name, slot_def in schema.items():
slot_type = slot_def.get('type', 'single')
if slot_type == 'single':
result[slot_name] = None
for form in normalized_forms:
if self._matches_criteria(form, slot_def.get('criteria', {})):
if result[slot_name] is None or (form.get('source') and not result[slot_name]):
result[slot_name] = self._clean_verb_string(form['form'])
elif slot_type == 'list':
size = slot_def.get('size', 6)
result[slot_name] = [None] * size
base_criteria = slot_def.get('base_criteria', {})
candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)]
for form in candidates:
idx = -1
# Iterate through index rules to find where this form belongs
for index_rule in slot_def.get('indices', []):
# Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut
rule_criteria = index_rule.get('criteria', {})
if 'tags' in index_rule:
rule_criteria = rule_criteria.copy()
rule_criteria['tags'] = index_rule['tags']
if self._matches_criteria(form, rule_criteria):
idx = index_rule['index']
break
if idx >= 0 and idx < size:
current_val = result[slot_name][idx]
if current_val is None:
result[slot_name][idx] = self._clean_verb_string(form['form'])
elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')):
result[slot_name][idx] = self._clean_verb_string(form['form'])
# 5. Fallbacks
if not result.get('infinitive') and word:
result['infinitive'] = word
# 6. Validation
if self.config.get('validate_completeness', False):
for key, val in result.items():
slot_config = schema.get(key, {})
if slot_config.get('optional', False):
continue
if val is None:
raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.")
if isinstance(val, list):
for i, v in enumerate(val):
if v is None:
raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.")
return result
class InflectionProcessor:
def __init__(self, configs):
self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()}
def process(self, entry: dict) -> dict:
key = f"{entry.get('lang_code')}_{entry.get('pos')}"
if key in self.compressors:
try:
compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry)
if compressed:
entry['forms'] = compressed
except Exception as e:
print(f"Error processing {entry.get('word')}: {e}")
return entry