Files
Wictionary-Data-Parser/scripts/InflectionProcessor.py
2026-02-13 00:10:40 +01:00

225 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
class UniversalInflectionCompressor:
"""
A generic inflection compressor that uses a configuration dictionary
to process, partition, and compress verb forms for any language.
"""
def __init__(self, config: dict):
self.config = config
def _matches_criteria(self, form: dict, criteria: dict) -> bool:
"""Helper: Checks if a form matches specific criteria."""
# Regex Match
if 'form_regex' in criteria:
form_str = form.get('form', '')
if form_str is None: form_str = ''
if not re.search(criteria['form_regex'], form_str):
return False
# Tags Inclusion
if 'tags' in criteria:
form_tags = set(form.get('tags', []))
required = set(criteria['tags'])
if not required.issubset(form_tags):
return False
# Raw Tags Inclusion
if 'raw_tags' in criteria:
form_raw = set(form.get('raw_tags', []))
required_raw = set(criteria['raw_tags'])
if not required_raw.issubset(form_raw):
return False
# Tag Exclusion
if 'exclude_tags' in criteria:
form_tags = set(form.get('tags', []))
if not form_tags.isdisjoint(set(criteria['exclude_tags'])):
return False
return True
def _normalize_forms(self, forms: list) -> list:
"""Enriches forms with tags based on 'normalization_rules'."""
rules = self.config.get('normalization_rules', [])
skip_if_source = self.config.get('skip_normalization_if_source', True)
for form in forms:
if form.get('source') and skip_if_source:
continue
for rule in rules:
field = rule.get('field')
value_to_match = rule.get('match')
match_mode = rule.get('match_mode', 'exact')
add_tags = rule.get('add_tags', [])
form_value = form.get(field)
if form_value is None: continue
is_match = False
if match_mode == 'regex':
if isinstance(form_value, list):
for item in form_value:
if re.search(value_to_match, str(item)):
is_match = True; break
else:
if re.search(value_to_match, str(form_value)):
is_match = True
else:
if isinstance(form_value, list):
is_match = value_to_match in form_value
else:
is_match = value_to_match == form_value
if is_match:
current_tags = set(form.get('tags', []))
current_tags.update(add_tags)
form['tags'] = list(current_tags)
return forms
def _extract_properties(self, forms: list, entry_context: dict = None) -> dict:
"""Determines global properties (e.g. aux, group)."""
properties = {}
candidates = forms.copy()
if entry_context:
candidates.append(entry_context)
for prop_def in self.config.get('properties', []):
name = prop_def['name']
default_val = prop_def.get('default')
is_multivalue = prop_def.get('multivalue', False)
found_values = set()
for rule in prop_def.get('rules', []):
for candidate in candidates:
if self._matches_criteria(candidate, rule.get('criteria', {})):
found_values.add(rule['value'])
if not is_multivalue:
break
if found_values and not is_multivalue:
break
if not found_values:
if is_multivalue and default_val is not None:
properties[name] = default_val if isinstance(default_val, list) else [default_val]
else:
properties[name] = default_val
elif is_multivalue:
properties[name] = sorted(list(found_values))
else:
properties[name] = list(found_values)[0]
return properties
def _clean_verb_string(self, form_string: str) -> str:
ignored = self.config.get('clean_prefixes', [])
current_string = form_string.strip()
changed = True
while changed:
changed = False
for prefix in ignored:
if prefix.endswith("'") or prefix.endswith(""):
if current_string.startswith(prefix):
current_string = current_string[len(prefix):]
changed = True
break
else:
if current_string.startswith(prefix + " "):
current_string = current_string[len(prefix)+1:]
changed = True
break
return current_string
def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict:
if not forms_list:
return None
# 1. Normalize tags
normalized_forms = self._normalize_forms(forms_list)
# 2. Extract Properties
entry_context = None
if entry:
entry_context = {
'form': entry.get('word', ''),
'tags': entry.get('tags', []),
'raw_tags': entry.get('raw_tags', [])
}
table_properties = self._extract_properties(normalized_forms, entry_context)
# 3. Initialize Output
result = table_properties.copy()
# 4. Fill Slots
schema = self.config.get('schema', {})
for slot_name, slot_def in schema.items():
slot_type = slot_def.get('type', 'single')
if slot_type == 'single':
result[slot_name] = None
for form in normalized_forms:
if self._matches_criteria(form, slot_def.get('criteria', {})):
if result[slot_name] is None or (form.get('source') and not result[slot_name]):
result[slot_name] = self._clean_verb_string(form['form'])
elif slot_type == 'list':
size = slot_def.get('size', 6)
result[slot_name] = [None] * size
base_criteria = slot_def.get('base_criteria', {})
candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)]
for form in candidates:
idx = -1
# Iterate through index rules to find where this form belongs
for index_rule in slot_def.get('indices', []):
# Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut
rule_criteria = index_rule.get('criteria', {})
if 'tags' in index_rule:
rule_criteria = rule_criteria.copy()
rule_criteria['tags'] = index_rule['tags']
if self._matches_criteria(form, rule_criteria):
idx = index_rule['index']
break
if idx >= 0 and idx < size:
current_val = result[slot_name][idx]
if current_val is None:
result[slot_name][idx] = self._clean_verb_string(form['form'])
elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')):
result[slot_name][idx] = self._clean_verb_string(form['form'])
# 5. Fallbacks
if not result.get('infinitive') and word:
result['infinitive'] = word
# 6. Validation
if self.config.get('validate_completeness', False):
for key, val in result.items():
slot_config = schema.get(key, {})
if slot_config.get('optional', False):
continue
if val is None:
raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.")
if isinstance(val, list):
for i, v in enumerate(val):
if v is None:
raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.")
return result
class InflectionProcessor:
def __init__(self, configs):
self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()}
def process(self, entry: dict) -> dict:
key = f"{entry.get('lang_code')}_{entry.get('pos')}"
if key in self.compressors:
try:
compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry)
if compressed:
entry['forms'] = compressed
except Exception as e:
print(f"Error processing {entry.get('word')}: {e}")
return entry