225 lines
9.3 KiB
Python
225 lines
9.3 KiB
Python
import re
|
||
|
||
class UniversalInflectionCompressor:
|
||
"""
|
||
A generic inflection compressor that uses a configuration dictionary
|
||
to process, partition, and compress verb forms for any language.
|
||
"""
|
||
def __init__(self, config: dict):
|
||
self.config = config
|
||
|
||
def _matches_criteria(self, form: dict, criteria: dict) -> bool:
|
||
"""Helper: Checks if a form matches specific criteria."""
|
||
# Regex Match
|
||
if 'form_regex' in criteria:
|
||
form_str = form.get('form', '')
|
||
if form_str is None: form_str = ''
|
||
if not re.search(criteria['form_regex'], form_str):
|
||
return False
|
||
|
||
# Tags Inclusion
|
||
if 'tags' in criteria:
|
||
form_tags = set(form.get('tags', []))
|
||
required = set(criteria['tags'])
|
||
if not required.issubset(form_tags):
|
||
return False
|
||
|
||
# Raw Tags Inclusion
|
||
if 'raw_tags' in criteria:
|
||
form_raw = set(form.get('raw_tags', []))
|
||
required_raw = set(criteria['raw_tags'])
|
||
if not required_raw.issubset(form_raw):
|
||
return False
|
||
|
||
# Tag Exclusion
|
||
if 'exclude_tags' in criteria:
|
||
form_tags = set(form.get('tags', []))
|
||
if not form_tags.isdisjoint(set(criteria['exclude_tags'])):
|
||
return False
|
||
|
||
return True
|
||
|
||
def _normalize_forms(self, forms: list) -> list:
|
||
"""Enriches forms with tags based on 'normalization_rules'."""
|
||
rules = self.config.get('normalization_rules', [])
|
||
skip_if_source = self.config.get('skip_normalization_if_source', True)
|
||
|
||
for form in forms:
|
||
if form.get('source') and skip_if_source:
|
||
continue
|
||
|
||
for rule in rules:
|
||
field = rule.get('field')
|
||
value_to_match = rule.get('match')
|
||
match_mode = rule.get('match_mode', 'exact')
|
||
add_tags = rule.get('add_tags', [])
|
||
|
||
form_value = form.get(field)
|
||
if form_value is None: continue
|
||
|
||
is_match = False
|
||
if match_mode == 'regex':
|
||
if isinstance(form_value, list):
|
||
for item in form_value:
|
||
if re.search(value_to_match, str(item)):
|
||
is_match = True; break
|
||
else:
|
||
if re.search(value_to_match, str(form_value)):
|
||
is_match = True
|
||
else:
|
||
if isinstance(form_value, list):
|
||
is_match = value_to_match in form_value
|
||
else:
|
||
is_match = value_to_match == form_value
|
||
|
||
if is_match:
|
||
current_tags = set(form.get('tags', []))
|
||
current_tags.update(add_tags)
|
||
form['tags'] = list(current_tags)
|
||
return forms
|
||
|
||
def _extract_properties(self, forms: list, entry_context: dict = None) -> dict:
|
||
"""Determines global properties (e.g. aux, group)."""
|
||
properties = {}
|
||
candidates = forms.copy()
|
||
if entry_context:
|
||
candidates.append(entry_context)
|
||
|
||
for prop_def in self.config.get('properties', []):
|
||
name = prop_def['name']
|
||
default_val = prop_def.get('default')
|
||
is_multivalue = prop_def.get('multivalue', False)
|
||
|
||
found_values = set()
|
||
for rule in prop_def.get('rules', []):
|
||
for candidate in candidates:
|
||
if self._matches_criteria(candidate, rule.get('criteria', {})):
|
||
found_values.add(rule['value'])
|
||
if not is_multivalue:
|
||
break
|
||
if found_values and not is_multivalue:
|
||
break
|
||
|
||
if not found_values:
|
||
if is_multivalue and default_val is not None:
|
||
properties[name] = default_val if isinstance(default_val, list) else [default_val]
|
||
else:
|
||
properties[name] = default_val
|
||
elif is_multivalue:
|
||
properties[name] = sorted(list(found_values))
|
||
else:
|
||
properties[name] = list(found_values)[0]
|
||
|
||
return properties
|
||
|
||
def _clean_verb_string(self, form_string: str) -> str:
|
||
ignored = self.config.get('clean_prefixes', [])
|
||
current_string = form_string.strip()
|
||
changed = True
|
||
while changed:
|
||
changed = False
|
||
for prefix in ignored:
|
||
if prefix.endswith("'") or prefix.endswith("’"):
|
||
if current_string.startswith(prefix):
|
||
current_string = current_string[len(prefix):]
|
||
changed = True
|
||
break
|
||
else:
|
||
if current_string.startswith(prefix + " "):
|
||
current_string = current_string[len(prefix)+1:]
|
||
changed = True
|
||
break
|
||
return current_string
|
||
|
||
def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict:
|
||
if not forms_list:
|
||
return None
|
||
|
||
# 1. Normalize tags
|
||
normalized_forms = self._normalize_forms(forms_list)
|
||
|
||
# 2. Extract Properties
|
||
entry_context = None
|
||
if entry:
|
||
entry_context = {
|
||
'form': entry.get('word', ''),
|
||
'tags': entry.get('tags', []),
|
||
'raw_tags': entry.get('raw_tags', [])
|
||
}
|
||
table_properties = self._extract_properties(normalized_forms, entry_context)
|
||
|
||
# 3. Initialize Output
|
||
result = table_properties.copy()
|
||
|
||
# 4. Fill Slots
|
||
schema = self.config.get('schema', {})
|
||
for slot_name, slot_def in schema.items():
|
||
slot_type = slot_def.get('type', 'single')
|
||
|
||
if slot_type == 'single':
|
||
result[slot_name] = None
|
||
for form in normalized_forms:
|
||
if self._matches_criteria(form, slot_def.get('criteria', {})):
|
||
if result[slot_name] is None or (form.get('source') and not result[slot_name]):
|
||
result[slot_name] = self._clean_verb_string(form['form'])
|
||
|
||
elif slot_type == 'list':
|
||
size = slot_def.get('size', 6)
|
||
result[slot_name] = [None] * size
|
||
base_criteria = slot_def.get('base_criteria', {})
|
||
candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)]
|
||
|
||
for form in candidates:
|
||
idx = -1
|
||
# Iterate through index rules to find where this form belongs
|
||
for index_rule in slot_def.get('indices', []):
|
||
# Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut
|
||
rule_criteria = index_rule.get('criteria', {})
|
||
if 'tags' in index_rule:
|
||
rule_criteria = rule_criteria.copy()
|
||
rule_criteria['tags'] = index_rule['tags']
|
||
|
||
if self._matches_criteria(form, rule_criteria):
|
||
idx = index_rule['index']
|
||
break
|
||
|
||
if idx >= 0 and idx < size:
|
||
current_val = result[slot_name][idx]
|
||
if current_val is None:
|
||
result[slot_name][idx] = self._clean_verb_string(form['form'])
|
||
elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')):
|
||
result[slot_name][idx] = self._clean_verb_string(form['form'])
|
||
|
||
# 5. Fallbacks
|
||
if not result.get('infinitive') and word:
|
||
result['infinitive'] = word
|
||
|
||
# 6. Validation
|
||
if self.config.get('validate_completeness', False):
|
||
for key, val in result.items():
|
||
slot_config = schema.get(key, {})
|
||
if slot_config.get('optional', False):
|
||
continue
|
||
if val is None:
|
||
raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.")
|
||
if isinstance(val, list):
|
||
for i, v in enumerate(val):
|
||
if v is None:
|
||
raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.")
|
||
|
||
return result
|
||
|
||
class InflectionProcessor:
|
||
def __init__(self, configs):
|
||
self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()}
|
||
|
||
def process(self, entry: dict) -> dict:
|
||
key = f"{entry.get('lang_code')}_{entry.get('pos')}"
|
||
if key in self.compressors:
|
||
try:
|
||
compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry)
|
||
if compressed:
|
||
entry['forms'] = compressed
|
||
except Exception as e:
|
||
print(f"Error processing {entry.get('word')}: {e}")
|
||
return entry |