Migrate to gitea
This commit is contained in:
225
scripts/InflectionProcessor.py
Normal file
225
scripts/InflectionProcessor.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import re
|
||||
|
||||
class UniversalInflectionCompressor:
|
||||
"""
|
||||
A generic inflection compressor that uses a configuration dictionary
|
||||
to process, partition, and compress verb forms for any language.
|
||||
"""
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
|
||||
def _matches_criteria(self, form: dict, criteria: dict) -> bool:
|
||||
"""Helper: Checks if a form matches specific criteria."""
|
||||
# Regex Match
|
||||
if 'form_regex' in criteria:
|
||||
form_str = form.get('form', '')
|
||||
if form_str is None: form_str = ''
|
||||
if not re.search(criteria['form_regex'], form_str):
|
||||
return False
|
||||
|
||||
# Tags Inclusion
|
||||
if 'tags' in criteria:
|
||||
form_tags = set(form.get('tags', []))
|
||||
required = set(criteria['tags'])
|
||||
if not required.issubset(form_tags):
|
||||
return False
|
||||
|
||||
# Raw Tags Inclusion
|
||||
if 'raw_tags' in criteria:
|
||||
form_raw = set(form.get('raw_tags', []))
|
||||
required_raw = set(criteria['raw_tags'])
|
||||
if not required_raw.issubset(form_raw):
|
||||
return False
|
||||
|
||||
# Tag Exclusion
|
||||
if 'exclude_tags' in criteria:
|
||||
form_tags = set(form.get('tags', []))
|
||||
if not form_tags.isdisjoint(set(criteria['exclude_tags'])):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _normalize_forms(self, forms: list) -> list:
|
||||
"""Enriches forms with tags based on 'normalization_rules'."""
|
||||
rules = self.config.get('normalization_rules', [])
|
||||
skip_if_source = self.config.get('skip_normalization_if_source', True)
|
||||
|
||||
for form in forms:
|
||||
if form.get('source') and skip_if_source:
|
||||
continue
|
||||
|
||||
for rule in rules:
|
||||
field = rule.get('field')
|
||||
value_to_match = rule.get('match')
|
||||
match_mode = rule.get('match_mode', 'exact')
|
||||
add_tags = rule.get('add_tags', [])
|
||||
|
||||
form_value = form.get(field)
|
||||
if form_value is None: continue
|
||||
|
||||
is_match = False
|
||||
if match_mode == 'regex':
|
||||
if isinstance(form_value, list):
|
||||
for item in form_value:
|
||||
if re.search(value_to_match, str(item)):
|
||||
is_match = True; break
|
||||
else:
|
||||
if re.search(value_to_match, str(form_value)):
|
||||
is_match = True
|
||||
else:
|
||||
if isinstance(form_value, list):
|
||||
is_match = value_to_match in form_value
|
||||
else:
|
||||
is_match = value_to_match == form_value
|
||||
|
||||
if is_match:
|
||||
current_tags = set(form.get('tags', []))
|
||||
current_tags.update(add_tags)
|
||||
form['tags'] = list(current_tags)
|
||||
return forms
|
||||
|
||||
def _extract_properties(self, forms: list, entry_context: dict = None) -> dict:
|
||||
"""Determines global properties (e.g. aux, group)."""
|
||||
properties = {}
|
||||
candidates = forms.copy()
|
||||
if entry_context:
|
||||
candidates.append(entry_context)
|
||||
|
||||
for prop_def in self.config.get('properties', []):
|
||||
name = prop_def['name']
|
||||
default_val = prop_def.get('default')
|
||||
is_multivalue = prop_def.get('multivalue', False)
|
||||
|
||||
found_values = set()
|
||||
for rule in prop_def.get('rules', []):
|
||||
for candidate in candidates:
|
||||
if self._matches_criteria(candidate, rule.get('criteria', {})):
|
||||
found_values.add(rule['value'])
|
||||
if not is_multivalue:
|
||||
break
|
||||
if found_values and not is_multivalue:
|
||||
break
|
||||
|
||||
if not found_values:
|
||||
if is_multivalue and default_val is not None:
|
||||
properties[name] = default_val if isinstance(default_val, list) else [default_val]
|
||||
else:
|
||||
properties[name] = default_val
|
||||
elif is_multivalue:
|
||||
properties[name] = sorted(list(found_values))
|
||||
else:
|
||||
properties[name] = list(found_values)[0]
|
||||
|
||||
return properties
|
||||
|
||||
def _clean_verb_string(self, form_string: str) -> str:
|
||||
ignored = self.config.get('clean_prefixes', [])
|
||||
current_string = form_string.strip()
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
for prefix in ignored:
|
||||
if prefix.endswith("'") or prefix.endswith("’"):
|
||||
if current_string.startswith(prefix):
|
||||
current_string = current_string[len(prefix):]
|
||||
changed = True
|
||||
break
|
||||
else:
|
||||
if current_string.startswith(prefix + " "):
|
||||
current_string = current_string[len(prefix)+1:]
|
||||
changed = True
|
||||
break
|
||||
return current_string
|
||||
|
||||
def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict:
|
||||
if not forms_list:
|
||||
return None
|
||||
|
||||
# 1. Normalize tags
|
||||
normalized_forms = self._normalize_forms(forms_list)
|
||||
|
||||
# 2. Extract Properties
|
||||
entry_context = None
|
||||
if entry:
|
||||
entry_context = {
|
||||
'form': entry.get('word', ''),
|
||||
'tags': entry.get('tags', []),
|
||||
'raw_tags': entry.get('raw_tags', [])
|
||||
}
|
||||
table_properties = self._extract_properties(normalized_forms, entry_context)
|
||||
|
||||
# 3. Initialize Output
|
||||
result = table_properties.copy()
|
||||
|
||||
# 4. Fill Slots
|
||||
schema = self.config.get('schema', {})
|
||||
for slot_name, slot_def in schema.items():
|
||||
slot_type = slot_def.get('type', 'single')
|
||||
|
||||
if slot_type == 'single':
|
||||
result[slot_name] = None
|
||||
for form in normalized_forms:
|
||||
if self._matches_criteria(form, slot_def.get('criteria', {})):
|
||||
if result[slot_name] is None or (form.get('source') and not result[slot_name]):
|
||||
result[slot_name] = self._clean_verb_string(form['form'])
|
||||
|
||||
elif slot_type == 'list':
|
||||
size = slot_def.get('size', 6)
|
||||
result[slot_name] = [None] * size
|
||||
base_criteria = slot_def.get('base_criteria', {})
|
||||
candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)]
|
||||
|
||||
for form in candidates:
|
||||
idx = -1
|
||||
# Iterate through index rules to find where this form belongs
|
||||
for index_rule in slot_def.get('indices', []):
|
||||
# Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut
|
||||
rule_criteria = index_rule.get('criteria', {})
|
||||
if 'tags' in index_rule:
|
||||
rule_criteria = rule_criteria.copy()
|
||||
rule_criteria['tags'] = index_rule['tags']
|
||||
|
||||
if self._matches_criteria(form, rule_criteria):
|
||||
idx = index_rule['index']
|
||||
break
|
||||
|
||||
if idx >= 0 and idx < size:
|
||||
current_val = result[slot_name][idx]
|
||||
if current_val is None:
|
||||
result[slot_name][idx] = self._clean_verb_string(form['form'])
|
||||
elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')):
|
||||
result[slot_name][idx] = self._clean_verb_string(form['form'])
|
||||
|
||||
# 5. Fallbacks
|
||||
if not result.get('infinitive') and word:
|
||||
result['infinitive'] = word
|
||||
|
||||
# 6. Validation
|
||||
if self.config.get('validate_completeness', False):
|
||||
for key, val in result.items():
|
||||
slot_config = schema.get(key, {})
|
||||
if slot_config.get('optional', False):
|
||||
continue
|
||||
if val is None:
|
||||
raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.")
|
||||
if isinstance(val, list):
|
||||
for i, v in enumerate(val):
|
||||
if v is None:
|
||||
raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.")
|
||||
|
||||
return result
|
||||
|
||||
class InflectionProcessor:
|
||||
def __init__(self, configs):
|
||||
self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()}
|
||||
|
||||
def process(self, entry: dict) -> dict:
|
||||
key = f"{entry.get('lang_code')}_{entry.get('pos')}"
|
||||
if key in self.compressors:
|
||||
try:
|
||||
compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry)
|
||||
if compressed:
|
||||
entry['forms'] = compressed
|
||||
except Exception as e:
|
||||
print(f"Error processing {entry.get('word')}: {e}")
|
||||
return entry
|
||||
Reference in New Issue
Block a user