Wictionary-Data-Parser/scripts/InflectionProcessor.py

import re

class UniversalInflectionCompressor:
    """
    A generic inflection compressor that uses a configuration dictionary
    to process, partition, and compress verb forms for any language.
    """
    def __init__(self, config: dict):
        self.config = config

    def _matches_criteria(self, form: dict, criteria: dict) -> bool:
        """Helper: Checks if a form matches specific criteria."""
        # Regex Match
        if 'form_regex' in criteria:
            form_str = form.get('form', '')
            if form_str is None: form_str = ''
            if not re.search(criteria['form_regex'], form_str):
                return False

        # Tags Inclusion
        if 'tags' in criteria:
            form_tags = set(form.get('tags', []))
            required = set(criteria['tags'])
            if not required.issubset(form_tags):
                return False

        # Raw Tags Inclusion
        if 'raw_tags' in criteria:
            form_raw = set(form.get('raw_tags', []))
            required_raw = set(criteria['raw_tags'])
            if not required_raw.issubset(form_raw):
                return False

        # Tag Exclusion
        if 'exclude_tags' in criteria:
            form_tags = set(form.get('tags', []))
            if not form_tags.isdisjoint(set(criteria['exclude_tags'])):
                return False

        return True

    def _normalize_forms(self, forms: list) -> list:
        """Enriches forms with tags based on 'normalization_rules'."""
        rules = self.config.get('normalization_rules', [])
        skip_if_source = self.config.get('skip_normalization_if_source', True)

        for form in forms:
            if form.get('source') and skip_if_source:
                continue

            for rule in rules:
                field = rule.get('field')
                value_to_match = rule.get('match')
                match_mode = rule.get('match_mode', 'exact')
                add_tags = rule.get('add_tags', [])

                form_value = form.get(field)
                if form_value is None: continue

                is_match = False
                if match_mode == 'regex':
                    if isinstance(form_value, list):
                        for item in form_value:
                            if re.search(value_to_match, str(item)):
                                is_match = True; break
                    else:
                        if re.search(value_to_match, str(form_value)):
                            is_match = True
                else:
                    if isinstance(form_value, list):
                        is_match = value_to_match in form_value
                    else:
                        is_match = value_to_match == form_value

                if is_match:
                    current_tags = set(form.get('tags', []))
                    current_tags.update(add_tags)
                    form['tags'] = list(current_tags)
        return forms

    def _extract_properties(self, forms: list, entry_context: dict = None) -> dict:
        """Determines global properties (e.g. aux, group)."""
        properties = {}
        candidates = forms.copy()
        if entry_context:
            candidates.append(entry_context)

        for prop_def in self.config.get('properties', []):
            name = prop_def['name']
            default_val = prop_def.get('default')
            is_multivalue = prop_def.get('multivalue', False)

            found_values = set()
            for rule in prop_def.get('rules', []):
                for candidate in candidates:
                    if self._matches_criteria(candidate, rule.get('criteria', {})):
                        found_values.add(rule['value'])
                        if not is_multivalue:
                            break
                if found_values and not is_multivalue:
                    break

            if not found_values:
                if is_multivalue and default_val is not None:
                    properties[name] = default_val if isinstance(default_val, list) else [default_val]
                else:
                    properties[name] = default_val
            elif is_multivalue:
                properties[name] = sorted(list(found_values))
            else:
                properties[name] = list(found_values)[0]

        return properties

    def _clean_verb_string(self, form_string: str) -> str:
        ignored = self.config.get('clean_prefixes', [])
        current_string = form_string.strip()
        changed = True
        while changed:
            changed = False
            for prefix in ignored:
                if prefix.endswith("'") or prefix.endswith("’"):
                    if current_string.startswith(prefix):
                        current_string = current_string[len(prefix):]
                        changed = True
                        break
                else:
                    if current_string.startswith(prefix + " "):
                        current_string = current_string[len(prefix)+1:]
                        changed = True
                        break
        return current_string

    def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict:
        if not forms_list:
            return None

        # 1. Normalize tags
        normalized_forms = self._normalize_forms(forms_list)

        # 2. Extract Properties
        entry_context = None
        if entry:
            entry_context = {
                'form': entry.get('word', ''),
                'tags': entry.get('tags', []),
                'raw_tags': entry.get('raw_tags', [])
            }
        table_properties = self._extract_properties(normalized_forms, entry_context)

        # 3. Initialize Output
        result = table_properties.copy()

        # 4. Fill Slots
        schema = self.config.get('schema', {})
        for slot_name, slot_def in schema.items():
            slot_type = slot_def.get('type', 'single')

            if slot_type == 'single':
                result[slot_name] = None
                for form in normalized_forms:
                    if self._matches_criteria(form, slot_def.get('criteria', {})):
                        if result[slot_name] is None or (form.get('source') and not result[slot_name]):
                             result[slot_name] = self._clean_verb_string(form['form'])

            elif slot_type == 'list':
                size = slot_def.get('size', 6)
                result[slot_name] = [None] * size
                base_criteria = slot_def.get('base_criteria', {})
                candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)]

                for form in candidates:
                    idx = -1
                    # Iterate through index rules to find where this form belongs
                    for index_rule in slot_def.get('indices', []):
                        # Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut
                        rule_criteria = index_rule.get('criteria', {})
                        if 'tags' in index_rule:
                            rule_criteria = rule_criteria.copy()
                            rule_criteria['tags'] = index_rule['tags']

                        if self._matches_criteria(form, rule_criteria):
                            idx = index_rule['index']
                            break

                    if idx >= 0 and idx < size:
                        current_val = result[slot_name][idx]
                        if current_val is None:
                            result[slot_name][idx] = self._clean_verb_string(form['form'])
                        elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')):
                             result[slot_name][idx] = self._clean_verb_string(form['form'])

        # 5. Fallbacks
        if not result.get('infinitive') and word:
            result['infinitive'] = word

        # 6. Validation
        if self.config.get('validate_completeness', False):
            for key, val in result.items():
                slot_config = schema.get(key, {})
                if slot_config.get('optional', False):
                    continue
                if val is None:
                     raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.")
                if isinstance(val, list):
                     for i, v in enumerate(val):
                        if v is None:
                             raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.")

        return result

class InflectionProcessor:
    def __init__(self, configs):
        self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()}

    def process(self, entry: dict) -> dict:
        key = f"{entry.get('lang_code')}_{entry.get('pos')}"
        if key in self.compressors:
            try:
                compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry)
                if compressed:
                    entry['forms'] = compressed
            except Exception as e:
                print(f"Error processing {entry.get('word')}: {e}")
        return entry