import re class UniversalInflectionCompressor: """ A generic inflection compressor that uses a configuration dictionary to process, partition, and compress verb forms for any language. """ def __init__(self, config: dict): self.config = config def _matches_criteria(self, form: dict, criteria: dict) -> bool: """Helper: Checks if a form matches specific criteria.""" # Regex Match if 'form_regex' in criteria: form_str = form.get('form', '') if form_str is None: form_str = '' if not re.search(criteria['form_regex'], form_str): return False # Tags Inclusion if 'tags' in criteria: form_tags = set(form.get('tags', [])) required = set(criteria['tags']) if not required.issubset(form_tags): return False # Raw Tags Inclusion if 'raw_tags' in criteria: form_raw = set(form.get('raw_tags', [])) required_raw = set(criteria['raw_tags']) if not required_raw.issubset(form_raw): return False # Tag Exclusion if 'exclude_tags' in criteria: form_tags = set(form.get('tags', [])) if not form_tags.isdisjoint(set(criteria['exclude_tags'])): return False return True def _normalize_forms(self, forms: list) -> list: """Enriches forms with tags based on 'normalization_rules'.""" rules = self.config.get('normalization_rules', []) skip_if_source = self.config.get('skip_normalization_if_source', True) for form in forms: if form.get('source') and skip_if_source: continue for rule in rules: field = rule.get('field') value_to_match = rule.get('match') match_mode = rule.get('match_mode', 'exact') add_tags = rule.get('add_tags', []) form_value = form.get(field) if form_value is None: continue is_match = False if match_mode == 'regex': if isinstance(form_value, list): for item in form_value: if re.search(value_to_match, str(item)): is_match = True; break else: if re.search(value_to_match, str(form_value)): is_match = True else: if isinstance(form_value, list): is_match = value_to_match in form_value else: is_match = value_to_match == form_value if is_match: current_tags = set(form.get('tags', [])) current_tags.update(add_tags) form['tags'] = list(current_tags) return forms def _extract_properties(self, forms: list, entry_context: dict = None) -> dict: """Determines global properties (e.g. aux, group).""" properties = {} candidates = forms.copy() if entry_context: candidates.append(entry_context) for prop_def in self.config.get('properties', []): name = prop_def['name'] default_val = prop_def.get('default') is_multivalue = prop_def.get('multivalue', False) found_values = set() for rule in prop_def.get('rules', []): for candidate in candidates: if self._matches_criteria(candidate, rule.get('criteria', {})): found_values.add(rule['value']) if not is_multivalue: break if found_values and not is_multivalue: break if not found_values: if is_multivalue and default_val is not None: properties[name] = default_val if isinstance(default_val, list) else [default_val] else: properties[name] = default_val elif is_multivalue: properties[name] = sorted(list(found_values)) else: properties[name] = list(found_values)[0] return properties def _clean_verb_string(self, form_string: str) -> str: ignored = self.config.get('clean_prefixes', []) current_string = form_string.strip() changed = True while changed: changed = False for prefix in ignored: if prefix.endswith("'") or prefix.endswith("’"): if current_string.startswith(prefix): current_string = current_string[len(prefix):] changed = True break else: if current_string.startswith(prefix + " "): current_string = current_string[len(prefix)+1:] changed = True break return current_string def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict: if not forms_list: return None # 1. Normalize tags normalized_forms = self._normalize_forms(forms_list) # 2. Extract Properties entry_context = None if entry: entry_context = { 'form': entry.get('word', ''), 'tags': entry.get('tags', []), 'raw_tags': entry.get('raw_tags', []) } table_properties = self._extract_properties(normalized_forms, entry_context) # 3. Initialize Output result = table_properties.copy() # 4. Fill Slots schema = self.config.get('schema', {}) for slot_name, slot_def in schema.items(): slot_type = slot_def.get('type', 'single') if slot_type == 'single': result[slot_name] = None for form in normalized_forms: if self._matches_criteria(form, slot_def.get('criteria', {})): if result[slot_name] is None or (form.get('source') and not result[slot_name]): result[slot_name] = self._clean_verb_string(form['form']) elif slot_type == 'list': size = slot_def.get('size', 6) result[slot_name] = [None] * size base_criteria = slot_def.get('base_criteria', {}) candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)] for form in candidates: idx = -1 # Iterate through index rules to find where this form belongs for index_rule in slot_def.get('indices', []): # Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut rule_criteria = index_rule.get('criteria', {}) if 'tags' in index_rule: rule_criteria = rule_criteria.copy() rule_criteria['tags'] = index_rule['tags'] if self._matches_criteria(form, rule_criteria): idx = index_rule['index'] break if idx >= 0 and idx < size: current_val = result[slot_name][idx] if current_val is None: result[slot_name][idx] = self._clean_verb_string(form['form']) elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')): result[slot_name][idx] = self._clean_verb_string(form['form']) # 5. Fallbacks if not result.get('infinitive') and word: result['infinitive'] = word # 6. Validation if self.config.get('validate_completeness', False): for key, val in result.items(): slot_config = schema.get(key, {}) if slot_config.get('optional', False): continue if val is None: raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.") if isinstance(val, list): for i, v in enumerate(val): if v is None: raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.") return result class InflectionProcessor: def __init__(self, configs): self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()} def process(self, entry: dict) -> dict: key = f"{entry.get('lang_code')}_{entry.get('pos')}" if key in self.compressors: try: compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry) if compressed: entry['forms'] = compressed except Exception as e: print(f"Error processing {entry.get('word')}: {e}") return entry