Migrate to gitea

This commit is contained in:
jonasgaudian
2026-02-13 00:10:40 +01:00
commit 6d06a9e14e
38 changed files with 31427 additions and 0 deletions

401
scripts/lang_config.py Normal file
View File

@@ -0,0 +1,401 @@
GERMAN_VERB_CONFIG = {
"clean_prefixes": ["ich", "du", "er/sie/es", "wir", "ihr", "sie"],
"normalization_rules": [
{"field": "pronouns", "match": "ich", "add_tags": ["first-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "du", "add_tags": ["second-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "er", "add_tags": ["third-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "sie", "add_tags": ["third-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "es", "add_tags": ["third-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "wir", "add_tags": ["first-person", "plural", "indicative", "active"]},
{"field": "pronouns", "match": "ihr", "add_tags": ["second-person", "plural", "indicative", "active"]}
],
"properties": [
{
"name": "auxiliary",
"multivalue": True, # <--- CRITICAL CHANGE HERE
"default": ["haben"],
"rules": [
# Check for explicit raw tags
{"value": "sein", "criteria": {"raw_tags": ["Hilfsverb sein"]}},
{"value": "haben", "criteria": {"raw_tags": ["Hilfsverb haben"]}},
# Check for 'common forms' that imply the aux
{"value": "sein", "criteria": {"form_regex": "^sein$", "tags": ["auxiliary", "perfect"]}},
{"value": "haben", "criteria": {"form_regex": "^haben$", "tags": ["auxiliary", "perfect"]}}
]
},
{
"name": "separability",
"default": "inseparable",
"rules": [
{"value": "separable", "criteria": {"tags": ["separable"]}},
{"value": "inseparable", "criteria": {"tags": ["inseparable"]}},
{"value": "separable", "criteria": {"tags": ["participle-2"], "form_regex": "^(?!ge).+ge.+$"}}
]
}
],
"schema": {
"infinitive": {
"type": "single",
"criteria": {"tags": ["infinitive", "present"], "exclude_tags": ["extended", "passive", "reflexive", "zu"]}
},
"participle_perfect": {
"type": "single",
"criteria": {"tags": ["participle-2", "perfect"], "exclude_tags": ["active", "passive", "auxiliary"]}
},
"imperative": {
"type": "list",
"size": 2,
"base_criteria": {"tags": ["imperative", "present", "active"]},
"indices": [
{"index": 0, "tags": ["singular", "second-person"]},
{"index": 1, "tags": ["plural", "second-person"]}
]
},
"present": {
"type": "list",
"size": 6,
"base_criteria": {"tags": ["indicative", "present", "active"], "exclude_tags": ["passive"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"past": {
"type": "list",
"size": 6,
"base_criteria": {"tags": ["indicative", "past", "active"], "exclude_tags": ["passive"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"subjunctive_ii": {
"type": "list",
"size": 6,
"base_criteria": {"tags": ["subjunctive-ii", "past", "active"], "exclude_tags": ["passive"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
}
}
}
FRENCH_VERB_CONFIG = {
"skip_normalization_if_source": False,
# CHANGED: Set to False to prevent crashes on idioms, rare words, and defective verbs
"validate_completeness": False,
"clean_prefixes": [
"qu'", "qu", "que", "j'", "j", "je", "tu",
"il/elle/on", "il", "elle", "on", "nous", "vous", "ils/elles", "ils", "elles"
],
"normalization_rules": [
# Pronoun matches
{"field": "form", "match": r"\bje\b", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
{"field": "form", "match": r"\bj[']", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
{"field": "form", "match": r"\btu\b", "match_mode": "regex", "add_tags": ["second-person", "singular"]},
{"field": "form", "match": r"\b(il|elle|on|il/elle/on)\b", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
{"field": "form", "match": r"\[il/ɛl/ɔ̃\]", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
{"field": "form", "match": r"\bnous\b", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
{"field": "form", "match": r"\bvous\b", "match_mode": "regex", "add_tags": ["second-person", "plural"]},
{"field": "form", "match": r"\b(ils|elles|ils/elles)\b", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
{"field": "form", "match": r"\[il/ɛl\]", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
# Suffix Heuristics
{"field": "form", "match": r"ons$", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
{"field": "form", "match": r"ez$", "match_mode": "regex", "add_tags": ["second-person", "plural"]}
],
"properties": [
{
"name": "auxiliary",
"multivalue": True,
"default": ["avoir"],
"rules": [
{"value": "être", "criteria": {"raw_tags": ["auxiliary être"]}},
{"value": "avoir", "criteria": {"raw_tags": ["auxiliary avoir"]}},
{"value": "être", "criteria": {"tags": ["auxiliary-être"]}},
{"value": "avoir", "criteria": {"tags": ["auxiliary-avoir"]}}
]
},
{
"name": "group",
"default": "unknown",
"rules": [
{"value": "1st-group", "criteria": {"raw_tags": ["1ᵉʳ groupe"]}},
{"value": "2nd-group", "criteria": {"raw_tags": ["2ᵉ groupe"]}},
{"value": "3rd-group", "criteria": {"raw_tags": ["3ᵉ groupe"]}},
{"value": "1st-group", "criteria": {"form_regex": "er$"}},
{"value": "2nd-group", "criteria": {"form_regex": "ir$"}},
{"value": "3rd-group", "criteria": {"form_regex": "(re|oir)$"}}
]
}
],
"schema": {
"infinitive": {
"type": "single",
"criteria": {"tags": ["infinitive", "present"]}
},
"participle_present": {
"type": "single",
"optional": True,
"criteria": {"tags": ["participle", "present"]}
},
"participle_past": {
"type": "single",
"optional": True,
"criteria": {"tags": ["participle", "past"], "exclude_tags": ["multiword-construction"]}
},
# All lists are now marked optional to handle defective verbs (like 'traire') and sparse data
"indicative_present": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["indicative", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_imperfect": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["indicative", "imperfect"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_future": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["indicative", "future"], "exclude_tags": ["perfect"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_simple_past": {
"type": "list", "size": 6, "optional": True, # Traire/clore do not have this
"base_criteria": {"tags": ["indicative", "past"], "exclude_tags": ["multiword-construction", "imperfect", "perfect", "anterior"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"subjunctive_present": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["subjunctive", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"conditional_present": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["conditional", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"imperative": {
"type": "list", "size": 3, "optional": True,
"base_criteria": {"tags": ["imperative", "present"]},
"indices": [
{"index": 0, "tags": ["singular"]},
{"index": 1, "tags": ["plural", "first-person"]},
{"index": 2, "tags": ["plural", "second-person"]},
{"index": 1, "criteria": {"form_regex": r"ons$"}},
{"index": 2, "criteria": {"form_regex": r"ez$"}},
{"index": 0, "criteria": {"form_regex": r"[es]$"}}
]
}
}
}
OLD_FRENCH_VERB_CONFIG = {
"skip_normalization_if_source": False,
"validate_completeness": True,
# --- 1. Normalization ---
"clean_prefixes": [
"qu'", "qu", "que", "j'", "j", "je", "tu",
"il/elle/on", "il", "elle", "on", "nous", "vous", "ils/elles", "ils", "elles"
],
"normalization_rules": [
{"field": "form", "match": r"\bje\b", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
{"field": "form", "match": r"\bj[']", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
{"field": "form", "match": r"\btu\b", "match_mode": "regex", "add_tags": ["second-person", "singular"]},
{"field": "form", "match": r"\b(il|elle|on|il/elle/on)\b", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
{"field": "form", "match": r"\[il/ɛl/ɔ̃\]", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
{"field": "form", "match": r"\bnous\b", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
{"field": "form", "match": r"\bvous\b", "match_mode": "regex", "add_tags": ["second-person", "plural"]},
{"field": "form", "match": r"\b(ils|elles|ils/elles)\b", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
{"field": "form", "match": r"\[il/ɛl\]", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
],
# --- 2. Properties ---
"properties": [
{
"name": "auxiliary",
"multivalue": True,
"default": ["avoir"],
"rules": [
{"value": "être", "criteria": {"raw_tags": ["auxiliary être"]}},
{"value": "avoir", "criteria": {"raw_tags": ["auxiliary avoir"]}},
{"value": "être", "criteria": {"tags": ["auxiliary-être"]}},
{"value": "avoir", "criteria": {"tags": ["auxiliary-avoir"]}}
]
},
{
"name": "group",
"default": "unknown",
"rules": [
{"value": "1st-group", "criteria": {"raw_tags": ["1ᵉʳ groupe"]}},
{"value": "2nd-group", "criteria": {"raw_tags": ["2ᵉ groupe"]}},
{"value": "3rd-group", "criteria": {"raw_tags": ["3ᵉ groupe"]}},
{"value": "1st-group", "criteria": {"form_regex": "er$"}},
{"value": "2nd-group", "criteria": {"form_regex": "ir$"}},
{"value": "3rd-group", "criteria": {"form_regex": "(re|oir)$"}}
]
}
],
# --- 3. Schema ---
"schema": {
"infinitive": {
"type": "single",
"criteria": {"tags": ["infinitive", "present"]}
},
"participle_present": {
"type": "single",
"optional": True, # <--- NEW: Allows missing participle
"criteria": {"tags": ["participle", "present"]}
},
"participle_past": {
"type": "single",
"optional": True, # <--- Often missing in defective verbs
"criteria": {"tags": ["participle", "past"], "exclude_tags": ["multiword-construction"]}
},
"indicative_present": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["indicative", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_imperfect": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["indicative", "imperfect"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_future": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["indicative", "future"], "exclude_tags": ["perfect"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_simple_past": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["indicative", "past"], "exclude_tags": ["multiword-construction", "imperfect", "perfect", "anterior"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"subjunctive_present": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["subjunctive", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"conditional_present": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["conditional", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"imperative": {
"type": "list", "size": 3,
"optional": True, # <--- Often missing for phrases/defective verbs
"base_criteria": {"tags": ["imperative", "present"]},
"indices": [
{"index": 0, "tags": ["singular"]},
{"index": 1, "tags": ["plural", "first-person"]},
{"index": 2, "tags": ["plural", "second-person"]}
]
}
}
}