Migrate to gitea

This commit is contained in:
jonasgaudian
2026-02-13 00:10:40 +01:00
commit 6d06a9e14e
38 changed files with 31427 additions and 0 deletions

View File

@@ -0,0 +1,419 @@
#!/usr/bin/env python3
"""
Universal Wiktionary Format Transformer
========================================
Transforms any Wiktionary JSON format to a standardized universal schema.
Usage:
python transform_wiktionary.py input.jsonl output.jsonl
python transform_wiktionary.py input.jsonl output.jsonl --validate
"""
import json
import sys
import argparse
from typing import Dict, List, Any, Optional
from pathlib import Path
class WiktionaryTransformer:
"""Transforms Wiktionary entries to universal format."""
def __init__(self, validate: bool = False):
self.validate = validate
self.stats = {
"total": 0,
"successful": 0,
"errors": 0,
"warnings": []
}
def transform_entry(self, raw_entry: Dict[str, Any]) -> Dict[str, Any]:
"""
Transform a single Wiktionary entry to universal format.
Args:
raw_entry: Raw entry from any Wiktionary edition
Returns:
Transformed entry in universal format
"""
# === REQUIRED CORE FIELDS ===
try:
universal = {
"word": raw_entry["word"],
"lang_code": raw_entry["lang_code"],
"pos": raw_entry["pos"],
"senses": raw_entry["senses"]
}
except KeyError as e:
raise ValueError(f"Missing required field: {e}")
# === PHONETICS ===
phonetics = self._extract_phonetics(raw_entry)
if phonetics:
universal["phonetics"] = phonetics
# === HYPHENATION ===
hyphenation = self._extract_hyphenation(raw_entry)
if hyphenation:
universal["hyphenation"] = hyphenation
# === FORMS ===
if "forms" in raw_entry:
universal["forms"] = raw_entry["forms"]
# === GRAMMATICAL FEATURES ===
grammatical = self._extract_grammatical_features(raw_entry)
if grammatical:
universal["grammatical_features"] = grammatical
# === ETYMOLOGY ===
etymology = self._extract_etymology(raw_entry)
if etymology:
universal["etymology"] = etymology
# === RELATIONS ===
relations = self._extract_relations(raw_entry)
if relations:
universal["relations"] = relations
# === TRANSLATIONS ===
if "translations" in raw_entry:
universal["translations"] = raw_entry["translations"]
# === DESCENDANTS ===
if "descendants" in raw_entry:
universal["descendants"] = raw_entry["descendants"]
# === METADATA ===
metadata = self._extract_metadata(raw_entry)
universal["metadata"] = metadata
return universal
def _extract_phonetics(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract and normalize phonetic information."""
phonetics = {}
# Process sounds array
if "sounds" in entry and entry["sounds"]:
ipa_variations = []
audio_list = []
homophones = []
for sound in entry["sounds"]:
# IPA transcription with country information
if "ipa" in sound:
ipa_entry = {"ipa": sound["ipa"]}
# Preserve country information from raw_tags
if "raw_tags" in sound:
ipa_entry["raw_tags"] = sound["raw_tags"]
# Clean IPA string by removing special characters at beginning/end
cleaned_ipa = self._clean_ipa_string(sound["ipa"])
ipa_entry["ipa_cleaned"] = cleaned_ipa
ipa_variations.append(ipa_entry)
# Audio files (keep for now, will be removed in filter step)
if "audio" in sound:
audio_obj = {}
# Try multiple URL formats
for url_key in ["ogg_url", "mp3_url", "url"]:
if url_key in sound:
audio_obj["url"] = sound[url_key]
break
audio_obj["text"] = sound.get("audio", "")
if audio_obj:
audio_list.append(audio_obj)
# Homophones
if "homophone" in sound:
homophones.append(sound["homophone"])
if ipa_variations:
phonetics["ipa_variations"] = ipa_variations
if audio_list:
phonetics["audio"] = audio_list
if homophones:
phonetics["homophones"] = homophones
# Handle extra_sounds (some editions)
if "extra_sounds" in entry:
if "pronunciación" in entry["extra_sounds"]:
phonetics["notes"] = entry["extra_sounds"]["pronunciación"]
return phonetics if phonetics else None
def _clean_ipa_string(self, ipa_string: str) -> str:
"""Clean IPA string by removing special characters at beginning/end."""
if not ipa_string:
return ipa_string
# Remove leading/trailing special characters: [, ], \, :
cleaned = ipa_string.strip("[]\\:")
return cleaned
def _extract_hyphenation(self, entry: Dict[str, Any]) -> Optional[List[str]]:
"""Extract and normalize hyphenation."""
# Format 1: hyphenations array with parts
if "hyphenations" in entry and entry["hyphenations"]:
parts = []
for h in entry["hyphenations"]:
if isinstance(h, dict) and "parts" in h:
parts.extend(h["parts"])
elif isinstance(h, str):
parts.append(h)
if parts:
return parts
# Format 2: hyphenation string with separator
if "hyphenation" in entry:
# Split on common separators
hyph = entry["hyphenation"]
for sep in ["", "-", "·", ""]:
if sep in hyph:
return hyph.split(sep)
return [hyph]
return None
def _extract_grammatical_features(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract grammatical features and tags."""
if "tags" not in entry:
return None
grammatical = {"tags": entry["tags"]}
# Extract gender from tags
gender_map = {
"masculine": "masculine",
"feminine": "feminine",
"neuter": "neuter",
"common": "common",
"m": "masculine",
"f": "feminine",
"n": "neuter",
"c": "common"
}
for tag in entry["tags"]:
tag_lower = tag.lower()
if tag_lower in gender_map:
grammatical["gender"] = gender_map[tag_lower]
break
# Extract number
number_map = {
"singular": "singular",
"plural": "plural",
"dual": "dual",
"sg": "singular",
"pl": "plural"
}
for tag in entry["tags"]:
tag_lower = tag.lower()
if tag_lower in number_map:
grammatical["number"] = number_map[tag_lower]
break
return grammatical
def _extract_etymology(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract etymology information."""
etymology = {}
if "etymology_text" in entry:
etymology["text"] = entry["etymology_text"]
if "etymology_texts" in entry:
etymology["texts"] = entry["etymology_texts"]
if "etymology_number" in entry:
etymology["number"] = entry["etymology_number"]
return etymology if etymology else None
def _extract_relations(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract semantic and lexical relations."""
relations = {}
# Define all possible relation types
relation_fields = [
"synonyms", "antonyms", "hypernyms", "hyponyms",
"meronyms", "holonyms", "related", "derived",
"coordinate_terms", "troponyms", "compounds"
]
for field in relation_fields:
if field in entry and entry[field]:
relations[field] = entry[field]
return relations if relations else None
def _extract_metadata(self, entry: Dict[str, Any]) -> Dict[str, Any]:
"""Extract metadata and source information."""
metadata = {}
# Source language
if "lang" in entry:
metadata["source_lang"] = entry["lang"]
# Infer source language code if possible
if "lang_code" in entry:
metadata["source_lang_code"] = entry["lang_code"]
# POS title (localized)
if "pos_title" in entry:
metadata["pos_title"] = entry["pos_title"]
elif "pos_text" in entry:
metadata["pos_title"] = entry["pos_text"]
# Categories
if "categories" in entry:
metadata["categories"] = entry["categories"]
# Templates
templates = []
if "head_templates" in entry:
templates.extend(entry["head_templates"])
if "inflection_templates" in entry:
templates.extend(entry["inflection_templates"])
if templates:
metadata["templates"] = templates
# Additional metadata
if "attestations" in entry:
metadata["attestations"] = entry["attestations"]
return metadata
def transform_file(self, input_path: str, output_path: str) -> None:
"""
Transform an entire JSONL file.
Args:
input_path: Path to input JSONL file
output_path: Path to output JSONL file
"""
input_file = Path(input_path)
output_file = Path(output_path)
if not input_file.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
print(f"Transforming: {input_path}{output_path}")
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', encoding='utf-8') as outfile:
for line_num, line in enumerate(infile, 1):
line = line.strip()
if not line:
continue
self.stats["total"] += 1
try:
# Parse input
raw_entry = json.loads(line)
# Transform
universal_entry = self.transform_entry(raw_entry)
# Validate if requested
if self.validate:
self._validate_entry(universal_entry)
# Write output
outfile.write(json.dumps(universal_entry, ensure_ascii=False) + '\n')
self.stats["successful"] += 1
except json.JSONDecodeError as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: JSON decode error - {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
except ValueError as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
except Exception as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: Unexpected error - {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
self._print_summary()
def _validate_entry(self, entry: Dict[str, Any]) -> None:
"""Validate a transformed entry."""
required = ["word", "lang_code", "pos", "senses"]
for field in required:
if field not in entry:
raise ValueError(f"Missing required field after transformation: {field}")
def _print_summary(self) -> None:
"""Print transformation summary."""
print("\n" + "="*60)
print("TRANSFORMATION SUMMARY")
print("="*60)
print(f"Total entries: {self.stats['total']}")
print(f"Successful: {self.stats['successful']}")
print(f"Errors: {self.stats['errors']}")
if self.stats['successful'] > 0:
success_rate = (self.stats['successful'] / self.stats['total']) * 100
print(f"Success rate: {success_rate:.1f}%")
if self.stats['warnings']:
print(f"\nWarnings: {len(self.stats['warnings'])}")
if len(self.stats['warnings']) <= 10:
for warning in self.stats['warnings']:
print(f" - {warning}")
else:
print(f" (showing first 10 of {len(self.stats['warnings'])})")
for warning in self.stats['warnings'][:10]:
print(f" - {warning}")
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Transform Wiktionary JSONL to universal format",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s input.jsonl output.jsonl
%(prog)s data/raw.jsonl data/transformed.jsonl --validate
"""
)
parser.add_argument("input", help="Input JSONL file")
parser.add_argument("output", help="Output JSONL file")
parser.add_argument("--validate", action="store_true",
help="Validate transformed entries")
args = parser.parse_args()
try:
transformer = WiktionaryTransformer(validate=args.validate)
transformer.transform_file(args.input, args.output)
# Exit with error code if there were errors
if transformer.stats["errors"] > 0:
sys.exit(1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()