Files
Wictionary-Data-Parser/scripts/transform_wiktionary.py
2026-02-13 00:10:40 +01:00

420 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Universal Wiktionary Format Transformer
========================================
Transforms any Wiktionary JSON format to a standardized universal schema.
Usage:
python transform_wiktionary.py input.jsonl output.jsonl
python transform_wiktionary.py input.jsonl output.jsonl --validate
"""
import json
import sys
import argparse
from typing import Dict, List, Any, Optional
from pathlib import Path
class WiktionaryTransformer:
"""Transforms Wiktionary entries to universal format."""
def __init__(self, validate: bool = False):
self.validate = validate
self.stats = {
"total": 0,
"successful": 0,
"errors": 0,
"warnings": []
}
def transform_entry(self, raw_entry: Dict[str, Any]) -> Dict[str, Any]:
"""
Transform a single Wiktionary entry to universal format.
Args:
raw_entry: Raw entry from any Wiktionary edition
Returns:
Transformed entry in universal format
"""
# === REQUIRED CORE FIELDS ===
try:
universal = {
"word": raw_entry["word"],
"lang_code": raw_entry["lang_code"],
"pos": raw_entry["pos"],
"senses": raw_entry["senses"]
}
except KeyError as e:
raise ValueError(f"Missing required field: {e}")
# === PHONETICS ===
phonetics = self._extract_phonetics(raw_entry)
if phonetics:
universal["phonetics"] = phonetics
# === HYPHENATION ===
hyphenation = self._extract_hyphenation(raw_entry)
if hyphenation:
universal["hyphenation"] = hyphenation
# === FORMS ===
if "forms" in raw_entry:
universal["forms"] = raw_entry["forms"]
# === GRAMMATICAL FEATURES ===
grammatical = self._extract_grammatical_features(raw_entry)
if grammatical:
universal["grammatical_features"] = grammatical
# === ETYMOLOGY ===
etymology = self._extract_etymology(raw_entry)
if etymology:
universal["etymology"] = etymology
# === RELATIONS ===
relations = self._extract_relations(raw_entry)
if relations:
universal["relations"] = relations
# === TRANSLATIONS ===
if "translations" in raw_entry:
universal["translations"] = raw_entry["translations"]
# === DESCENDANTS ===
if "descendants" in raw_entry:
universal["descendants"] = raw_entry["descendants"]
# === METADATA ===
metadata = self._extract_metadata(raw_entry)
universal["metadata"] = metadata
return universal
def _extract_phonetics(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract and normalize phonetic information."""
phonetics = {}
# Process sounds array
if "sounds" in entry and entry["sounds"]:
ipa_variations = []
audio_list = []
homophones = []
for sound in entry["sounds"]:
# IPA transcription with country information
if "ipa" in sound:
ipa_entry = {"ipa": sound["ipa"]}
# Preserve country information from raw_tags
if "raw_tags" in sound:
ipa_entry["raw_tags"] = sound["raw_tags"]
# Clean IPA string by removing special characters at beginning/end
cleaned_ipa = self._clean_ipa_string(sound["ipa"])
ipa_entry["ipa_cleaned"] = cleaned_ipa
ipa_variations.append(ipa_entry)
# Audio files (keep for now, will be removed in filter step)
if "audio" in sound:
audio_obj = {}
# Try multiple URL formats
for url_key in ["ogg_url", "mp3_url", "url"]:
if url_key in sound:
audio_obj["url"] = sound[url_key]
break
audio_obj["text"] = sound.get("audio", "")
if audio_obj:
audio_list.append(audio_obj)
# Homophones
if "homophone" in sound:
homophones.append(sound["homophone"])
if ipa_variations:
phonetics["ipa_variations"] = ipa_variations
if audio_list:
phonetics["audio"] = audio_list
if homophones:
phonetics["homophones"] = homophones
# Handle extra_sounds (some editions)
if "extra_sounds" in entry:
if "pronunciación" in entry["extra_sounds"]:
phonetics["notes"] = entry["extra_sounds"]["pronunciación"]
return phonetics if phonetics else None
def _clean_ipa_string(self, ipa_string: str) -> str:
"""Clean IPA string by removing special characters at beginning/end."""
if not ipa_string:
return ipa_string
# Remove leading/trailing special characters: [, ], \, :
cleaned = ipa_string.strip("[]\\:")
return cleaned
def _extract_hyphenation(self, entry: Dict[str, Any]) -> Optional[List[str]]:
"""Extract and normalize hyphenation."""
# Format 1: hyphenations array with parts
if "hyphenations" in entry and entry["hyphenations"]:
parts = []
for h in entry["hyphenations"]:
if isinstance(h, dict) and "parts" in h:
parts.extend(h["parts"])
elif isinstance(h, str):
parts.append(h)
if parts:
return parts
# Format 2: hyphenation string with separator
if "hyphenation" in entry:
# Split on common separators
hyph = entry["hyphenation"]
for sep in ["", "-", "·", ""]:
if sep in hyph:
return hyph.split(sep)
return [hyph]
return None
def _extract_grammatical_features(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract grammatical features and tags."""
if "tags" not in entry:
return None
grammatical = {"tags": entry["tags"]}
# Extract gender from tags
gender_map = {
"masculine": "masculine",
"feminine": "feminine",
"neuter": "neuter",
"common": "common",
"m": "masculine",
"f": "feminine",
"n": "neuter",
"c": "common"
}
for tag in entry["tags"]:
tag_lower = tag.lower()
if tag_lower in gender_map:
grammatical["gender"] = gender_map[tag_lower]
break
# Extract number
number_map = {
"singular": "singular",
"plural": "plural",
"dual": "dual",
"sg": "singular",
"pl": "plural"
}
for tag in entry["tags"]:
tag_lower = tag.lower()
if tag_lower in number_map:
grammatical["number"] = number_map[tag_lower]
break
return grammatical
def _extract_etymology(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract etymology information."""
etymology = {}
if "etymology_text" in entry:
etymology["text"] = entry["etymology_text"]
if "etymology_texts" in entry:
etymology["texts"] = entry["etymology_texts"]
if "etymology_number" in entry:
etymology["number"] = entry["etymology_number"]
return etymology if etymology else None
def _extract_relations(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract semantic and lexical relations."""
relations = {}
# Define all possible relation types
relation_fields = [
"synonyms", "antonyms", "hypernyms", "hyponyms",
"meronyms", "holonyms", "related", "derived",
"coordinate_terms", "troponyms", "compounds"
]
for field in relation_fields:
if field in entry and entry[field]:
relations[field] = entry[field]
return relations if relations else None
def _extract_metadata(self, entry: Dict[str, Any]) -> Dict[str, Any]:
"""Extract metadata and source information."""
metadata = {}
# Source language
if "lang" in entry:
metadata["source_lang"] = entry["lang"]
# Infer source language code if possible
if "lang_code" in entry:
metadata["source_lang_code"] = entry["lang_code"]
# POS title (localized)
if "pos_title" in entry:
metadata["pos_title"] = entry["pos_title"]
elif "pos_text" in entry:
metadata["pos_title"] = entry["pos_text"]
# Categories
if "categories" in entry:
metadata["categories"] = entry["categories"]
# Templates
templates = []
if "head_templates" in entry:
templates.extend(entry["head_templates"])
if "inflection_templates" in entry:
templates.extend(entry["inflection_templates"])
if templates:
metadata["templates"] = templates
# Additional metadata
if "attestations" in entry:
metadata["attestations"] = entry["attestations"]
return metadata
def transform_file(self, input_path: str, output_path: str) -> None:
"""
Transform an entire JSONL file.
Args:
input_path: Path to input JSONL file
output_path: Path to output JSONL file
"""
input_file = Path(input_path)
output_file = Path(output_path)
if not input_file.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
print(f"Transforming: {input_path}{output_path}")
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', encoding='utf-8') as outfile:
for line_num, line in enumerate(infile, 1):
line = line.strip()
if not line:
continue
self.stats["total"] += 1
try:
# Parse input
raw_entry = json.loads(line)
# Transform
universal_entry = self.transform_entry(raw_entry)
# Validate if requested
if self.validate:
self._validate_entry(universal_entry)
# Write output
outfile.write(json.dumps(universal_entry, ensure_ascii=False) + '\n')
self.stats["successful"] += 1
except json.JSONDecodeError as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: JSON decode error - {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
except ValueError as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
except Exception as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: Unexpected error - {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
self._print_summary()
def _validate_entry(self, entry: Dict[str, Any]) -> None:
"""Validate a transformed entry."""
required = ["word", "lang_code", "pos", "senses"]
for field in required:
if field not in entry:
raise ValueError(f"Missing required field after transformation: {field}")
def _print_summary(self) -> None:
"""Print transformation summary."""
print("\n" + "="*60)
print("TRANSFORMATION SUMMARY")
print("="*60)
print(f"Total entries: {self.stats['total']}")
print(f"Successful: {self.stats['successful']}")
print(f"Errors: {self.stats['errors']}")
if self.stats['successful'] > 0:
success_rate = (self.stats['successful'] / self.stats['total']) * 100
print(f"Success rate: {success_rate:.1f}%")
if self.stats['warnings']:
print(f"\nWarnings: {len(self.stats['warnings'])}")
if len(self.stats['warnings']) <= 10:
for warning in self.stats['warnings']:
print(f" - {warning}")
else:
print(f" (showing first 10 of {len(self.stats['warnings'])})")
for warning in self.stats['warnings'][:10]:
print(f" - {warning}")
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Transform Wiktionary JSONL to universal format",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s input.jsonl output.jsonl
%(prog)s data/raw.jsonl data/transformed.jsonl --validate
"""
)
parser.add_argument("input", help="Input JSONL file")
parser.add_argument("output", help="Output JSONL file")
parser.add_argument("--validate", action="store_true",
help="Validate transformed entries")
args = parser.parse_args()
try:
transformer = WiktionaryTransformer(validate=args.validate)
transformer.transform_file(args.input, args.output)
# Exit with error code if there were errors
if transformer.stats["errors"] > 0:
sys.exit(1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()