Files
Wictionary-Data-Parser/tests/test_transform_wiktionary.py
2026-02-13 00:10:40 +01:00

264 lines
10 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Test Suite for Wiktionary Transformer
======================================
Comprehensive tests for the transform_wiktionary.py module.
"""
import json
import sys
import pathlib
from typing import Dict, Any
# Add parent directory to path for imports
sys.path.append(str(pathlib.Path(__file__).parent.parent))
from tests.test_framework import TestFramework, SchemaValidator, TestDataLoader
from scripts.transform_wiktionary import WiktionaryTransformer
class TestWiktionaryTransformer(TestFramework):
"""Test suite for WiktionaryTransformer class."""
def __init__(self):
super().__init__()
self.transformer = WiktionaryTransformer(validate=True)
def test_required_fields(self):
"""Test that required fields are properly handled."""
print("Testing required fields...")
# Test with all required fields
valid_entry = {
"word": "test",
"lang_code": "en",
"pos": "noun",
"senses": [{"glosses": ["a test word"]}]
}
try:
result = self.transformer.transform_entry(valid_entry)
self.assert_true("word" in result, "Word field should be present")
self.assert_true("pos" in result, "POS field should be present")
self.assert_true("senses" in result, "Senses field should be present")
except Exception as e:
self.assert_false(True, f"Should not raise exception: {e}")
# Test with missing required field
invalid_entry = {
"word": "test",
"lang_code": "en",
"pos": "noun"
# Missing "senses"
}
try:
result = self.transformer.transform_entry(invalid_entry)
self.assert_false(True, "Should raise exception for missing required field")
except ValueError:
self.assert_true(True, "Should raise ValueError for missing required field")
def test_phonetics_extraction(self):
"""Test phonetics extraction and normalization."""
print("Testing phonetics extraction...")
entry_with_phonetics = {
"word": "test",
"lang_code": "en",
"pos": "noun",
"senses": [{"glosses": ["test"]}],
"sounds": [
{"ipa": "/tɛst/", "audio": "test.ogg"},
{"ipa": "/ˈtɛst/", "homophone": "test"}
]
}
result = self.transformer.transform_entry(entry_with_phonetics)
self.assert_true("phonetics" in result, "Phonetics should be extracted")
self.assert_true("ipa" in result["phonetics"], "IPA should be present")
self.assert_equal(len(result["phonetics"]["ipa"]), 2, "Should have 2 IPA entries")
self.assert_true("homophones" in result["phonetics"], "Homophones should be present")
def test_hyphenation_extraction(self):
"""Test hyphenation extraction."""
print("Testing hyphenation extraction...")
entry_with_hyphenation = {
"word": "hyphenation",
"lang_code": "en",
"pos": "noun",
"senses": [{"glosses": ["test"]}],
"hyphenation": "hy-phen-a-tion"
}
result = self.transformer.transform_entry(entry_with_hyphenation)
self.assert_true("hyphenation" in result, "Hyphenation should be extracted")
self.assert_is_instance(result["hyphenation"], list, "Hyphenation should be a list")
self.assert_equal(len(result["hyphenation"]), 4, "Should have 4 parts")
def test_grammatical_features_extraction(self):
"""Test grammatical features extraction."""
print("Testing grammatical features extraction...")
entry_with_tags = {
"word": "test",
"lang_code": "de",
"pos": "noun",
"senses": [{"glosses": ["test"]}],
"tags": ["masculine", "singular"]
}
result = self.transformer.transform_entry(entry_with_tags)
self.assert_true("grammatical_features" in result, "Grammatical features should be extracted")
self.assert_true("gender" in result["grammatical_features"], "Gender should be present")
self.assert_equal(result["grammatical_features"]["gender"], "masculine", "Gender should be masculine")
self.assert_true("number" in result["grammatical_features"], "Number should be present")
self.assert_equal(result["grammatical_features"]["number"], "singular", "Number should be singular")
def test_etymology_extraction(self):
"""Test etymology extraction."""
print("Testing etymology extraction...")
entry_with_etymology = {
"word": "test",
"lang_code": "en",
"pos": "noun",
"senses": [{"glosses": ["test"]}],
"etymology_text": "From Latin testum",
"etymology_number": 1
}
result = self.transformer.transform_entry(entry_with_etymology)
self.assert_true("etymology" in result, "Etymology should be extracted")
self.assert_true("text" in result["etymology"], "Etymology text should be present")
self.assert_true("number" in result["etymology"], "Etymology number should be present")
def test_relations_extraction(self):
"""Test relations extraction."""
print("Testing relations extraction...")
entry_with_relations = {
"word": "test",
"lang_code": "en",
"pos": "noun",
"senses": [{"glosses": ["test"]}],
"synonyms": [{"word": "exam"}],
"antonyms": [{"word": "ignore"}],
"related": ["examination", "quiz"]
}
result = self.transformer.transform_entry(entry_with_relations)
self.assert_true("relations" in result, "Relations should be extracted")
self.assert_true("synonyms" in result["relations"], "Synonyms should be present")
self.assert_true("antonyms" in result["relations"], "Antonyms should be present")
self.assert_true("related" in result["relations"], "Related terms should be present")
def test_schema_validation(self):
"""Test schema validation."""
print("Testing schema validation...")
# Test valid entry
valid_entry = {
"word": "test",
"lang_code": "en",
"pos": "noun",
"senses": [{"glosses": ["a test word"]}]
}
result = self.transformer.transform_entry(valid_entry)
self.assert_true(SchemaValidator.validate_universal_schema(result), "Valid entry should pass schema validation")
# Test entry with missing required field
invalid_entry = {
"word": "test",
"lang_code": "en",
"pos": "noun"
# Missing senses
}
try:
result = self.transformer.transform_entry(invalid_entry)
self.assert_false(True, "Should raise exception for invalid schema")
except ValueError:
self.assert_true(True, "Should raise ValueError for invalid schema")
def test_real_world_data(self):
"""Test with real sample data."""
print("Testing with real sample data...")
try:
# Load German sample data
german_data = TestDataLoader.load_sample_data("laufen")
# Add required fields if missing
german_data["lang_code"] = "de"
german_data["senses"] = [{"glosses": ["to run", "to walk"]}]
result = self.transformer.transform_entry(german_data)
self.assert_true(SchemaValidator.validate_universal_schema(result), "Real data should pass schema validation")
self.assert_equal(result["word"], "laufen", "Word should be preserved")
self.assert_equal(result["pos"], "verb", "POS should be preserved")
self.assert_true("forms" in result, "Forms should be preserved")
except FileNotFoundError:
self.assert_true(True, "Sample data not available, skipping real data test")
def test_error_handling(self):
"""Test error handling."""
print("Testing error handling...")
# Test with invalid JSON
try:
invalid_json = "not valid json"
self.transformer.transform_entry(json.loads(invalid_json))
self.assert_false(True, "Should raise JSON decode error")
except json.JSONDecodeError:
self.assert_true(True, "Should handle JSON decode errors gracefully")
# Test with missing required field
try:
incomplete_entry = {
"word": "test",
"lang_code": "en"
# Missing pos and senses
}
self.transformer.transform_entry(incomplete_entry)
self.assert_false(True, "Should raise ValueError for missing required fields")
except ValueError as e:
self.assert_true("Missing required field" in str(e), "Should provide descriptive error message")
def run_all_tests(self):
"""Run all tests in this suite."""
print("\n" + "="*60)
print("WIKTIONARY TRANSFORMER TEST SUITE")
print("="*60)
self.test_required_fields()
self.test_phonetics_extraction()
self.test_hyphenation_extraction()
self.test_grammatical_features_extraction()
self.test_etymology_extraction()
self.test_relations_extraction()
self.test_schema_validation()
self.test_real_world_data()
self.test_error_handling()
success = self.print_summary()
self.cleanup()
return success
if __name__ == "__main__":
test_suite = TestWiktionaryTransformer()
success = test_suite.run_all_tests()
if success:
print("\n[SUCCESS] All tests passed!")
sys.exit(0)
else:
print("\n[FAILED] Some tests failed!")
sys.exit(1)