Files
BatchVocabListGenerator/check_duplicates.py
jonasgaudian eabe2e2969 welcome gitea
2026-02-19 17:18:23 +01:00

178 lines
5.8 KiB
Python

"""
Sanity Check for Vocabulary Files
---------------------------------
Checks all JSON files in the output directory for TRUE duplicate word pairs.
A TRUE duplicate is when BOTH wordFirst AND wordSecond are identical.
Throws a warning if any word pair appears 3 times or more.
Also supports deleting files that have 3+ true duplicates.
Usage:
python check_duplicates.py
python check_duplicates.py --output output
python check_duplicates.py --threshold 3
python check_duplicates.py --delete # Delete files with 3+ true duplicates
"""
import argparse
import json
import os
from collections import Counter
from pathlib import Path
from typing import Dict, List
def find_json_files(output_dir: str) -> List[Path]:
"""Find all JSON files in the output directory."""
output_path = Path(output_dir)
if not output_path.exists():
print(f"ERROR: Output directory '{output_dir}' not found.")
exit(1)
# Find all JSON files (including subdirectories)
json_files = list(output_path.rglob("*.json"))
return [f for f in json_files if f.name != "vocab_manifest.json"]
def check_file_for_true_duplicates(file_path: Path, threshold: int = 3) -> Dict:
"""
Check a single file for TRUE duplicates.
TRUE duplicate = when BOTH wordFirst AND wordSecond are identical.
Returns a dict with duplicate information.
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except json.JSONDecodeError as e:
return {
"file": str(file_path),
"error": f"JSON parse error: {e}",
"true_dupes": {},
"item_count": 0
}
except Exception as e:
return {
"file": str(file_path),
"error": f"Error reading file: {e}",
"true_dupes": {},
"item_count": 0
}
# Extract items
items = data.get("items", [])
if not items:
return {
"file": str(file_path),
"item_count": 0,
"true_dupes": {}
}
# Create tuples of (wordFirst, wordSecond) to find TRUE duplicates
pair_list = [
(item.get("wordFirst", ""), item.get("wordSecond", ""))
for item in items
if item.get("wordFirst") and item.get("wordSecond")
]
# Count TRUE duplicates (both first AND second must match)
pair_counts = Counter(pair_list)
# Find duplicates above threshold
true_dupes = {pair: count for pair, count in pair_counts.items() if count >= threshold}
return {
"file": str(file_path),
"item_count": len(items),
"true_dupes": true_dupes,
"unique_pairs": len(pair_counts)
}
def delete_file(file_path: Path) -> bool:
"""Delete a file and return True if successful."""
try:
os.remove(file_path)
return True
except Exception as e:
print(f" ❌ Could not delete {file_path}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Check vocabulary files for TRUE duplicates")
parser.add_argument("--output", default="output", help="Output directory to check (default: output)")
parser.add_argument("--threshold", type=int, default=3, help="Warning threshold for duplicates (default: 3)")
parser.add_argument("--delete", action="store_true", help="Delete files with 3+ TRUE duplicates")
args = parser.parse_args()
print("=" * 60)
print(" Vocabulary Duplicate Sanity Check")
print("=" * 60)
print(f" Output dir : {args.output}")
print(f" Threshold : {args.threshold}+ occurrences = warning")
print(f" Mode : {'DELETE' if args.delete else 'CHECK'}")
print()
json_files = find_json_files(args.output)
print(f" Found {len(json_files)} JSON files to check...\n")
total_warnings = 0
files_with_issues = 0
files_deleted = 0
for file_path in json_files:
result = check_file_for_true_duplicates(file_path, args.threshold)
if "error" in result:
print(f" ❌ ERROR: {result['file']}")
print(f" {result['error']}")
files_with_issues += 1
continue
true_dupes = result["true_dupes"]
if true_dupes:
files_with_issues += 1
total_warnings += len(true_dupes)
try:
rel_path = file_path.relative_to(Path(args.output))
except ValueError:
rel_path = file_path.name
# Show details of true duplicates
print(f" ⚠️ {rel_path}")
print(f" Items: {result['item_count']} | Unique pairs: {result['unique_pairs']}")
print(f" TRUE duplicates (both wordFirst AND wordSecond identical):")
# Show up to 5 duplicates
for pair, count in sorted(true_dupes.items(), key=lambda x: -x[1])[:5]:
wf, ws = pair
print(f" - \"{wf}\"\"{ws}\" appears {count} times")
# Delete if requested
if args.delete:
if delete_file(file_path):
files_deleted += 1
print(f" ✅ DELETED due to {len(true_dupes)} duplicate pairs")
else:
print(f" ❌ Failed to delete")
print()
print("=" * 60)
if files_with_issues == 0:
print(f" ✅ All {len(json_files)} files passed sanity check!")
else:
print(f" ⚠️ Found {total_warnings} true duplicate warnings in {files_with_issues} files")
if args.delete:
print(f" 🗑️ Deleted {files_deleted} files")
else:
print(f" 💡 Run with --delete to remove files with 3+ true duplicates")
print("=" * 60)
if __name__ == "__main__":
main()