178 lines
5.8 KiB
Python
178 lines
5.8 KiB
Python
"""
|
|
Sanity Check for Vocabulary Files
|
|
---------------------------------
|
|
Checks all JSON files in the output directory for TRUE duplicate word pairs.
|
|
A TRUE duplicate is when BOTH wordFirst AND wordSecond are identical.
|
|
Throws a warning if any word pair appears 3 times or more.
|
|
|
|
Also supports deleting files that have 3+ true duplicates.
|
|
|
|
Usage:
|
|
python check_duplicates.py
|
|
python check_duplicates.py --output output
|
|
python check_duplicates.py --threshold 3
|
|
python check_duplicates.py --delete # Delete files with 3+ true duplicates
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
|
|
def find_json_files(output_dir: str) -> List[Path]:
|
|
"""Find all JSON files in the output directory."""
|
|
output_path = Path(output_dir)
|
|
if not output_path.exists():
|
|
print(f"ERROR: Output directory '{output_dir}' not found.")
|
|
exit(1)
|
|
|
|
# Find all JSON files (including subdirectories)
|
|
json_files = list(output_path.rglob("*.json"))
|
|
return [f for f in json_files if f.name != "vocab_manifest.json"]
|
|
|
|
|
|
def check_file_for_true_duplicates(file_path: Path, threshold: int = 3) -> Dict:
|
|
"""
|
|
Check a single file for TRUE duplicates.
|
|
TRUE duplicate = when BOTH wordFirst AND wordSecond are identical.
|
|
|
|
Returns a dict with duplicate information.
|
|
"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
except json.JSONDecodeError as e:
|
|
return {
|
|
"file": str(file_path),
|
|
"error": f"JSON parse error: {e}",
|
|
"true_dupes": {},
|
|
"item_count": 0
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"file": str(file_path),
|
|
"error": f"Error reading file: {e}",
|
|
"true_dupes": {},
|
|
"item_count": 0
|
|
}
|
|
|
|
# Extract items
|
|
items = data.get("items", [])
|
|
if not items:
|
|
return {
|
|
"file": str(file_path),
|
|
"item_count": 0,
|
|
"true_dupes": {}
|
|
}
|
|
|
|
# Create tuples of (wordFirst, wordSecond) to find TRUE duplicates
|
|
pair_list = [
|
|
(item.get("wordFirst", ""), item.get("wordSecond", ""))
|
|
for item in items
|
|
if item.get("wordFirst") and item.get("wordSecond")
|
|
]
|
|
|
|
# Count TRUE duplicates (both first AND second must match)
|
|
pair_counts = Counter(pair_list)
|
|
|
|
# Find duplicates above threshold
|
|
true_dupes = {pair: count for pair, count in pair_counts.items() if count >= threshold}
|
|
|
|
return {
|
|
"file": str(file_path),
|
|
"item_count": len(items),
|
|
"true_dupes": true_dupes,
|
|
"unique_pairs": len(pair_counts)
|
|
}
|
|
|
|
|
|
def delete_file(file_path: Path) -> bool:
|
|
"""Delete a file and return True if successful."""
|
|
try:
|
|
os.remove(file_path)
|
|
return True
|
|
except Exception as e:
|
|
print(f" ❌ Could not delete {file_path}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Check vocabulary files for TRUE duplicates")
|
|
parser.add_argument("--output", default="output", help="Output directory to check (default: output)")
|
|
parser.add_argument("--threshold", type=int, default=3, help="Warning threshold for duplicates (default: 3)")
|
|
parser.add_argument("--delete", action="store_true", help="Delete files with 3+ TRUE duplicates")
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 60)
|
|
print(" Vocabulary Duplicate Sanity Check")
|
|
print("=" * 60)
|
|
print(f" Output dir : {args.output}")
|
|
print(f" Threshold : {args.threshold}+ occurrences = warning")
|
|
print(f" Mode : {'DELETE' if args.delete else 'CHECK'}")
|
|
print()
|
|
|
|
json_files = find_json_files(args.output)
|
|
print(f" Found {len(json_files)} JSON files to check...\n")
|
|
|
|
total_warnings = 0
|
|
files_with_issues = 0
|
|
files_deleted = 0
|
|
|
|
for file_path in json_files:
|
|
result = check_file_for_true_duplicates(file_path, args.threshold)
|
|
|
|
if "error" in result:
|
|
print(f" ❌ ERROR: {result['file']}")
|
|
print(f" {result['error']}")
|
|
files_with_issues += 1
|
|
continue
|
|
|
|
true_dupes = result["true_dupes"]
|
|
|
|
if true_dupes:
|
|
files_with_issues += 1
|
|
total_warnings += len(true_dupes)
|
|
|
|
try:
|
|
rel_path = file_path.relative_to(Path(args.output))
|
|
except ValueError:
|
|
rel_path = file_path.name
|
|
|
|
# Show details of true duplicates
|
|
print(f" ⚠️ {rel_path}")
|
|
print(f" Items: {result['item_count']} | Unique pairs: {result['unique_pairs']}")
|
|
print(f" TRUE duplicates (both wordFirst AND wordSecond identical):")
|
|
|
|
# Show up to 5 duplicates
|
|
for pair, count in sorted(true_dupes.items(), key=lambda x: -x[1])[:5]:
|
|
wf, ws = pair
|
|
print(f" - \"{wf}\" → \"{ws}\" appears {count} times")
|
|
|
|
# Delete if requested
|
|
if args.delete:
|
|
if delete_file(file_path):
|
|
files_deleted += 1
|
|
print(f" ✅ DELETED due to {len(true_dupes)} duplicate pairs")
|
|
else:
|
|
print(f" ❌ Failed to delete")
|
|
|
|
print()
|
|
|
|
print("=" * 60)
|
|
if files_with_issues == 0:
|
|
print(f" ✅ All {len(json_files)} files passed sanity check!")
|
|
else:
|
|
print(f" ⚠️ Found {total_warnings} true duplicate warnings in {files_with_issues} files")
|
|
if args.delete:
|
|
print(f" 🗑️ Deleted {files_deleted} files")
|
|
else:
|
|
print(f" 💡 Run with --delete to remove files with 3+ true duplicates")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|