""" Sanity Check for Vocabulary Files --------------------------------- Checks all JSON files in the output directory for TRUE duplicate word pairs. A TRUE duplicate is when BOTH wordFirst AND wordSecond are identical. Throws a warning if any word pair appears 3 times or more. Also supports deleting files that have 3+ true duplicates. Usage: python check_duplicates.py python check_duplicates.py --output output python check_duplicates.py --threshold 3 python check_duplicates.py --delete # Delete files with 3+ true duplicates """ import argparse import json import os from collections import Counter from pathlib import Path from typing import Dict, List def find_json_files(output_dir: str) -> List[Path]: """Find all JSON files in the output directory.""" output_path = Path(output_dir) if not output_path.exists(): print(f"ERROR: Output directory '{output_dir}' not found.") exit(1) # Find all JSON files (including subdirectories) json_files = list(output_path.rglob("*.json")) return [f for f in json_files if f.name != "vocab_manifest.json"] def check_file_for_true_duplicates(file_path: Path, threshold: int = 3) -> Dict: """ Check a single file for TRUE duplicates. TRUE duplicate = when BOTH wordFirst AND wordSecond are identical. Returns a dict with duplicate information. """ try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) except json.JSONDecodeError as e: return { "file": str(file_path), "error": f"JSON parse error: {e}", "true_dupes": {}, "item_count": 0 } except Exception as e: return { "file": str(file_path), "error": f"Error reading file: {e}", "true_dupes": {}, "item_count": 0 } # Extract items items = data.get("items", []) if not items: return { "file": str(file_path), "item_count": 0, "true_dupes": {} } # Create tuples of (wordFirst, wordSecond) to find TRUE duplicates pair_list = [ (item.get("wordFirst", ""), item.get("wordSecond", "")) for item in items if item.get("wordFirst") and item.get("wordSecond") ] # Count TRUE duplicates (both first AND second must match) pair_counts = Counter(pair_list) # Find duplicates above threshold true_dupes = {pair: count for pair, count in pair_counts.items() if count >= threshold} return { "file": str(file_path), "item_count": len(items), "true_dupes": true_dupes, "unique_pairs": len(pair_counts) } def delete_file(file_path: Path) -> bool: """Delete a file and return True if successful.""" try: os.remove(file_path) return True except Exception as e: print(f" ❌ Could not delete {file_path}: {e}") return False def main(): parser = argparse.ArgumentParser(description="Check vocabulary files for TRUE duplicates") parser.add_argument("--output", default="output", help="Output directory to check (default: output)") parser.add_argument("--threshold", type=int, default=3, help="Warning threshold for duplicates (default: 3)") parser.add_argument("--delete", action="store_true", help="Delete files with 3+ TRUE duplicates") args = parser.parse_args() print("=" * 60) print(" Vocabulary Duplicate Sanity Check") print("=" * 60) print(f" Output dir : {args.output}") print(f" Threshold : {args.threshold}+ occurrences = warning") print(f" Mode : {'DELETE' if args.delete else 'CHECK'}") print() json_files = find_json_files(args.output) print(f" Found {len(json_files)} JSON files to check...\n") total_warnings = 0 files_with_issues = 0 files_deleted = 0 for file_path in json_files: result = check_file_for_true_duplicates(file_path, args.threshold) if "error" in result: print(f" ❌ ERROR: {result['file']}") print(f" {result['error']}") files_with_issues += 1 continue true_dupes = result["true_dupes"] if true_dupes: files_with_issues += 1 total_warnings += len(true_dupes) try: rel_path = file_path.relative_to(Path(args.output)) except ValueError: rel_path = file_path.name # Show details of true duplicates print(f" ⚠️ {rel_path}") print(f" Items: {result['item_count']} | Unique pairs: {result['unique_pairs']}") print(f" TRUE duplicates (both wordFirst AND wordSecond identical):") # Show up to 5 duplicates for pair, count in sorted(true_dupes.items(), key=lambda x: -x[1])[:5]: wf, ws = pair print(f" - \"{wf}\" → \"{ws}\" appears {count} times") # Delete if requested if args.delete: if delete_file(file_path): files_deleted += 1 print(f" ✅ DELETED due to {len(true_dupes)} duplicate pairs") else: print(f" ❌ Failed to delete") print() print("=" * 60) if files_with_issues == 0: print(f" ✅ All {len(json_files)} files passed sanity check!") else: print(f" ⚠️ Found {total_warnings} true duplicate warnings in {files_with_issues} files") if args.delete: print(f" 🗑️ Deleted {files_deleted} files") else: print(f" 💡 Run with --delete to remove files with 3+ true duplicates") print("=" * 60) if __name__ == "__main__": main()