BatchVocabListGenerator/check_duplicates.py

"""
Sanity Check for Vocabulary Files
---------------------------------
Checks all JSON files in the output directory for TRUE duplicate word pairs.
A TRUE duplicate is when BOTH wordFirst AND wordSecond are identical.
Throws a warning if any word pair appears 3 times or more.

Also supports deleting files that have 3+ true duplicates.

Usage:
    python check_duplicates.py
    python check_duplicates.py --output output
    python check_duplicates.py --threshold 3
    python check_duplicates.py --delete    # Delete files with 3+ true duplicates
"""

import argparse
import json
import os
from collections import Counter
from pathlib import Path
from typing import Dict, List


def find_json_files(output_dir: str) -> List[Path]:
    """Find all JSON files in the output directory."""
    output_path = Path(output_dir)
    if not output_path.exists():
        print(f"ERROR: Output directory '{output_dir}' not found.")
        exit(1)

    # Find all JSON files (including subdirectories)
    json_files = list(output_path.rglob("*.json"))
    return [f for f in json_files if f.name != "vocab_manifest.json"]


def check_file_for_true_duplicates(file_path: Path, threshold: int = 3) -> Dict:
    """
    Check a single file for TRUE duplicates.
    TRUE duplicate = when BOTH wordFirst AND wordSecond are identical.

    Returns a dict with duplicate information.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        return {
            "file": str(file_path),
            "error": f"JSON parse error: {e}",
            "true_dupes": {},
            "item_count": 0
        }
    except Exception as e:
        return {
            "file": str(file_path),
            "error": f"Error reading file: {e}",
            "true_dupes": {},
            "item_count": 0
        }

    # Extract items
    items = data.get("items", [])
    if not items:
        return {
            "file": str(file_path),
            "item_count": 0,
            "true_dupes": {}
        }

    # Create tuples of (wordFirst, wordSecond) to find TRUE duplicates
    pair_list = [
        (item.get("wordFirst", ""), item.get("wordSecond", ""))
        for item in items
        if item.get("wordFirst") and item.get("wordSecond")
    ]

    # Count TRUE duplicates (both first AND second must match)
    pair_counts = Counter(pair_list)

    # Find duplicates above threshold
    true_dupes = {pair: count for pair, count in pair_counts.items() if count >= threshold}

    return {
        "file": str(file_path),
        "item_count": len(items),
        "true_dupes": true_dupes,
        "unique_pairs": len(pair_counts)
    }


def delete_file(file_path: Path) -> bool:
    """Delete a file and return True if successful."""
    try:
        os.remove(file_path)
        return True
    except Exception as e:
        print(f"  ❌ Could not delete {file_path}: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(description="Check vocabulary files for TRUE duplicates")
    parser.add_argument("--output", default="output", help="Output directory to check (default: output)")
    parser.add_argument("--threshold", type=int, default=3, help="Warning threshold for duplicates (default: 3)")
    parser.add_argument("--delete", action="store_true", help="Delete files with 3+ TRUE duplicates")
    args = parser.parse_args()

    print("=" * 60)
    print("       Vocabulary Duplicate Sanity Check")
    print("=" * 60)
    print(f"  Output dir  : {args.output}")
    print(f"  Threshold  : {args.threshold}+ occurrences = warning")
    print(f"  Mode       : {'DELETE' if args.delete else 'CHECK'}")
    print()

    json_files = find_json_files(args.output)
    print(f"  Found {len(json_files)} JSON files to check...\n")

    total_warnings = 0
    files_with_issues = 0
    files_deleted = 0

    for file_path in json_files:
        result = check_file_for_true_duplicates(file_path, args.threshold)

        if "error" in result:
            print(f"  ❌ ERROR: {result['file']}")
            print(f"     {result['error']}")
            files_with_issues += 1
            continue

        true_dupes = result["true_dupes"]

        if true_dupes:
            files_with_issues += 1
            total_warnings += len(true_dupes)

            try:
                rel_path = file_path.relative_to(Path(args.output))
            except ValueError:
                rel_path = file_path.name

            # Show details of true duplicates
            print(f"  ⚠️  {rel_path}")
            print(f"      Items: {result['item_count']} | Unique pairs: {result['unique_pairs']}")
            print(f"      TRUE duplicates (both wordFirst AND wordSecond identical):")

            # Show up to 5 duplicates
            for pair, count in sorted(true_dupes.items(), key=lambda x: -x[1])[:5]:
                wf, ws = pair
                print(f"        - \"{wf}\" → \"{ws}\" appears {count} times")

            # Delete if requested
            if args.delete:
                if delete_file(file_path):
                    files_deleted += 1
                    print(f"      ✅ DELETED due to {len(true_dupes)} duplicate pairs")
                else:
                    print(f"      ❌ Failed to delete")

            print()

    print("=" * 60)
    if files_with_issues == 0:
        print(f"  ✅ All {len(json_files)} files passed sanity check!")
    else:
        print(f"  ⚠️  Found {total_warnings} true duplicate warnings in {files_with_issues} files")
        if args.delete:
            print(f"  🗑️  Deleted {files_deleted} files")
        else:
            print(f"  💡 Run with --delete to remove files with 3+ true duplicates")
    print("=" * 60)


if __name__ == "__main__":
    main()