welcome gitea
This commit is contained in:
177
check_duplicates.py
Normal file
177
check_duplicates.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""
|
||||
Sanity Check for Vocabulary Files
|
||||
---------------------------------
|
||||
Checks all JSON files in the output directory for TRUE duplicate word pairs.
|
||||
A TRUE duplicate is when BOTH wordFirst AND wordSecond are identical.
|
||||
Throws a warning if any word pair appears 3 times or more.
|
||||
|
||||
Also supports deleting files that have 3+ true duplicates.
|
||||
|
||||
Usage:
|
||||
python check_duplicates.py
|
||||
python check_duplicates.py --output output
|
||||
python check_duplicates.py --threshold 3
|
||||
python check_duplicates.py --delete # Delete files with 3+ true duplicates
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def find_json_files(output_dir: str) -> List[Path]:
|
||||
"""Find all JSON files in the output directory."""
|
||||
output_path = Path(output_dir)
|
||||
if not output_path.exists():
|
||||
print(f"ERROR: Output directory '{output_dir}' not found.")
|
||||
exit(1)
|
||||
|
||||
# Find all JSON files (including subdirectories)
|
||||
json_files = list(output_path.rglob("*.json"))
|
||||
return [f for f in json_files if f.name != "vocab_manifest.json"]
|
||||
|
||||
|
||||
def check_file_for_true_duplicates(file_path: Path, threshold: int = 3) -> Dict:
|
||||
"""
|
||||
Check a single file for TRUE duplicates.
|
||||
TRUE duplicate = when BOTH wordFirst AND wordSecond are identical.
|
||||
|
||||
Returns a dict with duplicate information.
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
return {
|
||||
"file": str(file_path),
|
||||
"error": f"JSON parse error: {e}",
|
||||
"true_dupes": {},
|
||||
"item_count": 0
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"file": str(file_path),
|
||||
"error": f"Error reading file: {e}",
|
||||
"true_dupes": {},
|
||||
"item_count": 0
|
||||
}
|
||||
|
||||
# Extract items
|
||||
items = data.get("items", [])
|
||||
if not items:
|
||||
return {
|
||||
"file": str(file_path),
|
||||
"item_count": 0,
|
||||
"true_dupes": {}
|
||||
}
|
||||
|
||||
# Create tuples of (wordFirst, wordSecond) to find TRUE duplicates
|
||||
pair_list = [
|
||||
(item.get("wordFirst", ""), item.get("wordSecond", ""))
|
||||
for item in items
|
||||
if item.get("wordFirst") and item.get("wordSecond")
|
||||
]
|
||||
|
||||
# Count TRUE duplicates (both first AND second must match)
|
||||
pair_counts = Counter(pair_list)
|
||||
|
||||
# Find duplicates above threshold
|
||||
true_dupes = {pair: count for pair, count in pair_counts.items() if count >= threshold}
|
||||
|
||||
return {
|
||||
"file": str(file_path),
|
||||
"item_count": len(items),
|
||||
"true_dupes": true_dupes,
|
||||
"unique_pairs": len(pair_counts)
|
||||
}
|
||||
|
||||
|
||||
def delete_file(file_path: Path) -> bool:
|
||||
"""Delete a file and return True if successful."""
|
||||
try:
|
||||
os.remove(file_path)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f" ❌ Could not delete {file_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Check vocabulary files for TRUE duplicates")
|
||||
parser.add_argument("--output", default="output", help="Output directory to check (default: output)")
|
||||
parser.add_argument("--threshold", type=int, default=3, help="Warning threshold for duplicates (default: 3)")
|
||||
parser.add_argument("--delete", action="store_true", help="Delete files with 3+ TRUE duplicates")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print(" Vocabulary Duplicate Sanity Check")
|
||||
print("=" * 60)
|
||||
print(f" Output dir : {args.output}")
|
||||
print(f" Threshold : {args.threshold}+ occurrences = warning")
|
||||
print(f" Mode : {'DELETE' if args.delete else 'CHECK'}")
|
||||
print()
|
||||
|
||||
json_files = find_json_files(args.output)
|
||||
print(f" Found {len(json_files)} JSON files to check...\n")
|
||||
|
||||
total_warnings = 0
|
||||
files_with_issues = 0
|
||||
files_deleted = 0
|
||||
|
||||
for file_path in json_files:
|
||||
result = check_file_for_true_duplicates(file_path, args.threshold)
|
||||
|
||||
if "error" in result:
|
||||
print(f" ❌ ERROR: {result['file']}")
|
||||
print(f" {result['error']}")
|
||||
files_with_issues += 1
|
||||
continue
|
||||
|
||||
true_dupes = result["true_dupes"]
|
||||
|
||||
if true_dupes:
|
||||
files_with_issues += 1
|
||||
total_warnings += len(true_dupes)
|
||||
|
||||
try:
|
||||
rel_path = file_path.relative_to(Path(args.output))
|
||||
except ValueError:
|
||||
rel_path = file_path.name
|
||||
|
||||
# Show details of true duplicates
|
||||
print(f" ⚠️ {rel_path}")
|
||||
print(f" Items: {result['item_count']} | Unique pairs: {result['unique_pairs']}")
|
||||
print(f" TRUE duplicates (both wordFirst AND wordSecond identical):")
|
||||
|
||||
# Show up to 5 duplicates
|
||||
for pair, count in sorted(true_dupes.items(), key=lambda x: -x[1])[:5]:
|
||||
wf, ws = pair
|
||||
print(f" - \"{wf}\" → \"{ws}\" appears {count} times")
|
||||
|
||||
# Delete if requested
|
||||
if args.delete:
|
||||
if delete_file(file_path):
|
||||
files_deleted += 1
|
||||
print(f" ✅ DELETED due to {len(true_dupes)} duplicate pairs")
|
||||
else:
|
||||
print(f" ❌ Failed to delete")
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 60)
|
||||
if files_with_issues == 0:
|
||||
print(f" ✅ All {len(json_files)} files passed sanity check!")
|
||||
else:
|
||||
print(f" ⚠️ Found {total_warnings} true duplicate warnings in {files_with_issues} files")
|
||||
if args.delete:
|
||||
print(f" 🗑️ Deleted {files_deleted} files")
|
||||
else:
|
||||
print(f" 💡 Run with --delete to remove files with 3+ true duplicates")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user