Files
Wictionary-Data-Parser/scripts/Json Analyzer/run_schema_analysis.py
2026-02-13 00:10:40 +01:00

213 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""
Run JSONL Schema Analysis with Default Configuration
This script runs the JSONL schema analyzer using predefined constants,
so you don't need to pass any command line arguments.
"""
import sys
from pathlib import Path
# Get the root directory (assuming this script is in the scripts folder)
ROOT_DIR = Path(__file__).parent.parent.parent
# Configuration constants
DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
DEFAULT_OUTPUT_DIR = ROOT_DIR / "intermediate"
DEFAULT_LANG_FILTER = "fr"
DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
DEFAULT_INPUT_FILE = DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME
# Analyzer configuration
DEFAULT_MAX_SAMPLES = 1000
DEFAULT_MAX_WORKERS = None # Will use CPU count
DEFAULT_PARALLEL_THRESHOLD_MB = 100
DEFAULT_CHUNK_SIZE = 1000
# Output configuration
DEFAULT_OUTPUT_FILENAME = f"{DEFAULT_LANG_FILTER}_schema_analysis.json"
DEFAULT_OUTPUT_FILE = DEFAULT_OUTPUT_DIR / DEFAULT_OUTPUT_FILENAME
def main():
"""Run the schema analysis with default configuration."""
print("=" * 60)
print("JSONL Schema Analysis - Default Configuration")
print("=" * 60)
# Display configuration
print(f"Root directory: {ROOT_DIR}")
print(f"Input directory: {DEFAULT_INPUT_DIR}")
print(f"Input file: {DEFAULT_INPUT_FILENAME}")
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
print(f"Output file: {DEFAULT_OUTPUT_FILENAME}")
print(f"Language filter: {DEFAULT_LANG_FILTER}")
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
print(f"Max workers: {DEFAULT_MAX_WORKERS or 'Auto (CPU count)'}")
print()
# Check if input file exists
if not DEFAULT_INPUT_FILE.exists():
print(f"❌ Input file not found: {DEFAULT_INPUT_FILE}")
print()
print("Available files in raw_data directory:")
# List available JSONL files
if DEFAULT_INPUT_DIR.exists():
jsonl_files = list(DEFAULT_INPUT_DIR.glob("*.jsonl"))
if jsonl_files:
for i, file in enumerate(sorted(jsonl_files), 1):
size_mb = file.stat().st_size / (1024 * 1024)
print(f" {i:2d}. {file.name} ({size_mb:.1f} MB)")
else:
print(" No JSONL files found.")
else:
print(" raw_data directory not found.")
print()
print("To analyze a different file, modify the constants in this script:")
print(f" - DEFAULT_LANG_FILTER (currently: '{DEFAULT_LANG_FILTER}')")
print(f" - DEFAULT_INPUT_FILENAME (currently: '{DEFAULT_INPUT_FILENAME}')")
return False
# Create output directory if it doesn't exist
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
print(f"✅ Input file found: {DEFAULT_INPUT_FILE.stat().st_size / (1024*1024):.1f} MB")
print()
try:
# Import the hybrid analyzer
sys.path.insert(0, str(Path(__file__).parent))
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
# Initialize analyzer with default configuration
analyzer = HybridJSONLSchemaAnalyzer(
max_samples=DEFAULT_MAX_SAMPLES,
max_workers=DEFAULT_MAX_WORKERS,
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
chunk_size=DEFAULT_CHUNK_SIZE
)
print("🚀 Starting analysis...")
print()
# Run analysis
results = analyzer.analyze_jsonl_file(DEFAULT_INPUT_FILE)
# Save results
analyzer.save_results(results, DEFAULT_OUTPUT_FILE)
print()
print("=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)
print(f"📊 Results saved to: {DEFAULT_OUTPUT_FILE}")
print(f"📈 Valid lines processed: {results.get('valid_lines', 0):,}")
print(f"🔑 Unique keys found: {results.get('unique_key_count', 0):,}")
print(f"⏱️ Processing time: {results.get('processing_time_seconds', 0):.2f} seconds")
print(f"📁 File size: {results.get('file_size_bytes', 0) / (1024*1024):.1f} MB")
if results.get('processing_strategy'):
print(f"🔧 Strategy used: {results['processing_strategy']}")
return True
except ImportError as e:
print(f"❌ Error importing analyzer: {e}")
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
return False
except Exception as e:
print(f"❌ Error during analysis: {e}")
return False
def run_directory_analysis():
"""Run analysis on entire directory with default configuration."""
print("=" * 60)
print("Directory JSONL Schema Analysis - Default Configuration")
print("=" * 60)
# Display configuration
print(f"Input directory: {DEFAULT_INPUT_DIR}")
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
print(f"Pattern: *.jsonl")
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
print()
# Check if input directory exists
if not DEFAULT_INPUT_DIR.exists():
print(f"❌ Input directory not found: {DEFAULT_INPUT_DIR}")
return False
# Create output directory if it doesn't exist
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
try:
# Import the hybrid analyzer
sys.path.insert(0, str(Path(__file__).parent))
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
# Initialize analyzer with default configuration
analyzer = HybridJSONLSchemaAnalyzer(
max_samples=DEFAULT_MAX_SAMPLES,
max_workers=DEFAULT_MAX_WORKERS,
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
chunk_size=DEFAULT_CHUNK_SIZE
)
print("🚀 Starting directory analysis...")
print()
# Run analysis
results = analyzer.analyze_directory(DEFAULT_INPUT_DIR, "*.jsonl")
# Save results
output_file = DEFAULT_OUTPUT_DIR / "directory_schema_analysis.json"
analyzer.save_results(results, output_file)
print()
print("=" * 60)
print("DIRECTORY ANALYSIS COMPLETE")
print("=" * 60)
print(f"📊 Results saved to: {output_file}")
summary = results.get('summary', {})
print(f"📁 Files analyzed: {summary.get('successfully_analyzed', 0)}")
print(f"📈 Total valid lines: {summary.get('total_valid_lines', 0):,}")
print(f"⏱️ Total processing time: {summary.get('total_processing_time', 0):.2f} seconds")
print(f"📦 Total data: {summary.get('total_size_bytes', 0) / (1024*1024*1024):.2f} GB")
print(f"🚀 Average speed: {summary.get('average_processing_speed_mb_per_sec', 0):.2f} MB/sec")
if summary.get('strategies_used'):
strategies = summary['strategies_used']
print(f"🔧 Sequential files: {strategies.get('sequential', 0)}")
print(f"🔧 Parallel files: {strategies.get('parallel', 0)}")
return True
except ImportError as e:
print(f"❌ Error importing analyzer: {e}")
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
return False
except Exception as e:
print(f"❌ Error during analysis: {e}")
return False
if __name__ == "__main__":
# You can choose what to run by default:
# Option 1: Analyze single file (based on DEFAULT_LANG_FILTER)
success = main()
# Option 2: Analyze entire directory (comment out the line above and uncomment below)
# success = run_directory_analysis()
if not success:
sys.exit(1)