#!/usr/bin/env python3 """ Run JSONL Schema Analysis with Default Configuration This script runs the JSONL schema analyzer using predefined constants, so you don't need to pass any command line arguments. """ import sys from pathlib import Path # Get the root directory (assuming this script is in the scripts folder) ROOT_DIR = Path(__file__).parent.parent.parent # Configuration constants DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data" DEFAULT_OUTPUT_DIR = ROOT_DIR / "intermediate" DEFAULT_LANG_FILTER = "fr" DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl" DEFAULT_INPUT_FILE = DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME # Analyzer configuration DEFAULT_MAX_SAMPLES = 1000 DEFAULT_MAX_WORKERS = None # Will use CPU count DEFAULT_PARALLEL_THRESHOLD_MB = 100 DEFAULT_CHUNK_SIZE = 1000 # Output configuration DEFAULT_OUTPUT_FILENAME = f"{DEFAULT_LANG_FILTER}_schema_analysis.json" DEFAULT_OUTPUT_FILE = DEFAULT_OUTPUT_DIR / DEFAULT_OUTPUT_FILENAME def main(): """Run the schema analysis with default configuration.""" print("=" * 60) print("JSONL Schema Analysis - Default Configuration") print("=" * 60) # Display configuration print(f"Root directory: {ROOT_DIR}") print(f"Input directory: {DEFAULT_INPUT_DIR}") print(f"Input file: {DEFAULT_INPUT_FILENAME}") print(f"Output directory: {DEFAULT_OUTPUT_DIR}") print(f"Output file: {DEFAULT_OUTPUT_FILENAME}") print(f"Language filter: {DEFAULT_LANG_FILTER}") print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}") print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB") print(f"Chunk size: {DEFAULT_CHUNK_SIZE}") print(f"Max workers: {DEFAULT_MAX_WORKERS or 'Auto (CPU count)'}") print() # Check if input file exists if not DEFAULT_INPUT_FILE.exists(): print(f"❌ Input file not found: {DEFAULT_INPUT_FILE}") print() print("Available files in raw_data directory:") # List available JSONL files if DEFAULT_INPUT_DIR.exists(): jsonl_files = list(DEFAULT_INPUT_DIR.glob("*.jsonl")) if jsonl_files: for i, file in enumerate(sorted(jsonl_files), 1): size_mb = file.stat().st_size / (1024 * 1024) print(f" {i:2d}. {file.name} ({size_mb:.1f} MB)") else: print(" No JSONL files found.") else: print(" raw_data directory not found.") print() print("To analyze a different file, modify the constants in this script:") print(f" - DEFAULT_LANG_FILTER (currently: '{DEFAULT_LANG_FILTER}')") print(f" - DEFAULT_INPUT_FILENAME (currently: '{DEFAULT_INPUT_FILENAME}')") return False # Create output directory if it doesn't exist DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True) print(f"✅ Input file found: {DEFAULT_INPUT_FILE.stat().st_size / (1024*1024):.1f} MB") print() try: # Import the hybrid analyzer sys.path.insert(0, str(Path(__file__).parent)) from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer # Initialize analyzer with default configuration analyzer = HybridJSONLSchemaAnalyzer( max_samples=DEFAULT_MAX_SAMPLES, max_workers=DEFAULT_MAX_WORKERS, parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB, chunk_size=DEFAULT_CHUNK_SIZE ) print("🚀 Starting analysis...") print() # Run analysis results = analyzer.analyze_jsonl_file(DEFAULT_INPUT_FILE) # Save results analyzer.save_results(results, DEFAULT_OUTPUT_FILE) print() print("=" * 60) print("ANALYSIS COMPLETE") print("=" * 60) print(f"📊 Results saved to: {DEFAULT_OUTPUT_FILE}") print(f"📈 Valid lines processed: {results.get('valid_lines', 0):,}") print(f"🔑 Unique keys found: {results.get('unique_key_count', 0):,}") print(f"⏱️ Processing time: {results.get('processing_time_seconds', 0):.2f} seconds") print(f"📁 File size: {results.get('file_size_bytes', 0) / (1024*1024):.1f} MB") if results.get('processing_strategy'): print(f"🔧 Strategy used: {results['processing_strategy']}") return True except ImportError as e: print(f"❌ Error importing analyzer: {e}") print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.") return False except Exception as e: print(f"❌ Error during analysis: {e}") return False def run_directory_analysis(): """Run analysis on entire directory with default configuration.""" print("=" * 60) print("Directory JSONL Schema Analysis - Default Configuration") print("=" * 60) # Display configuration print(f"Input directory: {DEFAULT_INPUT_DIR}") print(f"Output directory: {DEFAULT_OUTPUT_DIR}") print(f"Pattern: *.jsonl") print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}") print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB") print(f"Chunk size: {DEFAULT_CHUNK_SIZE}") print() # Check if input directory exists if not DEFAULT_INPUT_DIR.exists(): print(f"❌ Input directory not found: {DEFAULT_INPUT_DIR}") return False # Create output directory if it doesn't exist DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True) try: # Import the hybrid analyzer sys.path.insert(0, str(Path(__file__).parent)) from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer # Initialize analyzer with default configuration analyzer = HybridJSONLSchemaAnalyzer( max_samples=DEFAULT_MAX_SAMPLES, max_workers=DEFAULT_MAX_WORKERS, parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB, chunk_size=DEFAULT_CHUNK_SIZE ) print("🚀 Starting directory analysis...") print() # Run analysis results = analyzer.analyze_directory(DEFAULT_INPUT_DIR, "*.jsonl") # Save results output_file = DEFAULT_OUTPUT_DIR / "directory_schema_analysis.json" analyzer.save_results(results, output_file) print() print("=" * 60) print("DIRECTORY ANALYSIS COMPLETE") print("=" * 60) print(f"📊 Results saved to: {output_file}") summary = results.get('summary', {}) print(f"📁 Files analyzed: {summary.get('successfully_analyzed', 0)}") print(f"📈 Total valid lines: {summary.get('total_valid_lines', 0):,}") print(f"⏱️ Total processing time: {summary.get('total_processing_time', 0):.2f} seconds") print(f"📦 Total data: {summary.get('total_size_bytes', 0) / (1024*1024*1024):.2f} GB") print(f"🚀 Average speed: {summary.get('average_processing_speed_mb_per_sec', 0):.2f} MB/sec") if summary.get('strategies_used'): strategies = summary['strategies_used'] print(f"🔧 Sequential files: {strategies.get('sequential', 0)}") print(f"🔧 Parallel files: {strategies.get('parallel', 0)}") return True except ImportError as e: print(f"❌ Error importing analyzer: {e}") print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.") return False except Exception as e: print(f"❌ Error during analysis: {e}") return False if __name__ == "__main__": # You can choose what to run by default: # Option 1: Analyze single file (based on DEFAULT_LANG_FILTER) success = main() # Option 2: Analyze entire directory (comment out the line above and uncomment below) # success = run_directory_analysis() if not success: sys.exit(1)