213 lines
7.8 KiB
Python
213 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Run JSONL Schema Analysis with Default Configuration
|
|
|
|
This script runs the JSONL schema analyzer using predefined constants,
|
|
so you don't need to pass any command line arguments.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Get the root directory (assuming this script is in the scripts folder)
|
|
ROOT_DIR = Path(__file__).parent.parent.parent
|
|
|
|
# Configuration constants
|
|
DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
|
|
DEFAULT_OUTPUT_DIR = ROOT_DIR / "intermediate"
|
|
DEFAULT_LANG_FILTER = "fr"
|
|
DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
|
|
DEFAULT_INPUT_FILE = DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME
|
|
|
|
# Analyzer configuration
|
|
DEFAULT_MAX_SAMPLES = 1000
|
|
DEFAULT_MAX_WORKERS = None # Will use CPU count
|
|
DEFAULT_PARALLEL_THRESHOLD_MB = 100
|
|
DEFAULT_CHUNK_SIZE = 1000
|
|
|
|
# Output configuration
|
|
DEFAULT_OUTPUT_FILENAME = f"{DEFAULT_LANG_FILTER}_schema_analysis.json"
|
|
DEFAULT_OUTPUT_FILE = DEFAULT_OUTPUT_DIR / DEFAULT_OUTPUT_FILENAME
|
|
|
|
def main():
|
|
"""Run the schema analysis with default configuration."""
|
|
|
|
print("=" * 60)
|
|
print("JSONL Schema Analysis - Default Configuration")
|
|
print("=" * 60)
|
|
|
|
# Display configuration
|
|
print(f"Root directory: {ROOT_DIR}")
|
|
print(f"Input directory: {DEFAULT_INPUT_DIR}")
|
|
print(f"Input file: {DEFAULT_INPUT_FILENAME}")
|
|
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
|
|
print(f"Output file: {DEFAULT_OUTPUT_FILENAME}")
|
|
print(f"Language filter: {DEFAULT_LANG_FILTER}")
|
|
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
|
|
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
|
|
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
|
|
print(f"Max workers: {DEFAULT_MAX_WORKERS or 'Auto (CPU count)'}")
|
|
print()
|
|
|
|
# Check if input file exists
|
|
if not DEFAULT_INPUT_FILE.exists():
|
|
print(f"❌ Input file not found: {DEFAULT_INPUT_FILE}")
|
|
print()
|
|
print("Available files in raw_data directory:")
|
|
|
|
# List available JSONL files
|
|
if DEFAULT_INPUT_DIR.exists():
|
|
jsonl_files = list(DEFAULT_INPUT_DIR.glob("*.jsonl"))
|
|
if jsonl_files:
|
|
for i, file in enumerate(sorted(jsonl_files), 1):
|
|
size_mb = file.stat().st_size / (1024 * 1024)
|
|
print(f" {i:2d}. {file.name} ({size_mb:.1f} MB)")
|
|
else:
|
|
print(" No JSONL files found.")
|
|
else:
|
|
print(" raw_data directory not found.")
|
|
|
|
print()
|
|
print("To analyze a different file, modify the constants in this script:")
|
|
print(f" - DEFAULT_LANG_FILTER (currently: '{DEFAULT_LANG_FILTER}')")
|
|
print(f" - DEFAULT_INPUT_FILENAME (currently: '{DEFAULT_INPUT_FILENAME}')")
|
|
return False
|
|
|
|
# Create output directory if it doesn't exist
|
|
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
print(f"✅ Input file found: {DEFAULT_INPUT_FILE.stat().st_size / (1024*1024):.1f} MB")
|
|
print()
|
|
|
|
try:
|
|
# Import the hybrid analyzer
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
|
|
|
|
# Initialize analyzer with default configuration
|
|
analyzer = HybridJSONLSchemaAnalyzer(
|
|
max_samples=DEFAULT_MAX_SAMPLES,
|
|
max_workers=DEFAULT_MAX_WORKERS,
|
|
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
|
|
chunk_size=DEFAULT_CHUNK_SIZE
|
|
)
|
|
|
|
print("🚀 Starting analysis...")
|
|
print()
|
|
|
|
# Run analysis
|
|
results = analyzer.analyze_jsonl_file(DEFAULT_INPUT_FILE)
|
|
|
|
# Save results
|
|
analyzer.save_results(results, DEFAULT_OUTPUT_FILE)
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("ANALYSIS COMPLETE")
|
|
print("=" * 60)
|
|
print(f"📊 Results saved to: {DEFAULT_OUTPUT_FILE}")
|
|
print(f"📈 Valid lines processed: {results.get('valid_lines', 0):,}")
|
|
print(f"🔑 Unique keys found: {results.get('unique_key_count', 0):,}")
|
|
print(f"⏱️ Processing time: {results.get('processing_time_seconds', 0):.2f} seconds")
|
|
print(f"📁 File size: {results.get('file_size_bytes', 0) / (1024*1024):.1f} MB")
|
|
|
|
if results.get('processing_strategy'):
|
|
print(f"🔧 Strategy used: {results['processing_strategy']}")
|
|
|
|
return True
|
|
|
|
except ImportError as e:
|
|
print(f"❌ Error importing analyzer: {e}")
|
|
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Error during analysis: {e}")
|
|
return False
|
|
|
|
def run_directory_analysis():
|
|
"""Run analysis on entire directory with default configuration."""
|
|
|
|
print("=" * 60)
|
|
print("Directory JSONL Schema Analysis - Default Configuration")
|
|
print("=" * 60)
|
|
|
|
# Display configuration
|
|
print(f"Input directory: {DEFAULT_INPUT_DIR}")
|
|
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
|
|
print(f"Pattern: *.jsonl")
|
|
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
|
|
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
|
|
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
|
|
print()
|
|
|
|
# Check if input directory exists
|
|
if not DEFAULT_INPUT_DIR.exists():
|
|
print(f"❌ Input directory not found: {DEFAULT_INPUT_DIR}")
|
|
return False
|
|
|
|
# Create output directory if it doesn't exist
|
|
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
try:
|
|
# Import the hybrid analyzer
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
|
|
|
|
# Initialize analyzer with default configuration
|
|
analyzer = HybridJSONLSchemaAnalyzer(
|
|
max_samples=DEFAULT_MAX_SAMPLES,
|
|
max_workers=DEFAULT_MAX_WORKERS,
|
|
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
|
|
chunk_size=DEFAULT_CHUNK_SIZE
|
|
)
|
|
|
|
print("🚀 Starting directory analysis...")
|
|
print()
|
|
|
|
# Run analysis
|
|
results = analyzer.analyze_directory(DEFAULT_INPUT_DIR, "*.jsonl")
|
|
|
|
# Save results
|
|
output_file = DEFAULT_OUTPUT_DIR / "directory_schema_analysis.json"
|
|
analyzer.save_results(results, output_file)
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("DIRECTORY ANALYSIS COMPLETE")
|
|
print("=" * 60)
|
|
print(f"📊 Results saved to: {output_file}")
|
|
|
|
summary = results.get('summary', {})
|
|
print(f"📁 Files analyzed: {summary.get('successfully_analyzed', 0)}")
|
|
print(f"📈 Total valid lines: {summary.get('total_valid_lines', 0):,}")
|
|
print(f"⏱️ Total processing time: {summary.get('total_processing_time', 0):.2f} seconds")
|
|
print(f"📦 Total data: {summary.get('total_size_bytes', 0) / (1024*1024*1024):.2f} GB")
|
|
print(f"🚀 Average speed: {summary.get('average_processing_speed_mb_per_sec', 0):.2f} MB/sec")
|
|
|
|
if summary.get('strategies_used'):
|
|
strategies = summary['strategies_used']
|
|
print(f"🔧 Sequential files: {strategies.get('sequential', 0)}")
|
|
print(f"🔧 Parallel files: {strategies.get('parallel', 0)}")
|
|
|
|
return True
|
|
|
|
except ImportError as e:
|
|
print(f"❌ Error importing analyzer: {e}")
|
|
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Error during analysis: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
# You can choose what to run by default:
|
|
|
|
# Option 1: Analyze single file (based on DEFAULT_LANG_FILTER)
|
|
success = main()
|
|
|
|
# Option 2: Analyze entire directory (comment out the line above and uncomment below)
|
|
# success = run_directory_analysis()
|
|
|
|
if not success:
|
|
sys.exit(1)
|