Migrate to gitea
This commit is contained in:
212
scripts/Json Analyzer/run_schema_analysis.py
Normal file
212
scripts/Json Analyzer/run_schema_analysis.py
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Run JSONL Schema Analysis with Default Configuration
|
||||
|
||||
This script runs the JSONL schema analyzer using predefined constants,
|
||||
so you don't need to pass any command line arguments.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Get the root directory (assuming this script is in the scripts folder)
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||||
|
||||
# Configuration constants
|
||||
DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
|
||||
DEFAULT_OUTPUT_DIR = ROOT_DIR / "intermediate"
|
||||
DEFAULT_LANG_FILTER = "fr"
|
||||
DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
|
||||
DEFAULT_INPUT_FILE = DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME
|
||||
|
||||
# Analyzer configuration
|
||||
DEFAULT_MAX_SAMPLES = 1000
|
||||
DEFAULT_MAX_WORKERS = None # Will use CPU count
|
||||
DEFAULT_PARALLEL_THRESHOLD_MB = 100
|
||||
DEFAULT_CHUNK_SIZE = 1000
|
||||
|
||||
# Output configuration
|
||||
DEFAULT_OUTPUT_FILENAME = f"{DEFAULT_LANG_FILTER}_schema_analysis.json"
|
||||
DEFAULT_OUTPUT_FILE = DEFAULT_OUTPUT_DIR / DEFAULT_OUTPUT_FILENAME
|
||||
|
||||
def main():
|
||||
"""Run the schema analysis with default configuration."""
|
||||
|
||||
print("=" * 60)
|
||||
print("JSONL Schema Analysis - Default Configuration")
|
||||
print("=" * 60)
|
||||
|
||||
# Display configuration
|
||||
print(f"Root directory: {ROOT_DIR}")
|
||||
print(f"Input directory: {DEFAULT_INPUT_DIR}")
|
||||
print(f"Input file: {DEFAULT_INPUT_FILENAME}")
|
||||
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
|
||||
print(f"Output file: {DEFAULT_OUTPUT_FILENAME}")
|
||||
print(f"Language filter: {DEFAULT_LANG_FILTER}")
|
||||
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
|
||||
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
|
||||
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
|
||||
print(f"Max workers: {DEFAULT_MAX_WORKERS or 'Auto (CPU count)'}")
|
||||
print()
|
||||
|
||||
# Check if input file exists
|
||||
if not DEFAULT_INPUT_FILE.exists():
|
||||
print(f"❌ Input file not found: {DEFAULT_INPUT_FILE}")
|
||||
print()
|
||||
print("Available files in raw_data directory:")
|
||||
|
||||
# List available JSONL files
|
||||
if DEFAULT_INPUT_DIR.exists():
|
||||
jsonl_files = list(DEFAULT_INPUT_DIR.glob("*.jsonl"))
|
||||
if jsonl_files:
|
||||
for i, file in enumerate(sorted(jsonl_files), 1):
|
||||
size_mb = file.stat().st_size / (1024 * 1024)
|
||||
print(f" {i:2d}. {file.name} ({size_mb:.1f} MB)")
|
||||
else:
|
||||
print(" No JSONL files found.")
|
||||
else:
|
||||
print(" raw_data directory not found.")
|
||||
|
||||
print()
|
||||
print("To analyze a different file, modify the constants in this script:")
|
||||
print(f" - DEFAULT_LANG_FILTER (currently: '{DEFAULT_LANG_FILTER}')")
|
||||
print(f" - DEFAULT_INPUT_FILENAME (currently: '{DEFAULT_INPUT_FILENAME}')")
|
||||
return False
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"✅ Input file found: {DEFAULT_INPUT_FILE.stat().st_size / (1024*1024):.1f} MB")
|
||||
print()
|
||||
|
||||
try:
|
||||
# Import the hybrid analyzer
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
|
||||
|
||||
# Initialize analyzer with default configuration
|
||||
analyzer = HybridJSONLSchemaAnalyzer(
|
||||
max_samples=DEFAULT_MAX_SAMPLES,
|
||||
max_workers=DEFAULT_MAX_WORKERS,
|
||||
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
|
||||
chunk_size=DEFAULT_CHUNK_SIZE
|
||||
)
|
||||
|
||||
print("🚀 Starting analysis...")
|
||||
print()
|
||||
|
||||
# Run analysis
|
||||
results = analyzer.analyze_jsonl_file(DEFAULT_INPUT_FILE)
|
||||
|
||||
# Save results
|
||||
analyzer.save_results(results, DEFAULT_OUTPUT_FILE)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("ANALYSIS COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"📊 Results saved to: {DEFAULT_OUTPUT_FILE}")
|
||||
print(f"📈 Valid lines processed: {results.get('valid_lines', 0):,}")
|
||||
print(f"🔑 Unique keys found: {results.get('unique_key_count', 0):,}")
|
||||
print(f"⏱️ Processing time: {results.get('processing_time_seconds', 0):.2f} seconds")
|
||||
print(f"📁 File size: {results.get('file_size_bytes', 0) / (1024*1024):.1f} MB")
|
||||
|
||||
if results.get('processing_strategy'):
|
||||
print(f"🔧 Strategy used: {results['processing_strategy']}")
|
||||
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"❌ Error importing analyzer: {e}")
|
||||
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ Error during analysis: {e}")
|
||||
return False
|
||||
|
||||
def run_directory_analysis():
|
||||
"""Run analysis on entire directory with default configuration."""
|
||||
|
||||
print("=" * 60)
|
||||
print("Directory JSONL Schema Analysis - Default Configuration")
|
||||
print("=" * 60)
|
||||
|
||||
# Display configuration
|
||||
print(f"Input directory: {DEFAULT_INPUT_DIR}")
|
||||
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
|
||||
print(f"Pattern: *.jsonl")
|
||||
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
|
||||
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
|
||||
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
|
||||
print()
|
||||
|
||||
# Check if input directory exists
|
||||
if not DEFAULT_INPUT_DIR.exists():
|
||||
print(f"❌ Input directory not found: {DEFAULT_INPUT_DIR}")
|
||||
return False
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Import the hybrid analyzer
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
|
||||
|
||||
# Initialize analyzer with default configuration
|
||||
analyzer = HybridJSONLSchemaAnalyzer(
|
||||
max_samples=DEFAULT_MAX_SAMPLES,
|
||||
max_workers=DEFAULT_MAX_WORKERS,
|
||||
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
|
||||
chunk_size=DEFAULT_CHUNK_SIZE
|
||||
)
|
||||
|
||||
print("🚀 Starting directory analysis...")
|
||||
print()
|
||||
|
||||
# Run analysis
|
||||
results = analyzer.analyze_directory(DEFAULT_INPUT_DIR, "*.jsonl")
|
||||
|
||||
# Save results
|
||||
output_file = DEFAULT_OUTPUT_DIR / "directory_schema_analysis.json"
|
||||
analyzer.save_results(results, output_file)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("DIRECTORY ANALYSIS COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"📊 Results saved to: {output_file}")
|
||||
|
||||
summary = results.get('summary', {})
|
||||
print(f"📁 Files analyzed: {summary.get('successfully_analyzed', 0)}")
|
||||
print(f"📈 Total valid lines: {summary.get('total_valid_lines', 0):,}")
|
||||
print(f"⏱️ Total processing time: {summary.get('total_processing_time', 0):.2f} seconds")
|
||||
print(f"📦 Total data: {summary.get('total_size_bytes', 0) / (1024*1024*1024):.2f} GB")
|
||||
print(f"🚀 Average speed: {summary.get('average_processing_speed_mb_per_sec', 0):.2f} MB/sec")
|
||||
|
||||
if summary.get('strategies_used'):
|
||||
strategies = summary['strategies_used']
|
||||
print(f"🔧 Sequential files: {strategies.get('sequential', 0)}")
|
||||
print(f"🔧 Parallel files: {strategies.get('parallel', 0)}")
|
||||
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"❌ Error importing analyzer: {e}")
|
||||
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ Error during analysis: {e}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
# You can choose what to run by default:
|
||||
|
||||
# Option 1: Analyze single file (based on DEFAULT_LANG_FILTER)
|
||||
success = main()
|
||||
|
||||
# Option 2: Analyze entire directory (comment out the line above and uncomment below)
|
||||
# success = run_directory_analysis()
|
||||
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user