Files
Wictionary-Data-Parser/scripts/Json Analyzer/jsonl_schema_analyzer_hybrid.py
2026-02-13 00:10:40 +01:00

359 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Hybrid JSONL Schema Analyzer
Intelligently chooses between sequential and parallel processing based on file size.
For small files, uses sequential processing. For large files, uses parallel processing.
"""
import json
import os
import sys
import time
import mmap
from collections import defaultdict, Counter
from typing import Dict, List, Any, Set, Union, Tuple
import argparse
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count
import threading
from functools import partial
import gc
# Import the optimized analyzer for parallel processing
sys.path.insert(0, str(Path(__file__).parent))
try:
from jsonl_schema_analyzer_optimized import OptimizedJSONLSchemaAnalyzer
except ImportError:
print("Warning: Could not import optimized analyzer, using fallback")
OptimizedJSONLSchemaAnalyzer = None
class HybridJSONLSchemaAnalyzer:
"""Hybrid analyzer that intelligently chooses processing strategy."""
def __init__(self, max_samples: int = 1000, max_workers: int = None,
parallel_threshold_mb: int = 100, chunk_size: int = 1000):
"""
Initialize the hybrid analyzer.
Args:
max_samples: Maximum number of JSON objects to sample per file
max_workers: Maximum number of worker processes (default: cpu_count)
parallel_threshold_mb: File size threshold in MB to use parallel processing
chunk_size: Number of lines to process in each chunk
"""
self.max_samples = max_samples
self.max_workers = max_workers or min(cpu_count(), 8)
self.parallel_threshold_mb = parallel_threshold_mb
self.chunk_size = chunk_size
# Import the original analyzer for small files
sys.path.insert(0, str(Path(__file__).parent))
try:
from jsonl_schema_analyzer import JSONLSchemaAnalyzer
self.sequential_analyzer = JSONLSchemaAnalyzer(max_samples=max_samples)
except ImportError:
print("Warning: Could not import sequential analyzer")
self.sequential_analyzer = None
# Initialize optimized analyzer for large files
if OptimizedJSONLSchemaAnalyzer:
self.parallel_analyzer = OptimizedJSONLSchemaAnalyzer(
max_samples=max_samples,
max_workers=max_workers,
chunk_size=chunk_size
)
else:
self.parallel_analyzer = None
print(f"Hybrid analyzer initialized:")
print(f" Parallel threshold: {parallel_threshold_mb} MB")
print(f" Max workers: {self.max_workers}")
print(f" Chunk size: {self.chunk_size}")
def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
"""
Analyze a JSONL file using the appropriate strategy.
Args:
file_path: Path to the JSONL file
Returns:
Dictionary containing schema analysis results
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
# Get file size in MB
file_size_mb = file_path.stat().st_size / (1024 * 1024)
print(f"Analyzing {file_path.name} ({file_size_mb:.2f} MB)...")
# Choose processing strategy
if file_size_mb >= self.parallel_threshold_mb and self.parallel_analyzer:
print(f" Using parallel processing (file >= {self.parallel_threshold_mb} MB)")
result = self.parallel_analyzer.analyze_jsonl_file(file_path)
result["processing_strategy"] = "parallel"
elif self.sequential_analyzer:
print(f" Using sequential processing (file < {self.parallel_threshold_mb} MB)")
result = self.sequential_analyzer.analyze_jsonl_file(file_path)
result["processing_strategy"] = "sequential"
else:
# Fallback to parallel if sequential not available
print(f" Using parallel processing (sequential analyzer unavailable)")
if self.parallel_analyzer:
result = self.parallel_analyzer.analyze_jsonl_file(file_path)
result["processing_strategy"] = "parallel_fallback"
else:
raise RuntimeError("No analyzer available")
# Add hybrid-specific metadata
result["file_size_mb"] = file_size_mb
result["parallel_threshold_mb"] = self.parallel_threshold_mb
return result
def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
"""
Analyze all JSONL files in a directory using hybrid processing.
Args:
directory_path: Path to directory containing JSONL files
pattern: File pattern to match (default: *.jsonl)
Returns:
Dictionary containing analysis results for all files
"""
directory_path = Path(directory_path)
if not directory_path.exists():
raise FileNotFoundError(f"Directory not found: {directory_path}")
# Find all JSONL files
jsonl_files = list(directory_path.glob(pattern))
if not jsonl_files:
print(f"No JSONL files found in {directory_path} with pattern {pattern}")
return {"files": [], "summary": {}}
print(f"Found {len(jsonl_files)} JSONL files to analyze...")
start_time = time.time()
# Categorize files by size
small_files = []
large_files = []
for file_path in jsonl_files:
size_mb = file_path.stat().st_size / (1024 * 1024)
if size_mb >= self.parallel_threshold_mb:
large_files.append(file_path)
else:
small_files.append(file_path)
print(f" Small files (< {self.parallel_threshold_mb} MB): {len(small_files)}")
print(f" Large files (>= {self.parallel_threshold_mb} MB): {len(large_files)}")
file_results = {}
# Process small files sequentially (they're fast anyway)
if small_files and self.sequential_analyzer:
print(f"Processing {len(small_files)} small files sequentially...")
for file_path in small_files:
try:
result = self.analyze_jsonl_file(file_path)
file_results[file_path.name] = result
except Exception as e:
print(f"Error analyzing {file_path.name}: {e}")
file_results[file_path.name] = {"error": str(e)}
# Process large files in parallel
if large_files and self.parallel_analyzer:
print(f"Processing {len(large_files)} large files in parallel...")
if len(large_files) == 1:
# Single large file - just process it directly
file_path = large_files[0]
try:
result = self.analyze_jsonl_file(file_path)
file_results[file_path.name] = result
except Exception as e:
print(f"Error analyzing {file_path.name}: {e}")
file_results[file_path.name] = {"error": str(e)}
else:
# Multiple large files - process in parallel
with ThreadPoolExecutor(max_workers=min(len(large_files), self.max_workers)) as executor:
future_to_file = {
executor.submit(self.analyze_jsonl_file, file_path): file_path
for file_path in large_files
}
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
result = future.result()
file_results[file_path.name] = result
except Exception as e:
print(f"Error analyzing {file_path.name}: {e}")
file_results[file_path.name] = {"error": str(e)}
# Create summary
successful_results = [r for r in file_results.values() if "error" not in r]
summary = {
"total_files": len(jsonl_files),
"small_files": len(small_files),
"large_files": len(large_files),
"successfully_analyzed": len(successful_results),
"total_size_bytes": sum(
r.get("file_size_bytes", 0) for r in successful_results
),
"total_lines": sum(
r.get("total_lines", 0) for r in successful_results
),
"total_valid_lines": sum(
r.get("valid_lines", 0) for r in successful_results
),
"total_processing_time": sum(
r.get("processing_time_seconds", 0) for r in successful_results
),
"parallel_threshold_mb": self.parallel_threshold_mb,
"strategies_used": {
"sequential": len([r for r in successful_results if r.get("processing_strategy") == "sequential"]),
"parallel": len([r for r in successful_results if r.get("processing_strategy") in ["parallel", "parallel_fallback"]])
}
}
# Calculate processing speed
if summary["total_processing_time"] > 0:
total_mb = summary["total_size_bytes"] / (1024 * 1024)
summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
elapsed_time = time.time() - start_time
summary["total_elapsed_time"] = elapsed_time
print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
print(f"Sequential: {summary['strategies_used']['sequential']}, Parallel: {summary['strategies_used']['parallel']}")
print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
return {
"directory": str(directory_path),
"pattern": pattern,
"files": file_results,
"summary": summary
}
def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
"""
Save analysis results to a JSON file.
Args:
results: Analysis results to save
output_path: Path to save the results
"""
output_path = Path(output_path)
try:
start_time = time.time()
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
save_time = time.time() - start_time
file_size = output_path.stat().st_size
print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
except Exception as e:
raise RuntimeError(f"Error saving results to {output_path}: {e}")
def main():
"""Main function for command-line usage."""
parser = argparse.ArgumentParser(
description="Hybrid JSONL schema analyzer with intelligent processing strategy"
)
parser.add_argument(
"path",
help="Path to JSONL file or directory containing JSONL files"
)
parser.add_argument(
"-o", "--output",
help="Output file for analysis results (JSON format)"
)
parser.add_argument(
"-p", "--pattern",
default="*.jsonl",
help="File pattern when analyzing directory (default: *.jsonl)"
)
parser.add_argument(
"-s", "--max-samples",
type=int,
default=1000,
help="Maximum number of JSON objects to sample per file (default: 1000)"
)
parser.add_argument(
"-w", "--workers",
type=int,
default=None,
help="Number of worker processes for parallel processing (default: CPU count, max 8)"
)
parser.add_argument(
"-t", "--threshold",
type=int,
default=100,
help="File size threshold in MB for parallel processing (default: 100)"
)
parser.add_argument(
"-c", "--chunk-size",
type=int,
default=1000,
help="Number of lines to process in each chunk (default: 1000)"
)
parser.add_argument(
"--directory",
action="store_true",
help="Treat path as directory instead of single file"
)
args = parser.parse_args()
# Initialize hybrid analyzer
analyzer = HybridJSONLSchemaAnalyzer(
max_samples=args.max_samples,
max_workers=args.workers,
parallel_threshold_mb=args.threshold,
chunk_size=args.chunk_size
)
try:
start_time = time.time()
# Analyze file or directory
if args.directory or Path(args.path).is_dir():
results = analyzer.analyze_directory(args.path, args.pattern)
else:
results = analyzer.analyze_jsonl_file(args.path)
total_time = time.time() - start_time
# Save or print results
if args.output:
analyzer.save_results(results, args.output)
else:
print("\n" + "="*50)
print("ANALYSIS RESULTS")
print("="*50)
print(json.dumps(results, indent=2, ensure_ascii=False))
print(f"\nTotal analysis time: {total_time:.2f}s")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()