Migrate to gitea
This commit is contained in:
358
scripts/Json Analyzer/jsonl_schema_analyzer_hybrid.py
Normal file
358
scripts/Json Analyzer/jsonl_schema_analyzer_hybrid.py
Normal file
@@ -0,0 +1,358 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hybrid JSONL Schema Analyzer
|
||||
|
||||
Intelligently chooses between sequential and parallel processing based on file size.
|
||||
For small files, uses sequential processing. For large files, uses parallel processing.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import mmap
|
||||
from collections import defaultdict, Counter
|
||||
from typing import Dict, List, Any, Set, Union, Tuple
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||
from multiprocessing import cpu_count
|
||||
import threading
|
||||
from functools import partial
|
||||
import gc
|
||||
|
||||
# Import the optimized analyzer for parallel processing
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
try:
|
||||
from jsonl_schema_analyzer_optimized import OptimizedJSONLSchemaAnalyzer
|
||||
except ImportError:
|
||||
print("Warning: Could not import optimized analyzer, using fallback")
|
||||
OptimizedJSONLSchemaAnalyzer = None
|
||||
|
||||
|
||||
class HybridJSONLSchemaAnalyzer:
|
||||
"""Hybrid analyzer that intelligently chooses processing strategy."""
|
||||
|
||||
def __init__(self, max_samples: int = 1000, max_workers: int = None,
|
||||
parallel_threshold_mb: int = 100, chunk_size: int = 1000):
|
||||
"""
|
||||
Initialize the hybrid analyzer.
|
||||
|
||||
Args:
|
||||
max_samples: Maximum number of JSON objects to sample per file
|
||||
max_workers: Maximum number of worker processes (default: cpu_count)
|
||||
parallel_threshold_mb: File size threshold in MB to use parallel processing
|
||||
chunk_size: Number of lines to process in each chunk
|
||||
"""
|
||||
self.max_samples = max_samples
|
||||
self.max_workers = max_workers or min(cpu_count(), 8)
|
||||
self.parallel_threshold_mb = parallel_threshold_mb
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
# Import the original analyzer for small files
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
try:
|
||||
from jsonl_schema_analyzer import JSONLSchemaAnalyzer
|
||||
self.sequential_analyzer = JSONLSchemaAnalyzer(max_samples=max_samples)
|
||||
except ImportError:
|
||||
print("Warning: Could not import sequential analyzer")
|
||||
self.sequential_analyzer = None
|
||||
|
||||
# Initialize optimized analyzer for large files
|
||||
if OptimizedJSONLSchemaAnalyzer:
|
||||
self.parallel_analyzer = OptimizedJSONLSchemaAnalyzer(
|
||||
max_samples=max_samples,
|
||||
max_workers=max_workers,
|
||||
chunk_size=chunk_size
|
||||
)
|
||||
else:
|
||||
self.parallel_analyzer = None
|
||||
|
||||
print(f"Hybrid analyzer initialized:")
|
||||
print(f" Parallel threshold: {parallel_threshold_mb} MB")
|
||||
print(f" Max workers: {self.max_workers}")
|
||||
print(f" Chunk size: {self.chunk_size}")
|
||||
|
||||
def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze a JSONL file using the appropriate strategy.
|
||||
|
||||
Args:
|
||||
file_path: Path to the JSONL file
|
||||
|
||||
Returns:
|
||||
Dictionary containing schema analysis results
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
# Get file size in MB
|
||||
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
||||
|
||||
print(f"Analyzing {file_path.name} ({file_size_mb:.2f} MB)...")
|
||||
|
||||
# Choose processing strategy
|
||||
if file_size_mb >= self.parallel_threshold_mb and self.parallel_analyzer:
|
||||
print(f" Using parallel processing (file >= {self.parallel_threshold_mb} MB)")
|
||||
result = self.parallel_analyzer.analyze_jsonl_file(file_path)
|
||||
result["processing_strategy"] = "parallel"
|
||||
elif self.sequential_analyzer:
|
||||
print(f" Using sequential processing (file < {self.parallel_threshold_mb} MB)")
|
||||
result = self.sequential_analyzer.analyze_jsonl_file(file_path)
|
||||
result["processing_strategy"] = "sequential"
|
||||
else:
|
||||
# Fallback to parallel if sequential not available
|
||||
print(f" Using parallel processing (sequential analyzer unavailable)")
|
||||
if self.parallel_analyzer:
|
||||
result = self.parallel_analyzer.analyze_jsonl_file(file_path)
|
||||
result["processing_strategy"] = "parallel_fallback"
|
||||
else:
|
||||
raise RuntimeError("No analyzer available")
|
||||
|
||||
# Add hybrid-specific metadata
|
||||
result["file_size_mb"] = file_size_mb
|
||||
result["parallel_threshold_mb"] = self.parallel_threshold_mb
|
||||
|
||||
return result
|
||||
|
||||
def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze all JSONL files in a directory using hybrid processing.
|
||||
|
||||
Args:
|
||||
directory_path: Path to directory containing JSONL files
|
||||
pattern: File pattern to match (default: *.jsonl)
|
||||
|
||||
Returns:
|
||||
Dictionary containing analysis results for all files
|
||||
"""
|
||||
directory_path = Path(directory_path)
|
||||
|
||||
if not directory_path.exists():
|
||||
raise FileNotFoundError(f"Directory not found: {directory_path}")
|
||||
|
||||
# Find all JSONL files
|
||||
jsonl_files = list(directory_path.glob(pattern))
|
||||
|
||||
if not jsonl_files:
|
||||
print(f"No JSONL files found in {directory_path} with pattern {pattern}")
|
||||
return {"files": [], "summary": {}}
|
||||
|
||||
print(f"Found {len(jsonl_files)} JSONL files to analyze...")
|
||||
start_time = time.time()
|
||||
|
||||
# Categorize files by size
|
||||
small_files = []
|
||||
large_files = []
|
||||
|
||||
for file_path in jsonl_files:
|
||||
size_mb = file_path.stat().st_size / (1024 * 1024)
|
||||
if size_mb >= self.parallel_threshold_mb:
|
||||
large_files.append(file_path)
|
||||
else:
|
||||
small_files.append(file_path)
|
||||
|
||||
print(f" Small files (< {self.parallel_threshold_mb} MB): {len(small_files)}")
|
||||
print(f" Large files (>= {self.parallel_threshold_mb} MB): {len(large_files)}")
|
||||
|
||||
file_results = {}
|
||||
|
||||
# Process small files sequentially (they're fast anyway)
|
||||
if small_files and self.sequential_analyzer:
|
||||
print(f"Processing {len(small_files)} small files sequentially...")
|
||||
for file_path in small_files:
|
||||
try:
|
||||
result = self.analyze_jsonl_file(file_path)
|
||||
file_results[file_path.name] = result
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
|
||||
# Process large files in parallel
|
||||
if large_files and self.parallel_analyzer:
|
||||
print(f"Processing {len(large_files)} large files in parallel...")
|
||||
|
||||
if len(large_files) == 1:
|
||||
# Single large file - just process it directly
|
||||
file_path = large_files[0]
|
||||
try:
|
||||
result = self.analyze_jsonl_file(file_path)
|
||||
file_results[file_path.name] = result
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
else:
|
||||
# Multiple large files - process in parallel
|
||||
with ThreadPoolExecutor(max_workers=min(len(large_files), self.max_workers)) as executor:
|
||||
future_to_file = {
|
||||
executor.submit(self.analyze_jsonl_file, file_path): file_path
|
||||
for file_path in large_files
|
||||
}
|
||||
|
||||
for future in as_completed(future_to_file):
|
||||
file_path = future_to_file[future]
|
||||
try:
|
||||
result = future.result()
|
||||
file_results[file_path.name] = result
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
|
||||
# Create summary
|
||||
successful_results = [r for r in file_results.values() if "error" not in r]
|
||||
summary = {
|
||||
"total_files": len(jsonl_files),
|
||||
"small_files": len(small_files),
|
||||
"large_files": len(large_files),
|
||||
"successfully_analyzed": len(successful_results),
|
||||
"total_size_bytes": sum(
|
||||
r.get("file_size_bytes", 0) for r in successful_results
|
||||
),
|
||||
"total_lines": sum(
|
||||
r.get("total_lines", 0) for r in successful_results
|
||||
),
|
||||
"total_valid_lines": sum(
|
||||
r.get("valid_lines", 0) for r in successful_results
|
||||
),
|
||||
"total_processing_time": sum(
|
||||
r.get("processing_time_seconds", 0) for r in successful_results
|
||||
),
|
||||
"parallel_threshold_mb": self.parallel_threshold_mb,
|
||||
"strategies_used": {
|
||||
"sequential": len([r for r in successful_results if r.get("processing_strategy") == "sequential"]),
|
||||
"parallel": len([r for r in successful_results if r.get("processing_strategy") in ["parallel", "parallel_fallback"]])
|
||||
}
|
||||
}
|
||||
|
||||
# Calculate processing speed
|
||||
if summary["total_processing_time"] > 0:
|
||||
total_mb = summary["total_size_bytes"] / (1024 * 1024)
|
||||
summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
summary["total_elapsed_time"] = elapsed_time
|
||||
|
||||
print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
|
||||
print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
|
||||
print(f"Sequential: {summary['strategies_used']['sequential']}, Parallel: {summary['strategies_used']['parallel']}")
|
||||
print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
|
||||
|
||||
return {
|
||||
"directory": str(directory_path),
|
||||
"pattern": pattern,
|
||||
"files": file_results,
|
||||
"summary": summary
|
||||
}
|
||||
|
||||
def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
|
||||
"""
|
||||
Save analysis results to a JSON file.
|
||||
|
||||
Args:
|
||||
results: Analysis results to save
|
||||
output_path: Path to save the results
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
save_time = time.time() - start_time
|
||||
file_size = output_path.stat().st_size
|
||||
print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error saving results to {output_path}: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function for command-line usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Hybrid JSONL schema analyzer with intelligent processing strategy"
|
||||
)
|
||||
parser.add_argument(
|
||||
"path",
|
||||
help="Path to JSONL file or directory containing JSONL files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
help="Output file for analysis results (JSON format)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p", "--pattern",
|
||||
default="*.jsonl",
|
||||
help="File pattern when analyzing directory (default: *.jsonl)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s", "--max-samples",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Maximum number of JSON objects to sample per file (default: 1000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w", "--workers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of worker processes for parallel processing (default: CPU count, max 8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t", "--threshold",
|
||||
type=int,
|
||||
default=100,
|
||||
help="File size threshold in MB for parallel processing (default: 100)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--chunk-size",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of lines to process in each chunk (default: 1000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--directory",
|
||||
action="store_true",
|
||||
help="Treat path as directory instead of single file"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize hybrid analyzer
|
||||
analyzer = HybridJSONLSchemaAnalyzer(
|
||||
max_samples=args.max_samples,
|
||||
max_workers=args.workers,
|
||||
parallel_threshold_mb=args.threshold,
|
||||
chunk_size=args.chunk_size
|
||||
)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Analyze file or directory
|
||||
if args.directory or Path(args.path).is_dir():
|
||||
results = analyzer.analyze_directory(args.path, args.pattern)
|
||||
else:
|
||||
results = analyzer.analyze_jsonl_file(args.path)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Save or print results
|
||||
if args.output:
|
||||
analyzer.save_results(results, args.output)
|
||||
else:
|
||||
print("\n" + "="*50)
|
||||
print("ANALYSIS RESULTS")
|
||||
print("="*50)
|
||||
print(json.dumps(results, indent=2, ensure_ascii=False))
|
||||
|
||||
print(f"\nTotal analysis time: {total_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user