Migrate to gitea
This commit is contained in:
567
scripts/Json Analyzer/jsonl_schema_analyzer_optimized.py
Normal file
567
scripts/Json Analyzer/jsonl_schema_analyzer_optimized.py
Normal file
@@ -0,0 +1,567 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Optimized JSONL Schema Analyzer
|
||||
|
||||
Analyzes JSONL files to extract and aggregate schema information using multiple cores.
|
||||
For each JSONL file, it generates a schema showing the JSON structure
|
||||
and aggregates all possible keys found across all records.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import mmap
|
||||
from collections import defaultdict, Counter
|
||||
from typing import Dict, List, Any, Set, Union, Tuple
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||
from multiprocessing import cpu_count, Manager
|
||||
import threading
|
||||
from functools import partial
|
||||
import gc
|
||||
|
||||
|
||||
class OptimizedJSONLSchemaAnalyzer:
|
||||
"""Optimized analyzer that uses multiple cores and system resources efficiently."""
|
||||
|
||||
def __init__(self, max_samples: int = 1000, max_workers: int = None, chunk_size: int = 1000):
|
||||
"""
|
||||
Initialize the optimized analyzer.
|
||||
|
||||
Args:
|
||||
max_samples: Maximum number of JSON objects to sample per file
|
||||
max_workers: Maximum number of worker processes (default: cpu_count)
|
||||
chunk_size: Number of lines to process in each chunk
|
||||
"""
|
||||
self.max_samples = max_samples
|
||||
self.max_workers = max_workers or min(cpu_count(), 8) # Limit to 8 to avoid memory issues
|
||||
self.chunk_size = chunk_size
|
||||
self.schema_cache = {}
|
||||
|
||||
print(f"Initialized analyzer with {self.max_workers} workers, chunk size: {self.chunk_size}")
|
||||
|
||||
def analyze_json_value(self, value: Any, depth: int = 0, max_depth: int = 10) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze a JSON value and return its type and structure.
|
||||
|
||||
Args:
|
||||
value: The JSON value to analyze
|
||||
depth: Current depth in the structure
|
||||
max_depth: Maximum depth to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary describing the value's type and structure
|
||||
"""
|
||||
if depth > max_depth:
|
||||
return {"type": "unknown", "note": "max_depth_reached"}
|
||||
|
||||
if value is None:
|
||||
return {"type": "null"}
|
||||
elif isinstance(value, bool):
|
||||
return {"type": "boolean"}
|
||||
elif isinstance(value, int):
|
||||
return {"type": "integer"}
|
||||
elif isinstance(value, float):
|
||||
return {"type": "number"}
|
||||
elif isinstance(value, str):
|
||||
return {"type": "string", "sample_length": len(value)}
|
||||
elif isinstance(value, list):
|
||||
if not value:
|
||||
return {"type": "array", "item_types": [], "length_range": [0, 0]}
|
||||
|
||||
item_types = set()
|
||||
item_schemas = []
|
||||
|
||||
# Sample first few items to determine array structure
|
||||
sample_size = min(10, len(value))
|
||||
for item in value[:sample_size]:
|
||||
item_schema = self.analyze_json_value(item, depth + 1, max_depth)
|
||||
item_schemas.append(item_schema)
|
||||
item_types.add(item_schema["type"])
|
||||
|
||||
return {
|
||||
"type": "array",
|
||||
"item_types": sorted(list(item_types)),
|
||||
"length_range": [len(value), len(value)],
|
||||
"sample_items": item_schemas[:3] # Keep first 3 as examples
|
||||
}
|
||||
elif isinstance(value, dict):
|
||||
if not value:
|
||||
return {"type": "object", "properties": {}, "required_keys": []}
|
||||
|
||||
properties = {}
|
||||
for key, val in value.items():
|
||||
properties[key] = self.analyze_json_value(val, depth + 1, max_depth)
|
||||
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": properties,
|
||||
"required_keys": list(value.keys())
|
||||
}
|
||||
else:
|
||||
return {"type": "unknown", "note": f"unexpected_type: {type(value)}"}
|
||||
|
||||
def merge_schemas(self, schema1: Dict[str, Any], schema2: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Merge two schemas, combining their information.
|
||||
|
||||
Args:
|
||||
schema1: First schema
|
||||
schema2: Second schema
|
||||
|
||||
Returns:
|
||||
Merged schema
|
||||
"""
|
||||
if schema1["type"] != schema2["type"]:
|
||||
# Different types, create a union
|
||||
return {
|
||||
"type": "union",
|
||||
"possible_types": sorted(set([schema1["type"], schema2["type"]])),
|
||||
"schemas": [schema1, schema2]
|
||||
}
|
||||
|
||||
merged = {"type": schema1["type"]}
|
||||
|
||||
if schema1["type"] == "array":
|
||||
# Merge array item types
|
||||
item_types = set(schema1.get("item_types", []))
|
||||
item_types.update(schema2.get("item_types", []))
|
||||
merged["item_types"] = sorted(list(item_types))
|
||||
|
||||
# Merge length ranges
|
||||
len1 = schema1.get("length_range", [0, 0])
|
||||
len2 = schema2.get("length_range", [0, 0])
|
||||
merged["length_range"] = [min(len1[0], len2[0]), max(len1[1], len2[1])]
|
||||
|
||||
# Merge sample items if available
|
||||
if "sample_items" in schema1 or "sample_items" in schema2:
|
||||
merged["sample_items"] = (
|
||||
schema1.get("sample_items", []) +
|
||||
schema2.get("sample_items", [])
|
||||
)[:5] # Keep max 5 samples
|
||||
|
||||
elif schema1["type"] == "object":
|
||||
# Merge object properties
|
||||
properties = {}
|
||||
all_keys = set()
|
||||
|
||||
# Copy properties from first schema
|
||||
for key, val in schema1.get("properties", {}).items():
|
||||
properties[key] = val
|
||||
all_keys.add(key)
|
||||
|
||||
# Merge properties from second schema
|
||||
for key, val in schema2.get("properties", {}).items():
|
||||
if key in properties:
|
||||
properties[key] = self.merge_schemas(properties[key], val)
|
||||
else:
|
||||
properties[key] = val
|
||||
all_keys.add(key)
|
||||
|
||||
merged["properties"] = properties
|
||||
merged["required_keys"] = sorted(list(all_keys))
|
||||
|
||||
# Copy other fields
|
||||
for key in schema1:
|
||||
if key not in merged and key != "type":
|
||||
merged[key] = schema1[key]
|
||||
|
||||
return merged
|
||||
|
||||
def _extract_all_keys(self, obj: Any, prefix: str = "") -> List[str]:
|
||||
"""
|
||||
Recursively extract all keys from a JSON object.
|
||||
|
||||
Args:
|
||||
obj: JSON object to analyze
|
||||
prefix: Prefix for nested keys
|
||||
|
||||
Returns:
|
||||
List of all keys found
|
||||
"""
|
||||
keys = []
|
||||
|
||||
if isinstance(obj, dict):
|
||||
for key, value in obj.items():
|
||||
full_key = f"{prefix}.{key}" if prefix else key
|
||||
keys.append(full_key)
|
||||
keys.extend(self._extract_all_keys(value, full_key))
|
||||
|
||||
elif isinstance(obj, list):
|
||||
for i, item in enumerate(obj):
|
||||
keys.extend(self._extract_all_keys(item, f"{prefix}[{i}]" if prefix else f"[{i}]"))
|
||||
|
||||
return keys
|
||||
|
||||
def _process_chunk(self, chunk_data: List[str]) -> Tuple[Counter, List[Dict], int, int]:
|
||||
"""
|
||||
Process a chunk of JSONL lines.
|
||||
|
||||
Args:
|
||||
chunk_data: List of JSONL lines to process
|
||||
|
||||
Returns:
|
||||
Tuple of (keys_counter, sample_objects, valid_count, error_count)
|
||||
"""
|
||||
all_keys = Counter()
|
||||
sample_objects = []
|
||||
valid_count = 0
|
||||
error_count = 0
|
||||
|
||||
for line in chunk_data:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
valid_count += 1
|
||||
|
||||
# Collect all keys from this object
|
||||
keys = self._extract_all_keys(obj)
|
||||
all_keys.update(keys)
|
||||
|
||||
# Keep sample objects for schema analysis
|
||||
if len(sample_objects) < self.max_samples:
|
||||
sample_objects.append(obj)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
error_count += 1
|
||||
|
||||
return all_keys, sample_objects, valid_count, error_count
|
||||
|
||||
def _read_file_chunks(self, file_path: Path) -> List[List[str]]:
|
||||
"""
|
||||
Read a JSONL file in chunks for parallel processing.
|
||||
|
||||
Args:
|
||||
file_path: Path to the JSONL file
|
||||
|
||||
Returns:
|
||||
List of chunks, each containing lines to process
|
||||
"""
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
current_chunk.append(line)
|
||||
|
||||
if len(current_chunk) >= self.chunk_size:
|
||||
chunks.append(current_chunk)
|
||||
current_chunk = []
|
||||
|
||||
# Add remaining lines
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error reading file {file_path}: {e}")
|
||||
|
||||
return chunks
|
||||
|
||||
def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze a JSONL file and return schema information using parallel processing.
|
||||
|
||||
Args:
|
||||
file_path: Path to the JSONL file
|
||||
|
||||
Returns:
|
||||
Dictionary containing schema analysis results
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
start_time = time.time()
|
||||
file_size = file_path.stat().st_size
|
||||
print(f"Analyzing {file_path.name} ({file_size / (1024*1024*1024):.2f} GB)...")
|
||||
|
||||
# Statistics
|
||||
total_lines = 0
|
||||
valid_lines = 0
|
||||
error_lines = 0
|
||||
all_keys = Counter()
|
||||
merged_schema = None
|
||||
sample_objects = []
|
||||
|
||||
# Read file in chunks and process in parallel
|
||||
chunks = self._read_file_chunks(file_path)
|
||||
|
||||
if len(chunks) == 1 or self.max_workers == 1:
|
||||
# Process sequentially for small files or single worker
|
||||
for chunk in chunks:
|
||||
chunk_keys, chunk_samples, chunk_valid, chunk_errors = self._process_chunk(chunk)
|
||||
all_keys.update(chunk_keys)
|
||||
sample_objects.extend(chunk_samples)
|
||||
valid_lines += chunk_valid
|
||||
error_lines += chunk_errors
|
||||
total_lines += len(chunk)
|
||||
else:
|
||||
# Process chunks in parallel
|
||||
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# Submit all chunks for processing
|
||||
future_to_chunk = {
|
||||
executor.submit(self._process_chunk, chunk): chunk
|
||||
for chunk in chunks
|
||||
}
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_chunk):
|
||||
chunk_keys, chunk_samples, chunk_valid, chunk_errors = future.result()
|
||||
all_keys.update(chunk_keys)
|
||||
sample_objects.extend(chunk_samples)
|
||||
valid_lines += chunk_valid
|
||||
error_lines += chunk_errors
|
||||
total_lines += len(future_to_chunk[future])
|
||||
|
||||
# Limit sample objects
|
||||
if len(sample_objects) >= self.max_samples:
|
||||
sample_objects = sample_objects[:self.max_samples]
|
||||
|
||||
# Analyze schema from sample objects
|
||||
if sample_objects:
|
||||
for obj in sample_objects:
|
||||
obj_schema = self.analyze_json_value(obj)
|
||||
|
||||
if merged_schema is None:
|
||||
merged_schema = obj_schema
|
||||
else:
|
||||
merged_schema = self.merge_schemas(merged_schema, obj_schema)
|
||||
|
||||
# Prepare results
|
||||
elapsed_time = time.time() - start_time
|
||||
results = {
|
||||
"file_path": str(file_path),
|
||||
"file_size_bytes": file_size,
|
||||
"total_lines": total_lines,
|
||||
"valid_lines": valid_lines,
|
||||
"error_lines": error_lines,
|
||||
"sample_count": len(sample_objects),
|
||||
"all_keys": dict(all_keys.most_common()),
|
||||
"unique_key_count": len(all_keys),
|
||||
"schema": merged_schema,
|
||||
"analysis_timestamp": time.time(),
|
||||
"processing_time_seconds": elapsed_time,
|
||||
"workers_used": self.max_workers,
|
||||
"chunks_processed": len(chunks)
|
||||
}
|
||||
|
||||
print(f" Completed in {elapsed_time:.2f}s - {valid_lines:,} valid lines, {error_lines:,} errors")
|
||||
|
||||
# Clean up memory
|
||||
gc.collect()
|
||||
|
||||
return results
|
||||
|
||||
def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze all JSONL files in a directory using parallel processing.
|
||||
|
||||
Args:
|
||||
directory_path: Path to directory containing JSONL files
|
||||
pattern: File pattern to match (default: *.jsonl)
|
||||
|
||||
Returns:
|
||||
Dictionary containing analysis results for all files
|
||||
"""
|
||||
directory_path = Path(directory_path)
|
||||
|
||||
if not directory_path.exists():
|
||||
raise FileNotFoundError(f"Directory not found: {directory_path}")
|
||||
|
||||
# Find all JSONL files
|
||||
jsonl_files = list(directory_path.glob(pattern))
|
||||
|
||||
if not jsonl_files:
|
||||
print(f"No JSONL files found in {directory_path} with pattern {pattern}")
|
||||
return {"files": [], "summary": {}}
|
||||
|
||||
print(f"Found {len(jsonl_files)} JSONL files to analyze using {self.max_workers} workers...")
|
||||
start_time = time.time()
|
||||
|
||||
# Sort files by size (largest first) for better load balancing
|
||||
jsonl_files.sort(key=lambda f: f.stat().st_size, reverse=True)
|
||||
|
||||
# Analyze files in parallel
|
||||
file_results = {}
|
||||
|
||||
if len(jsonl_files) == 1 or self.max_workers == 1:
|
||||
# Process sequentially for single file
|
||||
for file_path in jsonl_files:
|
||||
try:
|
||||
file_results[file_path.name] = self.analyze_jsonl_file(file_path)
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
else:
|
||||
# Process files in parallel
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# Submit all files for analysis
|
||||
future_to_file = {
|
||||
executor.submit(self.analyze_jsonl_file, file_path): file_path
|
||||
for file_path in jsonl_files
|
||||
}
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_file):
|
||||
file_path = future_to_file[future]
|
||||
try:
|
||||
result = future.result()
|
||||
file_results[file_path.name] = result
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
|
||||
# Create summary
|
||||
successful_results = [r for r in file_results.values() if "error" not in r]
|
||||
summary = {
|
||||
"total_files": len(jsonl_files),
|
||||
"successfully_analyzed": len(successful_results),
|
||||
"total_size_bytes": sum(
|
||||
r.get("file_size_bytes", 0) for r in successful_results
|
||||
),
|
||||
"total_lines": sum(
|
||||
r.get("total_lines", 0) for r in successful_results
|
||||
),
|
||||
"total_valid_lines": sum(
|
||||
r.get("valid_lines", 0) for r in successful_results
|
||||
),
|
||||
"total_processing_time": sum(
|
||||
r.get("processing_time_seconds", 0) for r in successful_results
|
||||
),
|
||||
"average_processing_speed_mb_per_sec": 0
|
||||
}
|
||||
|
||||
# Calculate processing speed
|
||||
if summary["total_processing_time"] > 0:
|
||||
total_mb = summary["total_size_bytes"] / (1024 * 1024)
|
||||
summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
summary["total_elapsed_time"] = elapsed_time
|
||||
|
||||
print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
|
||||
print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
|
||||
print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
|
||||
|
||||
return {
|
||||
"directory": str(directory_path),
|
||||
"pattern": pattern,
|
||||
"files": file_results,
|
||||
"summary": summary
|
||||
}
|
||||
|
||||
def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
|
||||
"""
|
||||
Save analysis results to a JSON file.
|
||||
|
||||
Args:
|
||||
results: Analysis results to save
|
||||
output_path: Path to save the results
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
save_time = time.time() - start_time
|
||||
file_size = output_path.stat().st_size
|
||||
print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error saving results to {output_path}: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function for command-line usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Optimized JSONL schema analyzer using multiple cores"
|
||||
)
|
||||
parser.add_argument(
|
||||
"path",
|
||||
help="Path to JSONL file or directory containing JSONL files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
help="Output file for analysis results (JSON format)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p", "--pattern",
|
||||
default="*.jsonl",
|
||||
help="File pattern when analyzing directory (default: *.jsonl)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s", "--max-samples",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Maximum number of JSON objects to sample per file (default: 1000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w", "--workers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of worker processes (default: CPU count, max 8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--chunk-size",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of lines to process in each chunk (default: 1000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--directory",
|
||||
action="store_true",
|
||||
help="Treat path as directory instead of single file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Enable performance profiling"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize analyzer
|
||||
analyzer = OptimizedJSONLSchemaAnalyzer(
|
||||
max_samples=args.max_samples,
|
||||
max_workers=args.workers,
|
||||
chunk_size=args.chunk_size
|
||||
)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Analyze file or directory
|
||||
if args.directory or Path(args.path).is_dir():
|
||||
results = analyzer.analyze_directory(args.path, args.pattern)
|
||||
else:
|
||||
results = analyzer.analyze_jsonl_file(args.path)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Save or print results
|
||||
if args.output:
|
||||
analyzer.save_results(results, args.output)
|
||||
else:
|
||||
print("\n" + "="*50)
|
||||
print("ANALYSIS RESULTS")
|
||||
print("="*50)
|
||||
print(json.dumps(results, indent=2, ensure_ascii=False))
|
||||
|
||||
print(f"\nTotal analysis time: {total_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user