#!/usr/bin/env python3 """ Optimized JSONL Schema Analyzer Analyzes JSONL files to extract and aggregate schema information using multiple cores. For each JSONL file, it generates a schema showing the JSON structure and aggregates all possible keys found across all records. """ import json import os import sys import time import mmap from collections import defaultdict, Counter from typing import Dict, List, Any, Set, Union, Tuple import argparse from pathlib import Path from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed from multiprocessing import cpu_count, Manager import threading from functools import partial import gc class OptimizedJSONLSchemaAnalyzer: """Optimized analyzer that uses multiple cores and system resources efficiently.""" def __init__(self, max_samples: int = 1000, max_workers: int = None, chunk_size: int = 1000): """ Initialize the optimized analyzer. Args: max_samples: Maximum number of JSON objects to sample per file max_workers: Maximum number of worker processes (default: cpu_count) chunk_size: Number of lines to process in each chunk """ self.max_samples = max_samples self.max_workers = max_workers or min(cpu_count(), 8) # Limit to 8 to avoid memory issues self.chunk_size = chunk_size self.schema_cache = {} print(f"Initialized analyzer with {self.max_workers} workers, chunk size: {self.chunk_size}") def analyze_json_value(self, value: Any, depth: int = 0, max_depth: int = 10) -> Dict[str, Any]: """ Analyze a JSON value and return its type and structure. Args: value: The JSON value to analyze depth: Current depth in the structure max_depth: Maximum depth to analyze Returns: Dictionary describing the value's type and structure """ if depth > max_depth: return {"type": "unknown", "note": "max_depth_reached"} if value is None: return {"type": "null"} elif isinstance(value, bool): return {"type": "boolean"} elif isinstance(value, int): return {"type": "integer"} elif isinstance(value, float): return {"type": "number"} elif isinstance(value, str): return {"type": "string", "sample_length": len(value)} elif isinstance(value, list): if not value: return {"type": "array", "item_types": [], "length_range": [0, 0]} item_types = set() item_schemas = [] # Sample first few items to determine array structure sample_size = min(10, len(value)) for item in value[:sample_size]: item_schema = self.analyze_json_value(item, depth + 1, max_depth) item_schemas.append(item_schema) item_types.add(item_schema["type"]) return { "type": "array", "item_types": sorted(list(item_types)), "length_range": [len(value), len(value)], "sample_items": item_schemas[:3] # Keep first 3 as examples } elif isinstance(value, dict): if not value: return {"type": "object", "properties": {}, "required_keys": []} properties = {} for key, val in value.items(): properties[key] = self.analyze_json_value(val, depth + 1, max_depth) return { "type": "object", "properties": properties, "required_keys": list(value.keys()) } else: return {"type": "unknown", "note": f"unexpected_type: {type(value)}"} def merge_schemas(self, schema1: Dict[str, Any], schema2: Dict[str, Any]) -> Dict[str, Any]: """ Merge two schemas, combining their information. Args: schema1: First schema schema2: Second schema Returns: Merged schema """ if schema1["type"] != schema2["type"]: # Different types, create a union return { "type": "union", "possible_types": sorted(set([schema1["type"], schema2["type"]])), "schemas": [schema1, schema2] } merged = {"type": schema1["type"]} if schema1["type"] == "array": # Merge array item types item_types = set(schema1.get("item_types", [])) item_types.update(schema2.get("item_types", [])) merged["item_types"] = sorted(list(item_types)) # Merge length ranges len1 = schema1.get("length_range", [0, 0]) len2 = schema2.get("length_range", [0, 0]) merged["length_range"] = [min(len1[0], len2[0]), max(len1[1], len2[1])] # Merge sample items if available if "sample_items" in schema1 or "sample_items" in schema2: merged["sample_items"] = ( schema1.get("sample_items", []) + schema2.get("sample_items", []) )[:5] # Keep max 5 samples elif schema1["type"] == "object": # Merge object properties properties = {} all_keys = set() # Copy properties from first schema for key, val in schema1.get("properties", {}).items(): properties[key] = val all_keys.add(key) # Merge properties from second schema for key, val in schema2.get("properties", {}).items(): if key in properties: properties[key] = self.merge_schemas(properties[key], val) else: properties[key] = val all_keys.add(key) merged["properties"] = properties merged["required_keys"] = sorted(list(all_keys)) # Copy other fields for key in schema1: if key not in merged and key != "type": merged[key] = schema1[key] return merged def _extract_all_keys(self, obj: Any, prefix: str = "") -> List[str]: """ Recursively extract all keys from a JSON object. Args: obj: JSON object to analyze prefix: Prefix for nested keys Returns: List of all keys found """ keys = [] if isinstance(obj, dict): for key, value in obj.items(): full_key = f"{prefix}.{key}" if prefix else key keys.append(full_key) keys.extend(self._extract_all_keys(value, full_key)) elif isinstance(obj, list): for i, item in enumerate(obj): keys.extend(self._extract_all_keys(item, f"{prefix}[{i}]" if prefix else f"[{i}]")) return keys def _process_chunk(self, chunk_data: List[str]) -> Tuple[Counter, List[Dict], int, int]: """ Process a chunk of JSONL lines. Args: chunk_data: List of JSONL lines to process Returns: Tuple of (keys_counter, sample_objects, valid_count, error_count) """ all_keys = Counter() sample_objects = [] valid_count = 0 error_count = 0 for line in chunk_data: line = line.strip() if not line: continue try: obj = json.loads(line) valid_count += 1 # Collect all keys from this object keys = self._extract_all_keys(obj) all_keys.update(keys) # Keep sample objects for schema analysis if len(sample_objects) < self.max_samples: sample_objects.append(obj) except json.JSONDecodeError: error_count += 1 return all_keys, sample_objects, valid_count, error_count def _read_file_chunks(self, file_path: Path) -> List[List[str]]: """ Read a JSONL file in chunks for parallel processing. Args: file_path: Path to the JSONL file Returns: List of chunks, each containing lines to process """ chunks = [] current_chunk = [] try: with open(file_path, 'r', encoding='utf-8') as f: for line in f: current_chunk.append(line) if len(current_chunk) >= self.chunk_size: chunks.append(current_chunk) current_chunk = [] # Add remaining lines if current_chunk: chunks.append(current_chunk) except Exception as e: raise RuntimeError(f"Error reading file {file_path}: {e}") return chunks def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]: """ Analyze a JSONL file and return schema information using parallel processing. Args: file_path: Path to the JSONL file Returns: Dictionary containing schema analysis results """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") start_time = time.time() file_size = file_path.stat().st_size print(f"Analyzing {file_path.name} ({file_size / (1024*1024*1024):.2f} GB)...") # Statistics total_lines = 0 valid_lines = 0 error_lines = 0 all_keys = Counter() merged_schema = None sample_objects = [] # Read file in chunks and process in parallel chunks = self._read_file_chunks(file_path) if len(chunks) == 1 or self.max_workers == 1: # Process sequentially for small files or single worker for chunk in chunks: chunk_keys, chunk_samples, chunk_valid, chunk_errors = self._process_chunk(chunk) all_keys.update(chunk_keys) sample_objects.extend(chunk_samples) valid_lines += chunk_valid error_lines += chunk_errors total_lines += len(chunk) else: # Process chunks in parallel with ProcessPoolExecutor(max_workers=self.max_workers) as executor: # Submit all chunks for processing future_to_chunk = { executor.submit(self._process_chunk, chunk): chunk for chunk in chunks } # Collect results as they complete for future in as_completed(future_to_chunk): chunk_keys, chunk_samples, chunk_valid, chunk_errors = future.result() all_keys.update(chunk_keys) sample_objects.extend(chunk_samples) valid_lines += chunk_valid error_lines += chunk_errors total_lines += len(future_to_chunk[future]) # Limit sample objects if len(sample_objects) >= self.max_samples: sample_objects = sample_objects[:self.max_samples] # Analyze schema from sample objects if sample_objects: for obj in sample_objects: obj_schema = self.analyze_json_value(obj) if merged_schema is None: merged_schema = obj_schema else: merged_schema = self.merge_schemas(merged_schema, obj_schema) # Prepare results elapsed_time = time.time() - start_time results = { "file_path": str(file_path), "file_size_bytes": file_size, "total_lines": total_lines, "valid_lines": valid_lines, "error_lines": error_lines, "sample_count": len(sample_objects), "all_keys": dict(all_keys.most_common()), "unique_key_count": len(all_keys), "schema": merged_schema, "analysis_timestamp": time.time(), "processing_time_seconds": elapsed_time, "workers_used": self.max_workers, "chunks_processed": len(chunks) } print(f" Completed in {elapsed_time:.2f}s - {valid_lines:,} valid lines, {error_lines:,} errors") # Clean up memory gc.collect() return results def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]: """ Analyze all JSONL files in a directory using parallel processing. Args: directory_path: Path to directory containing JSONL files pattern: File pattern to match (default: *.jsonl) Returns: Dictionary containing analysis results for all files """ directory_path = Path(directory_path) if not directory_path.exists(): raise FileNotFoundError(f"Directory not found: {directory_path}") # Find all JSONL files jsonl_files = list(directory_path.glob(pattern)) if not jsonl_files: print(f"No JSONL files found in {directory_path} with pattern {pattern}") return {"files": [], "summary": {}} print(f"Found {len(jsonl_files)} JSONL files to analyze using {self.max_workers} workers...") start_time = time.time() # Sort files by size (largest first) for better load balancing jsonl_files.sort(key=lambda f: f.stat().st_size, reverse=True) # Analyze files in parallel file_results = {} if len(jsonl_files) == 1 or self.max_workers == 1: # Process sequentially for single file for file_path in jsonl_files: try: file_results[file_path.name] = self.analyze_jsonl_file(file_path) except Exception as e: print(f"Error analyzing {file_path.name}: {e}") file_results[file_path.name] = {"error": str(e)} else: # Process files in parallel with ThreadPoolExecutor(max_workers=self.max_workers) as executor: # Submit all files for analysis future_to_file = { executor.submit(self.analyze_jsonl_file, file_path): file_path for file_path in jsonl_files } # Collect results as they complete for future in as_completed(future_to_file): file_path = future_to_file[future] try: result = future.result() file_results[file_path.name] = result except Exception as e: print(f"Error analyzing {file_path.name}: {e}") file_results[file_path.name] = {"error": str(e)} # Create summary successful_results = [r for r in file_results.values() if "error" not in r] summary = { "total_files": len(jsonl_files), "successfully_analyzed": len(successful_results), "total_size_bytes": sum( r.get("file_size_bytes", 0) for r in successful_results ), "total_lines": sum( r.get("total_lines", 0) for r in successful_results ), "total_valid_lines": sum( r.get("valid_lines", 0) for r in successful_results ), "total_processing_time": sum( r.get("processing_time_seconds", 0) for r in successful_results ), "average_processing_speed_mb_per_sec": 0 } # Calculate processing speed if summary["total_processing_time"] > 0: total_mb = summary["total_size_bytes"] / (1024 * 1024) summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"] elapsed_time = time.time() - start_time summary["total_elapsed_time"] = elapsed_time print(f"\nDirectory analysis completed in {elapsed_time:.2f}s") print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files") print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec") return { "directory": str(directory_path), "pattern": pattern, "files": file_results, "summary": summary } def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]): """ Save analysis results to a JSON file. Args: results: Analysis results to save output_path: Path to save the results """ output_path = Path(output_path) try: start_time = time.time() with open(output_path, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) save_time = time.time() - start_time file_size = output_path.stat().st_size print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s") except Exception as e: raise RuntimeError(f"Error saving results to {output_path}: {e}") def main(): """Main function for command-line usage.""" parser = argparse.ArgumentParser( description="Optimized JSONL schema analyzer using multiple cores" ) parser.add_argument( "path", help="Path to JSONL file or directory containing JSONL files" ) parser.add_argument( "-o", "--output", help="Output file for analysis results (JSON format)" ) parser.add_argument( "-p", "--pattern", default="*.jsonl", help="File pattern when analyzing directory (default: *.jsonl)" ) parser.add_argument( "-s", "--max-samples", type=int, default=1000, help="Maximum number of JSON objects to sample per file (default: 1000)" ) parser.add_argument( "-w", "--workers", type=int, default=None, help="Number of worker processes (default: CPU count, max 8)" ) parser.add_argument( "-c", "--chunk-size", type=int, default=1000, help="Number of lines to process in each chunk (default: 1000)" ) parser.add_argument( "--directory", action="store_true", help="Treat path as directory instead of single file" ) parser.add_argument( "--profile", action="store_true", help="Enable performance profiling" ) args = parser.parse_args() # Initialize analyzer analyzer = OptimizedJSONLSchemaAnalyzer( max_samples=args.max_samples, max_workers=args.workers, chunk_size=args.chunk_size ) try: start_time = time.time() # Analyze file or directory if args.directory or Path(args.path).is_dir(): results = analyzer.analyze_directory(args.path, args.pattern) else: results = analyzer.analyze_jsonl_file(args.path) total_time = time.time() - start_time # Save or print results if args.output: analyzer.save_results(results, args.output) else: print("\n" + "="*50) print("ANALYSIS RESULTS") print("="*50) print(json.dumps(results, indent=2, ensure_ascii=False)) print(f"\nTotal analysis time: {total_time:.2f}s") except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()