Wictionary-Data-Parser/scripts/Json Analyzer/jsonl_schema_analyzer_optimized.py

#!/usr/bin/env python3
"""
Optimized JSONL Schema Analyzer

Analyzes JSONL files to extract and aggregate schema information using multiple cores.
For each JSONL file, it generates a schema showing the JSON structure
and aggregates all possible keys found across all records.
"""

import json
import os
import sys
import time
import mmap
from collections import defaultdict, Counter
from typing import Dict, List, Any, Set, Union, Tuple
import argparse
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count, Manager
import threading
from functools import partial
import gc


class OptimizedJSONLSchemaAnalyzer:
    """Optimized analyzer that uses multiple cores and system resources efficiently."""

    def __init__(self, max_samples: int = 1000, max_workers: int = None, chunk_size: int = 1000):
        """
        Initialize the optimized analyzer.

        Args:
            max_samples: Maximum number of JSON objects to sample per file
            max_workers: Maximum number of worker processes (default: cpu_count)
            chunk_size: Number of lines to process in each chunk
        """
        self.max_samples = max_samples
        self.max_workers = max_workers or min(cpu_count(), 8)  # Limit to 8 to avoid memory issues
        self.chunk_size = chunk_size
        self.schema_cache = {}

        print(f"Initialized analyzer with {self.max_workers} workers, chunk size: {self.chunk_size}")

    def analyze_json_value(self, value: Any, depth: int = 0, max_depth: int = 10) -> Dict[str, Any]:
        """
        Analyze a JSON value and return its type and structure.

        Args:
            value: The JSON value to analyze
            depth: Current depth in the structure
            max_depth: Maximum depth to analyze

        Returns:
            Dictionary describing the value's type and structure
        """
        if depth > max_depth:
            return {"type": "unknown", "note": "max_depth_reached"}

        if value is None:
            return {"type": "null"}
        elif isinstance(value, bool):
            return {"type": "boolean"}
        elif isinstance(value, int):
            return {"type": "integer"}
        elif isinstance(value, float):
            return {"type": "number"}
        elif isinstance(value, str):
            return {"type": "string", "sample_length": len(value)}
        elif isinstance(value, list):
            if not value:
                return {"type": "array", "item_types": [], "length_range": [0, 0]}

            item_types = set()
            item_schemas = []

            # Sample first few items to determine array structure
            sample_size = min(10, len(value))
            for item in value[:sample_size]:
                item_schema = self.analyze_json_value(item, depth + 1, max_depth)
                item_schemas.append(item_schema)
                item_types.add(item_schema["type"])

            return {
                "type": "array",
                "item_types": sorted(list(item_types)),
                "length_range": [len(value), len(value)],
                "sample_items": item_schemas[:3]  # Keep first 3 as examples
            }
        elif isinstance(value, dict):
            if not value:
                return {"type": "object", "properties": {}, "required_keys": []}

            properties = {}
            for key, val in value.items():
                properties[key] = self.analyze_json_value(val, depth + 1, max_depth)

            return {
                "type": "object",
                "properties": properties,
                "required_keys": list(value.keys())
            }
        else:
            return {"type": "unknown", "note": f"unexpected_type: {type(value)}"}

    def merge_schemas(self, schema1: Dict[str, Any], schema2: Dict[str, Any]) -> Dict[str, Any]:
        """
        Merge two schemas, combining their information.

        Args:
            schema1: First schema
            schema2: Second schema

        Returns:
            Merged schema
        """
        if schema1["type"] != schema2["type"]:
            # Different types, create a union
            return {
                "type": "union",
                "possible_types": sorted(set([schema1["type"], schema2["type"]])),
                "schemas": [schema1, schema2]
            }

        merged = {"type": schema1["type"]}

        if schema1["type"] == "array":
            # Merge array item types
            item_types = set(schema1.get("item_types", []))
            item_types.update(schema2.get("item_types", []))
            merged["item_types"] = sorted(list(item_types))

            # Merge length ranges
            len1 = schema1.get("length_range", [0, 0])
            len2 = schema2.get("length_range", [0, 0])
            merged["length_range"] = [min(len1[0], len2[0]), max(len1[1], len2[1])]

            # Merge sample items if available
            if "sample_items" in schema1 or "sample_items" in schema2:
                merged["sample_items"] = (
                    schema1.get("sample_items", []) +
                    schema2.get("sample_items", [])
                )[:5]  # Keep max 5 samples

        elif schema1["type"] == "object":
            # Merge object properties
            properties = {}
            all_keys = set()

            # Copy properties from first schema
            for key, val in schema1.get("properties", {}).items():
                properties[key] = val
                all_keys.add(key)

            # Merge properties from second schema
            for key, val in schema2.get("properties", {}).items():
                if key in properties:
                    properties[key] = self.merge_schemas(properties[key], val)
                else:
                    properties[key] = val
                all_keys.add(key)

            merged["properties"] = properties
            merged["required_keys"] = sorted(list(all_keys))

        # Copy other fields
        for key in schema1:
            if key not in merged and key != "type":
                merged[key] = schema1[key]

        return merged

    def _extract_all_keys(self, obj: Any, prefix: str = "") -> List[str]:
        """
        Recursively extract all keys from a JSON object.

        Args:
            obj: JSON object to analyze
            prefix: Prefix for nested keys

        Returns:
            List of all keys found
        """
        keys = []

        if isinstance(obj, dict):
            for key, value in obj.items():
                full_key = f"{prefix}.{key}" if prefix else key
                keys.append(full_key)
                keys.extend(self._extract_all_keys(value, full_key))

        elif isinstance(obj, list):
            for i, item in enumerate(obj):
                keys.extend(self._extract_all_keys(item, f"{prefix}[{i}]" if prefix else f"[{i}]"))

        return keys

    def _process_chunk(self, chunk_data: List[str]) -> Tuple[Counter, List[Dict], int, int]:
        """
        Process a chunk of JSONL lines.

        Args:
            chunk_data: List of JSONL lines to process

        Returns:
            Tuple of (keys_counter, sample_objects, valid_count, error_count)
        """
        all_keys = Counter()
        sample_objects = []
        valid_count = 0
        error_count = 0

        for line in chunk_data:
            line = line.strip()
            if not line:
                continue

            try:
                obj = json.loads(line)
                valid_count += 1

                # Collect all keys from this object
                keys = self._extract_all_keys(obj)
                all_keys.update(keys)

                # Keep sample objects for schema analysis
                if len(sample_objects) < self.max_samples:
                    sample_objects.append(obj)

            except json.JSONDecodeError:
                error_count += 1

        return all_keys, sample_objects, valid_count, error_count

    def _read_file_chunks(self, file_path: Path) -> List[List[str]]:
        """
        Read a JSONL file in chunks for parallel processing.

        Args:
            file_path: Path to the JSONL file

        Returns:
            List of chunks, each containing lines to process
        """
        chunks = []
        current_chunk = []

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    current_chunk.append(line)

                    if len(current_chunk) >= self.chunk_size:
                        chunks.append(current_chunk)
                        current_chunk = []

                # Add remaining lines
                if current_chunk:
                    chunks.append(current_chunk)

        except Exception as e:
            raise RuntimeError(f"Error reading file {file_path}: {e}")

        return chunks

    def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
        """
        Analyze a JSONL file and return schema information using parallel processing.

        Args:
            file_path: Path to the JSONL file

        Returns:
            Dictionary containing schema analysis results
        """
        file_path = Path(file_path)

        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        start_time = time.time()
        file_size = file_path.stat().st_size
        print(f"Analyzing {file_path.name} ({file_size / (1024*1024*1024):.2f} GB)...")

        # Statistics
        total_lines = 0
        valid_lines = 0
        error_lines = 0
        all_keys = Counter()
        merged_schema = None
        sample_objects = []

        # Read file in chunks and process in parallel
        chunks = self._read_file_chunks(file_path)

        if len(chunks) == 1 or self.max_workers == 1:
            # Process sequentially for small files or single worker
            for chunk in chunks:
                chunk_keys, chunk_samples, chunk_valid, chunk_errors = self._process_chunk(chunk)
                all_keys.update(chunk_keys)
                sample_objects.extend(chunk_samples)
                valid_lines += chunk_valid
                error_lines += chunk_errors
                total_lines += len(chunk)
        else:
            # Process chunks in parallel
            with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
                # Submit all chunks for processing
                future_to_chunk = {
                    executor.submit(self._process_chunk, chunk): chunk
                    for chunk in chunks
                }

                # Collect results as they complete
                for future in as_completed(future_to_chunk):
                    chunk_keys, chunk_samples, chunk_valid, chunk_errors = future.result()
                    all_keys.update(chunk_keys)
                    sample_objects.extend(chunk_samples)
                    valid_lines += chunk_valid
                    error_lines += chunk_errors
                    total_lines += len(future_to_chunk[future])

                    # Limit sample objects
                    if len(sample_objects) >= self.max_samples:
                        sample_objects = sample_objects[:self.max_samples]

        # Analyze schema from sample objects
        if sample_objects:
            for obj in sample_objects:
                obj_schema = self.analyze_json_value(obj)

                if merged_schema is None:
                    merged_schema = obj_schema
                else:
                    merged_schema = self.merge_schemas(merged_schema, obj_schema)

        # Prepare results
        elapsed_time = time.time() - start_time
        results = {
            "file_path": str(file_path),
            "file_size_bytes": file_size,
            "total_lines": total_lines,
            "valid_lines": valid_lines,
            "error_lines": error_lines,
            "sample_count": len(sample_objects),
            "all_keys": dict(all_keys.most_common()),
            "unique_key_count": len(all_keys),
            "schema": merged_schema,
            "analysis_timestamp": time.time(),
            "processing_time_seconds": elapsed_time,
            "workers_used": self.max_workers,
            "chunks_processed": len(chunks)
        }

        print(f"  Completed in {elapsed_time:.2f}s - {valid_lines:,} valid lines, {error_lines:,} errors")

        # Clean up memory
        gc.collect()

        return results

    def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
        """
        Analyze all JSONL files in a directory using parallel processing.

        Args:
            directory_path: Path to directory containing JSONL files
            pattern: File pattern to match (default: *.jsonl)

        Returns:
            Dictionary containing analysis results for all files
        """
        directory_path = Path(directory_path)

        if not directory_path.exists():
            raise FileNotFoundError(f"Directory not found: {directory_path}")

        # Find all JSONL files
        jsonl_files = list(directory_path.glob(pattern))

        if not jsonl_files:
            print(f"No JSONL files found in {directory_path} with pattern {pattern}")
            return {"files": [], "summary": {}}

        print(f"Found {len(jsonl_files)} JSONL files to analyze using {self.max_workers} workers...")
        start_time = time.time()

        # Sort files by size (largest first) for better load balancing
        jsonl_files.sort(key=lambda f: f.stat().st_size, reverse=True)

        # Analyze files in parallel
        file_results = {}

        if len(jsonl_files) == 1 or self.max_workers == 1:
            # Process sequentially for single file
            for file_path in jsonl_files:
                try:
                    file_results[file_path.name] = self.analyze_jsonl_file(file_path)
                except Exception as e:
                    print(f"Error analyzing {file_path.name}: {e}")
                    file_results[file_path.name] = {"error": str(e)}
        else:
            # Process files in parallel
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                # Submit all files for analysis
                future_to_file = {
                    executor.submit(self.analyze_jsonl_file, file_path): file_path
                    for file_path in jsonl_files
                }

                # Collect results as they complete
                for future in as_completed(future_to_file):
                    file_path = future_to_file[future]
                    try:
                        result = future.result()
                        file_results[file_path.name] = result
                    except Exception as e:
                        print(f"Error analyzing {file_path.name}: {e}")
                        file_results[file_path.name] = {"error": str(e)}

        # Create summary
        successful_results = [r for r in file_results.values() if "error" not in r]
        summary = {
            "total_files": len(jsonl_files),
            "successfully_analyzed": len(successful_results),
            "total_size_bytes": sum(
                r.get("file_size_bytes", 0) for r in successful_results
            ),
            "total_lines": sum(
                r.get("total_lines", 0) for r in successful_results
            ),
            "total_valid_lines": sum(
                r.get("valid_lines", 0) for r in successful_results
            ),
            "total_processing_time": sum(
                r.get("processing_time_seconds", 0) for r in successful_results
            ),
            "average_processing_speed_mb_per_sec": 0
        }

        # Calculate processing speed
        if summary["total_processing_time"] > 0:
            total_mb = summary["total_size_bytes"] / (1024 * 1024)
            summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]

        elapsed_time = time.time() - start_time
        summary["total_elapsed_time"] = elapsed_time

        print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
        print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
        print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")

        return {
            "directory": str(directory_path),
            "pattern": pattern,
            "files": file_results,
            "summary": summary
        }

    def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
        """
        Save analysis results to a JSON file.

        Args:
            results: Analysis results to save
            output_path: Path to save the results
        """
        output_path = Path(output_path)

        try:
            start_time = time.time()
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)

            save_time = time.time() - start_time
            file_size = output_path.stat().st_size
            print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")

        except Exception as e:
            raise RuntimeError(f"Error saving results to {output_path}: {e}")


def main():
    """Main function for command-line usage."""
    parser = argparse.ArgumentParser(
        description="Optimized JSONL schema analyzer using multiple cores"
    )
    parser.add_argument(
        "path",
        help="Path to JSONL file or directory containing JSONL files"
    )
    parser.add_argument(
        "-o", "--output",
        help="Output file for analysis results (JSON format)"
    )
    parser.add_argument(
        "-p", "--pattern",
        default="*.jsonl",
        help="File pattern when analyzing directory (default: *.jsonl)"
    )
    parser.add_argument(
        "-s", "--max-samples",
        type=int,
        default=1000,
        help="Maximum number of JSON objects to sample per file (default: 1000)"
    )
    parser.add_argument(
        "-w", "--workers",
        type=int,
        default=None,
        help="Number of worker processes (default: CPU count, max 8)"
    )
    parser.add_argument(
        "-c", "--chunk-size",
        type=int,
        default=1000,
        help="Number of lines to process in each chunk (default: 1000)"
    )
    parser.add_argument(
        "--directory",
        action="store_true",
        help="Treat path as directory instead of single file"
    )
    parser.add_argument(
        "--profile",
        action="store_true",
        help="Enable performance profiling"
    )

    args = parser.parse_args()

    # Initialize analyzer
    analyzer = OptimizedJSONLSchemaAnalyzer(
        max_samples=args.max_samples,
        max_workers=args.workers,
        chunk_size=args.chunk_size
    )

    try:
        start_time = time.time()

        # Analyze file or directory
        if args.directory or Path(args.path).is_dir():
            results = analyzer.analyze_directory(args.path, args.pattern)
        else:
            results = analyzer.analyze_jsonl_file(args.path)

        total_time = time.time() - start_time

        # Save or print results
        if args.output:
            analyzer.save_results(results, args.output)
        else:
            print("\n" + "="*50)
            print("ANALYSIS RESULTS")
            print("="*50)
            print(json.dumps(results, indent=2, ensure_ascii=False))

        print(f"\nTotal analysis time: {total_time:.2f}s")

    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()