Migrate to gitea

2026-02-13 00:10:40 +01:00
commit 6d06a9e14e
38 changed files with 31427 additions and 0 deletions
--- a/Analyzer/jsonl_schema_analyzer_optimized.py
+++ b/Analyzer/jsonl_schema_analyzer_optimized.py
@@ -0,0 +1,567 @@
+#!/usr/bin/env python3
+"""
+Optimized JSONL Schema Analyzer
+
+Analyzes JSONL files to extract and aggregate schema information using multiple cores.
+For each JSONL file, it generates a schema showing the JSON structure
+and aggregates all possible keys found across all records.
+"""
+
+import json
+import os
+import sys
+import time
+import mmap
+from collections import defaultdict, Counter
+from typing import Dict, List, Any, Set, Union, Tuple
+import argparse
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from multiprocessing import cpu_count, Manager
+import threading
+from functools import partial
+import gc
+
+
+class OptimizedJSONLSchemaAnalyzer:
+    """Optimized analyzer that uses multiple cores and system resources efficiently."""
+    
+    def __init__(self, max_samples: int = 1000, max_workers: int = None, chunk_size: int = 1000):
+        """
+        Initialize the optimized analyzer.
+        
+        Args:
+            max_samples: Maximum number of JSON objects to sample per file
+            max_workers: Maximum number of worker processes (default: cpu_count)
+            chunk_size: Number of lines to process in each chunk
+        """
+        self.max_samples = max_samples
+        self.max_workers = max_workers or min(cpu_count(), 8)  # Limit to 8 to avoid memory issues
+        self.chunk_size = chunk_size
+        self.schema_cache = {}
+        
+        print(f"Initialized analyzer with {self.max_workers} workers, chunk size: {self.chunk_size}")
+    
+    def analyze_json_value(self, value: Any, depth: int = 0, max_depth: int = 10) -> Dict[str, Any]:
+        """
+        Analyze a JSON value and return its type and structure.
+        
+        Args:
+            value: The JSON value to analyze
+            depth: Current depth in the structure
+            max_depth: Maximum depth to analyze
+            
+        Returns:
+            Dictionary describing the value's type and structure
+        """
+        if depth > max_depth:
+            return {"type": "unknown", "note": "max_depth_reached"}
+        
+        if value is None:
+            return {"type": "null"}
+        elif isinstance(value, bool):
+            return {"type": "boolean"}
+        elif isinstance(value, int):
+            return {"type": "integer"}
+        elif isinstance(value, float):
+            return {"type": "number"}
+        elif isinstance(value, str):
+            return {"type": "string", "sample_length": len(value)}
+        elif isinstance(value, list):
+            if not value:
+                return {"type": "array", "item_types": [], "length_range": [0, 0]}
+            
+            item_types = set()
+            item_schemas = []
+            
+            # Sample first few items to determine array structure
+            sample_size = min(10, len(value))
+            for item in value[:sample_size]:
+                item_schema = self.analyze_json_value(item, depth + 1, max_depth)
+                item_schemas.append(item_schema)
+                item_types.add(item_schema["type"])
+            
+            return {
+                "type": "array",
+                "item_types": sorted(list(item_types)),
+                "length_range": [len(value), len(value)],
+                "sample_items": item_schemas[:3]  # Keep first 3 as examples
+            }
+        elif isinstance(value, dict):
+            if not value:
+                return {"type": "object", "properties": {}, "required_keys": []}
+            
+            properties = {}
+            for key, val in value.items():
+                properties[key] = self.analyze_json_value(val, depth + 1, max_depth)
+            
+            return {
+                "type": "object",
+                "properties": properties,
+                "required_keys": list(value.keys())
+            }
+        else:
+            return {"type": "unknown", "note": f"unexpected_type: {type(value)}"}
+    
+    def merge_schemas(self, schema1: Dict[str, Any], schema2: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Merge two schemas, combining their information.
+        
+        Args:
+            schema1: First schema
+            schema2: Second schema
+            
+        Returns:
+            Merged schema
+        """
+        if schema1["type"] != schema2["type"]:
+            # Different types, create a union
+            return {
+                "type": "union",
+                "possible_types": sorted(set([schema1["type"], schema2["type"]])),
+                "schemas": [schema1, schema2]
+            }
+        
+        merged = {"type": schema1["type"]}
+        
+        if schema1["type"] == "array":
+            # Merge array item types
+            item_types = set(schema1.get("item_types", []))
+            item_types.update(schema2.get("item_types", []))
+            merged["item_types"] = sorted(list(item_types))
+            
+            # Merge length ranges
+            len1 = schema1.get("length_range", [0, 0])
+            len2 = schema2.get("length_range", [0, 0])
+            merged["length_range"] = [min(len1[0], len2[0]), max(len1[1], len2[1])]
+            
+            # Merge sample items if available
+            if "sample_items" in schema1 or "sample_items" in schema2:
+                merged["sample_items"] = (
+                    schema1.get("sample_items", []) + 
+                    schema2.get("sample_items", [])
+                )[:5]  # Keep max 5 samples
+        
+        elif schema1["type"] == "object":
+            # Merge object properties
+            properties = {}
+            all_keys = set()
+            
+            # Copy properties from first schema
+            for key, val in schema1.get("properties", {}).items():
+                properties[key] = val
+                all_keys.add(key)
+            
+            # Merge properties from second schema
+            for key, val in schema2.get("properties", {}).items():
+                if key in properties:
+                    properties[key] = self.merge_schemas(properties[key], val)
+                else:
+                    properties[key] = val
+                all_keys.add(key)
+            
+            merged["properties"] = properties
+            merged["required_keys"] = sorted(list(all_keys))
+        
+        # Copy other fields
+        for key in schema1:
+            if key not in merged and key != "type":
+                merged[key] = schema1[key]
+        
+        return merged
+    
+    def _extract_all_keys(self, obj: Any, prefix: str = "") -> List[str]:
+        """
+        Recursively extract all keys from a JSON object.
+        
+        Args:
+            obj: JSON object to analyze
+            prefix: Prefix for nested keys
+            
+        Returns:
+            List of all keys found
+        """
+        keys = []
+        
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                full_key = f"{prefix}.{key}" if prefix else key
+                keys.append(full_key)
+                keys.extend(self._extract_all_keys(value, full_key))
+        
+        elif isinstance(obj, list):
+            for i, item in enumerate(obj):
+                keys.extend(self._extract_all_keys(item, f"{prefix}[{i}]" if prefix else f"[{i}]"))
+        
+        return keys
+    
+    def _process_chunk(self, chunk_data: List[str]) -> Tuple[Counter, List[Dict], int, int]:
+        """
+        Process a chunk of JSONL lines.
+        
+        Args:
+            chunk_data: List of JSONL lines to process
+            
+        Returns:
+            Tuple of (keys_counter, sample_objects, valid_count, error_count)
+        """
+        all_keys = Counter()
+        sample_objects = []
+        valid_count = 0
+        error_count = 0
+        
+        for line in chunk_data:
+            line = line.strip()
+            if not line:
+                continue
+            
+            try:
+                obj = json.loads(line)
+                valid_count += 1
+                
+                # Collect all keys from this object
+                keys = self._extract_all_keys(obj)
+                all_keys.update(keys)
+                
+                # Keep sample objects for schema analysis
+                if len(sample_objects) < self.max_samples:
+                    sample_objects.append(obj)
+            
+            except json.JSONDecodeError:
+                error_count += 1
+        
+        return all_keys, sample_objects, valid_count, error_count
+    
+    def _read_file_chunks(self, file_path: Path) -> List[List[str]]:
+        """
+        Read a JSONL file in chunks for parallel processing.
+        
+        Args:
+            file_path: Path to the JSONL file
+            
+        Returns:
+            List of chunks, each containing lines to process
+        """
+        chunks = []
+        current_chunk = []
+        
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    current_chunk.append(line)
+                    
+                    if len(current_chunk) >= self.chunk_size:
+                        chunks.append(current_chunk)
+                        current_chunk = []
+                
+                # Add remaining lines
+                if current_chunk:
+                    chunks.append(current_chunk)
+        
+        except Exception as e:
+            raise RuntimeError(f"Error reading file {file_path}: {e}")
+        
+        return chunks
+    
+    def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
+        """
+        Analyze a JSONL file and return schema information using parallel processing.
+        
+        Args:
+            file_path: Path to the JSONL file
+            
+        Returns:
+            Dictionary containing schema analysis results
+        """
+        file_path = Path(file_path)
+        
+        if not file_path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        start_time = time.time()
+        file_size = file_path.stat().st_size
+        print(f"Analyzing {file_path.name} ({file_size / (1024*1024*1024):.2f} GB)...")
+        
+        # Statistics
+        total_lines = 0
+        valid_lines = 0
+        error_lines = 0
+        all_keys = Counter()
+        merged_schema = None
+        sample_objects = []
+        
+        # Read file in chunks and process in parallel
+        chunks = self._read_file_chunks(file_path)
+        
+        if len(chunks) == 1 or self.max_workers == 1:
+            # Process sequentially for small files or single worker
+            for chunk in chunks:
+                chunk_keys, chunk_samples, chunk_valid, chunk_errors = self._process_chunk(chunk)
+                all_keys.update(chunk_keys)
+                sample_objects.extend(chunk_samples)
+                valid_lines += chunk_valid
+                error_lines += chunk_errors
+                total_lines += len(chunk)
+        else:
+            # Process chunks in parallel
+            with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+                # Submit all chunks for processing
+                future_to_chunk = {
+                    executor.submit(self._process_chunk, chunk): chunk 
+                    for chunk in chunks
+                }
+                
+                # Collect results as they complete
+                for future in as_completed(future_to_chunk):
+                    chunk_keys, chunk_samples, chunk_valid, chunk_errors = future.result()
+                    all_keys.update(chunk_keys)
+                    sample_objects.extend(chunk_samples)
+                    valid_lines += chunk_valid
+                    error_lines += chunk_errors
+                    total_lines += len(future_to_chunk[future])
+                    
+                    # Limit sample objects
+                    if len(sample_objects) >= self.max_samples:
+                        sample_objects = sample_objects[:self.max_samples]
+        
+        # Analyze schema from sample objects
+        if sample_objects:
+            for obj in sample_objects:
+                obj_schema = self.analyze_json_value(obj)
+                
+                if merged_schema is None:
+                    merged_schema = obj_schema
+                else:
+                    merged_schema = self.merge_schemas(merged_schema, obj_schema)
+        
+        # Prepare results
+        elapsed_time = time.time() - start_time
+        results = {
+            "file_path": str(file_path),
+            "file_size_bytes": file_size,
+            "total_lines": total_lines,
+            "valid_lines": valid_lines,
+            "error_lines": error_lines,
+            "sample_count": len(sample_objects),
+            "all_keys": dict(all_keys.most_common()),
+            "unique_key_count": len(all_keys),
+            "schema": merged_schema,
+            "analysis_timestamp": time.time(),
+            "processing_time_seconds": elapsed_time,
+            "workers_used": self.max_workers,
+            "chunks_processed": len(chunks)
+        }
+        
+        print(f"  Completed in {elapsed_time:.2f}s - {valid_lines:,} valid lines, {error_lines:,} errors")
+        
+        # Clean up memory
+        gc.collect()
+        
+        return results
+    
+    def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
+        """
+        Analyze all JSONL files in a directory using parallel processing.
+        
+        Args:
+            directory_path: Path to directory containing JSONL files
+            pattern: File pattern to match (default: *.jsonl)
+            
+        Returns:
+            Dictionary containing analysis results for all files
+        """
+        directory_path = Path(directory_path)
+        
+        if not directory_path.exists():
+            raise FileNotFoundError(f"Directory not found: {directory_path}")
+        
+        # Find all JSONL files
+        jsonl_files = list(directory_path.glob(pattern))
+        
+        if not jsonl_files:
+            print(f"No JSONL files found in {directory_path} with pattern {pattern}")
+            return {"files": [], "summary": {}}
+        
+        print(f"Found {len(jsonl_files)} JSONL files to analyze using {self.max_workers} workers...")
+        start_time = time.time()
+        
+        # Sort files by size (largest first) for better load balancing
+        jsonl_files.sort(key=lambda f: f.stat().st_size, reverse=True)
+        
+        # Analyze files in parallel
+        file_results = {}
+        
+        if len(jsonl_files) == 1 or self.max_workers == 1:
+            # Process sequentially for single file
+            for file_path in jsonl_files:
+                try:
+                    file_results[file_path.name] = self.analyze_jsonl_file(file_path)
+                except Exception as e:
+                    print(f"Error analyzing {file_path.name}: {e}")
+                    file_results[file_path.name] = {"error": str(e)}
+        else:
+            # Process files in parallel
+            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                # Submit all files for analysis
+                future_to_file = {
+                    executor.submit(self.analyze_jsonl_file, file_path): file_path 
+                    for file_path in jsonl_files
+                }
+                
+                # Collect results as they complete
+                for future in as_completed(future_to_file):
+                    file_path = future_to_file[future]
+                    try:
+                        result = future.result()
+                        file_results[file_path.name] = result
+                    except Exception as e:
+                        print(f"Error analyzing {file_path.name}: {e}")
+                        file_results[file_path.name] = {"error": str(e)}
+        
+        # Create summary
+        successful_results = [r for r in file_results.values() if "error" not in r]
+        summary = {
+            "total_files": len(jsonl_files),
+            "successfully_analyzed": len(successful_results),
+            "total_size_bytes": sum(
+                r.get("file_size_bytes", 0) for r in successful_results
+            ),
+            "total_lines": sum(
+                r.get("total_lines", 0) for r in successful_results
+            ),
+            "total_valid_lines": sum(
+                r.get("valid_lines", 0) for r in successful_results
+            ),
+            "total_processing_time": sum(
+                r.get("processing_time_seconds", 0) for r in successful_results
+            ),
+            "average_processing_speed_mb_per_sec": 0
+        }
+        
+        # Calculate processing speed
+        if summary["total_processing_time"] > 0:
+            total_mb = summary["total_size_bytes"] / (1024 * 1024)
+            summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
+        
+        elapsed_time = time.time() - start_time
+        summary["total_elapsed_time"] = elapsed_time
+        
+        print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
+        print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
+        print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
+        
+        return {
+            "directory": str(directory_path),
+            "pattern": pattern,
+            "files": file_results,
+            "summary": summary
+        }
+    
+    def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
+        """
+        Save analysis results to a JSON file.
+        
+        Args:
+            results: Analysis results to save
+            output_path: Path to save the results
+        """
+        output_path = Path(output_path)
+        
+        try:
+            start_time = time.time()
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(results, f, indent=2, ensure_ascii=False)
+            
+            save_time = time.time() - start_time
+            file_size = output_path.stat().st_size
+            print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
+            
+        except Exception as e:
+            raise RuntimeError(f"Error saving results to {output_path}: {e}")
+
+
+def main():
+    """Main function for command-line usage."""
+    parser = argparse.ArgumentParser(
+        description="Optimized JSONL schema analyzer using multiple cores"
+    )
+    parser.add_argument(
+        "path",
+        help="Path to JSONL file or directory containing JSONL files"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Output file for analysis results (JSON format)"
+    )
+    parser.add_argument(
+        "-p", "--pattern",
+        default="*.jsonl",
+        help="File pattern when analyzing directory (default: *.jsonl)"
+    )
+    parser.add_argument(
+        "-s", "--max-samples",
+        type=int,
+        default=1000,
+        help="Maximum number of JSON objects to sample per file (default: 1000)"
+    )
+    parser.add_argument(
+        "-w", "--workers",
+        type=int,
+        default=None,
+        help="Number of worker processes (default: CPU count, max 8)"
+    )
+    parser.add_argument(
+        "-c", "--chunk-size",
+        type=int,
+        default=1000,
+        help="Number of lines to process in each chunk (default: 1000)"
+    )
+    parser.add_argument(
+        "--directory",
+        action="store_true",
+        help="Treat path as directory instead of single file"
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Enable performance profiling"
+    )
+    
+    args = parser.parse_args()
+    
+    # Initialize analyzer
+    analyzer = OptimizedJSONLSchemaAnalyzer(
+        max_samples=args.max_samples,
+        max_workers=args.workers,
+        chunk_size=args.chunk_size
+    )
+    
+    try:
+        start_time = time.time()
+        
+        # Analyze file or directory
+        if args.directory or Path(args.path).is_dir():
+            results = analyzer.analyze_directory(args.path, args.pattern)
+        else:
+            results = analyzer.analyze_jsonl_file(args.path)
+        
+        total_time = time.time() - start_time
+        
+        # Save or print results
+        if args.output:
+            analyzer.save_results(results, args.output)
+        else:
+            print("\n" + "="*50)
+            print("ANALYSIS RESULTS")
+            print("="*50)
+            print(json.dumps(results, indent=2, ensure_ascii=False))
+        
+        print(f"\nTotal analysis time: {total_time:.2f}s")
+    
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()