import json import sqlite3 import pathlib import traceback import os import argparse import sys import multiprocessing import csv import statistics from datetime import datetime try: import zstandard except ImportError: print("ERROR: zstandard library not found. Please install it: pip install zstandard") sys.exit(1) # ====================================================================== # --- DEFAULT CONFIGURATION (Overridable via CLI args) --- # ====================================================================== try: SCRIPT_DIR = pathlib.Path(__file__).parent ROOT_DIR = SCRIPT_DIR.parent except NameError: SCRIPT_DIR = pathlib.Path.cwd() ROOT_DIR = SCRIPT_DIR.parent DEFAULT_LANG_CODE = "fr" DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate" DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs" COMPRESSION_LEVEL = 22 DICTIONARY_SAMPLE_COUNT = 200000 DICTIONARY_MAX_SIZE = 10 * 1024 * 1024 # 10MB DEFAULT_UNCOMPRESSED_ONLY = False #change this for compression! DEFAULT_MINIMAL = False # ====================================================================== def get_file_size_mb(filepath): return os.path.getsize(filepath) / (1024 * 1024) def count_lines(filepath): print("Counting total lines for progress tracking...") with open(filepath, 'r', encoding='utf-8') as f: return sum(1 for _ in f) def process_chunk(chunk, compression_dict_bytes): import zstandard compression_dict = zstandard.ZstdCompressionDict(compression_dict_bytes) local_compressor = zstandard.ZstdCompressor(level=22, dict_data=compression_dict) results = [] for line in chunk: if not line.strip(): continue try: entry = json.loads(line) word = entry.get("word") pos = entry.get("pos", "") if not word: continue data_to_compress = entry.copy() data_to_compress.pop("word", None) data_to_compress.pop("pos", None) value_bytes = json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8') compressed_blob = local_compressor.compress(value_bytes) results.append((word, pos, compressed_blob, len(value_bytes))) except Exception: pass return results def process_chunk_uncompressed(chunk): results = [] for line in chunk: if not line.strip(): continue try: entry = json.loads(line) word = entry.get("word") pos = entry.get("pos", "") if not word: continue data_to_store = entry.copy() data_to_store.pop("word", None) data_to_store.pop("pos", None) value_str = json.dumps(data_to_store, ensure_ascii=False) value_bytes = value_str.encode('utf-8') results.append((word, pos, value_str, len(value_bytes))) except Exception: pass return results def train_config(config, lines): import zstandard sample_count, max_size = config step = max(1, len(lines) // sample_count) samples = [] for j in range(0, len(lines), step): line = lines[j] if not line.strip(): continue entry = json.loads(line) data_to_compress = entry.copy() data_to_compress.pop("word", None) data_to_compress.pop("pos", None) samples.append(json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8')) if len(samples) >= sample_count: break if not samples: return None compression_dict = zstandard.train_dictionary(max_size, samples) dict_bytes = compression_dict.as_bytes() return (sample_count, max_size, len(dict_bytes), dict_bytes) def create_database(lang_code, input_file, output_dir, intermediate_dir, uncompressed_only=False, minimal=False): database_file = output_dir / f"dictionary_{lang_code}.db" dictionary_file = output_dir / f"dictionary_{lang_code}.zstdict" # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) print(f"Settings:\n - Language: {lang_code}\n - Input: {input_file}\n - DB Output: {database_file}\n - Dict Output: {dictionary_file}") if not input_file.exists(): print(f"Error: Input file not found at {input_file}") sys.exit(1) total_lines = count_lines(input_file) print(f"Total lines to process: {total_lines:,}") with open(input_file, "r", encoding="utf-8") as f: lines = f.readlines() num_processes = multiprocessing.cpu_count() chunk_size = len(lines) // num_processes + 1 chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)] # --- Pass 1: Training Compression Dictionary --- if not uncompressed_only: print(f"\n--- Pass 1: Training Compression Dictionary ---") try: if minimal: sample_count = DICTIONARY_SAMPLE_COUNT max_size = DICTIONARY_MAX_SIZE config = (sample_count, max_size) result = train_config(config, lines) if result is None: print("Error: No valid dictionary trained.") sys.exit(1) sample_count, max_size, dict_size, dict_bytes = result print(f"Using default configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)") else: # Generate 20 configurations to try (varying both sample_count and max_size) configs = [] for i in range(20): sample_count = 100000 + (i % 5) * 200000 # 5 different: 200k, 400k, 600k, 800k, 1M max_size = (3 + (i // 5) * 2) * 1024 * 1024 # 4 different: 3MB, 5MB, 7MB, 9MB configs.append((sample_count, max_size)) pool = multiprocessing.Pool(processes=min(20, multiprocessing.cpu_count())) results = pool.starmap(train_config, [(config, lines) for config in configs]) pool.close() pool.join() # Find the best configuration (largest dictionary size) valid_results = [r for r in results if r is not None] if not valid_results: print("Error: No valid dictionaries trained.") sys.exit(1) print("All configurations results:") for sample_count, max_size, dict_size, _ in valid_results: print(f" samples={sample_count}, max_size={max_size/1024/1024:.1f}MB -> dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)") best_result = max(valid_results, key=lambda x: x[2]) sample_count, max_size, dict_size, dict_bytes = best_result print(f"\nBest configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)") compression_dict = zstandard.ZstdCompressionDict(dict_bytes) with open(dictionary_file, "wb") as f: f.write(dict_bytes) print(f"Saved dictionary to {dictionary_file}") except Exception as e: print(f"Error during training: {e}") traceback.print_exc() sys.exit(1) if not uncompressed_only: # --- Database Setup --- if database_file.exists(): os.remove(database_file) conn = sqlite3.connect(database_file) conn.execute("PRAGMA journal_mode=WAL;") conn.execute("PRAGMA auto_vacuum=full;") cursor = conn.cursor() compressor = zstandard.ZstdCompressor(level=COMPRESSION_LEVEL, dict_data=compression_dict) cursor.execute(''' CREATE TABLE dictionary_data ( id INTEGER PRIMARY KEY AUTOINCREMENT, word TEXT NOT NULL, pos TEXT, data_blob BLOB, uncompressed_size INTEGER ); ''') # --- Pass 2: Insert Data --- print("\n--- Pass 2: Inserting Data ---") pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) print("Processing chunks in parallel for compressed DB...") all_results = pool.starmap(process_chunk, zip(chunks, [dict_bytes] * len(chunks))) data_to_insert = [item for sublist in all_results for item in sublist] print(f"Collected {len(data_to_insert)} items to insert into compressed DB.") cursor.executemany("INSERT INTO dictionary_data (word, pos, data_blob, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert) word_counter = len(data_to_insert) conn.commit() print(f"Inserted {word_counter:,} words into compressed DB.") # --- Pass 3: FTS & Cleanup --- print("Creating FTS4 index...") cursor.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');") cursor.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;") conn.commit() print("Running VACUUM...") cursor.execute('VACUUM') conn.commit() conn.close() db_size_mb = get_file_size_mb(database_file) dict_size_mb = get_file_size_mb(dictionary_file) print(f"\n{'='*60}") print(f"SUCCESS: Database created.") print(f"{'='*60}") print(f"Final Database Size: {db_size_mb:.2f} MB ({database_file.name})") print(f"Final Dictionary Size: {dict_size_mb:.2f} MB ({dictionary_file.name})") print(f"{'='*60}") # --- Create Uncompressed Database --- print(f"\n--- Creating Uncompressed Database ---") uncompressed_db_file = intermediate_dir / f"dictionary_{lang_code}_uncompressed.db" # Ensure intermediate directory exists intermediate_dir.mkdir(parents=True, exist_ok=True) if uncompressed_db_file.exists(): os.remove(uncompressed_db_file) conn2 = sqlite3.connect(uncompressed_db_file) conn2.execute("PRAGMA journal_mode=WAL;") conn2.execute("PRAGMA auto_vacuum=full;") cursor2 = conn2.cursor() cursor2.execute(''' CREATE TABLE dictionary_data ( id INTEGER PRIMARY KEY AUTOINCREMENT, word TEXT NOT NULL, pos TEXT, data TEXT, uncompressed_size INTEGER ); ''') # --- Pass 2b: Insert Uncompressed Data --- print("\n--- Pass 2b: Inserting Uncompressed Data ---") print("Processing chunks in parallel for uncompressed DB...") if uncompressed_only: pool_uncomp = multiprocessing.Pool(processes=multiprocessing.cpu_count()) all_results2 = pool_uncomp.map(process_chunk_uncompressed, chunks) pool_uncomp.close() pool_uncomp.join() else: all_results2 = pool.map(process_chunk_uncompressed, chunks) pool.close() pool.join() data_to_insert2 = [item for sublist in all_results2 for item in sublist] print(f"Collected {len(data_to_insert2)} items to insert into uncompressed DB.") cursor2.executemany("INSERT INTO dictionary_data (word, pos, data, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert2) word_counter2 = len(data_to_insert2) conn2.commit() print(f"Inserted {word_counter2:,} words into uncompressed DB.") # --- Pass 3b: FTS & Cleanup --- print("Creating FTS4 index for uncompressed DB...") cursor2.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');") cursor2.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;") conn2.commit() print("Running VACUUM on uncompressed DB...") cursor2.execute('VACUUM') conn2.commit() # Compute and print uncompressed_size statistics sizes = [row[0] for row in cursor2.execute("SELECT uncompressed_size FROM dictionary_data")] if sizes: min_size = min(sizes) max_size = max(sizes) avg_size = statistics.mean(sizes) median_size = statistics.median(sizes) try: stdev_size = statistics.stdev(sizes) except statistics.StatisticsError: stdev_size = 0.0 print(f"\nUncompressed Size Statistics:") print(f" Count: {len(sizes):,}") print(f" Min: {min_size}") print(f" Max: {max_size}") print(f" Avg: {avg_size:.2f}") print(f" Median: {median_size}") print(f" Std Dev: {stdev_size:.2f}") # Outliers: top 10 largest entries outliers = cursor2.execute("SELECT word, uncompressed_size FROM dictionary_data ORDER BY uncompressed_size DESC LIMIT 10").fetchall() print(f"\nTop 10 largest entries by uncompressed size:") for word, size in outliers: print(f" {word}: {size:,} bytes") conn2.close() uncompressed_db_size_mb = get_file_size_mb(uncompressed_db_file) print(f"\n{'='*60}") print(f"Uncompressed Database Size: {uncompressed_db_size_mb:.2f} MB ({uncompressed_db_file.name})") print(f"{'='*60}") def main(): parser = argparse.ArgumentParser(description="Compress dictionary JSONL into SQLite DB.") parser.add_argument("--lang", type=str, default=DEFAULT_LANG_CODE, help="Language code (e.g., 'de'). Used for naming output files.") parser.add_argument("--input", type=pathlib.Path, help="Full path to input JSONL. If omitted, tries to find it in standard intermediate folder based on lang.") parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR, help="Directory to save .db and .zstdict files.") parser.add_argument("--intermediate-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR, help="Directory to save uncompressed .db file.") args = parser.parse_args() # Determine input file if not explicitly provided if args.input: input_file = args.input else: # Try to guess the filename based on the language code matching script 1's output filename = f"{args.lang.capitalize()}_universal.jsonl" input_file = DEFAULT_INTERMEDIATE_DIR / filename create_database(args.lang, input_file, args.output_dir, args.intermediate_dir, DEFAULT_UNCOMPRESSED_ONLY, DEFAULT_MINIMAL) # Log stats to CSV stats_file = ROOT_DIR / "processing_stats.csv" timestamp = datetime.now().isoformat() files_to_log = [ (args.output_dir / f"dictionary_{args.lang}.db", "compressed_db"), (args.output_dir / f"dictionary_{args.lang}.zstdict", "compression_dict"), (args.intermediate_dir / f"dictionary_{args.lang}_uncompressed.db", "uncompressed_db") ] write_header = not stats_file.exists() with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) if write_header: writer.writerow(['timestamp', 'output_file', 'size_bytes', 'type']) for file_path, file_type in files_to_log: if file_path.exists(): size = file_path.stat().st_size writer.writerow([timestamp, str(file_path), size, file_type]) if __name__ == "__main__": main()