Migrate to gitea

This commit is contained in:
jonasgaudian
2026-02-13 00:10:40 +01:00
commit 6d06a9e14e
38 changed files with 31427 additions and 0 deletions

380
scripts/02_create_db.py Normal file
View File

@@ -0,0 +1,380 @@
import json
import sqlite3
import pathlib
import traceback
import os
import argparse
import sys
import multiprocessing
import csv
import statistics
from datetime import datetime
try:
import zstandard
except ImportError:
print("ERROR: zstandard library not found. Please install it: pip install zstandard")
sys.exit(1)
# ======================================================================
# --- DEFAULT CONFIGURATION (Overridable via CLI args) ---
# ======================================================================
try:
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
except NameError:
SCRIPT_DIR = pathlib.Path.cwd()
ROOT_DIR = SCRIPT_DIR.parent
DEFAULT_LANG_CODE = "fr"
DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate"
DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs"
COMPRESSION_LEVEL = 22
DICTIONARY_SAMPLE_COUNT = 200000
DICTIONARY_MAX_SIZE = 10 * 1024 * 1024 # 10MB
DEFAULT_UNCOMPRESSED_ONLY = False #change this for compression!
DEFAULT_MINIMAL = False
# ======================================================================
def get_file_size_mb(filepath):
return os.path.getsize(filepath) / (1024 * 1024)
def count_lines(filepath):
print("Counting total lines for progress tracking...")
with open(filepath, 'r', encoding='utf-8') as f:
return sum(1 for _ in f)
def process_chunk(chunk, compression_dict_bytes):
import zstandard
compression_dict = zstandard.ZstdCompressionDict(compression_dict_bytes)
local_compressor = zstandard.ZstdCompressor(level=22, dict_data=compression_dict)
results = []
for line in chunk:
if not line.strip(): continue
try:
entry = json.loads(line)
word = entry.get("word")
pos = entry.get("pos", "")
if not word: continue
data_to_compress = entry.copy()
data_to_compress.pop("word", None)
data_to_compress.pop("pos", None)
value_bytes = json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8')
compressed_blob = local_compressor.compress(value_bytes)
results.append((word, pos, compressed_blob, len(value_bytes)))
except Exception:
pass
return results
def process_chunk_uncompressed(chunk):
results = []
for line in chunk:
if not line.strip(): continue
try:
entry = json.loads(line)
word = entry.get("word")
pos = entry.get("pos", "")
if not word: continue
data_to_store = entry.copy()
data_to_store.pop("word", None)
data_to_store.pop("pos", None)
value_str = json.dumps(data_to_store, ensure_ascii=False)
value_bytes = value_str.encode('utf-8')
results.append((word, pos, value_str, len(value_bytes)))
except Exception:
pass
return results
def train_config(config, lines):
import zstandard
sample_count, max_size = config
step = max(1, len(lines) // sample_count)
samples = []
for j in range(0, len(lines), step):
line = lines[j]
if not line.strip(): continue
entry = json.loads(line)
data_to_compress = entry.copy()
data_to_compress.pop("word", None)
data_to_compress.pop("pos", None)
samples.append(json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8'))
if len(samples) >= sample_count: break
if not samples:
return None
compression_dict = zstandard.train_dictionary(max_size, samples)
dict_bytes = compression_dict.as_bytes()
return (sample_count, max_size, len(dict_bytes), dict_bytes)
def create_database(lang_code, input_file, output_dir, intermediate_dir, uncompressed_only=False, minimal=False):
database_file = output_dir / f"dictionary_{lang_code}.db"
dictionary_file = output_dir / f"dictionary_{lang_code}.zstdict"
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Settings:\n - Language: {lang_code}\n - Input: {input_file}\n - DB Output: {database_file}\n - Dict Output: {dictionary_file}")
if not input_file.exists():
print(f"Error: Input file not found at {input_file}")
sys.exit(1)
total_lines = count_lines(input_file)
print(f"Total lines to process: {total_lines:,}")
with open(input_file, "r", encoding="utf-8") as f:
lines = f.readlines()
num_processes = multiprocessing.cpu_count()
chunk_size = len(lines) // num_processes + 1
chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)]
# --- Pass 1: Training Compression Dictionary ---
if not uncompressed_only:
print(f"\n--- Pass 1: Training Compression Dictionary ---")
try:
if minimal:
sample_count = DICTIONARY_SAMPLE_COUNT
max_size = DICTIONARY_MAX_SIZE
config = (sample_count, max_size)
result = train_config(config, lines)
if result is None:
print("Error: No valid dictionary trained.")
sys.exit(1)
sample_count, max_size, dict_size, dict_bytes = result
print(f"Using default configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
else:
# Generate 20 configurations to try (varying both sample_count and max_size)
configs = []
for i in range(20):
sample_count = 100000 + (i % 5) * 200000 # 5 different: 200k, 400k, 600k, 800k, 1M
max_size = (3 + (i // 5) * 2) * 1024 * 1024 # 4 different: 3MB, 5MB, 7MB, 9MB
configs.append((sample_count, max_size))
pool = multiprocessing.Pool(processes=min(20, multiprocessing.cpu_count()))
results = pool.starmap(train_config, [(config, lines) for config in configs])
pool.close()
pool.join()
# Find the best configuration (largest dictionary size)
valid_results = [r for r in results if r is not None]
if not valid_results:
print("Error: No valid dictionaries trained.")
sys.exit(1)
print("All configurations results:")
for sample_count, max_size, dict_size, _ in valid_results:
print(f" samples={sample_count}, max_size={max_size/1024/1024:.1f}MB -> dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
best_result = max(valid_results, key=lambda x: x[2])
sample_count, max_size, dict_size, dict_bytes = best_result
print(f"\nBest configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
compression_dict = zstandard.ZstdCompressionDict(dict_bytes)
with open(dictionary_file, "wb") as f:
f.write(dict_bytes)
print(f"Saved dictionary to {dictionary_file}")
except Exception as e:
print(f"Error during training: {e}")
traceback.print_exc()
sys.exit(1)
if not uncompressed_only:
# --- Database Setup ---
if database_file.exists():
os.remove(database_file)
conn = sqlite3.connect(database_file)
conn.execute("PRAGMA journal_mode=WAL;")
conn.execute("PRAGMA auto_vacuum=full;")
cursor = conn.cursor()
compressor = zstandard.ZstdCompressor(level=COMPRESSION_LEVEL, dict_data=compression_dict)
cursor.execute('''
CREATE TABLE dictionary_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word TEXT NOT NULL,
pos TEXT,
data_blob BLOB,
uncompressed_size INTEGER
);
''')
# --- Pass 2: Insert Data ---
print("\n--- Pass 2: Inserting Data ---")
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
print("Processing chunks in parallel for compressed DB...")
all_results = pool.starmap(process_chunk, zip(chunks, [dict_bytes] * len(chunks)))
data_to_insert = [item for sublist in all_results for item in sublist]
print(f"Collected {len(data_to_insert)} items to insert into compressed DB.")
cursor.executemany("INSERT INTO dictionary_data (word, pos, data_blob, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert)
word_counter = len(data_to_insert)
conn.commit()
print(f"Inserted {word_counter:,} words into compressed DB.")
# --- Pass 3: FTS & Cleanup ---
print("Creating FTS4 index...")
cursor.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
cursor.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
conn.commit()
print("Running VACUUM...")
cursor.execute('VACUUM')
conn.commit()
conn.close()
db_size_mb = get_file_size_mb(database_file)
dict_size_mb = get_file_size_mb(dictionary_file)
print(f"\n{'='*60}")
print(f"SUCCESS: Database created.")
print(f"{'='*60}")
print(f"Final Database Size: {db_size_mb:.2f} MB ({database_file.name})")
print(f"Final Dictionary Size: {dict_size_mb:.2f} MB ({dictionary_file.name})")
print(f"{'='*60}")
# --- Create Uncompressed Database ---
print(f"\n--- Creating Uncompressed Database ---")
uncompressed_db_file = intermediate_dir / f"dictionary_{lang_code}_uncompressed.db"
# Ensure intermediate directory exists
intermediate_dir.mkdir(parents=True, exist_ok=True)
if uncompressed_db_file.exists():
os.remove(uncompressed_db_file)
conn2 = sqlite3.connect(uncompressed_db_file)
conn2.execute("PRAGMA journal_mode=WAL;")
conn2.execute("PRAGMA auto_vacuum=full;")
cursor2 = conn2.cursor()
cursor2.execute('''
CREATE TABLE dictionary_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word TEXT NOT NULL,
pos TEXT,
data TEXT,
uncompressed_size INTEGER
);
''')
# --- Pass 2b: Insert Uncompressed Data ---
print("\n--- Pass 2b: Inserting Uncompressed Data ---")
print("Processing chunks in parallel for uncompressed DB...")
if uncompressed_only:
pool_uncomp = multiprocessing.Pool(processes=multiprocessing.cpu_count())
all_results2 = pool_uncomp.map(process_chunk_uncompressed, chunks)
pool_uncomp.close()
pool_uncomp.join()
else:
all_results2 = pool.map(process_chunk_uncompressed, chunks)
pool.close()
pool.join()
data_to_insert2 = [item for sublist in all_results2 for item in sublist]
print(f"Collected {len(data_to_insert2)} items to insert into uncompressed DB.")
cursor2.executemany("INSERT INTO dictionary_data (word, pos, data, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert2)
word_counter2 = len(data_to_insert2)
conn2.commit()
print(f"Inserted {word_counter2:,} words into uncompressed DB.")
# --- Pass 3b: FTS & Cleanup ---
print("Creating FTS4 index for uncompressed DB...")
cursor2.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
cursor2.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
conn2.commit()
print("Running VACUUM on uncompressed DB...")
cursor2.execute('VACUUM')
conn2.commit()
# Compute and print uncompressed_size statistics
sizes = [row[0] for row in cursor2.execute("SELECT uncompressed_size FROM dictionary_data")]
if sizes:
min_size = min(sizes)
max_size = max(sizes)
avg_size = statistics.mean(sizes)
median_size = statistics.median(sizes)
try:
stdev_size = statistics.stdev(sizes)
except statistics.StatisticsError:
stdev_size = 0.0
print(f"\nUncompressed Size Statistics:")
print(f" Count: {len(sizes):,}")
print(f" Min: {min_size}")
print(f" Max: {max_size}")
print(f" Avg: {avg_size:.2f}")
print(f" Median: {median_size}")
print(f" Std Dev: {stdev_size:.2f}")
# Outliers: top 10 largest entries
outliers = cursor2.execute("SELECT word, uncompressed_size FROM dictionary_data ORDER BY uncompressed_size DESC LIMIT 10").fetchall()
print(f"\nTop 10 largest entries by uncompressed size:")
for word, size in outliers:
print(f" {word}: {size:,} bytes")
conn2.close()
uncompressed_db_size_mb = get_file_size_mb(uncompressed_db_file)
print(f"\n{'='*60}")
print(f"Uncompressed Database Size: {uncompressed_db_size_mb:.2f} MB ({uncompressed_db_file.name})")
print(f"{'='*60}")
def main():
parser = argparse.ArgumentParser(description="Compress dictionary JSONL into SQLite DB.")
parser.add_argument("--lang", type=str, default=DEFAULT_LANG_CODE,
help="Language code (e.g., 'de'). Used for naming output files.")
parser.add_argument("--input", type=pathlib.Path,
help="Full path to input JSONL. If omitted, tries to find it in standard intermediate folder based on lang.")
parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR,
help="Directory to save .db and .zstdict files.")
parser.add_argument("--intermediate-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR,
help="Directory to save uncompressed .db file.")
args = parser.parse_args()
# Determine input file if not explicitly provided
if args.input:
input_file = args.input
else:
# Try to guess the filename based on the language code matching script 1's output
filename = f"{args.lang.capitalize()}_universal.jsonl"
input_file = DEFAULT_INTERMEDIATE_DIR / filename
create_database(args.lang, input_file, args.output_dir, args.intermediate_dir, DEFAULT_UNCOMPRESSED_ONLY, DEFAULT_MINIMAL)
# Log stats to CSV
stats_file = ROOT_DIR / "processing_stats.csv"
timestamp = datetime.now().isoformat()
files_to_log = [
(args.output_dir / f"dictionary_{args.lang}.db", "compressed_db"),
(args.output_dir / f"dictionary_{args.lang}.zstdict", "compression_dict"),
(args.intermediate_dir / f"dictionary_{args.lang}_uncompressed.db", "uncompressed_db")
]
write_header = not stats_file.exists()
with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
if write_header:
writer.writerow(['timestamp', 'output_file', 'size_bytes', 'type'])
for file_path, file_type in files_to_log:
if file_path.exists():
size = file_path.stat().st_size
writer.writerow([timestamp, str(file_path), size, file_type])
if __name__ == "__main__":
main()