Migrate to gitea
This commit is contained in:
380
scripts/02_create_db.py
Normal file
380
scripts/02_create_db.py
Normal file
@@ -0,0 +1,380 @@
|
||||
import json
|
||||
import sqlite3
|
||||
import pathlib
|
||||
import traceback
|
||||
import os
|
||||
import argparse
|
||||
import sys
|
||||
import multiprocessing
|
||||
import csv
|
||||
import statistics
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import zstandard
|
||||
except ImportError:
|
||||
print("ERROR: zstandard library not found. Please install it: pip install zstandard")
|
||||
sys.exit(1)
|
||||
|
||||
# ======================================================================
|
||||
# --- DEFAULT CONFIGURATION (Overridable via CLI args) ---
|
||||
# ======================================================================
|
||||
|
||||
try:
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
except NameError:
|
||||
SCRIPT_DIR = pathlib.Path.cwd()
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
|
||||
DEFAULT_LANG_CODE = "fr"
|
||||
DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate"
|
||||
DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs"
|
||||
|
||||
COMPRESSION_LEVEL = 22
|
||||
DICTIONARY_SAMPLE_COUNT = 200000
|
||||
DICTIONARY_MAX_SIZE = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
DEFAULT_UNCOMPRESSED_ONLY = False #change this for compression!
|
||||
DEFAULT_MINIMAL = False
|
||||
|
||||
# ======================================================================
|
||||
|
||||
def get_file_size_mb(filepath):
|
||||
return os.path.getsize(filepath) / (1024 * 1024)
|
||||
|
||||
def count_lines(filepath):
|
||||
print("Counting total lines for progress tracking...")
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
return sum(1 for _ in f)
|
||||
|
||||
def process_chunk(chunk, compression_dict_bytes):
|
||||
import zstandard
|
||||
compression_dict = zstandard.ZstdCompressionDict(compression_dict_bytes)
|
||||
local_compressor = zstandard.ZstdCompressor(level=22, dict_data=compression_dict)
|
||||
results = []
|
||||
for line in chunk:
|
||||
if not line.strip(): continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
word = entry.get("word")
|
||||
pos = entry.get("pos", "")
|
||||
if not word: continue
|
||||
data_to_compress = entry.copy()
|
||||
data_to_compress.pop("word", None)
|
||||
data_to_compress.pop("pos", None)
|
||||
value_bytes = json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8')
|
||||
compressed_blob = local_compressor.compress(value_bytes)
|
||||
results.append((word, pos, compressed_blob, len(value_bytes)))
|
||||
except Exception:
|
||||
pass
|
||||
return results
|
||||
|
||||
def process_chunk_uncompressed(chunk):
|
||||
results = []
|
||||
for line in chunk:
|
||||
if not line.strip(): continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
word = entry.get("word")
|
||||
pos = entry.get("pos", "")
|
||||
if not word: continue
|
||||
data_to_store = entry.copy()
|
||||
data_to_store.pop("word", None)
|
||||
data_to_store.pop("pos", None)
|
||||
value_str = json.dumps(data_to_store, ensure_ascii=False)
|
||||
value_bytes = value_str.encode('utf-8')
|
||||
results.append((word, pos, value_str, len(value_bytes)))
|
||||
except Exception:
|
||||
pass
|
||||
return results
|
||||
|
||||
def train_config(config, lines):
|
||||
import zstandard
|
||||
sample_count, max_size = config
|
||||
step = max(1, len(lines) // sample_count)
|
||||
samples = []
|
||||
for j in range(0, len(lines), step):
|
||||
line = lines[j]
|
||||
if not line.strip(): continue
|
||||
entry = json.loads(line)
|
||||
data_to_compress = entry.copy()
|
||||
data_to_compress.pop("word", None)
|
||||
data_to_compress.pop("pos", None)
|
||||
samples.append(json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8'))
|
||||
if len(samples) >= sample_count: break
|
||||
if not samples:
|
||||
return None
|
||||
compression_dict = zstandard.train_dictionary(max_size, samples)
|
||||
dict_bytes = compression_dict.as_bytes()
|
||||
return (sample_count, max_size, len(dict_bytes), dict_bytes)
|
||||
|
||||
def create_database(lang_code, input_file, output_dir, intermediate_dir, uncompressed_only=False, minimal=False):
|
||||
|
||||
database_file = output_dir / f"dictionary_{lang_code}.db"
|
||||
dictionary_file = output_dir / f"dictionary_{lang_code}.zstdict"
|
||||
|
||||
# Ensure output directory exists
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Settings:\n - Language: {lang_code}\n - Input: {input_file}\n - DB Output: {database_file}\n - Dict Output: {dictionary_file}")
|
||||
|
||||
if not input_file.exists():
|
||||
print(f"Error: Input file not found at {input_file}")
|
||||
sys.exit(1)
|
||||
|
||||
total_lines = count_lines(input_file)
|
||||
print(f"Total lines to process: {total_lines:,}")
|
||||
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
num_processes = multiprocessing.cpu_count()
|
||||
chunk_size = len(lines) // num_processes + 1
|
||||
chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)]
|
||||
|
||||
# --- Pass 1: Training Compression Dictionary ---
|
||||
if not uncompressed_only:
|
||||
print(f"\n--- Pass 1: Training Compression Dictionary ---")
|
||||
try:
|
||||
if minimal:
|
||||
sample_count = DICTIONARY_SAMPLE_COUNT
|
||||
max_size = DICTIONARY_MAX_SIZE
|
||||
config = (sample_count, max_size)
|
||||
result = train_config(config, lines)
|
||||
if result is None:
|
||||
print("Error: No valid dictionary trained.")
|
||||
sys.exit(1)
|
||||
sample_count, max_size, dict_size, dict_bytes = result
|
||||
print(f"Using default configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
|
||||
else:
|
||||
# Generate 20 configurations to try (varying both sample_count and max_size)
|
||||
configs = []
|
||||
for i in range(20):
|
||||
sample_count = 100000 + (i % 5) * 200000 # 5 different: 200k, 400k, 600k, 800k, 1M
|
||||
max_size = (3 + (i // 5) * 2) * 1024 * 1024 # 4 different: 3MB, 5MB, 7MB, 9MB
|
||||
configs.append((sample_count, max_size))
|
||||
|
||||
pool = multiprocessing.Pool(processes=min(20, multiprocessing.cpu_count()))
|
||||
results = pool.starmap(train_config, [(config, lines) for config in configs])
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# Find the best configuration (largest dictionary size)
|
||||
valid_results = [r for r in results if r is not None]
|
||||
if not valid_results:
|
||||
print("Error: No valid dictionaries trained.")
|
||||
sys.exit(1)
|
||||
|
||||
print("All configurations results:")
|
||||
for sample_count, max_size, dict_size, _ in valid_results:
|
||||
print(f" samples={sample_count}, max_size={max_size/1024/1024:.1f}MB -> dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
|
||||
|
||||
best_result = max(valid_results, key=lambda x: x[2])
|
||||
sample_count, max_size, dict_size, dict_bytes = best_result
|
||||
|
||||
print(f"\nBest configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
|
||||
|
||||
compression_dict = zstandard.ZstdCompressionDict(dict_bytes)
|
||||
|
||||
with open(dictionary_file, "wb") as f:
|
||||
f.write(dict_bytes)
|
||||
print(f"Saved dictionary to {dictionary_file}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during training: {e}")
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
if not uncompressed_only:
|
||||
# --- Database Setup ---
|
||||
if database_file.exists():
|
||||
os.remove(database_file)
|
||||
|
||||
conn = sqlite3.connect(database_file)
|
||||
conn.execute("PRAGMA journal_mode=WAL;")
|
||||
conn.execute("PRAGMA auto_vacuum=full;")
|
||||
cursor = conn.cursor()
|
||||
compressor = zstandard.ZstdCompressor(level=COMPRESSION_LEVEL, dict_data=compression_dict)
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE dictionary_data (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
word TEXT NOT NULL,
|
||||
pos TEXT,
|
||||
data_blob BLOB,
|
||||
uncompressed_size INTEGER
|
||||
);
|
||||
''')
|
||||
|
||||
# --- Pass 2: Insert Data ---
|
||||
print("\n--- Pass 2: Inserting Data ---")
|
||||
|
||||
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
|
||||
|
||||
print("Processing chunks in parallel for compressed DB...")
|
||||
all_results = pool.starmap(process_chunk, zip(chunks, [dict_bytes] * len(chunks)))
|
||||
data_to_insert = [item for sublist in all_results for item in sublist]
|
||||
|
||||
print(f"Collected {len(data_to_insert)} items to insert into compressed DB.")
|
||||
cursor.executemany("INSERT INTO dictionary_data (word, pos, data_blob, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert)
|
||||
word_counter = len(data_to_insert)
|
||||
|
||||
conn.commit()
|
||||
print(f"Inserted {word_counter:,} words into compressed DB.")
|
||||
|
||||
# --- Pass 3: FTS & Cleanup ---
|
||||
print("Creating FTS4 index...")
|
||||
cursor.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
|
||||
cursor.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
|
||||
conn.commit()
|
||||
|
||||
print("Running VACUUM...")
|
||||
cursor.execute('VACUUM')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
db_size_mb = get_file_size_mb(database_file)
|
||||
dict_size_mb = get_file_size_mb(dictionary_file)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SUCCESS: Database created.")
|
||||
print(f"{'='*60}")
|
||||
print(f"Final Database Size: {db_size_mb:.2f} MB ({database_file.name})")
|
||||
print(f"Final Dictionary Size: {dict_size_mb:.2f} MB ({dictionary_file.name})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# --- Create Uncompressed Database ---
|
||||
print(f"\n--- Creating Uncompressed Database ---")
|
||||
uncompressed_db_file = intermediate_dir / f"dictionary_{lang_code}_uncompressed.db"
|
||||
|
||||
# Ensure intermediate directory exists
|
||||
intermediate_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if uncompressed_db_file.exists():
|
||||
os.remove(uncompressed_db_file)
|
||||
|
||||
conn2 = sqlite3.connect(uncompressed_db_file)
|
||||
conn2.execute("PRAGMA journal_mode=WAL;")
|
||||
conn2.execute("PRAGMA auto_vacuum=full;")
|
||||
cursor2 = conn2.cursor()
|
||||
|
||||
cursor2.execute('''
|
||||
CREATE TABLE dictionary_data (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
word TEXT NOT NULL,
|
||||
pos TEXT,
|
||||
data TEXT,
|
||||
uncompressed_size INTEGER
|
||||
);
|
||||
''')
|
||||
|
||||
# --- Pass 2b: Insert Uncompressed Data ---
|
||||
print("\n--- Pass 2b: Inserting Uncompressed Data ---")
|
||||
|
||||
print("Processing chunks in parallel for uncompressed DB...")
|
||||
if uncompressed_only:
|
||||
pool_uncomp = multiprocessing.Pool(processes=multiprocessing.cpu_count())
|
||||
all_results2 = pool_uncomp.map(process_chunk_uncompressed, chunks)
|
||||
pool_uncomp.close()
|
||||
pool_uncomp.join()
|
||||
else:
|
||||
all_results2 = pool.map(process_chunk_uncompressed, chunks)
|
||||
pool.close()
|
||||
pool.join()
|
||||
data_to_insert2 = [item for sublist in all_results2 for item in sublist]
|
||||
|
||||
print(f"Collected {len(data_to_insert2)} items to insert into uncompressed DB.")
|
||||
cursor2.executemany("INSERT INTO dictionary_data (word, pos, data, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert2)
|
||||
word_counter2 = len(data_to_insert2)
|
||||
|
||||
conn2.commit()
|
||||
print(f"Inserted {word_counter2:,} words into uncompressed DB.")
|
||||
|
||||
# --- Pass 3b: FTS & Cleanup ---
|
||||
print("Creating FTS4 index for uncompressed DB...")
|
||||
cursor2.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
|
||||
cursor2.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
|
||||
conn2.commit()
|
||||
|
||||
print("Running VACUUM on uncompressed DB...")
|
||||
cursor2.execute('VACUUM')
|
||||
conn2.commit()
|
||||
|
||||
# Compute and print uncompressed_size statistics
|
||||
sizes = [row[0] for row in cursor2.execute("SELECT uncompressed_size FROM dictionary_data")]
|
||||
if sizes:
|
||||
min_size = min(sizes)
|
||||
max_size = max(sizes)
|
||||
avg_size = statistics.mean(sizes)
|
||||
median_size = statistics.median(sizes)
|
||||
try:
|
||||
stdev_size = statistics.stdev(sizes)
|
||||
except statistics.StatisticsError:
|
||||
stdev_size = 0.0
|
||||
|
||||
print(f"\nUncompressed Size Statistics:")
|
||||
print(f" Count: {len(sizes):,}")
|
||||
print(f" Min: {min_size}")
|
||||
print(f" Max: {max_size}")
|
||||
print(f" Avg: {avg_size:.2f}")
|
||||
print(f" Median: {median_size}")
|
||||
print(f" Std Dev: {stdev_size:.2f}")
|
||||
|
||||
# Outliers: top 10 largest entries
|
||||
outliers = cursor2.execute("SELECT word, uncompressed_size FROM dictionary_data ORDER BY uncompressed_size DESC LIMIT 10").fetchall()
|
||||
print(f"\nTop 10 largest entries by uncompressed size:")
|
||||
for word, size in outliers:
|
||||
print(f" {word}: {size:,} bytes")
|
||||
|
||||
conn2.close()
|
||||
|
||||
uncompressed_db_size_mb = get_file_size_mb(uncompressed_db_file)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Uncompressed Database Size: {uncompressed_db_size_mb:.2f} MB ({uncompressed_db_file.name})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Compress dictionary JSONL into SQLite DB.")
|
||||
parser.add_argument("--lang", type=str, default=DEFAULT_LANG_CODE,
|
||||
help="Language code (e.g., 'de'). Used for naming output files.")
|
||||
parser.add_argument("--input", type=pathlib.Path,
|
||||
help="Full path to input JSONL. If omitted, tries to find it in standard intermediate folder based on lang.")
|
||||
parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR,
|
||||
help="Directory to save .db and .zstdict files.")
|
||||
parser.add_argument("--intermediate-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR,
|
||||
help="Directory to save uncompressed .db file.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine input file if not explicitly provided
|
||||
if args.input:
|
||||
input_file = args.input
|
||||
else:
|
||||
# Try to guess the filename based on the language code matching script 1's output
|
||||
filename = f"{args.lang.capitalize()}_universal.jsonl"
|
||||
input_file = DEFAULT_INTERMEDIATE_DIR / filename
|
||||
|
||||
create_database(args.lang, input_file, args.output_dir, args.intermediate_dir, DEFAULT_UNCOMPRESSED_ONLY, DEFAULT_MINIMAL)
|
||||
|
||||
# Log stats to CSV
|
||||
stats_file = ROOT_DIR / "processing_stats.csv"
|
||||
timestamp = datetime.now().isoformat()
|
||||
files_to_log = [
|
||||
(args.output_dir / f"dictionary_{args.lang}.db", "compressed_db"),
|
||||
(args.output_dir / f"dictionary_{args.lang}.zstdict", "compression_dict"),
|
||||
(args.intermediate_dir / f"dictionary_{args.lang}_uncompressed.db", "uncompressed_db")
|
||||
]
|
||||
write_header = not stats_file.exists()
|
||||
with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
if write_header:
|
||||
writer.writerow(['timestamp', 'output_file', 'size_bytes', 'type'])
|
||||
for file_path, file_type in files_to_log:
|
||||
if file_path.exists():
|
||||
size = file_path.stat().st_size
|
||||
writer.writerow([timestamp, str(file_path), size, file_type])
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user