Migrate to gitea

This commit is contained in:
jonasgaudian
2026-02-13 00:10:40 +01:00
commit 6d06a9e14e
38 changed files with 31427 additions and 0 deletions

View File

@@ -0,0 +1,329 @@
#!/usr/bin/env python3
"""
Transforms dictionary data from kaakki.org JSONL format to the universal
dictionary schema defined in 'universal_dictionary_schema.json'.
Uses ALL system cores for parallel processing.
"""
import json
import pathlib
import logging
import sys
import argparse
import csv
import multiprocessing
import traceback
from datetime import datetime
from typing import List, Dict, Any, Set, Optional, Tuple
# ==============================================================================
# --- DEFAULT CONFIGURATION (Overridable via CLI args) ---
# ==============================================================================
try:
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
except NameError:
SCRIPT_DIR = pathlib.Path.cwd()
ROOT_DIR = SCRIPT_DIR.parent
sys.path.insert(0, str(ROOT_DIR))
# --- IMPORTS ---
try:
from transform_wiktionary import WiktionaryTransformer
from InflectionProcessor import InflectionProcessor
# Import language configurations
try:
from lang_config import GERMAN_VERB_CONFIG
except ImportError:
GERMAN_VERB_CONFIG = {}
try:
from lang_config import FRENCH_VERB_CONFIG
except ImportError:
FRENCH_VERB_CONFIG = {}
except ImportError:
pass
DEFAULT_LANG_FILTER = "fr"
DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate"
DEFAULT_POS_WHITELIST = set()
DEFAULT_POS_BLACKLIST = {"unknown"}
DEFAULT_IGNORE_FORM_OF = True
DEFAULT_TRANS_LANGS = {"pt", "es", "en", "de", "it", "fr", "nl"}
# ==============================================================================
# --- LOGGING ---
# ==============================================================================
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# ==============================================================================
# --- WORKER FUNCTION ---
# ==============================================================================
def process_chunk_filtering(
chunk_lines: List[str],
lang_filter: Optional[str],
pos_whitelist: Set[str],
pos_blacklist: Set[str],
ignore_form_of: bool,
translation_languages: Set[str],
inflection_configs: Dict
) -> Tuple[List[str], Dict[str, int], List[str]]:
# Re-instantiate processors inside the worker process
transformer = WiktionaryTransformer()
inflection_processor = InflectionProcessor(inflection_configs)
form_of_tags = {"form-of", "affix", "particle", "suffix", "prefix"}
results = []
errors = []
counters = {"processed": 0, "skipped": 0, "errors": 0}
for line in chunk_lines:
if not line.strip():
continue
try:
data = json.loads(line)
# --- Apply Filters ---
if lang_filter and data.get("lang_code") != lang_filter:
counters["skipped"] += 1; continue
pos = data.get("pos")
if pos_whitelist and pos not in pos_whitelist:
counters["skipped"] += 1; continue
if pos_blacklist and pos in pos_blacklist:
counters["skipped"] += 1; continue
if ignore_form_of:
if set(data.get("tags", [])).intersection(form_of_tags):
counters["skipped"] += 1; continue
# --- Filter Translations ---
if 'translations' in data:
data['translations'] = [
tr for tr in data['translations']
if tr.get('lang_code') in translation_languages
]
# --- 1. Transform Data to Universal Schema ---
new_entry = transformer.transform_entry(data)
# --- CLEANUP PHONETICS (Audio & Duplicates) ---
if 'phonetics' in new_entry:
# Remove Audio
if 'audio' in new_entry['phonetics']:
del new_entry['phonetics']['audio']
# Process IPA variations to remove duplicates while preserving country information
if 'ipa_variations' in new_entry['phonetics'] and isinstance(new_entry['phonetics']['ipa_variations'], list):
# Group variations by cleaned IPA to collect all regions for each pronunciation
ipa_groups = {}
for variation in new_entry['phonetics']['ipa_variations']:
ipa_cleaned = variation.get('ipa_cleaned', '')
if ipa_cleaned:
if ipa_cleaned not in ipa_groups:
ipa_groups[ipa_cleaned] = {
"ipa": ipa_cleaned,
"raw_tags": []
}
# Collect all raw_tags for this IPA
if 'raw_tags' in variation:
ipa_groups[ipa_cleaned]['raw_tags'].extend(variation['raw_tags'])
# Create compressed variations list
compressed_variations = []
for ipa_cleaned, group_data in ipa_groups.items():
variation = {"ipa": ipa_cleaned}
if group_data['raw_tags']:
# Remove duplicates from raw_tags while preserving order
seen_tags = set()
unique_tags = []
for tag in group_data['raw_tags']:
if tag not in seen_tags:
unique_tags.append(tag)
seen_tags.add(tag)
variation['raw_tags'] = unique_tags
compressed_variations.append(variation)
# Create simplified IPA list and compressed variations
simplified_ipa = list(ipa_groups.keys())
new_entry['phonetics']['ipa'] = simplified_ipa
new_entry['phonetics']['ipa_variations'] = compressed_variations
# --- Filter out unnecessary fields ---
if 'metadata' in new_entry:
del new_entry['metadata']
if 'translations' in new_entry:
for tr in new_entry['translations']:
tr.pop('lang', None)
tr.pop('sense', None)
if 'senses' in new_entry:
for sense in new_entry['senses']:
if 'examples' in sense:
sense['examples'] = [ex['text'] for ex in sense['examples'] if 'text' in ex]
if 'relations' in new_entry and 'derived' in new_entry['relations']:
del new_entry['relations']['derived']
# --- 2. Run Inflection Processor ---
new_entry = inflection_processor.process(new_entry)
# --- Remove lang_code after processing ---
if 'lang_code' in new_entry:
del new_entry['lang_code']
results.append(json.dumps(new_entry, ensure_ascii=False))
counters["processed"] += 1
except ValueError as e:
counters["skipped"] += 1
errors.append(f"Value Error: {str(e)}")
except json.JSONDecodeError:
counters["errors"] += 1
except Exception as e:
counters["errors"] += 1
errors.append(f"Unexpected Error: {str(e)}")
return results, counters, errors
# ==============================================================================
# --- MAIN PROCESS ---
# ==============================================================================
def process_file(input_path: pathlib.Path, output_path: pathlib.Path, lang_filter: Optional[str],
pos_whitelist: Set[str], pos_blacklist: Set[str], ignore_form_of: bool,
translation_languages: Set[str]):
logger.info(f"Starting parallel processing...")
logger.info(f" Input file: {input_path}")
logger.info(f" Output file: {output_path}")
if not input_path.exists():
logger.critical(f"Input file not found: {input_path}")
sys.exit(1)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Prepare Inflection Configs
inflection_configs = {
'de_verb': GERMAN_VERB_CONFIG,
'fr_verb': FRENCH_VERB_CONFIG
}
if lang_filter and f"{lang_filter}_verb" not in inflection_configs:
logger.warning(f"No inflection configuration found for language '{lang_filter}'. Verbs will remain uncompressed.")
logger.info("Reading input file into memory...")
try:
with open(input_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
except Exception as e:
logger.critical(f"Failed to read input file: {e}")
sys.exit(1)
total_lines = len(lines)
logger.info(f"Total lines to process: {total_lines:,}")
num_processes = multiprocessing.cpu_count()
chunk_size = total_lines // num_processes + 1
chunks = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)]
logger.info(f"Split data into {len(chunks)} chunks for {num_processes} cores.")
pool = multiprocessing.Pool(processes=num_processes)
worker_args = [
(chunk, lang_filter, pos_whitelist, pos_blacklist, ignore_form_of, translation_languages, inflection_configs)
for chunk in chunks
]
try:
all_results = pool.starmap(process_chunk_filtering, worker_args)
pool.close()
pool.join()
except KeyboardInterrupt:
logger.warning("Interrupted by user. Terminating pool...")
pool.terminate()
sys.exit(1)
except Exception as e:
logger.critical(f"Error during parallel processing: {e}")
traceback.print_exc()
sys.exit(1)
logger.info("Aggregating results and writing to output...")
final_counters = {"processed": 0, "skipped": 0, "errors": 0}
error_log_path = output_path.parent / "verb_errors.log"
with open(output_path, 'w', encoding='utf-8') as out_f, \
open(error_log_path, 'w', encoding='utf-8') as err_f:
for result_strings, worker_stats, worker_errors in all_results:
for k in final_counters:
final_counters[k] += worker_stats.get(k, 0)
for json_str in result_strings:
out_f.write(json_str + "\n")
for err_msg in worker_errors:
err_f.write(err_msg + "\n")
logger.info(f"DONE. Total Read: {total_lines}")
logger.info(f"Processed: {final_counters['processed']}, Skipped: {final_counters['skipped']}, Errors: {final_counters['errors']}")
def main():
parser = argparse.ArgumentParser(description="Transform kaakki.org JSONL to universal dictionary format (Parallel).")
parser.add_argument("--input", type=pathlib.Path, default=DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME,
help="Path to the raw input JSONL file.")
parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR,
help="Directory to save the transformed JSONL file.")
parser.add_argument("--lang", type=str, default=DEFAULT_LANG_FILTER,
help="Language code to filter for (e.g., 'de').")
parser.add_argument("--trans-langs", type=str, default=",".join(DEFAULT_TRANS_LANGS),
help="Comma-separated list of translation languages to keep.")
args = parser.parse_args()
output_filename = f"{args.lang.capitalize()}_universal.jsonl" if args.lang else "universal.jsonl"
output_file_path = args.output_dir / output_filename
trans_langs_set = set(lang.strip() for lang in args.trans_langs.split(",")) if args.trans_langs else set()
process_file(
args.input,
output_file_path,
args.lang,
DEFAULT_POS_WHITELIST,
DEFAULT_POS_BLACKLIST,
DEFAULT_IGNORE_FORM_OF,
trans_langs_set
)
stats_file = ROOT_DIR / "processing_stats.csv"
if output_file_path.exists():
file_size = output_file_path.stat().st_size
else:
file_size = 0
timestamp = datetime.now().isoformat()
write_header = not stats_file.exists()
try:
with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
if write_header:
writer.writerow(['timestamp', 'output_file', 'size_bytes'])
writer.writerow([timestamp, str(output_file_path), file_size])
except Exception as e:
logger.warning(f"Could not write stats csv: {e}")
if __name__ == "__main__":
multiprocessing.freeze_support()
main()

380
scripts/02_create_db.py Normal file
View File

@@ -0,0 +1,380 @@
import json
import sqlite3
import pathlib
import traceback
import os
import argparse
import sys
import multiprocessing
import csv
import statistics
from datetime import datetime
try:
import zstandard
except ImportError:
print("ERROR: zstandard library not found. Please install it: pip install zstandard")
sys.exit(1)
# ======================================================================
# --- DEFAULT CONFIGURATION (Overridable via CLI args) ---
# ======================================================================
try:
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
except NameError:
SCRIPT_DIR = pathlib.Path.cwd()
ROOT_DIR = SCRIPT_DIR.parent
DEFAULT_LANG_CODE = "fr"
DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate"
DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs"
COMPRESSION_LEVEL = 22
DICTIONARY_SAMPLE_COUNT = 200000
DICTIONARY_MAX_SIZE = 10 * 1024 * 1024 # 10MB
DEFAULT_UNCOMPRESSED_ONLY = False #change this for compression!
DEFAULT_MINIMAL = False
# ======================================================================
def get_file_size_mb(filepath):
return os.path.getsize(filepath) / (1024 * 1024)
def count_lines(filepath):
print("Counting total lines for progress tracking...")
with open(filepath, 'r', encoding='utf-8') as f:
return sum(1 for _ in f)
def process_chunk(chunk, compression_dict_bytes):
import zstandard
compression_dict = zstandard.ZstdCompressionDict(compression_dict_bytes)
local_compressor = zstandard.ZstdCompressor(level=22, dict_data=compression_dict)
results = []
for line in chunk:
if not line.strip(): continue
try:
entry = json.loads(line)
word = entry.get("word")
pos = entry.get("pos", "")
if not word: continue
data_to_compress = entry.copy()
data_to_compress.pop("word", None)
data_to_compress.pop("pos", None)
value_bytes = json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8')
compressed_blob = local_compressor.compress(value_bytes)
results.append((word, pos, compressed_blob, len(value_bytes)))
except Exception:
pass
return results
def process_chunk_uncompressed(chunk):
results = []
for line in chunk:
if not line.strip(): continue
try:
entry = json.loads(line)
word = entry.get("word")
pos = entry.get("pos", "")
if not word: continue
data_to_store = entry.copy()
data_to_store.pop("word", None)
data_to_store.pop("pos", None)
value_str = json.dumps(data_to_store, ensure_ascii=False)
value_bytes = value_str.encode('utf-8')
results.append((word, pos, value_str, len(value_bytes)))
except Exception:
pass
return results
def train_config(config, lines):
import zstandard
sample_count, max_size = config
step = max(1, len(lines) // sample_count)
samples = []
for j in range(0, len(lines), step):
line = lines[j]
if not line.strip(): continue
entry = json.loads(line)
data_to_compress = entry.copy()
data_to_compress.pop("word", None)
data_to_compress.pop("pos", None)
samples.append(json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8'))
if len(samples) >= sample_count: break
if not samples:
return None
compression_dict = zstandard.train_dictionary(max_size, samples)
dict_bytes = compression_dict.as_bytes()
return (sample_count, max_size, len(dict_bytes), dict_bytes)
def create_database(lang_code, input_file, output_dir, intermediate_dir, uncompressed_only=False, minimal=False):
database_file = output_dir / f"dictionary_{lang_code}.db"
dictionary_file = output_dir / f"dictionary_{lang_code}.zstdict"
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Settings:\n - Language: {lang_code}\n - Input: {input_file}\n - DB Output: {database_file}\n - Dict Output: {dictionary_file}")
if not input_file.exists():
print(f"Error: Input file not found at {input_file}")
sys.exit(1)
total_lines = count_lines(input_file)
print(f"Total lines to process: {total_lines:,}")
with open(input_file, "r", encoding="utf-8") as f:
lines = f.readlines()
num_processes = multiprocessing.cpu_count()
chunk_size = len(lines) // num_processes + 1
chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)]
# --- Pass 1: Training Compression Dictionary ---
if not uncompressed_only:
print(f"\n--- Pass 1: Training Compression Dictionary ---")
try:
if minimal:
sample_count = DICTIONARY_SAMPLE_COUNT
max_size = DICTIONARY_MAX_SIZE
config = (sample_count, max_size)
result = train_config(config, lines)
if result is None:
print("Error: No valid dictionary trained.")
sys.exit(1)
sample_count, max_size, dict_size, dict_bytes = result
print(f"Using default configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
else:
# Generate 20 configurations to try (varying both sample_count and max_size)
configs = []
for i in range(20):
sample_count = 100000 + (i % 5) * 200000 # 5 different: 200k, 400k, 600k, 800k, 1M
max_size = (3 + (i // 5) * 2) * 1024 * 1024 # 4 different: 3MB, 5MB, 7MB, 9MB
configs.append((sample_count, max_size))
pool = multiprocessing.Pool(processes=min(20, multiprocessing.cpu_count()))
results = pool.starmap(train_config, [(config, lines) for config in configs])
pool.close()
pool.join()
# Find the best configuration (largest dictionary size)
valid_results = [r for r in results if r is not None]
if not valid_results:
print("Error: No valid dictionaries trained.")
sys.exit(1)
print("All configurations results:")
for sample_count, max_size, dict_size, _ in valid_results:
print(f" samples={sample_count}, max_size={max_size/1024/1024:.1f}MB -> dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
best_result = max(valid_results, key=lambda x: x[2])
sample_count, max_size, dict_size, dict_bytes = best_result
print(f"\nBest configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
compression_dict = zstandard.ZstdCompressionDict(dict_bytes)
with open(dictionary_file, "wb") as f:
f.write(dict_bytes)
print(f"Saved dictionary to {dictionary_file}")
except Exception as e:
print(f"Error during training: {e}")
traceback.print_exc()
sys.exit(1)
if not uncompressed_only:
# --- Database Setup ---
if database_file.exists():
os.remove(database_file)
conn = sqlite3.connect(database_file)
conn.execute("PRAGMA journal_mode=WAL;")
conn.execute("PRAGMA auto_vacuum=full;")
cursor = conn.cursor()
compressor = zstandard.ZstdCompressor(level=COMPRESSION_LEVEL, dict_data=compression_dict)
cursor.execute('''
CREATE TABLE dictionary_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word TEXT NOT NULL,
pos TEXT,
data_blob BLOB,
uncompressed_size INTEGER
);
''')
# --- Pass 2: Insert Data ---
print("\n--- Pass 2: Inserting Data ---")
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
print("Processing chunks in parallel for compressed DB...")
all_results = pool.starmap(process_chunk, zip(chunks, [dict_bytes] * len(chunks)))
data_to_insert = [item for sublist in all_results for item in sublist]
print(f"Collected {len(data_to_insert)} items to insert into compressed DB.")
cursor.executemany("INSERT INTO dictionary_data (word, pos, data_blob, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert)
word_counter = len(data_to_insert)
conn.commit()
print(f"Inserted {word_counter:,} words into compressed DB.")
# --- Pass 3: FTS & Cleanup ---
print("Creating FTS4 index...")
cursor.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
cursor.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
conn.commit()
print("Running VACUUM...")
cursor.execute('VACUUM')
conn.commit()
conn.close()
db_size_mb = get_file_size_mb(database_file)
dict_size_mb = get_file_size_mb(dictionary_file)
print(f"\n{'='*60}")
print(f"SUCCESS: Database created.")
print(f"{'='*60}")
print(f"Final Database Size: {db_size_mb:.2f} MB ({database_file.name})")
print(f"Final Dictionary Size: {dict_size_mb:.2f} MB ({dictionary_file.name})")
print(f"{'='*60}")
# --- Create Uncompressed Database ---
print(f"\n--- Creating Uncompressed Database ---")
uncompressed_db_file = intermediate_dir / f"dictionary_{lang_code}_uncompressed.db"
# Ensure intermediate directory exists
intermediate_dir.mkdir(parents=True, exist_ok=True)
if uncompressed_db_file.exists():
os.remove(uncompressed_db_file)
conn2 = sqlite3.connect(uncompressed_db_file)
conn2.execute("PRAGMA journal_mode=WAL;")
conn2.execute("PRAGMA auto_vacuum=full;")
cursor2 = conn2.cursor()
cursor2.execute('''
CREATE TABLE dictionary_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word TEXT NOT NULL,
pos TEXT,
data TEXT,
uncompressed_size INTEGER
);
''')
# --- Pass 2b: Insert Uncompressed Data ---
print("\n--- Pass 2b: Inserting Uncompressed Data ---")
print("Processing chunks in parallel for uncompressed DB...")
if uncompressed_only:
pool_uncomp = multiprocessing.Pool(processes=multiprocessing.cpu_count())
all_results2 = pool_uncomp.map(process_chunk_uncompressed, chunks)
pool_uncomp.close()
pool_uncomp.join()
else:
all_results2 = pool.map(process_chunk_uncompressed, chunks)
pool.close()
pool.join()
data_to_insert2 = [item for sublist in all_results2 for item in sublist]
print(f"Collected {len(data_to_insert2)} items to insert into uncompressed DB.")
cursor2.executemany("INSERT INTO dictionary_data (word, pos, data, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert2)
word_counter2 = len(data_to_insert2)
conn2.commit()
print(f"Inserted {word_counter2:,} words into uncompressed DB.")
# --- Pass 3b: FTS & Cleanup ---
print("Creating FTS4 index for uncompressed DB...")
cursor2.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
cursor2.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
conn2.commit()
print("Running VACUUM on uncompressed DB...")
cursor2.execute('VACUUM')
conn2.commit()
# Compute and print uncompressed_size statistics
sizes = [row[0] for row in cursor2.execute("SELECT uncompressed_size FROM dictionary_data")]
if sizes:
min_size = min(sizes)
max_size = max(sizes)
avg_size = statistics.mean(sizes)
median_size = statistics.median(sizes)
try:
stdev_size = statistics.stdev(sizes)
except statistics.StatisticsError:
stdev_size = 0.0
print(f"\nUncompressed Size Statistics:")
print(f" Count: {len(sizes):,}")
print(f" Min: {min_size}")
print(f" Max: {max_size}")
print(f" Avg: {avg_size:.2f}")
print(f" Median: {median_size}")
print(f" Std Dev: {stdev_size:.2f}")
# Outliers: top 10 largest entries
outliers = cursor2.execute("SELECT word, uncompressed_size FROM dictionary_data ORDER BY uncompressed_size DESC LIMIT 10").fetchall()
print(f"\nTop 10 largest entries by uncompressed size:")
for word, size in outliers:
print(f" {word}: {size:,} bytes")
conn2.close()
uncompressed_db_size_mb = get_file_size_mb(uncompressed_db_file)
print(f"\n{'='*60}")
print(f"Uncompressed Database Size: {uncompressed_db_size_mb:.2f} MB ({uncompressed_db_file.name})")
print(f"{'='*60}")
def main():
parser = argparse.ArgumentParser(description="Compress dictionary JSONL into SQLite DB.")
parser.add_argument("--lang", type=str, default=DEFAULT_LANG_CODE,
help="Language code (e.g., 'de'). Used for naming output files.")
parser.add_argument("--input", type=pathlib.Path,
help="Full path to input JSONL. If omitted, tries to find it in standard intermediate folder based on lang.")
parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR,
help="Directory to save .db and .zstdict files.")
parser.add_argument("--intermediate-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR,
help="Directory to save uncompressed .db file.")
args = parser.parse_args()
# Determine input file if not explicitly provided
if args.input:
input_file = args.input
else:
# Try to guess the filename based on the language code matching script 1's output
filename = f"{args.lang.capitalize()}_universal.jsonl"
input_file = DEFAULT_INTERMEDIATE_DIR / filename
create_database(args.lang, input_file, args.output_dir, args.intermediate_dir, DEFAULT_UNCOMPRESSED_ONLY, DEFAULT_MINIMAL)
# Log stats to CSV
stats_file = ROOT_DIR / "processing_stats.csv"
timestamp = datetime.now().isoformat()
files_to_log = [
(args.output_dir / f"dictionary_{args.lang}.db", "compressed_db"),
(args.output_dir / f"dictionary_{args.lang}.zstdict", "compression_dict"),
(args.intermediate_dir / f"dictionary_{args.lang}_uncompressed.db", "uncompressed_db")
]
write_header = not stats_file.exists()
with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
if write_header:
writer.writerow(['timestamp', 'output_file', 'size_bytes', 'type'])
for file_path, file_type in files_to_log:
if file_path.exists():
size = file_path.stat().st_size
writer.writerow([timestamp, str(file_path), size, file_type])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,108 @@
import json
import os
import hashlib
import sys
import pathlib
import re
import argparse
from typing import Dict, Any, Set
# ======================================================================
# --- DEFAULT CONFIGURATION ---
# ======================================================================
try:
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
except NameError:
SCRIPT_DIR = pathlib.Path.cwd()
ROOT_DIR = SCRIPT_DIR.parent
DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs"
# ======================================================================
def calculate_sha256(filepath: pathlib.Path, block_size=65536) -> str | None:
sha256 = hashlib.sha256()
try:
with open(filepath, 'rb') as f:
for block in iter(lambda: f.read(block_size), b''):
sha256.update(block)
except IOError as e:
print(f" ERROR: Could not read file '{filepath.name}': {e}")
return None
return sha256.hexdigest().upper()
def guess_properties_from_base(base_name: str) -> Dict[str, str]:
match = re.match(r"dictionary_([a-zA-Z]{2,3})", base_name)
if match:
lang_code = match.group(1)
return {"id": f"{lang_code}_dict", "name": f"Dictionary ({lang_code.upper()})", "lang_code": lang_code}
return {"id": base_name, "name": f"Dictionary ({base_name})", "lang_code": "xx"}
def create_new_dict_entry(base_name: str, asset_files: list[pathlib.Path]) -> Dict[str, Any]:
props = guess_properties_from_base(base_name)
new_entry = {
"id": props["id"], "name": props["name"], "description": "Auto-generated", "version": "1.0.0", "assets": []
}
for file_path in asset_files:
print(f" -> Adding new asset: '{file_path.name}'")
csum = calculate_sha256(file_path)
if csum:
new_entry["assets"].append({
"filename": file_path.name, "size_bytes": os.path.getsize(file_path), "checksum_sha256": csum
})
return new_entry
def update_manifest(outputs_dir: pathlib.Path):
manifest_path = outputs_dir / 'manifest.json'
if not outputs_dir.exists():
print(f"Error: Outputs directory does not exist: {outputs_dir}")
sys.exit(1)
manifest_data = {"files": []}
if manifest_path.exists():
try:
with open(manifest_path, 'r', encoding='utf-8') as f:
manifest_data = json.load(f)
if 'files' not in manifest_data: manifest_data['files'] = []
except Exception as e:
print(f"Error reading manifest: {e}"); sys.exit(1)
print(f"Scanning {outputs_dir} for assets...")
assets_map = {asset['filename']: asset for entry in manifest_data.get('files', []) for asset in entry.get('assets', [])}
discovered = list(outputs_dir.glob('*.db')) + list(outputs_dir.glob('*.zstdict'))
new_files, updated_count = [], 0
for fpath in discovered:
fname = fpath.name
if fname in assets_map:
print(f"Updating: {fname}")
assets_map[fname]['size_bytes'] = os.path.getsize(fpath)
assets_map[fname]['checksum_sha256'] = calculate_sha256(fpath)
updated_count += 1
else:
new_files.append(fpath)
added_count = 0
if new_files:
grouped = {}
for f in new_files:
grouped.setdefault(f.stem, []).append(f)
for base, files in grouped.items():
print(f"Creating new entry for: {base}")
manifest_data['files'].append(create_new_dict_entry(base, files))
added_count += 1
with open(manifest_path, 'w', encoding='utf-8') as f:
json.dump(manifest_data, f, indent=2, ensure_ascii=False)
print(f"\nComplete. Updated {updated_count} assets, added {added_count} new entries.")
def main():
parser = argparse.ArgumentParser(description="Update manifest.json with .db and .zstdict files.")
parser.add_argument("--outputs-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR,
help="Directory containing assets and manifest.json.")
args = parser.parse_args()
update_manifest(args.outputs_dir)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,225 @@
import re
class UniversalInflectionCompressor:
"""
A generic inflection compressor that uses a configuration dictionary
to process, partition, and compress verb forms for any language.
"""
def __init__(self, config: dict):
self.config = config
def _matches_criteria(self, form: dict, criteria: dict) -> bool:
"""Helper: Checks if a form matches specific criteria."""
# Regex Match
if 'form_regex' in criteria:
form_str = form.get('form', '')
if form_str is None: form_str = ''
if not re.search(criteria['form_regex'], form_str):
return False
# Tags Inclusion
if 'tags' in criteria:
form_tags = set(form.get('tags', []))
required = set(criteria['tags'])
if not required.issubset(form_tags):
return False
# Raw Tags Inclusion
if 'raw_tags' in criteria:
form_raw = set(form.get('raw_tags', []))
required_raw = set(criteria['raw_tags'])
if not required_raw.issubset(form_raw):
return False
# Tag Exclusion
if 'exclude_tags' in criteria:
form_tags = set(form.get('tags', []))
if not form_tags.isdisjoint(set(criteria['exclude_tags'])):
return False
return True
def _normalize_forms(self, forms: list) -> list:
"""Enriches forms with tags based on 'normalization_rules'."""
rules = self.config.get('normalization_rules', [])
skip_if_source = self.config.get('skip_normalization_if_source', True)
for form in forms:
if form.get('source') and skip_if_source:
continue
for rule in rules:
field = rule.get('field')
value_to_match = rule.get('match')
match_mode = rule.get('match_mode', 'exact')
add_tags = rule.get('add_tags', [])
form_value = form.get(field)
if form_value is None: continue
is_match = False
if match_mode == 'regex':
if isinstance(form_value, list):
for item in form_value:
if re.search(value_to_match, str(item)):
is_match = True; break
else:
if re.search(value_to_match, str(form_value)):
is_match = True
else:
if isinstance(form_value, list):
is_match = value_to_match in form_value
else:
is_match = value_to_match == form_value
if is_match:
current_tags = set(form.get('tags', []))
current_tags.update(add_tags)
form['tags'] = list(current_tags)
return forms
def _extract_properties(self, forms: list, entry_context: dict = None) -> dict:
"""Determines global properties (e.g. aux, group)."""
properties = {}
candidates = forms.copy()
if entry_context:
candidates.append(entry_context)
for prop_def in self.config.get('properties', []):
name = prop_def['name']
default_val = prop_def.get('default')
is_multivalue = prop_def.get('multivalue', False)
found_values = set()
for rule in prop_def.get('rules', []):
for candidate in candidates:
if self._matches_criteria(candidate, rule.get('criteria', {})):
found_values.add(rule['value'])
if not is_multivalue:
break
if found_values and not is_multivalue:
break
if not found_values:
if is_multivalue and default_val is not None:
properties[name] = default_val if isinstance(default_val, list) else [default_val]
else:
properties[name] = default_val
elif is_multivalue:
properties[name] = sorted(list(found_values))
else:
properties[name] = list(found_values)[0]
return properties
def _clean_verb_string(self, form_string: str) -> str:
ignored = self.config.get('clean_prefixes', [])
current_string = form_string.strip()
changed = True
while changed:
changed = False
for prefix in ignored:
if prefix.endswith("'") or prefix.endswith(""):
if current_string.startswith(prefix):
current_string = current_string[len(prefix):]
changed = True
break
else:
if current_string.startswith(prefix + " "):
current_string = current_string[len(prefix)+1:]
changed = True
break
return current_string
def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict:
if not forms_list:
return None
# 1. Normalize tags
normalized_forms = self._normalize_forms(forms_list)
# 2. Extract Properties
entry_context = None
if entry:
entry_context = {
'form': entry.get('word', ''),
'tags': entry.get('tags', []),
'raw_tags': entry.get('raw_tags', [])
}
table_properties = self._extract_properties(normalized_forms, entry_context)
# 3. Initialize Output
result = table_properties.copy()
# 4. Fill Slots
schema = self.config.get('schema', {})
for slot_name, slot_def in schema.items():
slot_type = slot_def.get('type', 'single')
if slot_type == 'single':
result[slot_name] = None
for form in normalized_forms:
if self._matches_criteria(form, slot_def.get('criteria', {})):
if result[slot_name] is None or (form.get('source') and not result[slot_name]):
result[slot_name] = self._clean_verb_string(form['form'])
elif slot_type == 'list':
size = slot_def.get('size', 6)
result[slot_name] = [None] * size
base_criteria = slot_def.get('base_criteria', {})
candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)]
for form in candidates:
idx = -1
# Iterate through index rules to find where this form belongs
for index_rule in slot_def.get('indices', []):
# Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut
rule_criteria = index_rule.get('criteria', {})
if 'tags' in index_rule:
rule_criteria = rule_criteria.copy()
rule_criteria['tags'] = index_rule['tags']
if self._matches_criteria(form, rule_criteria):
idx = index_rule['index']
break
if idx >= 0 and idx < size:
current_val = result[slot_name][idx]
if current_val is None:
result[slot_name][idx] = self._clean_verb_string(form['form'])
elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')):
result[slot_name][idx] = self._clean_verb_string(form['form'])
# 5. Fallbacks
if not result.get('infinitive') and word:
result['infinitive'] = word
# 6. Validation
if self.config.get('validate_completeness', False):
for key, val in result.items():
slot_config = schema.get(key, {})
if slot_config.get('optional', False):
continue
if val is None:
raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.")
if isinstance(val, list):
for i, v in enumerate(val):
if v is None:
raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.")
return result
class InflectionProcessor:
def __init__(self, configs):
self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()}
def process(self, entry: dict) -> dict:
key = f"{entry.get('lang_code')}_{entry.get('pos')}"
if key in self.compressors:
try:
compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry)
if compressed:
entry['forms'] = compressed
except Exception as e:
print(f"Error processing {entry.get('word')}: {e}")
return entry

View File

@@ -0,0 +1,358 @@
#!/usr/bin/env python3
"""
Hybrid JSONL Schema Analyzer
Intelligently chooses between sequential and parallel processing based on file size.
For small files, uses sequential processing. For large files, uses parallel processing.
"""
import json
import os
import sys
import time
import mmap
from collections import defaultdict, Counter
from typing import Dict, List, Any, Set, Union, Tuple
import argparse
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count
import threading
from functools import partial
import gc
# Import the optimized analyzer for parallel processing
sys.path.insert(0, str(Path(__file__).parent))
try:
from jsonl_schema_analyzer_optimized import OptimizedJSONLSchemaAnalyzer
except ImportError:
print("Warning: Could not import optimized analyzer, using fallback")
OptimizedJSONLSchemaAnalyzer = None
class HybridJSONLSchemaAnalyzer:
"""Hybrid analyzer that intelligently chooses processing strategy."""
def __init__(self, max_samples: int = 1000, max_workers: int = None,
parallel_threshold_mb: int = 100, chunk_size: int = 1000):
"""
Initialize the hybrid analyzer.
Args:
max_samples: Maximum number of JSON objects to sample per file
max_workers: Maximum number of worker processes (default: cpu_count)
parallel_threshold_mb: File size threshold in MB to use parallel processing
chunk_size: Number of lines to process in each chunk
"""
self.max_samples = max_samples
self.max_workers = max_workers or min(cpu_count(), 8)
self.parallel_threshold_mb = parallel_threshold_mb
self.chunk_size = chunk_size
# Import the original analyzer for small files
sys.path.insert(0, str(Path(__file__).parent))
try:
from jsonl_schema_analyzer import JSONLSchemaAnalyzer
self.sequential_analyzer = JSONLSchemaAnalyzer(max_samples=max_samples)
except ImportError:
print("Warning: Could not import sequential analyzer")
self.sequential_analyzer = None
# Initialize optimized analyzer for large files
if OptimizedJSONLSchemaAnalyzer:
self.parallel_analyzer = OptimizedJSONLSchemaAnalyzer(
max_samples=max_samples,
max_workers=max_workers,
chunk_size=chunk_size
)
else:
self.parallel_analyzer = None
print(f"Hybrid analyzer initialized:")
print(f" Parallel threshold: {parallel_threshold_mb} MB")
print(f" Max workers: {self.max_workers}")
print(f" Chunk size: {self.chunk_size}")
def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
"""
Analyze a JSONL file using the appropriate strategy.
Args:
file_path: Path to the JSONL file
Returns:
Dictionary containing schema analysis results
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
# Get file size in MB
file_size_mb = file_path.stat().st_size / (1024 * 1024)
print(f"Analyzing {file_path.name} ({file_size_mb:.2f} MB)...")
# Choose processing strategy
if file_size_mb >= self.parallel_threshold_mb and self.parallel_analyzer:
print(f" Using parallel processing (file >= {self.parallel_threshold_mb} MB)")
result = self.parallel_analyzer.analyze_jsonl_file(file_path)
result["processing_strategy"] = "parallel"
elif self.sequential_analyzer:
print(f" Using sequential processing (file < {self.parallel_threshold_mb} MB)")
result = self.sequential_analyzer.analyze_jsonl_file(file_path)
result["processing_strategy"] = "sequential"
else:
# Fallback to parallel if sequential not available
print(f" Using parallel processing (sequential analyzer unavailable)")
if self.parallel_analyzer:
result = self.parallel_analyzer.analyze_jsonl_file(file_path)
result["processing_strategy"] = "parallel_fallback"
else:
raise RuntimeError("No analyzer available")
# Add hybrid-specific metadata
result["file_size_mb"] = file_size_mb
result["parallel_threshold_mb"] = self.parallel_threshold_mb
return result
def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
"""
Analyze all JSONL files in a directory using hybrid processing.
Args:
directory_path: Path to directory containing JSONL files
pattern: File pattern to match (default: *.jsonl)
Returns:
Dictionary containing analysis results for all files
"""
directory_path = Path(directory_path)
if not directory_path.exists():
raise FileNotFoundError(f"Directory not found: {directory_path}")
# Find all JSONL files
jsonl_files = list(directory_path.glob(pattern))
if not jsonl_files:
print(f"No JSONL files found in {directory_path} with pattern {pattern}")
return {"files": [], "summary": {}}
print(f"Found {len(jsonl_files)} JSONL files to analyze...")
start_time = time.time()
# Categorize files by size
small_files = []
large_files = []
for file_path in jsonl_files:
size_mb = file_path.stat().st_size / (1024 * 1024)
if size_mb >= self.parallel_threshold_mb:
large_files.append(file_path)
else:
small_files.append(file_path)
print(f" Small files (< {self.parallel_threshold_mb} MB): {len(small_files)}")
print(f" Large files (>= {self.parallel_threshold_mb} MB): {len(large_files)}")
file_results = {}
# Process small files sequentially (they're fast anyway)
if small_files and self.sequential_analyzer:
print(f"Processing {len(small_files)} small files sequentially...")
for file_path in small_files:
try:
result = self.analyze_jsonl_file(file_path)
file_results[file_path.name] = result
except Exception as e:
print(f"Error analyzing {file_path.name}: {e}")
file_results[file_path.name] = {"error": str(e)}
# Process large files in parallel
if large_files and self.parallel_analyzer:
print(f"Processing {len(large_files)} large files in parallel...")
if len(large_files) == 1:
# Single large file - just process it directly
file_path = large_files[0]
try:
result = self.analyze_jsonl_file(file_path)
file_results[file_path.name] = result
except Exception as e:
print(f"Error analyzing {file_path.name}: {e}")
file_results[file_path.name] = {"error": str(e)}
else:
# Multiple large files - process in parallel
with ThreadPoolExecutor(max_workers=min(len(large_files), self.max_workers)) as executor:
future_to_file = {
executor.submit(self.analyze_jsonl_file, file_path): file_path
for file_path in large_files
}
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
result = future.result()
file_results[file_path.name] = result
except Exception as e:
print(f"Error analyzing {file_path.name}: {e}")
file_results[file_path.name] = {"error": str(e)}
# Create summary
successful_results = [r for r in file_results.values() if "error" not in r]
summary = {
"total_files": len(jsonl_files),
"small_files": len(small_files),
"large_files": len(large_files),
"successfully_analyzed": len(successful_results),
"total_size_bytes": sum(
r.get("file_size_bytes", 0) for r in successful_results
),
"total_lines": sum(
r.get("total_lines", 0) for r in successful_results
),
"total_valid_lines": sum(
r.get("valid_lines", 0) for r in successful_results
),
"total_processing_time": sum(
r.get("processing_time_seconds", 0) for r in successful_results
),
"parallel_threshold_mb": self.parallel_threshold_mb,
"strategies_used": {
"sequential": len([r for r in successful_results if r.get("processing_strategy") == "sequential"]),
"parallel": len([r for r in successful_results if r.get("processing_strategy") in ["parallel", "parallel_fallback"]])
}
}
# Calculate processing speed
if summary["total_processing_time"] > 0:
total_mb = summary["total_size_bytes"] / (1024 * 1024)
summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
elapsed_time = time.time() - start_time
summary["total_elapsed_time"] = elapsed_time
print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
print(f"Sequential: {summary['strategies_used']['sequential']}, Parallel: {summary['strategies_used']['parallel']}")
print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
return {
"directory": str(directory_path),
"pattern": pattern,
"files": file_results,
"summary": summary
}
def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
"""
Save analysis results to a JSON file.
Args:
results: Analysis results to save
output_path: Path to save the results
"""
output_path = Path(output_path)
try:
start_time = time.time()
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
save_time = time.time() - start_time
file_size = output_path.stat().st_size
print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
except Exception as e:
raise RuntimeError(f"Error saving results to {output_path}: {e}")
def main():
"""Main function for command-line usage."""
parser = argparse.ArgumentParser(
description="Hybrid JSONL schema analyzer with intelligent processing strategy"
)
parser.add_argument(
"path",
help="Path to JSONL file or directory containing JSONL files"
)
parser.add_argument(
"-o", "--output",
help="Output file for analysis results (JSON format)"
)
parser.add_argument(
"-p", "--pattern",
default="*.jsonl",
help="File pattern when analyzing directory (default: *.jsonl)"
)
parser.add_argument(
"-s", "--max-samples",
type=int,
default=1000,
help="Maximum number of JSON objects to sample per file (default: 1000)"
)
parser.add_argument(
"-w", "--workers",
type=int,
default=None,
help="Number of worker processes for parallel processing (default: CPU count, max 8)"
)
parser.add_argument(
"-t", "--threshold",
type=int,
default=100,
help="File size threshold in MB for parallel processing (default: 100)"
)
parser.add_argument(
"-c", "--chunk-size",
type=int,
default=1000,
help="Number of lines to process in each chunk (default: 1000)"
)
parser.add_argument(
"--directory",
action="store_true",
help="Treat path as directory instead of single file"
)
args = parser.parse_args()
# Initialize hybrid analyzer
analyzer = HybridJSONLSchemaAnalyzer(
max_samples=args.max_samples,
max_workers=args.workers,
parallel_threshold_mb=args.threshold,
chunk_size=args.chunk_size
)
try:
start_time = time.time()
# Analyze file or directory
if args.directory or Path(args.path).is_dir():
results = analyzer.analyze_directory(args.path, args.pattern)
else:
results = analyzer.analyze_jsonl_file(args.path)
total_time = time.time() - start_time
# Save or print results
if args.output:
analyzer.save_results(results, args.output)
else:
print("\n" + "="*50)
print("ANALYSIS RESULTS")
print("="*50)
print(json.dumps(results, indent=2, ensure_ascii=False))
print(f"\nTotal analysis time: {total_time:.2f}s")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,567 @@
#!/usr/bin/env python3
"""
Optimized JSONL Schema Analyzer
Analyzes JSONL files to extract and aggregate schema information using multiple cores.
For each JSONL file, it generates a schema showing the JSON structure
and aggregates all possible keys found across all records.
"""
import json
import os
import sys
import time
import mmap
from collections import defaultdict, Counter
from typing import Dict, List, Any, Set, Union, Tuple
import argparse
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count, Manager
import threading
from functools import partial
import gc
class OptimizedJSONLSchemaAnalyzer:
"""Optimized analyzer that uses multiple cores and system resources efficiently."""
def __init__(self, max_samples: int = 1000, max_workers: int = None, chunk_size: int = 1000):
"""
Initialize the optimized analyzer.
Args:
max_samples: Maximum number of JSON objects to sample per file
max_workers: Maximum number of worker processes (default: cpu_count)
chunk_size: Number of lines to process in each chunk
"""
self.max_samples = max_samples
self.max_workers = max_workers or min(cpu_count(), 8) # Limit to 8 to avoid memory issues
self.chunk_size = chunk_size
self.schema_cache = {}
print(f"Initialized analyzer with {self.max_workers} workers, chunk size: {self.chunk_size}")
def analyze_json_value(self, value: Any, depth: int = 0, max_depth: int = 10) -> Dict[str, Any]:
"""
Analyze a JSON value and return its type and structure.
Args:
value: The JSON value to analyze
depth: Current depth in the structure
max_depth: Maximum depth to analyze
Returns:
Dictionary describing the value's type and structure
"""
if depth > max_depth:
return {"type": "unknown", "note": "max_depth_reached"}
if value is None:
return {"type": "null"}
elif isinstance(value, bool):
return {"type": "boolean"}
elif isinstance(value, int):
return {"type": "integer"}
elif isinstance(value, float):
return {"type": "number"}
elif isinstance(value, str):
return {"type": "string", "sample_length": len(value)}
elif isinstance(value, list):
if not value:
return {"type": "array", "item_types": [], "length_range": [0, 0]}
item_types = set()
item_schemas = []
# Sample first few items to determine array structure
sample_size = min(10, len(value))
for item in value[:sample_size]:
item_schema = self.analyze_json_value(item, depth + 1, max_depth)
item_schemas.append(item_schema)
item_types.add(item_schema["type"])
return {
"type": "array",
"item_types": sorted(list(item_types)),
"length_range": [len(value), len(value)],
"sample_items": item_schemas[:3] # Keep first 3 as examples
}
elif isinstance(value, dict):
if not value:
return {"type": "object", "properties": {}, "required_keys": []}
properties = {}
for key, val in value.items():
properties[key] = self.analyze_json_value(val, depth + 1, max_depth)
return {
"type": "object",
"properties": properties,
"required_keys": list(value.keys())
}
else:
return {"type": "unknown", "note": f"unexpected_type: {type(value)}"}
def merge_schemas(self, schema1: Dict[str, Any], schema2: Dict[str, Any]) -> Dict[str, Any]:
"""
Merge two schemas, combining their information.
Args:
schema1: First schema
schema2: Second schema
Returns:
Merged schema
"""
if schema1["type"] != schema2["type"]:
# Different types, create a union
return {
"type": "union",
"possible_types": sorted(set([schema1["type"], schema2["type"]])),
"schemas": [schema1, schema2]
}
merged = {"type": schema1["type"]}
if schema1["type"] == "array":
# Merge array item types
item_types = set(schema1.get("item_types", []))
item_types.update(schema2.get("item_types", []))
merged["item_types"] = sorted(list(item_types))
# Merge length ranges
len1 = schema1.get("length_range", [0, 0])
len2 = schema2.get("length_range", [0, 0])
merged["length_range"] = [min(len1[0], len2[0]), max(len1[1], len2[1])]
# Merge sample items if available
if "sample_items" in schema1 or "sample_items" in schema2:
merged["sample_items"] = (
schema1.get("sample_items", []) +
schema2.get("sample_items", [])
)[:5] # Keep max 5 samples
elif schema1["type"] == "object":
# Merge object properties
properties = {}
all_keys = set()
# Copy properties from first schema
for key, val in schema1.get("properties", {}).items():
properties[key] = val
all_keys.add(key)
# Merge properties from second schema
for key, val in schema2.get("properties", {}).items():
if key in properties:
properties[key] = self.merge_schemas(properties[key], val)
else:
properties[key] = val
all_keys.add(key)
merged["properties"] = properties
merged["required_keys"] = sorted(list(all_keys))
# Copy other fields
for key in schema1:
if key not in merged and key != "type":
merged[key] = schema1[key]
return merged
def _extract_all_keys(self, obj: Any, prefix: str = "") -> List[str]:
"""
Recursively extract all keys from a JSON object.
Args:
obj: JSON object to analyze
prefix: Prefix for nested keys
Returns:
List of all keys found
"""
keys = []
if isinstance(obj, dict):
for key, value in obj.items():
full_key = f"{prefix}.{key}" if prefix else key
keys.append(full_key)
keys.extend(self._extract_all_keys(value, full_key))
elif isinstance(obj, list):
for i, item in enumerate(obj):
keys.extend(self._extract_all_keys(item, f"{prefix}[{i}]" if prefix else f"[{i}]"))
return keys
def _process_chunk(self, chunk_data: List[str]) -> Tuple[Counter, List[Dict], int, int]:
"""
Process a chunk of JSONL lines.
Args:
chunk_data: List of JSONL lines to process
Returns:
Tuple of (keys_counter, sample_objects, valid_count, error_count)
"""
all_keys = Counter()
sample_objects = []
valid_count = 0
error_count = 0
for line in chunk_data:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
valid_count += 1
# Collect all keys from this object
keys = self._extract_all_keys(obj)
all_keys.update(keys)
# Keep sample objects for schema analysis
if len(sample_objects) < self.max_samples:
sample_objects.append(obj)
except json.JSONDecodeError:
error_count += 1
return all_keys, sample_objects, valid_count, error_count
def _read_file_chunks(self, file_path: Path) -> List[List[str]]:
"""
Read a JSONL file in chunks for parallel processing.
Args:
file_path: Path to the JSONL file
Returns:
List of chunks, each containing lines to process
"""
chunks = []
current_chunk = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
current_chunk.append(line)
if len(current_chunk) >= self.chunk_size:
chunks.append(current_chunk)
current_chunk = []
# Add remaining lines
if current_chunk:
chunks.append(current_chunk)
except Exception as e:
raise RuntimeError(f"Error reading file {file_path}: {e}")
return chunks
def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
"""
Analyze a JSONL file and return schema information using parallel processing.
Args:
file_path: Path to the JSONL file
Returns:
Dictionary containing schema analysis results
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
start_time = time.time()
file_size = file_path.stat().st_size
print(f"Analyzing {file_path.name} ({file_size / (1024*1024*1024):.2f} GB)...")
# Statistics
total_lines = 0
valid_lines = 0
error_lines = 0
all_keys = Counter()
merged_schema = None
sample_objects = []
# Read file in chunks and process in parallel
chunks = self._read_file_chunks(file_path)
if len(chunks) == 1 or self.max_workers == 1:
# Process sequentially for small files or single worker
for chunk in chunks:
chunk_keys, chunk_samples, chunk_valid, chunk_errors = self._process_chunk(chunk)
all_keys.update(chunk_keys)
sample_objects.extend(chunk_samples)
valid_lines += chunk_valid
error_lines += chunk_errors
total_lines += len(chunk)
else:
# Process chunks in parallel
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all chunks for processing
future_to_chunk = {
executor.submit(self._process_chunk, chunk): chunk
for chunk in chunks
}
# Collect results as they complete
for future in as_completed(future_to_chunk):
chunk_keys, chunk_samples, chunk_valid, chunk_errors = future.result()
all_keys.update(chunk_keys)
sample_objects.extend(chunk_samples)
valid_lines += chunk_valid
error_lines += chunk_errors
total_lines += len(future_to_chunk[future])
# Limit sample objects
if len(sample_objects) >= self.max_samples:
sample_objects = sample_objects[:self.max_samples]
# Analyze schema from sample objects
if sample_objects:
for obj in sample_objects:
obj_schema = self.analyze_json_value(obj)
if merged_schema is None:
merged_schema = obj_schema
else:
merged_schema = self.merge_schemas(merged_schema, obj_schema)
# Prepare results
elapsed_time = time.time() - start_time
results = {
"file_path": str(file_path),
"file_size_bytes": file_size,
"total_lines": total_lines,
"valid_lines": valid_lines,
"error_lines": error_lines,
"sample_count": len(sample_objects),
"all_keys": dict(all_keys.most_common()),
"unique_key_count": len(all_keys),
"schema": merged_schema,
"analysis_timestamp": time.time(),
"processing_time_seconds": elapsed_time,
"workers_used": self.max_workers,
"chunks_processed": len(chunks)
}
print(f" Completed in {elapsed_time:.2f}s - {valid_lines:,} valid lines, {error_lines:,} errors")
# Clean up memory
gc.collect()
return results
def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
"""
Analyze all JSONL files in a directory using parallel processing.
Args:
directory_path: Path to directory containing JSONL files
pattern: File pattern to match (default: *.jsonl)
Returns:
Dictionary containing analysis results for all files
"""
directory_path = Path(directory_path)
if not directory_path.exists():
raise FileNotFoundError(f"Directory not found: {directory_path}")
# Find all JSONL files
jsonl_files = list(directory_path.glob(pattern))
if not jsonl_files:
print(f"No JSONL files found in {directory_path} with pattern {pattern}")
return {"files": [], "summary": {}}
print(f"Found {len(jsonl_files)} JSONL files to analyze using {self.max_workers} workers...")
start_time = time.time()
# Sort files by size (largest first) for better load balancing
jsonl_files.sort(key=lambda f: f.stat().st_size, reverse=True)
# Analyze files in parallel
file_results = {}
if len(jsonl_files) == 1 or self.max_workers == 1:
# Process sequentially for single file
for file_path in jsonl_files:
try:
file_results[file_path.name] = self.analyze_jsonl_file(file_path)
except Exception as e:
print(f"Error analyzing {file_path.name}: {e}")
file_results[file_path.name] = {"error": str(e)}
else:
# Process files in parallel
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all files for analysis
future_to_file = {
executor.submit(self.analyze_jsonl_file, file_path): file_path
for file_path in jsonl_files
}
# Collect results as they complete
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
result = future.result()
file_results[file_path.name] = result
except Exception as e:
print(f"Error analyzing {file_path.name}: {e}")
file_results[file_path.name] = {"error": str(e)}
# Create summary
successful_results = [r for r in file_results.values() if "error" not in r]
summary = {
"total_files": len(jsonl_files),
"successfully_analyzed": len(successful_results),
"total_size_bytes": sum(
r.get("file_size_bytes", 0) for r in successful_results
),
"total_lines": sum(
r.get("total_lines", 0) for r in successful_results
),
"total_valid_lines": sum(
r.get("valid_lines", 0) for r in successful_results
),
"total_processing_time": sum(
r.get("processing_time_seconds", 0) for r in successful_results
),
"average_processing_speed_mb_per_sec": 0
}
# Calculate processing speed
if summary["total_processing_time"] > 0:
total_mb = summary["total_size_bytes"] / (1024 * 1024)
summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
elapsed_time = time.time() - start_time
summary["total_elapsed_time"] = elapsed_time
print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
return {
"directory": str(directory_path),
"pattern": pattern,
"files": file_results,
"summary": summary
}
def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
"""
Save analysis results to a JSON file.
Args:
results: Analysis results to save
output_path: Path to save the results
"""
output_path = Path(output_path)
try:
start_time = time.time()
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
save_time = time.time() - start_time
file_size = output_path.stat().st_size
print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
except Exception as e:
raise RuntimeError(f"Error saving results to {output_path}: {e}")
def main():
"""Main function for command-line usage."""
parser = argparse.ArgumentParser(
description="Optimized JSONL schema analyzer using multiple cores"
)
parser.add_argument(
"path",
help="Path to JSONL file or directory containing JSONL files"
)
parser.add_argument(
"-o", "--output",
help="Output file for analysis results (JSON format)"
)
parser.add_argument(
"-p", "--pattern",
default="*.jsonl",
help="File pattern when analyzing directory (default: *.jsonl)"
)
parser.add_argument(
"-s", "--max-samples",
type=int,
default=1000,
help="Maximum number of JSON objects to sample per file (default: 1000)"
)
parser.add_argument(
"-w", "--workers",
type=int,
default=None,
help="Number of worker processes (default: CPU count, max 8)"
)
parser.add_argument(
"-c", "--chunk-size",
type=int,
default=1000,
help="Number of lines to process in each chunk (default: 1000)"
)
parser.add_argument(
"--directory",
action="store_true",
help="Treat path as directory instead of single file"
)
parser.add_argument(
"--profile",
action="store_true",
help="Enable performance profiling"
)
args = parser.parse_args()
# Initialize analyzer
analyzer = OptimizedJSONLSchemaAnalyzer(
max_samples=args.max_samples,
max_workers=args.workers,
chunk_size=args.chunk_size
)
try:
start_time = time.time()
# Analyze file or directory
if args.directory or Path(args.path).is_dir():
results = analyzer.analyze_directory(args.path, args.pattern)
else:
results = analyzer.analyze_jsonl_file(args.path)
total_time = time.time() - start_time
# Save or print results
if args.output:
analyzer.save_results(results, args.output)
else:
print("\n" + "="*50)
print("ANALYSIS RESULTS")
print("="*50)
print(json.dumps(results, indent=2, ensure_ascii=False))
print(f"\nTotal analysis time: {total_time:.2f}s")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,212 @@
#!/usr/bin/env python3
"""
Run JSONL Schema Analysis with Default Configuration
This script runs the JSONL schema analyzer using predefined constants,
so you don't need to pass any command line arguments.
"""
import sys
from pathlib import Path
# Get the root directory (assuming this script is in the scripts folder)
ROOT_DIR = Path(__file__).parent.parent.parent
# Configuration constants
DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
DEFAULT_OUTPUT_DIR = ROOT_DIR / "intermediate"
DEFAULT_LANG_FILTER = "fr"
DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
DEFAULT_INPUT_FILE = DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME
# Analyzer configuration
DEFAULT_MAX_SAMPLES = 1000
DEFAULT_MAX_WORKERS = None # Will use CPU count
DEFAULT_PARALLEL_THRESHOLD_MB = 100
DEFAULT_CHUNK_SIZE = 1000
# Output configuration
DEFAULT_OUTPUT_FILENAME = f"{DEFAULT_LANG_FILTER}_schema_analysis.json"
DEFAULT_OUTPUT_FILE = DEFAULT_OUTPUT_DIR / DEFAULT_OUTPUT_FILENAME
def main():
"""Run the schema analysis with default configuration."""
print("=" * 60)
print("JSONL Schema Analysis - Default Configuration")
print("=" * 60)
# Display configuration
print(f"Root directory: {ROOT_DIR}")
print(f"Input directory: {DEFAULT_INPUT_DIR}")
print(f"Input file: {DEFAULT_INPUT_FILENAME}")
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
print(f"Output file: {DEFAULT_OUTPUT_FILENAME}")
print(f"Language filter: {DEFAULT_LANG_FILTER}")
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
print(f"Max workers: {DEFAULT_MAX_WORKERS or 'Auto (CPU count)'}")
print()
# Check if input file exists
if not DEFAULT_INPUT_FILE.exists():
print(f"❌ Input file not found: {DEFAULT_INPUT_FILE}")
print()
print("Available files in raw_data directory:")
# List available JSONL files
if DEFAULT_INPUT_DIR.exists():
jsonl_files = list(DEFAULT_INPUT_DIR.glob("*.jsonl"))
if jsonl_files:
for i, file in enumerate(sorted(jsonl_files), 1):
size_mb = file.stat().st_size / (1024 * 1024)
print(f" {i:2d}. {file.name} ({size_mb:.1f} MB)")
else:
print(" No JSONL files found.")
else:
print(" raw_data directory not found.")
print()
print("To analyze a different file, modify the constants in this script:")
print(f" - DEFAULT_LANG_FILTER (currently: '{DEFAULT_LANG_FILTER}')")
print(f" - DEFAULT_INPUT_FILENAME (currently: '{DEFAULT_INPUT_FILENAME}')")
return False
# Create output directory if it doesn't exist
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
print(f"✅ Input file found: {DEFAULT_INPUT_FILE.stat().st_size / (1024*1024):.1f} MB")
print()
try:
# Import the hybrid analyzer
sys.path.insert(0, str(Path(__file__).parent))
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
# Initialize analyzer with default configuration
analyzer = HybridJSONLSchemaAnalyzer(
max_samples=DEFAULT_MAX_SAMPLES,
max_workers=DEFAULT_MAX_WORKERS,
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
chunk_size=DEFAULT_CHUNK_SIZE
)
print("🚀 Starting analysis...")
print()
# Run analysis
results = analyzer.analyze_jsonl_file(DEFAULT_INPUT_FILE)
# Save results
analyzer.save_results(results, DEFAULT_OUTPUT_FILE)
print()
print("=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)
print(f"📊 Results saved to: {DEFAULT_OUTPUT_FILE}")
print(f"📈 Valid lines processed: {results.get('valid_lines', 0):,}")
print(f"🔑 Unique keys found: {results.get('unique_key_count', 0):,}")
print(f"⏱️ Processing time: {results.get('processing_time_seconds', 0):.2f} seconds")
print(f"📁 File size: {results.get('file_size_bytes', 0) / (1024*1024):.1f} MB")
if results.get('processing_strategy'):
print(f"🔧 Strategy used: {results['processing_strategy']}")
return True
except ImportError as e:
print(f"❌ Error importing analyzer: {e}")
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
return False
except Exception as e:
print(f"❌ Error during analysis: {e}")
return False
def run_directory_analysis():
"""Run analysis on entire directory with default configuration."""
print("=" * 60)
print("Directory JSONL Schema Analysis - Default Configuration")
print("=" * 60)
# Display configuration
print(f"Input directory: {DEFAULT_INPUT_DIR}")
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
print(f"Pattern: *.jsonl")
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
print()
# Check if input directory exists
if not DEFAULT_INPUT_DIR.exists():
print(f"❌ Input directory not found: {DEFAULT_INPUT_DIR}")
return False
# Create output directory if it doesn't exist
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
try:
# Import the hybrid analyzer
sys.path.insert(0, str(Path(__file__).parent))
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
# Initialize analyzer with default configuration
analyzer = HybridJSONLSchemaAnalyzer(
max_samples=DEFAULT_MAX_SAMPLES,
max_workers=DEFAULT_MAX_WORKERS,
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
chunk_size=DEFAULT_CHUNK_SIZE
)
print("🚀 Starting directory analysis...")
print()
# Run analysis
results = analyzer.analyze_directory(DEFAULT_INPUT_DIR, "*.jsonl")
# Save results
output_file = DEFAULT_OUTPUT_DIR / "directory_schema_analysis.json"
analyzer.save_results(results, output_file)
print()
print("=" * 60)
print("DIRECTORY ANALYSIS COMPLETE")
print("=" * 60)
print(f"📊 Results saved to: {output_file}")
summary = results.get('summary', {})
print(f"📁 Files analyzed: {summary.get('successfully_analyzed', 0)}")
print(f"📈 Total valid lines: {summary.get('total_valid_lines', 0):,}")
print(f"⏱️ Total processing time: {summary.get('total_processing_time', 0):.2f} seconds")
print(f"📦 Total data: {summary.get('total_size_bytes', 0) / (1024*1024*1024):.2f} GB")
print(f"🚀 Average speed: {summary.get('average_processing_speed_mb_per_sec', 0):.2f} MB/sec")
if summary.get('strategies_used'):
strategies = summary['strategies_used']
print(f"🔧 Sequential files: {strategies.get('sequential', 0)}")
print(f"🔧 Parallel files: {strategies.get('parallel', 0)}")
return True
except ImportError as e:
print(f"❌ Error importing analyzer: {e}")
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
return False
except Exception as e:
print(f"❌ Error during analysis: {e}")
return False
if __name__ == "__main__":
# You can choose what to run by default:
# Option 1: Analyze single file (based on DEFAULT_LANG_FILTER)
success = main()
# Option 2: Analyze entire directory (comment out the line above and uncomment below)
# success = run_directory_analysis()
if not success:
sys.exit(1)

152
scripts/collect_samples.py Normal file
View File

@@ -0,0 +1,152 @@
import json
import pathlib
import logging
import sys
import os
# ==============================================================================
# --- CONFIGURATION ---
# ==============================================================================
# --- Paths ---
# Try to determine project root relative to this script location
try:
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
except NameError:
SCRIPT_DIR = pathlib.Path.cwd()
ROOT_DIR = SCRIPT_DIR.parent
# Input directory containing the source semua.org files
RAW_DATA_DIR = ROOT_DIR / "raw_data"
# The pattern to match source files
FILE_PATTERN = "*raw-wiktextract-data.jsonl"
# Output directory for the collected samples
SAMPLES_DIR = ROOT_DIR / "samples"
# Final output filename
OUTPUT_FILENAME = "combined_samples.jsonl"
# --- Sampling Options ---
# How many matching entries to take from EACH source file.
SAMPLES_PER_FILE = 2
# Filter by Language Code.
# Set to None to include all languages.
# Example: "en", "de", "fr", "no"
LANG_FILTER = set()
# set()
# Filter by Part of Speech.
# Leave empty set() to include ALL parts of speech.
# Example: {"noun", "verb", "adj"}
POS_FILTER = {"verb"}
# Filter to only include entries in their own language (lang_code matches file prefix)
OWN_LANG_FILTER = True
# ==============================================================================
# --- END OF CONFIGURATION ---
# ==============================================================================
# Setup simple logging to console
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)
def collect_samples():
# 1. Setup Paths and Directories
input_dir = pathlib.Path(RAW_DATA_DIR)
output_dir = pathlib.Path(SAMPLES_DIR)
output_file = output_dir / OUTPUT_FILENAME
if not input_dir.exists():
logger.error(f"ERROR: Raw data directory not found at: {input_dir}")
logger.error("Please ensure your configuration points to the correct folder.")
sys.exit(1)
# Create samples directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)
# 2. Find all matching input files
source_files = list(input_dir.glob(FILE_PATTERN))
if not source_files:
logger.warning(f"No files matching '{FILE_PATTERN}' found in {input_dir}")
sys.exit(0)
logger.info(f"Found {len(source_files)} source files to sample from.")
logger.info(f"Target: {SAMPLES_PER_FILE} samples per file.")
logger.info(f"Language Filter: {LANG_FILTER if LANG_FILTER else 'ALL'}")
logger.info(f"POS Filter: {POS_FILTER if POS_FILTER else 'ALL'}")
logger.info(f"Own Language Filter: {'ENABLED' if OWN_LANG_FILTER else 'DISABLED'}")
logger.info("-" * 50)
total_collected = 0
# Open the output file once and append samples from all inputs to it
try:
with open(output_file, 'w', encoding='utf-8') as out_f:
for src_file in source_files:
logger.info(f"Scanning: {src_file.name}...")
lang_from_file = src_file.name[:2]
file_collected = 0
lines_read = 0
try:
with open(src_file, 'r', encoding='utf-8') as in_f:
for line in in_f:
lines_read += 1
# Stop reading this file if we have enough samples
if file_collected >= SAMPLES_PER_FILE:
break
if not line.strip():
continue
try:
entry = json.loads(line)
# --- Filtering Logic ---
# 1. Language Filter
if LANG_FILTER and entry.get('lang_code') != LANG_FILTER:
continue
# 2. POS Filter
if POS_FILTER and entry.get('pos') not in POS_FILTER:
continue
# 3. Own Language Filter
if OWN_LANG_FILTER and entry.get('lang_code') != lang_from_file:
continue
# --- If it passed filters, save it ---
# We write it exactly as it is in the source
json.dump(entry, out_f, ensure_ascii=False)
out_f.write('\n')
file_collected += 1
total_collected += 1
except json.JSONDecodeError:
# Ignore bad lines in source files during sampling
continue
logger.info(f" -> Collected {file_collected} samples (scanned {lines_read} lines)")
except Exception as e:
logger.error(f" ERROR reading {src_file.name}: {e}")
except Exception as e:
logger.critical(f"FATAL ERROR writing output file: {e}")
sys.exit(1)
logger.info("-" * 50)
logger.info("SAMPLING COMPLETE")
logger.info(f"Total entries collected: {total_collected}")
logger.info(f"Output saved to: {output_file}")
if __name__ == "__main__":
collect_samples()

142
scripts/count_pos_values.py Normal file
View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
Script to count all different "pos" values in JSONL files using parallel processing.
Analyzes all JSONL files in the raw_data directory and displays frequency counts.
"""
import json
import os
import glob
from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count
import time
from typing import Dict, List, Tuple
def process_jsonl_file(file_path: str) -> Tuple[str, Counter]:
"""
Process a single JSONL file and count POS values.
Args:
file_path: Path to the JSONL file
Returns:
Tuple of (filename, Counter of POS values)
"""
pos_counter = Counter()
line_count = 0
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
if 'pos' in data and data['pos']:
pos_counter[data['pos']] += 1
line_count += 1
except json.JSONDecodeError as e:
print(f"Warning: JSON decode error in {file_path} at line {line_num}: {e}")
continue
except Exception as e:
print(f"Error processing file {file_path}: {e}")
return file_path, Counter()
print(f"Processed {file_path}: {line_count} lines, {sum(pos_counter.values())} POS entries")
return file_path, pos_counter
def main():
"""Main function to process all JSONL files and display POS statistics."""
# Find all JSONL files in raw_data directory
raw_data_dir = "raw_data"
jsonl_files = glob.glob(os.path.join(raw_data_dir, "*.jsonl"))
if not jsonl_files:
print(f"No JSONL files found in {raw_data_dir}")
return
print(f"Found {len(jsonl_files)} JSONL files to process")
print(f"Using {cpu_count()} CPU cores for parallel processing")
print("-" * 60)
# Process files in parallel
start_time = time.time()
all_pos_counts = Counter()
file_results = {}
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
# Submit all files for processing
future_to_file = {
executor.submit(process_jsonl_file, file_path): file_path
for file_path in jsonl_files
}
# Collect results as they complete
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
filename, pos_counter = future.result()
file_results[filename] = pos_counter
all_pos_counts.update(pos_counter)
except Exception as e:
print(f"Error processing {file_path}: {e}")
end_time = time.time()
processing_time = end_time - start_time
# Display results
print("\n" + "=" * 80)
print("POS VALUE COUNTS ACROSS ALL FILES")
print("=" * 80)
print(f"Total processing time: {processing_time:.2f} seconds")
print(f"Total POS entries found: {sum(all_pos_counts.values()):,}")
print(f"Unique POS values: {len(all_pos_counts)}")
print("\nTop 50 most common POS values:")
print("-" * 80)
# Sort by frequency (descending)
sorted_pos = sorted(all_pos_counts.items(), key=lambda x: x[1], reverse=True)
for pos, count in sorted_pos[:100]:
percentage = (count / sum(all_pos_counts.values())) * 100
print(f"{pos:<20} {count:>10,} ({percentage:5.2f}%)")
if len(sorted_pos) > 100:
print(f"\n... and {len(sorted_pos) - 100} more POS values")
# Show all unique POS values (alphabetical)
print("\n" + "=" * 80)
print("ALL UNIQUE POS VALUES (ALPHABETICAL)")
print("=" * 80)
for pos, count in sorted(all_pos_counts.items(), key=lambda x: x[0].lower()):
print(f"{pos:<30} {count:>10,}")
# Per-file breakdown
print("\n" + "=" * 80)
print("PER-FILE BREAKDOWN")
print("=" * 80)
for filename, pos_counter in sorted(file_results.items()):
total_entries = sum(pos_counter.values())
if total_entries > 0:
print(f"\n{os.path.basename(filename)}:")
print(f" Total entries: {total_entries:,}")
print(f" Unique POS values: {len(pos_counter)}")
# All POS values for this file (sorted by frequency)
all_pos = sorted(pos_counter.items(), key=lambda x: x[1], reverse=True)
for pos, count in all_pos:
print(f" {pos:<15} {count:>8,}")
print(f"\nProcessing completed in {processing_time:.2f} seconds")
if __name__ == "__main__":
main()

401
scripts/lang_config.py Normal file
View File

@@ -0,0 +1,401 @@
GERMAN_VERB_CONFIG = {
"clean_prefixes": ["ich", "du", "er/sie/es", "wir", "ihr", "sie"],
"normalization_rules": [
{"field": "pronouns", "match": "ich", "add_tags": ["first-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "du", "add_tags": ["second-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "er", "add_tags": ["third-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "sie", "add_tags": ["third-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "es", "add_tags": ["third-person", "singular", "indicative", "active"]},
{"field": "pronouns", "match": "wir", "add_tags": ["first-person", "plural", "indicative", "active"]},
{"field": "pronouns", "match": "ihr", "add_tags": ["second-person", "plural", "indicative", "active"]}
],
"properties": [
{
"name": "auxiliary",
"multivalue": True, # <--- CRITICAL CHANGE HERE
"default": ["haben"],
"rules": [
# Check for explicit raw tags
{"value": "sein", "criteria": {"raw_tags": ["Hilfsverb sein"]}},
{"value": "haben", "criteria": {"raw_tags": ["Hilfsverb haben"]}},
# Check for 'common forms' that imply the aux
{"value": "sein", "criteria": {"form_regex": "^sein$", "tags": ["auxiliary", "perfect"]}},
{"value": "haben", "criteria": {"form_regex": "^haben$", "tags": ["auxiliary", "perfect"]}}
]
},
{
"name": "separability",
"default": "inseparable",
"rules": [
{"value": "separable", "criteria": {"tags": ["separable"]}},
{"value": "inseparable", "criteria": {"tags": ["inseparable"]}},
{"value": "separable", "criteria": {"tags": ["participle-2"], "form_regex": "^(?!ge).+ge.+$"}}
]
}
],
"schema": {
"infinitive": {
"type": "single",
"criteria": {"tags": ["infinitive", "present"], "exclude_tags": ["extended", "passive", "reflexive", "zu"]}
},
"participle_perfect": {
"type": "single",
"criteria": {"tags": ["participle-2", "perfect"], "exclude_tags": ["active", "passive", "auxiliary"]}
},
"imperative": {
"type": "list",
"size": 2,
"base_criteria": {"tags": ["imperative", "present", "active"]},
"indices": [
{"index": 0, "tags": ["singular", "second-person"]},
{"index": 1, "tags": ["plural", "second-person"]}
]
},
"present": {
"type": "list",
"size": 6,
"base_criteria": {"tags": ["indicative", "present", "active"], "exclude_tags": ["passive"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"past": {
"type": "list",
"size": 6,
"base_criteria": {"tags": ["indicative", "past", "active"], "exclude_tags": ["passive"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"subjunctive_ii": {
"type": "list",
"size": 6,
"base_criteria": {"tags": ["subjunctive-ii", "past", "active"], "exclude_tags": ["passive"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
}
}
}
FRENCH_VERB_CONFIG = {
"skip_normalization_if_source": False,
# CHANGED: Set to False to prevent crashes on idioms, rare words, and defective verbs
"validate_completeness": False,
"clean_prefixes": [
"qu'", "qu", "que", "j'", "j", "je", "tu",
"il/elle/on", "il", "elle", "on", "nous", "vous", "ils/elles", "ils", "elles"
],
"normalization_rules": [
# Pronoun matches
{"field": "form", "match": r"\bje\b", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
{"field": "form", "match": r"\bj[']", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
{"field": "form", "match": r"\btu\b", "match_mode": "regex", "add_tags": ["second-person", "singular"]},
{"field": "form", "match": r"\b(il|elle|on|il/elle/on)\b", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
{"field": "form", "match": r"\[il/ɛl/ɔ̃\]", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
{"field": "form", "match": r"\bnous\b", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
{"field": "form", "match": r"\bvous\b", "match_mode": "regex", "add_tags": ["second-person", "plural"]},
{"field": "form", "match": r"\b(ils|elles|ils/elles)\b", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
{"field": "form", "match": r"\[il/ɛl\]", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
# Suffix Heuristics
{"field": "form", "match": r"ons$", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
{"field": "form", "match": r"ez$", "match_mode": "regex", "add_tags": ["second-person", "plural"]}
],
"properties": [
{
"name": "auxiliary",
"multivalue": True,
"default": ["avoir"],
"rules": [
{"value": "être", "criteria": {"raw_tags": ["auxiliary être"]}},
{"value": "avoir", "criteria": {"raw_tags": ["auxiliary avoir"]}},
{"value": "être", "criteria": {"tags": ["auxiliary-être"]}},
{"value": "avoir", "criteria": {"tags": ["auxiliary-avoir"]}}
]
},
{
"name": "group",
"default": "unknown",
"rules": [
{"value": "1st-group", "criteria": {"raw_tags": ["1ᵉʳ groupe"]}},
{"value": "2nd-group", "criteria": {"raw_tags": ["2ᵉ groupe"]}},
{"value": "3rd-group", "criteria": {"raw_tags": ["3ᵉ groupe"]}},
{"value": "1st-group", "criteria": {"form_regex": "er$"}},
{"value": "2nd-group", "criteria": {"form_regex": "ir$"}},
{"value": "3rd-group", "criteria": {"form_regex": "(re|oir)$"}}
]
}
],
"schema": {
"infinitive": {
"type": "single",
"criteria": {"tags": ["infinitive", "present"]}
},
"participle_present": {
"type": "single",
"optional": True,
"criteria": {"tags": ["participle", "present"]}
},
"participle_past": {
"type": "single",
"optional": True,
"criteria": {"tags": ["participle", "past"], "exclude_tags": ["multiword-construction"]}
},
# All lists are now marked optional to handle defective verbs (like 'traire') and sparse data
"indicative_present": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["indicative", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_imperfect": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["indicative", "imperfect"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_future": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["indicative", "future"], "exclude_tags": ["perfect"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_simple_past": {
"type": "list", "size": 6, "optional": True, # Traire/clore do not have this
"base_criteria": {"tags": ["indicative", "past"], "exclude_tags": ["multiword-construction", "imperfect", "perfect", "anterior"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"subjunctive_present": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["subjunctive", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"conditional_present": {
"type": "list", "size": 6, "optional": True,
"base_criteria": {"tags": ["conditional", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"imperative": {
"type": "list", "size": 3, "optional": True,
"base_criteria": {"tags": ["imperative", "present"]},
"indices": [
{"index": 0, "tags": ["singular"]},
{"index": 1, "tags": ["plural", "first-person"]},
{"index": 2, "tags": ["plural", "second-person"]},
{"index": 1, "criteria": {"form_regex": r"ons$"}},
{"index": 2, "criteria": {"form_regex": r"ez$"}},
{"index": 0, "criteria": {"form_regex": r"[es]$"}}
]
}
}
}
OLD_FRENCH_VERB_CONFIG = {
"skip_normalization_if_source": False,
"validate_completeness": True,
# --- 1. Normalization ---
"clean_prefixes": [
"qu'", "qu", "que", "j'", "j", "je", "tu",
"il/elle/on", "il", "elle", "on", "nous", "vous", "ils/elles", "ils", "elles"
],
"normalization_rules": [
{"field": "form", "match": r"\bje\b", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
{"field": "form", "match": r"\bj[']", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
{"field": "form", "match": r"\btu\b", "match_mode": "regex", "add_tags": ["second-person", "singular"]},
{"field": "form", "match": r"\b(il|elle|on|il/elle/on)\b", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
{"field": "form", "match": r"\[il/ɛl/ɔ̃\]", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
{"field": "form", "match": r"\bnous\b", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
{"field": "form", "match": r"\bvous\b", "match_mode": "regex", "add_tags": ["second-person", "plural"]},
{"field": "form", "match": r"\b(ils|elles|ils/elles)\b", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
{"field": "form", "match": r"\[il/ɛl\]", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
],
# --- 2. Properties ---
"properties": [
{
"name": "auxiliary",
"multivalue": True,
"default": ["avoir"],
"rules": [
{"value": "être", "criteria": {"raw_tags": ["auxiliary être"]}},
{"value": "avoir", "criteria": {"raw_tags": ["auxiliary avoir"]}},
{"value": "être", "criteria": {"tags": ["auxiliary-être"]}},
{"value": "avoir", "criteria": {"tags": ["auxiliary-avoir"]}}
]
},
{
"name": "group",
"default": "unknown",
"rules": [
{"value": "1st-group", "criteria": {"raw_tags": ["1ᵉʳ groupe"]}},
{"value": "2nd-group", "criteria": {"raw_tags": ["2ᵉ groupe"]}},
{"value": "3rd-group", "criteria": {"raw_tags": ["3ᵉ groupe"]}},
{"value": "1st-group", "criteria": {"form_regex": "er$"}},
{"value": "2nd-group", "criteria": {"form_regex": "ir$"}},
{"value": "3rd-group", "criteria": {"form_regex": "(re|oir)$"}}
]
}
],
# --- 3. Schema ---
"schema": {
"infinitive": {
"type": "single",
"criteria": {"tags": ["infinitive", "present"]}
},
"participle_present": {
"type": "single",
"optional": True, # <--- NEW: Allows missing participle
"criteria": {"tags": ["participle", "present"]}
},
"participle_past": {
"type": "single",
"optional": True, # <--- Often missing in defective verbs
"criteria": {"tags": ["participle", "past"], "exclude_tags": ["multiword-construction"]}
},
"indicative_present": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["indicative", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_imperfect": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["indicative", "imperfect"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_future": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["indicative", "future"], "exclude_tags": ["perfect"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"indicative_simple_past": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["indicative", "past"], "exclude_tags": ["multiword-construction", "imperfect", "perfect", "anterior"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"subjunctive_present": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["subjunctive", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"conditional_present": {
"type": "list", "size": 6,
"base_criteria": {"tags": ["conditional", "present"]},
"indices": [
{"index": 0, "tags": ["first-person", "singular"]},
{"index": 1, "tags": ["second-person", "singular"]},
{"index": 2, "tags": ["third-person", "singular"]},
{"index": 3, "tags": ["first-person", "plural"]},
{"index": 4, "tags": ["second-person", "plural"]},
{"index": 5, "tags": ["third-person", "plural"]}
]
},
"imperative": {
"type": "list", "size": 3,
"optional": True, # <--- Often missing for phrases/defective verbs
"base_criteria": {"tags": ["imperative", "present"]},
"indices": [
{"index": 0, "tags": ["singular"]},
{"index": 1, "tags": ["plural", "first-person"]},
{"index": 2, "tags": ["plural", "second-person"]}
]
}
}
}

38
scripts/printline.py Normal file
View File

@@ -0,0 +1,38 @@
import json
import pathlib
from datetime import datetime
INPUT_FILE_NAME = "fr_raw-wiktextract-data.jsonl"
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
# --- Configuration ---
START_LINE = 99 # 1-based index (first line is 1)
NUM_LINES = 99 # Number of lines/objects to write
def extract_lines_to_file(file_path, start_line, num_lines):
# Generate timestamp filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = file_path.parent / f"{timestamp}.json"
with open(file_path, 'r', encoding='utf-8') as infile:
with open(output_file, 'w', encoding='utf-8') as outfile:
for i, line in enumerate(infile, start=1):
if i >= start_line and i < start_line + num_lines:
try:
element = json.loads(line)
outfile.write(json.dumps(element, indent=2, ensure_ascii=False))
outfile.write('\n')
except json.JSONDecodeError:
outfile.write(f"Error: Line {i} is not valid JSON.\n")
print(f"Output written to: {output_file}")
if __name__ == "__main__":
extract_lines_to_file(INPUT_FILE, START_LINE, NUM_LINES)

110
scripts/search_word.py Normal file
View File

@@ -0,0 +1,110 @@
import json
import pathlib
from datetime import datetime
INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl" # <-- Update this to your file
# --- Dynamic Path Setup ---
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
# --- Filter Configuration ---
# Set the POS (part of speech) you want to filter for
# Examples: "noun", "verb", "adj", "adv", etc.
# Set to None to skip POS filtering
FILTER_POS = "noun"
# Set the word you want to filter for
# Set to None to skip word filtering
FILTER_WORD = "grenouille"
# Set word prefix to filter for (e.g., "Septem" will match "September")
# Set to None to skip prefix filtering
FILTER_PREFIX = None
# Set word suffix to filter for (e.g., "ber" will match "September")
# Set to None to skip suffix filtering
FILTER_SUFFIX = None
# Maximum number of results to include (set to None for unlimited)
MAX_RESULTS = 5
def matches_filters(entry):
"""Check if an entry matches all active filters."""
# Filter by POS
if FILTER_POS is not None:
if entry.get("pos") != FILTER_POS:
return False
# Filter by exact word
if FILTER_WORD is not None:
if entry.get("word") != FILTER_WORD:
return False
# Filter by prefix
if FILTER_PREFIX is not None:
word = entry.get("word", "")
if not word.startswith(FILTER_PREFIX):
return False
# Filter by suffix
if FILTER_SUFFIX is not None:
word = entry.get("word", "")
if not word.endswith(FILTER_SUFFIX):
return False
return True
def filter_and_save(file_path):
"""Filter JSONL file and save matching entries."""
# Generate output filename with original filename and timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl"
match_count = 0
total_lines = 0
with open(file_path, 'r', encoding='utf-8') as infile:
with open(output_file, 'w', encoding='utf-8') as outfile:
for line in infile:
total_lines += 1
try:
entry = json.loads(line)
# Check if entry matches filters
if matches_filters(entry):
outfile.write(json.dumps(entry, ensure_ascii=False))
outfile.write('\n')
match_count += 1
# Stop if we've reached max results
if MAX_RESULTS is not None and match_count >= MAX_RESULTS:
break
except json.JSONDecodeError:
print(f"Warning: Line {total_lines} is not valid JSON.")
print(f"Filtered {match_count} entries from {total_lines} total lines")
print(f"Output written to: {output_file}")
# Print active filters
print("\nActive filters:")
if FILTER_POS:
print(f" - POS: {FILTER_POS}")
if FILTER_WORD:
print(f" - Word (exact): {FILTER_WORD}")
if FILTER_PREFIX:
print(f" - Prefix: {FILTER_PREFIX}")
if FILTER_SUFFIX:
print(f" - Suffix: {FILTER_SUFFIX}")
if __name__ == "__main__":
filter_and_save(INPUT_FILE)

View File

@@ -0,0 +1,419 @@
#!/usr/bin/env python3
"""
Universal Wiktionary Format Transformer
========================================
Transforms any Wiktionary JSON format to a standardized universal schema.
Usage:
python transform_wiktionary.py input.jsonl output.jsonl
python transform_wiktionary.py input.jsonl output.jsonl --validate
"""
import json
import sys
import argparse
from typing import Dict, List, Any, Optional
from pathlib import Path
class WiktionaryTransformer:
"""Transforms Wiktionary entries to universal format."""
def __init__(self, validate: bool = False):
self.validate = validate
self.stats = {
"total": 0,
"successful": 0,
"errors": 0,
"warnings": []
}
def transform_entry(self, raw_entry: Dict[str, Any]) -> Dict[str, Any]:
"""
Transform a single Wiktionary entry to universal format.
Args:
raw_entry: Raw entry from any Wiktionary edition
Returns:
Transformed entry in universal format
"""
# === REQUIRED CORE FIELDS ===
try:
universal = {
"word": raw_entry["word"],
"lang_code": raw_entry["lang_code"],
"pos": raw_entry["pos"],
"senses": raw_entry["senses"]
}
except KeyError as e:
raise ValueError(f"Missing required field: {e}")
# === PHONETICS ===
phonetics = self._extract_phonetics(raw_entry)
if phonetics:
universal["phonetics"] = phonetics
# === HYPHENATION ===
hyphenation = self._extract_hyphenation(raw_entry)
if hyphenation:
universal["hyphenation"] = hyphenation
# === FORMS ===
if "forms" in raw_entry:
universal["forms"] = raw_entry["forms"]
# === GRAMMATICAL FEATURES ===
grammatical = self._extract_grammatical_features(raw_entry)
if grammatical:
universal["grammatical_features"] = grammatical
# === ETYMOLOGY ===
etymology = self._extract_etymology(raw_entry)
if etymology:
universal["etymology"] = etymology
# === RELATIONS ===
relations = self._extract_relations(raw_entry)
if relations:
universal["relations"] = relations
# === TRANSLATIONS ===
if "translations" in raw_entry:
universal["translations"] = raw_entry["translations"]
# === DESCENDANTS ===
if "descendants" in raw_entry:
universal["descendants"] = raw_entry["descendants"]
# === METADATA ===
metadata = self._extract_metadata(raw_entry)
universal["metadata"] = metadata
return universal
def _extract_phonetics(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract and normalize phonetic information."""
phonetics = {}
# Process sounds array
if "sounds" in entry and entry["sounds"]:
ipa_variations = []
audio_list = []
homophones = []
for sound in entry["sounds"]:
# IPA transcription with country information
if "ipa" in sound:
ipa_entry = {"ipa": sound["ipa"]}
# Preserve country information from raw_tags
if "raw_tags" in sound:
ipa_entry["raw_tags"] = sound["raw_tags"]
# Clean IPA string by removing special characters at beginning/end
cleaned_ipa = self._clean_ipa_string(sound["ipa"])
ipa_entry["ipa_cleaned"] = cleaned_ipa
ipa_variations.append(ipa_entry)
# Audio files (keep for now, will be removed in filter step)
if "audio" in sound:
audio_obj = {}
# Try multiple URL formats
for url_key in ["ogg_url", "mp3_url", "url"]:
if url_key in sound:
audio_obj["url"] = sound[url_key]
break
audio_obj["text"] = sound.get("audio", "")
if audio_obj:
audio_list.append(audio_obj)
# Homophones
if "homophone" in sound:
homophones.append(sound["homophone"])
if ipa_variations:
phonetics["ipa_variations"] = ipa_variations
if audio_list:
phonetics["audio"] = audio_list
if homophones:
phonetics["homophones"] = homophones
# Handle extra_sounds (some editions)
if "extra_sounds" in entry:
if "pronunciación" in entry["extra_sounds"]:
phonetics["notes"] = entry["extra_sounds"]["pronunciación"]
return phonetics if phonetics else None
def _clean_ipa_string(self, ipa_string: str) -> str:
"""Clean IPA string by removing special characters at beginning/end."""
if not ipa_string:
return ipa_string
# Remove leading/trailing special characters: [, ], \, :
cleaned = ipa_string.strip("[]\\:")
return cleaned
def _extract_hyphenation(self, entry: Dict[str, Any]) -> Optional[List[str]]:
"""Extract and normalize hyphenation."""
# Format 1: hyphenations array with parts
if "hyphenations" in entry and entry["hyphenations"]:
parts = []
for h in entry["hyphenations"]:
if isinstance(h, dict) and "parts" in h:
parts.extend(h["parts"])
elif isinstance(h, str):
parts.append(h)
if parts:
return parts
# Format 2: hyphenation string with separator
if "hyphenation" in entry:
# Split on common separators
hyph = entry["hyphenation"]
for sep in ["", "-", "·", ""]:
if sep in hyph:
return hyph.split(sep)
return [hyph]
return None
def _extract_grammatical_features(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract grammatical features and tags."""
if "tags" not in entry:
return None
grammatical = {"tags": entry["tags"]}
# Extract gender from tags
gender_map = {
"masculine": "masculine",
"feminine": "feminine",
"neuter": "neuter",
"common": "common",
"m": "masculine",
"f": "feminine",
"n": "neuter",
"c": "common"
}
for tag in entry["tags"]:
tag_lower = tag.lower()
if tag_lower in gender_map:
grammatical["gender"] = gender_map[tag_lower]
break
# Extract number
number_map = {
"singular": "singular",
"plural": "plural",
"dual": "dual",
"sg": "singular",
"pl": "plural"
}
for tag in entry["tags"]:
tag_lower = tag.lower()
if tag_lower in number_map:
grammatical["number"] = number_map[tag_lower]
break
return grammatical
def _extract_etymology(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract etymology information."""
etymology = {}
if "etymology_text" in entry:
etymology["text"] = entry["etymology_text"]
if "etymology_texts" in entry:
etymology["texts"] = entry["etymology_texts"]
if "etymology_number" in entry:
etymology["number"] = entry["etymology_number"]
return etymology if etymology else None
def _extract_relations(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Extract semantic and lexical relations."""
relations = {}
# Define all possible relation types
relation_fields = [
"synonyms", "antonyms", "hypernyms", "hyponyms",
"meronyms", "holonyms", "related", "derived",
"coordinate_terms", "troponyms", "compounds"
]
for field in relation_fields:
if field in entry and entry[field]:
relations[field] = entry[field]
return relations if relations else None
def _extract_metadata(self, entry: Dict[str, Any]) -> Dict[str, Any]:
"""Extract metadata and source information."""
metadata = {}
# Source language
if "lang" in entry:
metadata["source_lang"] = entry["lang"]
# Infer source language code if possible
if "lang_code" in entry:
metadata["source_lang_code"] = entry["lang_code"]
# POS title (localized)
if "pos_title" in entry:
metadata["pos_title"] = entry["pos_title"]
elif "pos_text" in entry:
metadata["pos_title"] = entry["pos_text"]
# Categories
if "categories" in entry:
metadata["categories"] = entry["categories"]
# Templates
templates = []
if "head_templates" in entry:
templates.extend(entry["head_templates"])
if "inflection_templates" in entry:
templates.extend(entry["inflection_templates"])
if templates:
metadata["templates"] = templates
# Additional metadata
if "attestations" in entry:
metadata["attestations"] = entry["attestations"]
return metadata
def transform_file(self, input_path: str, output_path: str) -> None:
"""
Transform an entire JSONL file.
Args:
input_path: Path to input JSONL file
output_path: Path to output JSONL file
"""
input_file = Path(input_path)
output_file = Path(output_path)
if not input_file.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
print(f"Transforming: {input_path}{output_path}")
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', encoding='utf-8') as outfile:
for line_num, line in enumerate(infile, 1):
line = line.strip()
if not line:
continue
self.stats["total"] += 1
try:
# Parse input
raw_entry = json.loads(line)
# Transform
universal_entry = self.transform_entry(raw_entry)
# Validate if requested
if self.validate:
self._validate_entry(universal_entry)
# Write output
outfile.write(json.dumps(universal_entry, ensure_ascii=False) + '\n')
self.stats["successful"] += 1
except json.JSONDecodeError as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: JSON decode error - {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
except ValueError as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
except Exception as e:
self.stats["errors"] += 1
warning = f"Line {line_num}: Unexpected error - {e}"
self.stats["warnings"].append(warning)
print(f"{warning}", file=sys.stderr)
self._print_summary()
def _validate_entry(self, entry: Dict[str, Any]) -> None:
"""Validate a transformed entry."""
required = ["word", "lang_code", "pos", "senses"]
for field in required:
if field not in entry:
raise ValueError(f"Missing required field after transformation: {field}")
def _print_summary(self) -> None:
"""Print transformation summary."""
print("\n" + "="*60)
print("TRANSFORMATION SUMMARY")
print("="*60)
print(f"Total entries: {self.stats['total']}")
print(f"Successful: {self.stats['successful']}")
print(f"Errors: {self.stats['errors']}")
if self.stats['successful'] > 0:
success_rate = (self.stats['successful'] / self.stats['total']) * 100
print(f"Success rate: {success_rate:.1f}%")
if self.stats['warnings']:
print(f"\nWarnings: {len(self.stats['warnings'])}")
if len(self.stats['warnings']) <= 10:
for warning in self.stats['warnings']:
print(f" - {warning}")
else:
print(f" (showing first 10 of {len(self.stats['warnings'])})")
for warning in self.stats['warnings'][:10]:
print(f" - {warning}")
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Transform Wiktionary JSONL to universal format",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s input.jsonl output.jsonl
%(prog)s data/raw.jsonl data/transformed.jsonl --validate
"""
)
parser.add_argument("input", help="Input JSONL file")
parser.add_argument("output", help="Output JSONL file")
parser.add_argument("--validate", action="store_true",
help="Validate transformed entries")
args = parser.parse_args()
try:
transformer = WiktionaryTransformer(validate=args.validate)
transformer.transform_file(args.input, args.output)
# Exit with error code if there were errors
if transformer.stats["errors"] > 0:
sys.exit(1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()