Migrate to gitea
This commit is contained in:
329
scripts/01_filter_dictionary.py
Normal file
329
scripts/01_filter_dictionary.py
Normal file
@@ -0,0 +1,329 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Transforms dictionary data from kaakki.org JSONL format to the universal
|
||||
dictionary schema defined in 'universal_dictionary_schema.json'.
|
||||
Uses ALL system cores for parallel processing.
|
||||
"""
|
||||
|
||||
import json
|
||||
import pathlib
|
||||
import logging
|
||||
import sys
|
||||
import argparse
|
||||
import csv
|
||||
import multiprocessing
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Set, Optional, Tuple
|
||||
|
||||
# ==============================================================================
|
||||
# --- DEFAULT CONFIGURATION (Overridable via CLI args) ---
|
||||
# ==============================================================================
|
||||
|
||||
try:
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
except NameError:
|
||||
SCRIPT_DIR = pathlib.Path.cwd()
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
|
||||
sys.path.insert(0, str(ROOT_DIR))
|
||||
|
||||
# --- IMPORTS ---
|
||||
try:
|
||||
from transform_wiktionary import WiktionaryTransformer
|
||||
from InflectionProcessor import InflectionProcessor
|
||||
# Import language configurations
|
||||
try:
|
||||
from lang_config import GERMAN_VERB_CONFIG
|
||||
except ImportError:
|
||||
GERMAN_VERB_CONFIG = {}
|
||||
try:
|
||||
from lang_config import FRENCH_VERB_CONFIG
|
||||
except ImportError:
|
||||
FRENCH_VERB_CONFIG = {}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
DEFAULT_LANG_FILTER = "fr"
|
||||
|
||||
DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
|
||||
DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
|
||||
DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate"
|
||||
|
||||
DEFAULT_POS_WHITELIST = set()
|
||||
DEFAULT_POS_BLACKLIST = {"unknown"}
|
||||
DEFAULT_IGNORE_FORM_OF = True
|
||||
DEFAULT_TRANS_LANGS = {"pt", "es", "en", "de", "it", "fr", "nl"}
|
||||
|
||||
# ==============================================================================
|
||||
# --- LOGGING ---
|
||||
# ==============================================================================
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ==============================================================================
|
||||
# --- WORKER FUNCTION ---
|
||||
# ==============================================================================
|
||||
|
||||
def process_chunk_filtering(
|
||||
chunk_lines: List[str],
|
||||
lang_filter: Optional[str],
|
||||
pos_whitelist: Set[str],
|
||||
pos_blacklist: Set[str],
|
||||
ignore_form_of: bool,
|
||||
translation_languages: Set[str],
|
||||
inflection_configs: Dict
|
||||
) -> Tuple[List[str], Dict[str, int], List[str]]:
|
||||
|
||||
# Re-instantiate processors inside the worker process
|
||||
transformer = WiktionaryTransformer()
|
||||
inflection_processor = InflectionProcessor(inflection_configs)
|
||||
|
||||
form_of_tags = {"form-of", "affix", "particle", "suffix", "prefix"}
|
||||
|
||||
results = []
|
||||
errors = []
|
||||
counters = {"processed": 0, "skipped": 0, "errors": 0}
|
||||
|
||||
for line in chunk_lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
# --- Apply Filters ---
|
||||
if lang_filter and data.get("lang_code") != lang_filter:
|
||||
counters["skipped"] += 1; continue
|
||||
|
||||
pos = data.get("pos")
|
||||
if pos_whitelist and pos not in pos_whitelist:
|
||||
counters["skipped"] += 1; continue
|
||||
if pos_blacklist and pos in pos_blacklist:
|
||||
counters["skipped"] += 1; continue
|
||||
|
||||
if ignore_form_of:
|
||||
if set(data.get("tags", [])).intersection(form_of_tags):
|
||||
counters["skipped"] += 1; continue
|
||||
|
||||
# --- Filter Translations ---
|
||||
if 'translations' in data:
|
||||
data['translations'] = [
|
||||
tr for tr in data['translations']
|
||||
if tr.get('lang_code') in translation_languages
|
||||
]
|
||||
|
||||
# --- 1. Transform Data to Universal Schema ---
|
||||
new_entry = transformer.transform_entry(data)
|
||||
|
||||
# --- CLEANUP PHONETICS (Audio & Duplicates) ---
|
||||
if 'phonetics' in new_entry:
|
||||
# Remove Audio
|
||||
if 'audio' in new_entry['phonetics']:
|
||||
del new_entry['phonetics']['audio']
|
||||
|
||||
# Process IPA variations to remove duplicates while preserving country information
|
||||
if 'ipa_variations' in new_entry['phonetics'] and isinstance(new_entry['phonetics']['ipa_variations'], list):
|
||||
# Group variations by cleaned IPA to collect all regions for each pronunciation
|
||||
ipa_groups = {}
|
||||
for variation in new_entry['phonetics']['ipa_variations']:
|
||||
ipa_cleaned = variation.get('ipa_cleaned', '')
|
||||
if ipa_cleaned:
|
||||
if ipa_cleaned not in ipa_groups:
|
||||
ipa_groups[ipa_cleaned] = {
|
||||
"ipa": ipa_cleaned,
|
||||
"raw_tags": []
|
||||
}
|
||||
# Collect all raw_tags for this IPA
|
||||
if 'raw_tags' in variation:
|
||||
ipa_groups[ipa_cleaned]['raw_tags'].extend(variation['raw_tags'])
|
||||
|
||||
# Create compressed variations list
|
||||
compressed_variations = []
|
||||
for ipa_cleaned, group_data in ipa_groups.items():
|
||||
variation = {"ipa": ipa_cleaned}
|
||||
if group_data['raw_tags']:
|
||||
# Remove duplicates from raw_tags while preserving order
|
||||
seen_tags = set()
|
||||
unique_tags = []
|
||||
for tag in group_data['raw_tags']:
|
||||
if tag not in seen_tags:
|
||||
unique_tags.append(tag)
|
||||
seen_tags.add(tag)
|
||||
variation['raw_tags'] = unique_tags
|
||||
compressed_variations.append(variation)
|
||||
|
||||
# Create simplified IPA list and compressed variations
|
||||
simplified_ipa = list(ipa_groups.keys())
|
||||
new_entry['phonetics']['ipa'] = simplified_ipa
|
||||
new_entry['phonetics']['ipa_variations'] = compressed_variations
|
||||
|
||||
# --- Filter out unnecessary fields ---
|
||||
if 'metadata' in new_entry:
|
||||
del new_entry['metadata']
|
||||
if 'translations' in new_entry:
|
||||
for tr in new_entry['translations']:
|
||||
tr.pop('lang', None)
|
||||
tr.pop('sense', None)
|
||||
|
||||
if 'senses' in new_entry:
|
||||
for sense in new_entry['senses']:
|
||||
if 'examples' in sense:
|
||||
sense['examples'] = [ex['text'] for ex in sense['examples'] if 'text' in ex]
|
||||
|
||||
if 'relations' in new_entry and 'derived' in new_entry['relations']:
|
||||
del new_entry['relations']['derived']
|
||||
|
||||
# --- 2. Run Inflection Processor ---
|
||||
new_entry = inflection_processor.process(new_entry)
|
||||
|
||||
# --- Remove lang_code after processing ---
|
||||
if 'lang_code' in new_entry:
|
||||
del new_entry['lang_code']
|
||||
|
||||
results.append(json.dumps(new_entry, ensure_ascii=False))
|
||||
counters["processed"] += 1
|
||||
|
||||
except ValueError as e:
|
||||
counters["skipped"] += 1
|
||||
errors.append(f"Value Error: {str(e)}")
|
||||
except json.JSONDecodeError:
|
||||
counters["errors"] += 1
|
||||
except Exception as e:
|
||||
counters["errors"] += 1
|
||||
errors.append(f"Unexpected Error: {str(e)}")
|
||||
|
||||
return results, counters, errors
|
||||
|
||||
# ==============================================================================
|
||||
# --- MAIN PROCESS ---
|
||||
# ==============================================================================
|
||||
|
||||
def process_file(input_path: pathlib.Path, output_path: pathlib.Path, lang_filter: Optional[str],
|
||||
pos_whitelist: Set[str], pos_blacklist: Set[str], ignore_form_of: bool,
|
||||
translation_languages: Set[str]):
|
||||
|
||||
logger.info(f"Starting parallel processing...")
|
||||
logger.info(f" Input file: {input_path}")
|
||||
logger.info(f" Output file: {output_path}")
|
||||
|
||||
if not input_path.exists():
|
||||
logger.critical(f"Input file not found: {input_path}")
|
||||
sys.exit(1)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Prepare Inflection Configs
|
||||
inflection_configs = {
|
||||
'de_verb': GERMAN_VERB_CONFIG,
|
||||
'fr_verb': FRENCH_VERB_CONFIG
|
||||
}
|
||||
|
||||
if lang_filter and f"{lang_filter}_verb" not in inflection_configs:
|
||||
logger.warning(f"No inflection configuration found for language '{lang_filter}'. Verbs will remain uncompressed.")
|
||||
|
||||
logger.info("Reading input file into memory...")
|
||||
try:
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
except Exception as e:
|
||||
logger.critical(f"Failed to read input file: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
total_lines = len(lines)
|
||||
logger.info(f"Total lines to process: {total_lines:,}")
|
||||
|
||||
num_processes = multiprocessing.cpu_count()
|
||||
chunk_size = total_lines // num_processes + 1
|
||||
chunks = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)]
|
||||
logger.info(f"Split data into {len(chunks)} chunks for {num_processes} cores.")
|
||||
|
||||
pool = multiprocessing.Pool(processes=num_processes)
|
||||
|
||||
worker_args = [
|
||||
(chunk, lang_filter, pos_whitelist, pos_blacklist, ignore_form_of, translation_languages, inflection_configs)
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
try:
|
||||
all_results = pool.starmap(process_chunk_filtering, worker_args)
|
||||
pool.close()
|
||||
pool.join()
|
||||
except KeyboardInterrupt:
|
||||
logger.warning("Interrupted by user. Terminating pool...")
|
||||
pool.terminate()
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
logger.critical(f"Error during parallel processing: {e}")
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
logger.info("Aggregating results and writing to output...")
|
||||
|
||||
final_counters = {"processed": 0, "skipped": 0, "errors": 0}
|
||||
error_log_path = output_path.parent / "verb_errors.log"
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as out_f, \
|
||||
open(error_log_path, 'w', encoding='utf-8') as err_f:
|
||||
|
||||
for result_strings, worker_stats, worker_errors in all_results:
|
||||
for k in final_counters:
|
||||
final_counters[k] += worker_stats.get(k, 0)
|
||||
|
||||
for json_str in result_strings:
|
||||
out_f.write(json_str + "\n")
|
||||
|
||||
for err_msg in worker_errors:
|
||||
err_f.write(err_msg + "\n")
|
||||
|
||||
logger.info(f"DONE. Total Read: {total_lines}")
|
||||
logger.info(f"Processed: {final_counters['processed']}, Skipped: {final_counters['skipped']}, Errors: {final_counters['errors']}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Transform kaakki.org JSONL to universal dictionary format (Parallel).")
|
||||
parser.add_argument("--input", type=pathlib.Path, default=DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME,
|
||||
help="Path to the raw input JSONL file.")
|
||||
parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR,
|
||||
help="Directory to save the transformed JSONL file.")
|
||||
parser.add_argument("--lang", type=str, default=DEFAULT_LANG_FILTER,
|
||||
help="Language code to filter for (e.g., 'de').")
|
||||
parser.add_argument("--trans-langs", type=str, default=",".join(DEFAULT_TRANS_LANGS),
|
||||
help="Comma-separated list of translation languages to keep.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
output_filename = f"{args.lang.capitalize()}_universal.jsonl" if args.lang else "universal.jsonl"
|
||||
output_file_path = args.output_dir / output_filename
|
||||
|
||||
trans_langs_set = set(lang.strip() for lang in args.trans_langs.split(",")) if args.trans_langs else set()
|
||||
|
||||
process_file(
|
||||
args.input,
|
||||
output_file_path,
|
||||
args.lang,
|
||||
DEFAULT_POS_WHITELIST,
|
||||
DEFAULT_POS_BLACKLIST,
|
||||
DEFAULT_IGNORE_FORM_OF,
|
||||
trans_langs_set
|
||||
)
|
||||
|
||||
stats_file = ROOT_DIR / "processing_stats.csv"
|
||||
if output_file_path.exists():
|
||||
file_size = output_file_path.stat().st_size
|
||||
else:
|
||||
file_size = 0
|
||||
|
||||
timestamp = datetime.now().isoformat()
|
||||
write_header = not stats_file.exists()
|
||||
try:
|
||||
with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
if write_header:
|
||||
writer.writerow(['timestamp', 'output_file', 'size_bytes'])
|
||||
writer.writerow([timestamp, str(output_file_path), file_size])
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not write stats csv: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
multiprocessing.freeze_support()
|
||||
main()
|
||||
380
scripts/02_create_db.py
Normal file
380
scripts/02_create_db.py
Normal file
@@ -0,0 +1,380 @@
|
||||
import json
|
||||
import sqlite3
|
||||
import pathlib
|
||||
import traceback
|
||||
import os
|
||||
import argparse
|
||||
import sys
|
||||
import multiprocessing
|
||||
import csv
|
||||
import statistics
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import zstandard
|
||||
except ImportError:
|
||||
print("ERROR: zstandard library not found. Please install it: pip install zstandard")
|
||||
sys.exit(1)
|
||||
|
||||
# ======================================================================
|
||||
# --- DEFAULT CONFIGURATION (Overridable via CLI args) ---
|
||||
# ======================================================================
|
||||
|
||||
try:
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
except NameError:
|
||||
SCRIPT_DIR = pathlib.Path.cwd()
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
|
||||
DEFAULT_LANG_CODE = "fr"
|
||||
DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate"
|
||||
DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs"
|
||||
|
||||
COMPRESSION_LEVEL = 22
|
||||
DICTIONARY_SAMPLE_COUNT = 200000
|
||||
DICTIONARY_MAX_SIZE = 10 * 1024 * 1024 # 10MB
|
||||
|
||||
DEFAULT_UNCOMPRESSED_ONLY = False #change this for compression!
|
||||
DEFAULT_MINIMAL = False
|
||||
|
||||
# ======================================================================
|
||||
|
||||
def get_file_size_mb(filepath):
|
||||
return os.path.getsize(filepath) / (1024 * 1024)
|
||||
|
||||
def count_lines(filepath):
|
||||
print("Counting total lines for progress tracking...")
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
return sum(1 for _ in f)
|
||||
|
||||
def process_chunk(chunk, compression_dict_bytes):
|
||||
import zstandard
|
||||
compression_dict = zstandard.ZstdCompressionDict(compression_dict_bytes)
|
||||
local_compressor = zstandard.ZstdCompressor(level=22, dict_data=compression_dict)
|
||||
results = []
|
||||
for line in chunk:
|
||||
if not line.strip(): continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
word = entry.get("word")
|
||||
pos = entry.get("pos", "")
|
||||
if not word: continue
|
||||
data_to_compress = entry.copy()
|
||||
data_to_compress.pop("word", None)
|
||||
data_to_compress.pop("pos", None)
|
||||
value_bytes = json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8')
|
||||
compressed_blob = local_compressor.compress(value_bytes)
|
||||
results.append((word, pos, compressed_blob, len(value_bytes)))
|
||||
except Exception:
|
||||
pass
|
||||
return results
|
||||
|
||||
def process_chunk_uncompressed(chunk):
|
||||
results = []
|
||||
for line in chunk:
|
||||
if not line.strip(): continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
word = entry.get("word")
|
||||
pos = entry.get("pos", "")
|
||||
if not word: continue
|
||||
data_to_store = entry.copy()
|
||||
data_to_store.pop("word", None)
|
||||
data_to_store.pop("pos", None)
|
||||
value_str = json.dumps(data_to_store, ensure_ascii=False)
|
||||
value_bytes = value_str.encode('utf-8')
|
||||
results.append((word, pos, value_str, len(value_bytes)))
|
||||
except Exception:
|
||||
pass
|
||||
return results
|
||||
|
||||
def train_config(config, lines):
|
||||
import zstandard
|
||||
sample_count, max_size = config
|
||||
step = max(1, len(lines) // sample_count)
|
||||
samples = []
|
||||
for j in range(0, len(lines), step):
|
||||
line = lines[j]
|
||||
if not line.strip(): continue
|
||||
entry = json.loads(line)
|
||||
data_to_compress = entry.copy()
|
||||
data_to_compress.pop("word", None)
|
||||
data_to_compress.pop("pos", None)
|
||||
samples.append(json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8'))
|
||||
if len(samples) >= sample_count: break
|
||||
if not samples:
|
||||
return None
|
||||
compression_dict = zstandard.train_dictionary(max_size, samples)
|
||||
dict_bytes = compression_dict.as_bytes()
|
||||
return (sample_count, max_size, len(dict_bytes), dict_bytes)
|
||||
|
||||
def create_database(lang_code, input_file, output_dir, intermediate_dir, uncompressed_only=False, minimal=False):
|
||||
|
||||
database_file = output_dir / f"dictionary_{lang_code}.db"
|
||||
dictionary_file = output_dir / f"dictionary_{lang_code}.zstdict"
|
||||
|
||||
# Ensure output directory exists
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Settings:\n - Language: {lang_code}\n - Input: {input_file}\n - DB Output: {database_file}\n - Dict Output: {dictionary_file}")
|
||||
|
||||
if not input_file.exists():
|
||||
print(f"Error: Input file not found at {input_file}")
|
||||
sys.exit(1)
|
||||
|
||||
total_lines = count_lines(input_file)
|
||||
print(f"Total lines to process: {total_lines:,}")
|
||||
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
num_processes = multiprocessing.cpu_count()
|
||||
chunk_size = len(lines) // num_processes + 1
|
||||
chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)]
|
||||
|
||||
# --- Pass 1: Training Compression Dictionary ---
|
||||
if not uncompressed_only:
|
||||
print(f"\n--- Pass 1: Training Compression Dictionary ---")
|
||||
try:
|
||||
if minimal:
|
||||
sample_count = DICTIONARY_SAMPLE_COUNT
|
||||
max_size = DICTIONARY_MAX_SIZE
|
||||
config = (sample_count, max_size)
|
||||
result = train_config(config, lines)
|
||||
if result is None:
|
||||
print("Error: No valid dictionary trained.")
|
||||
sys.exit(1)
|
||||
sample_count, max_size, dict_size, dict_bytes = result
|
||||
print(f"Using default configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
|
||||
else:
|
||||
# Generate 20 configurations to try (varying both sample_count and max_size)
|
||||
configs = []
|
||||
for i in range(20):
|
||||
sample_count = 100000 + (i % 5) * 200000 # 5 different: 200k, 400k, 600k, 800k, 1M
|
||||
max_size = (3 + (i // 5) * 2) * 1024 * 1024 # 4 different: 3MB, 5MB, 7MB, 9MB
|
||||
configs.append((sample_count, max_size))
|
||||
|
||||
pool = multiprocessing.Pool(processes=min(20, multiprocessing.cpu_count()))
|
||||
results = pool.starmap(train_config, [(config, lines) for config in configs])
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# Find the best configuration (largest dictionary size)
|
||||
valid_results = [r for r in results if r is not None]
|
||||
if not valid_results:
|
||||
print("Error: No valid dictionaries trained.")
|
||||
sys.exit(1)
|
||||
|
||||
print("All configurations results:")
|
||||
for sample_count, max_size, dict_size, _ in valid_results:
|
||||
print(f" samples={sample_count}, max_size={max_size/1024/1024:.1f}MB -> dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
|
||||
|
||||
best_result = max(valid_results, key=lambda x: x[2])
|
||||
sample_count, max_size, dict_size, dict_bytes = best_result
|
||||
|
||||
print(f"\nBest configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
|
||||
|
||||
compression_dict = zstandard.ZstdCompressionDict(dict_bytes)
|
||||
|
||||
with open(dictionary_file, "wb") as f:
|
||||
f.write(dict_bytes)
|
||||
print(f"Saved dictionary to {dictionary_file}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during training: {e}")
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
if not uncompressed_only:
|
||||
# --- Database Setup ---
|
||||
if database_file.exists():
|
||||
os.remove(database_file)
|
||||
|
||||
conn = sqlite3.connect(database_file)
|
||||
conn.execute("PRAGMA journal_mode=WAL;")
|
||||
conn.execute("PRAGMA auto_vacuum=full;")
|
||||
cursor = conn.cursor()
|
||||
compressor = zstandard.ZstdCompressor(level=COMPRESSION_LEVEL, dict_data=compression_dict)
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE dictionary_data (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
word TEXT NOT NULL,
|
||||
pos TEXT,
|
||||
data_blob BLOB,
|
||||
uncompressed_size INTEGER
|
||||
);
|
||||
''')
|
||||
|
||||
# --- Pass 2: Insert Data ---
|
||||
print("\n--- Pass 2: Inserting Data ---")
|
||||
|
||||
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
|
||||
|
||||
print("Processing chunks in parallel for compressed DB...")
|
||||
all_results = pool.starmap(process_chunk, zip(chunks, [dict_bytes] * len(chunks)))
|
||||
data_to_insert = [item for sublist in all_results for item in sublist]
|
||||
|
||||
print(f"Collected {len(data_to_insert)} items to insert into compressed DB.")
|
||||
cursor.executemany("INSERT INTO dictionary_data (word, pos, data_blob, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert)
|
||||
word_counter = len(data_to_insert)
|
||||
|
||||
conn.commit()
|
||||
print(f"Inserted {word_counter:,} words into compressed DB.")
|
||||
|
||||
# --- Pass 3: FTS & Cleanup ---
|
||||
print("Creating FTS4 index...")
|
||||
cursor.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
|
||||
cursor.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
|
||||
conn.commit()
|
||||
|
||||
print("Running VACUUM...")
|
||||
cursor.execute('VACUUM')
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
db_size_mb = get_file_size_mb(database_file)
|
||||
dict_size_mb = get_file_size_mb(dictionary_file)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SUCCESS: Database created.")
|
||||
print(f"{'='*60}")
|
||||
print(f"Final Database Size: {db_size_mb:.2f} MB ({database_file.name})")
|
||||
print(f"Final Dictionary Size: {dict_size_mb:.2f} MB ({dictionary_file.name})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# --- Create Uncompressed Database ---
|
||||
print(f"\n--- Creating Uncompressed Database ---")
|
||||
uncompressed_db_file = intermediate_dir / f"dictionary_{lang_code}_uncompressed.db"
|
||||
|
||||
# Ensure intermediate directory exists
|
||||
intermediate_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if uncompressed_db_file.exists():
|
||||
os.remove(uncompressed_db_file)
|
||||
|
||||
conn2 = sqlite3.connect(uncompressed_db_file)
|
||||
conn2.execute("PRAGMA journal_mode=WAL;")
|
||||
conn2.execute("PRAGMA auto_vacuum=full;")
|
||||
cursor2 = conn2.cursor()
|
||||
|
||||
cursor2.execute('''
|
||||
CREATE TABLE dictionary_data (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
word TEXT NOT NULL,
|
||||
pos TEXT,
|
||||
data TEXT,
|
||||
uncompressed_size INTEGER
|
||||
);
|
||||
''')
|
||||
|
||||
# --- Pass 2b: Insert Uncompressed Data ---
|
||||
print("\n--- Pass 2b: Inserting Uncompressed Data ---")
|
||||
|
||||
print("Processing chunks in parallel for uncompressed DB...")
|
||||
if uncompressed_only:
|
||||
pool_uncomp = multiprocessing.Pool(processes=multiprocessing.cpu_count())
|
||||
all_results2 = pool_uncomp.map(process_chunk_uncompressed, chunks)
|
||||
pool_uncomp.close()
|
||||
pool_uncomp.join()
|
||||
else:
|
||||
all_results2 = pool.map(process_chunk_uncompressed, chunks)
|
||||
pool.close()
|
||||
pool.join()
|
||||
data_to_insert2 = [item for sublist in all_results2 for item in sublist]
|
||||
|
||||
print(f"Collected {len(data_to_insert2)} items to insert into uncompressed DB.")
|
||||
cursor2.executemany("INSERT INTO dictionary_data (word, pos, data, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert2)
|
||||
word_counter2 = len(data_to_insert2)
|
||||
|
||||
conn2.commit()
|
||||
print(f"Inserted {word_counter2:,} words into uncompressed DB.")
|
||||
|
||||
# --- Pass 3b: FTS & Cleanup ---
|
||||
print("Creating FTS4 index for uncompressed DB...")
|
||||
cursor2.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
|
||||
cursor2.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
|
||||
conn2.commit()
|
||||
|
||||
print("Running VACUUM on uncompressed DB...")
|
||||
cursor2.execute('VACUUM')
|
||||
conn2.commit()
|
||||
|
||||
# Compute and print uncompressed_size statistics
|
||||
sizes = [row[0] for row in cursor2.execute("SELECT uncompressed_size FROM dictionary_data")]
|
||||
if sizes:
|
||||
min_size = min(sizes)
|
||||
max_size = max(sizes)
|
||||
avg_size = statistics.mean(sizes)
|
||||
median_size = statistics.median(sizes)
|
||||
try:
|
||||
stdev_size = statistics.stdev(sizes)
|
||||
except statistics.StatisticsError:
|
||||
stdev_size = 0.0
|
||||
|
||||
print(f"\nUncompressed Size Statistics:")
|
||||
print(f" Count: {len(sizes):,}")
|
||||
print(f" Min: {min_size}")
|
||||
print(f" Max: {max_size}")
|
||||
print(f" Avg: {avg_size:.2f}")
|
||||
print(f" Median: {median_size}")
|
||||
print(f" Std Dev: {stdev_size:.2f}")
|
||||
|
||||
# Outliers: top 10 largest entries
|
||||
outliers = cursor2.execute("SELECT word, uncompressed_size FROM dictionary_data ORDER BY uncompressed_size DESC LIMIT 10").fetchall()
|
||||
print(f"\nTop 10 largest entries by uncompressed size:")
|
||||
for word, size in outliers:
|
||||
print(f" {word}: {size:,} bytes")
|
||||
|
||||
conn2.close()
|
||||
|
||||
uncompressed_db_size_mb = get_file_size_mb(uncompressed_db_file)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Uncompressed Database Size: {uncompressed_db_size_mb:.2f} MB ({uncompressed_db_file.name})")
|
||||
print(f"{'='*60}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Compress dictionary JSONL into SQLite DB.")
|
||||
parser.add_argument("--lang", type=str, default=DEFAULT_LANG_CODE,
|
||||
help="Language code (e.g., 'de'). Used for naming output files.")
|
||||
parser.add_argument("--input", type=pathlib.Path,
|
||||
help="Full path to input JSONL. If omitted, tries to find it in standard intermediate folder based on lang.")
|
||||
parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR,
|
||||
help="Directory to save .db and .zstdict files.")
|
||||
parser.add_argument("--intermediate-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR,
|
||||
help="Directory to save uncompressed .db file.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine input file if not explicitly provided
|
||||
if args.input:
|
||||
input_file = args.input
|
||||
else:
|
||||
# Try to guess the filename based on the language code matching script 1's output
|
||||
filename = f"{args.lang.capitalize()}_universal.jsonl"
|
||||
input_file = DEFAULT_INTERMEDIATE_DIR / filename
|
||||
|
||||
create_database(args.lang, input_file, args.output_dir, args.intermediate_dir, DEFAULT_UNCOMPRESSED_ONLY, DEFAULT_MINIMAL)
|
||||
|
||||
# Log stats to CSV
|
||||
stats_file = ROOT_DIR / "processing_stats.csv"
|
||||
timestamp = datetime.now().isoformat()
|
||||
files_to_log = [
|
||||
(args.output_dir / f"dictionary_{args.lang}.db", "compressed_db"),
|
||||
(args.output_dir / f"dictionary_{args.lang}.zstdict", "compression_dict"),
|
||||
(args.intermediate_dir / f"dictionary_{args.lang}_uncompressed.db", "uncompressed_db")
|
||||
]
|
||||
write_header = not stats_file.exists()
|
||||
with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
if write_header:
|
||||
writer.writerow(['timestamp', 'output_file', 'size_bytes', 'type'])
|
||||
for file_path, file_type in files_to_log:
|
||||
if file_path.exists():
|
||||
size = file_path.stat().st_size
|
||||
writer.writerow([timestamp, str(file_path), size, file_type])
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
108
scripts/03_update_manifest.py
Normal file
108
scripts/03_update_manifest.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import json
|
||||
import os
|
||||
import hashlib
|
||||
import sys
|
||||
import pathlib
|
||||
import re
|
||||
import argparse
|
||||
from typing import Dict, Any, Set
|
||||
|
||||
# ======================================================================
|
||||
# --- DEFAULT CONFIGURATION ---
|
||||
# ======================================================================
|
||||
try:
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
except NameError:
|
||||
SCRIPT_DIR = pathlib.Path.cwd()
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
|
||||
DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs"
|
||||
# ======================================================================
|
||||
|
||||
def calculate_sha256(filepath: pathlib.Path, block_size=65536) -> str | None:
|
||||
sha256 = hashlib.sha256()
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
for block in iter(lambda: f.read(block_size), b''):
|
||||
sha256.update(block)
|
||||
except IOError as e:
|
||||
print(f" ERROR: Could not read file '{filepath.name}': {e}")
|
||||
return None
|
||||
return sha256.hexdigest().upper()
|
||||
|
||||
def guess_properties_from_base(base_name: str) -> Dict[str, str]:
|
||||
match = re.match(r"dictionary_([a-zA-Z]{2,3})", base_name)
|
||||
if match:
|
||||
lang_code = match.group(1)
|
||||
return {"id": f"{lang_code}_dict", "name": f"Dictionary ({lang_code.upper()})", "lang_code": lang_code}
|
||||
return {"id": base_name, "name": f"Dictionary ({base_name})", "lang_code": "xx"}
|
||||
|
||||
def create_new_dict_entry(base_name: str, asset_files: list[pathlib.Path]) -> Dict[str, Any]:
|
||||
props = guess_properties_from_base(base_name)
|
||||
new_entry = {
|
||||
"id": props["id"], "name": props["name"], "description": "Auto-generated", "version": "1.0.0", "assets": []
|
||||
}
|
||||
for file_path in asset_files:
|
||||
print(f" -> Adding new asset: '{file_path.name}'")
|
||||
csum = calculate_sha256(file_path)
|
||||
if csum:
|
||||
new_entry["assets"].append({
|
||||
"filename": file_path.name, "size_bytes": os.path.getsize(file_path), "checksum_sha256": csum
|
||||
})
|
||||
return new_entry
|
||||
|
||||
def update_manifest(outputs_dir: pathlib.Path):
|
||||
manifest_path = outputs_dir / 'manifest.json'
|
||||
if not outputs_dir.exists():
|
||||
print(f"Error: Outputs directory does not exist: {outputs_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
manifest_data = {"files": []}
|
||||
if manifest_path.exists():
|
||||
try:
|
||||
with open(manifest_path, 'r', encoding='utf-8') as f:
|
||||
manifest_data = json.load(f)
|
||||
if 'files' not in manifest_data: manifest_data['files'] = []
|
||||
except Exception as e:
|
||||
print(f"Error reading manifest: {e}"); sys.exit(1)
|
||||
|
||||
print(f"Scanning {outputs_dir} for assets...")
|
||||
assets_map = {asset['filename']: asset for entry in manifest_data.get('files', []) for asset in entry.get('assets', [])}
|
||||
|
||||
discovered = list(outputs_dir.glob('*.db')) + list(outputs_dir.glob('*.zstdict'))
|
||||
new_files, updated_count = [], 0
|
||||
|
||||
for fpath in discovered:
|
||||
fname = fpath.name
|
||||
if fname in assets_map:
|
||||
print(f"Updating: {fname}")
|
||||
assets_map[fname]['size_bytes'] = os.path.getsize(fpath)
|
||||
assets_map[fname]['checksum_sha256'] = calculate_sha256(fpath)
|
||||
updated_count += 1
|
||||
else:
|
||||
new_files.append(fpath)
|
||||
|
||||
added_count = 0
|
||||
if new_files:
|
||||
grouped = {}
|
||||
for f in new_files:
|
||||
grouped.setdefault(f.stem, []).append(f)
|
||||
for base, files in grouped.items():
|
||||
print(f"Creating new entry for: {base}")
|
||||
manifest_data['files'].append(create_new_dict_entry(base, files))
|
||||
added_count += 1
|
||||
|
||||
with open(manifest_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(manifest_data, f, indent=2, ensure_ascii=False)
|
||||
print(f"\nComplete. Updated {updated_count} assets, added {added_count} new entries.")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Update manifest.json with .db and .zstdict files.")
|
||||
parser.add_argument("--outputs-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR,
|
||||
help="Directory containing assets and manifest.json.")
|
||||
args = parser.parse_args()
|
||||
update_manifest(args.outputs_dir)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
225
scripts/InflectionProcessor.py
Normal file
225
scripts/InflectionProcessor.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import re
|
||||
|
||||
class UniversalInflectionCompressor:
|
||||
"""
|
||||
A generic inflection compressor that uses a configuration dictionary
|
||||
to process, partition, and compress verb forms for any language.
|
||||
"""
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
|
||||
def _matches_criteria(self, form: dict, criteria: dict) -> bool:
|
||||
"""Helper: Checks if a form matches specific criteria."""
|
||||
# Regex Match
|
||||
if 'form_regex' in criteria:
|
||||
form_str = form.get('form', '')
|
||||
if form_str is None: form_str = ''
|
||||
if not re.search(criteria['form_regex'], form_str):
|
||||
return False
|
||||
|
||||
# Tags Inclusion
|
||||
if 'tags' in criteria:
|
||||
form_tags = set(form.get('tags', []))
|
||||
required = set(criteria['tags'])
|
||||
if not required.issubset(form_tags):
|
||||
return False
|
||||
|
||||
# Raw Tags Inclusion
|
||||
if 'raw_tags' in criteria:
|
||||
form_raw = set(form.get('raw_tags', []))
|
||||
required_raw = set(criteria['raw_tags'])
|
||||
if not required_raw.issubset(form_raw):
|
||||
return False
|
||||
|
||||
# Tag Exclusion
|
||||
if 'exclude_tags' in criteria:
|
||||
form_tags = set(form.get('tags', []))
|
||||
if not form_tags.isdisjoint(set(criteria['exclude_tags'])):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _normalize_forms(self, forms: list) -> list:
|
||||
"""Enriches forms with tags based on 'normalization_rules'."""
|
||||
rules = self.config.get('normalization_rules', [])
|
||||
skip_if_source = self.config.get('skip_normalization_if_source', True)
|
||||
|
||||
for form in forms:
|
||||
if form.get('source') and skip_if_source:
|
||||
continue
|
||||
|
||||
for rule in rules:
|
||||
field = rule.get('field')
|
||||
value_to_match = rule.get('match')
|
||||
match_mode = rule.get('match_mode', 'exact')
|
||||
add_tags = rule.get('add_tags', [])
|
||||
|
||||
form_value = form.get(field)
|
||||
if form_value is None: continue
|
||||
|
||||
is_match = False
|
||||
if match_mode == 'regex':
|
||||
if isinstance(form_value, list):
|
||||
for item in form_value:
|
||||
if re.search(value_to_match, str(item)):
|
||||
is_match = True; break
|
||||
else:
|
||||
if re.search(value_to_match, str(form_value)):
|
||||
is_match = True
|
||||
else:
|
||||
if isinstance(form_value, list):
|
||||
is_match = value_to_match in form_value
|
||||
else:
|
||||
is_match = value_to_match == form_value
|
||||
|
||||
if is_match:
|
||||
current_tags = set(form.get('tags', []))
|
||||
current_tags.update(add_tags)
|
||||
form['tags'] = list(current_tags)
|
||||
return forms
|
||||
|
||||
def _extract_properties(self, forms: list, entry_context: dict = None) -> dict:
|
||||
"""Determines global properties (e.g. aux, group)."""
|
||||
properties = {}
|
||||
candidates = forms.copy()
|
||||
if entry_context:
|
||||
candidates.append(entry_context)
|
||||
|
||||
for prop_def in self.config.get('properties', []):
|
||||
name = prop_def['name']
|
||||
default_val = prop_def.get('default')
|
||||
is_multivalue = prop_def.get('multivalue', False)
|
||||
|
||||
found_values = set()
|
||||
for rule in prop_def.get('rules', []):
|
||||
for candidate in candidates:
|
||||
if self._matches_criteria(candidate, rule.get('criteria', {})):
|
||||
found_values.add(rule['value'])
|
||||
if not is_multivalue:
|
||||
break
|
||||
if found_values and not is_multivalue:
|
||||
break
|
||||
|
||||
if not found_values:
|
||||
if is_multivalue and default_val is not None:
|
||||
properties[name] = default_val if isinstance(default_val, list) else [default_val]
|
||||
else:
|
||||
properties[name] = default_val
|
||||
elif is_multivalue:
|
||||
properties[name] = sorted(list(found_values))
|
||||
else:
|
||||
properties[name] = list(found_values)[0]
|
||||
|
||||
return properties
|
||||
|
||||
def _clean_verb_string(self, form_string: str) -> str:
|
||||
ignored = self.config.get('clean_prefixes', [])
|
||||
current_string = form_string.strip()
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
for prefix in ignored:
|
||||
if prefix.endswith("'") or prefix.endswith("’"):
|
||||
if current_string.startswith(prefix):
|
||||
current_string = current_string[len(prefix):]
|
||||
changed = True
|
||||
break
|
||||
else:
|
||||
if current_string.startswith(prefix + " "):
|
||||
current_string = current_string[len(prefix)+1:]
|
||||
changed = True
|
||||
break
|
||||
return current_string
|
||||
|
||||
def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict:
|
||||
if not forms_list:
|
||||
return None
|
||||
|
||||
# 1. Normalize tags
|
||||
normalized_forms = self._normalize_forms(forms_list)
|
||||
|
||||
# 2. Extract Properties
|
||||
entry_context = None
|
||||
if entry:
|
||||
entry_context = {
|
||||
'form': entry.get('word', ''),
|
||||
'tags': entry.get('tags', []),
|
||||
'raw_tags': entry.get('raw_tags', [])
|
||||
}
|
||||
table_properties = self._extract_properties(normalized_forms, entry_context)
|
||||
|
||||
# 3. Initialize Output
|
||||
result = table_properties.copy()
|
||||
|
||||
# 4. Fill Slots
|
||||
schema = self.config.get('schema', {})
|
||||
for slot_name, slot_def in schema.items():
|
||||
slot_type = slot_def.get('type', 'single')
|
||||
|
||||
if slot_type == 'single':
|
||||
result[slot_name] = None
|
||||
for form in normalized_forms:
|
||||
if self._matches_criteria(form, slot_def.get('criteria', {})):
|
||||
if result[slot_name] is None or (form.get('source') and not result[slot_name]):
|
||||
result[slot_name] = self._clean_verb_string(form['form'])
|
||||
|
||||
elif slot_type == 'list':
|
||||
size = slot_def.get('size', 6)
|
||||
result[slot_name] = [None] * size
|
||||
base_criteria = slot_def.get('base_criteria', {})
|
||||
candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)]
|
||||
|
||||
for form in candidates:
|
||||
idx = -1
|
||||
# Iterate through index rules to find where this form belongs
|
||||
for index_rule in slot_def.get('indices', []):
|
||||
# Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut
|
||||
rule_criteria = index_rule.get('criteria', {})
|
||||
if 'tags' in index_rule:
|
||||
rule_criteria = rule_criteria.copy()
|
||||
rule_criteria['tags'] = index_rule['tags']
|
||||
|
||||
if self._matches_criteria(form, rule_criteria):
|
||||
idx = index_rule['index']
|
||||
break
|
||||
|
||||
if idx >= 0 and idx < size:
|
||||
current_val = result[slot_name][idx]
|
||||
if current_val is None:
|
||||
result[slot_name][idx] = self._clean_verb_string(form['form'])
|
||||
elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')):
|
||||
result[slot_name][idx] = self._clean_verb_string(form['form'])
|
||||
|
||||
# 5. Fallbacks
|
||||
if not result.get('infinitive') and word:
|
||||
result['infinitive'] = word
|
||||
|
||||
# 6. Validation
|
||||
if self.config.get('validate_completeness', False):
|
||||
for key, val in result.items():
|
||||
slot_config = schema.get(key, {})
|
||||
if slot_config.get('optional', False):
|
||||
continue
|
||||
if val is None:
|
||||
raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.")
|
||||
if isinstance(val, list):
|
||||
for i, v in enumerate(val):
|
||||
if v is None:
|
||||
raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.")
|
||||
|
||||
return result
|
||||
|
||||
class InflectionProcessor:
|
||||
def __init__(self, configs):
|
||||
self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()}
|
||||
|
||||
def process(self, entry: dict) -> dict:
|
||||
key = f"{entry.get('lang_code')}_{entry.get('pos')}"
|
||||
if key in self.compressors:
|
||||
try:
|
||||
compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry)
|
||||
if compressed:
|
||||
entry['forms'] = compressed
|
||||
except Exception as e:
|
||||
print(f"Error processing {entry.get('word')}: {e}")
|
||||
return entry
|
||||
358
scripts/Json Analyzer/jsonl_schema_analyzer_hybrid.py
Normal file
358
scripts/Json Analyzer/jsonl_schema_analyzer_hybrid.py
Normal file
@@ -0,0 +1,358 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hybrid JSONL Schema Analyzer
|
||||
|
||||
Intelligently chooses between sequential and parallel processing based on file size.
|
||||
For small files, uses sequential processing. For large files, uses parallel processing.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import mmap
|
||||
from collections import defaultdict, Counter
|
||||
from typing import Dict, List, Any, Set, Union, Tuple
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||
from multiprocessing import cpu_count
|
||||
import threading
|
||||
from functools import partial
|
||||
import gc
|
||||
|
||||
# Import the optimized analyzer for parallel processing
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
try:
|
||||
from jsonl_schema_analyzer_optimized import OptimizedJSONLSchemaAnalyzer
|
||||
except ImportError:
|
||||
print("Warning: Could not import optimized analyzer, using fallback")
|
||||
OptimizedJSONLSchemaAnalyzer = None
|
||||
|
||||
|
||||
class HybridJSONLSchemaAnalyzer:
|
||||
"""Hybrid analyzer that intelligently chooses processing strategy."""
|
||||
|
||||
def __init__(self, max_samples: int = 1000, max_workers: int = None,
|
||||
parallel_threshold_mb: int = 100, chunk_size: int = 1000):
|
||||
"""
|
||||
Initialize the hybrid analyzer.
|
||||
|
||||
Args:
|
||||
max_samples: Maximum number of JSON objects to sample per file
|
||||
max_workers: Maximum number of worker processes (default: cpu_count)
|
||||
parallel_threshold_mb: File size threshold in MB to use parallel processing
|
||||
chunk_size: Number of lines to process in each chunk
|
||||
"""
|
||||
self.max_samples = max_samples
|
||||
self.max_workers = max_workers or min(cpu_count(), 8)
|
||||
self.parallel_threshold_mb = parallel_threshold_mb
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
# Import the original analyzer for small files
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
try:
|
||||
from jsonl_schema_analyzer import JSONLSchemaAnalyzer
|
||||
self.sequential_analyzer = JSONLSchemaAnalyzer(max_samples=max_samples)
|
||||
except ImportError:
|
||||
print("Warning: Could not import sequential analyzer")
|
||||
self.sequential_analyzer = None
|
||||
|
||||
# Initialize optimized analyzer for large files
|
||||
if OptimizedJSONLSchemaAnalyzer:
|
||||
self.parallel_analyzer = OptimizedJSONLSchemaAnalyzer(
|
||||
max_samples=max_samples,
|
||||
max_workers=max_workers,
|
||||
chunk_size=chunk_size
|
||||
)
|
||||
else:
|
||||
self.parallel_analyzer = None
|
||||
|
||||
print(f"Hybrid analyzer initialized:")
|
||||
print(f" Parallel threshold: {parallel_threshold_mb} MB")
|
||||
print(f" Max workers: {self.max_workers}")
|
||||
print(f" Chunk size: {self.chunk_size}")
|
||||
|
||||
def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze a JSONL file using the appropriate strategy.
|
||||
|
||||
Args:
|
||||
file_path: Path to the JSONL file
|
||||
|
||||
Returns:
|
||||
Dictionary containing schema analysis results
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
# Get file size in MB
|
||||
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
||||
|
||||
print(f"Analyzing {file_path.name} ({file_size_mb:.2f} MB)...")
|
||||
|
||||
# Choose processing strategy
|
||||
if file_size_mb >= self.parallel_threshold_mb and self.parallel_analyzer:
|
||||
print(f" Using parallel processing (file >= {self.parallel_threshold_mb} MB)")
|
||||
result = self.parallel_analyzer.analyze_jsonl_file(file_path)
|
||||
result["processing_strategy"] = "parallel"
|
||||
elif self.sequential_analyzer:
|
||||
print(f" Using sequential processing (file < {self.parallel_threshold_mb} MB)")
|
||||
result = self.sequential_analyzer.analyze_jsonl_file(file_path)
|
||||
result["processing_strategy"] = "sequential"
|
||||
else:
|
||||
# Fallback to parallel if sequential not available
|
||||
print(f" Using parallel processing (sequential analyzer unavailable)")
|
||||
if self.parallel_analyzer:
|
||||
result = self.parallel_analyzer.analyze_jsonl_file(file_path)
|
||||
result["processing_strategy"] = "parallel_fallback"
|
||||
else:
|
||||
raise RuntimeError("No analyzer available")
|
||||
|
||||
# Add hybrid-specific metadata
|
||||
result["file_size_mb"] = file_size_mb
|
||||
result["parallel_threshold_mb"] = self.parallel_threshold_mb
|
||||
|
||||
return result
|
||||
|
||||
def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze all JSONL files in a directory using hybrid processing.
|
||||
|
||||
Args:
|
||||
directory_path: Path to directory containing JSONL files
|
||||
pattern: File pattern to match (default: *.jsonl)
|
||||
|
||||
Returns:
|
||||
Dictionary containing analysis results for all files
|
||||
"""
|
||||
directory_path = Path(directory_path)
|
||||
|
||||
if not directory_path.exists():
|
||||
raise FileNotFoundError(f"Directory not found: {directory_path}")
|
||||
|
||||
# Find all JSONL files
|
||||
jsonl_files = list(directory_path.glob(pattern))
|
||||
|
||||
if not jsonl_files:
|
||||
print(f"No JSONL files found in {directory_path} with pattern {pattern}")
|
||||
return {"files": [], "summary": {}}
|
||||
|
||||
print(f"Found {len(jsonl_files)} JSONL files to analyze...")
|
||||
start_time = time.time()
|
||||
|
||||
# Categorize files by size
|
||||
small_files = []
|
||||
large_files = []
|
||||
|
||||
for file_path in jsonl_files:
|
||||
size_mb = file_path.stat().st_size / (1024 * 1024)
|
||||
if size_mb >= self.parallel_threshold_mb:
|
||||
large_files.append(file_path)
|
||||
else:
|
||||
small_files.append(file_path)
|
||||
|
||||
print(f" Small files (< {self.parallel_threshold_mb} MB): {len(small_files)}")
|
||||
print(f" Large files (>= {self.parallel_threshold_mb} MB): {len(large_files)}")
|
||||
|
||||
file_results = {}
|
||||
|
||||
# Process small files sequentially (they're fast anyway)
|
||||
if small_files and self.sequential_analyzer:
|
||||
print(f"Processing {len(small_files)} small files sequentially...")
|
||||
for file_path in small_files:
|
||||
try:
|
||||
result = self.analyze_jsonl_file(file_path)
|
||||
file_results[file_path.name] = result
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
|
||||
# Process large files in parallel
|
||||
if large_files and self.parallel_analyzer:
|
||||
print(f"Processing {len(large_files)} large files in parallel...")
|
||||
|
||||
if len(large_files) == 1:
|
||||
# Single large file - just process it directly
|
||||
file_path = large_files[0]
|
||||
try:
|
||||
result = self.analyze_jsonl_file(file_path)
|
||||
file_results[file_path.name] = result
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
else:
|
||||
# Multiple large files - process in parallel
|
||||
with ThreadPoolExecutor(max_workers=min(len(large_files), self.max_workers)) as executor:
|
||||
future_to_file = {
|
||||
executor.submit(self.analyze_jsonl_file, file_path): file_path
|
||||
for file_path in large_files
|
||||
}
|
||||
|
||||
for future in as_completed(future_to_file):
|
||||
file_path = future_to_file[future]
|
||||
try:
|
||||
result = future.result()
|
||||
file_results[file_path.name] = result
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
|
||||
# Create summary
|
||||
successful_results = [r for r in file_results.values() if "error" not in r]
|
||||
summary = {
|
||||
"total_files": len(jsonl_files),
|
||||
"small_files": len(small_files),
|
||||
"large_files": len(large_files),
|
||||
"successfully_analyzed": len(successful_results),
|
||||
"total_size_bytes": sum(
|
||||
r.get("file_size_bytes", 0) for r in successful_results
|
||||
),
|
||||
"total_lines": sum(
|
||||
r.get("total_lines", 0) for r in successful_results
|
||||
),
|
||||
"total_valid_lines": sum(
|
||||
r.get("valid_lines", 0) for r in successful_results
|
||||
),
|
||||
"total_processing_time": sum(
|
||||
r.get("processing_time_seconds", 0) for r in successful_results
|
||||
),
|
||||
"parallel_threshold_mb": self.parallel_threshold_mb,
|
||||
"strategies_used": {
|
||||
"sequential": len([r for r in successful_results if r.get("processing_strategy") == "sequential"]),
|
||||
"parallel": len([r for r in successful_results if r.get("processing_strategy") in ["parallel", "parallel_fallback"]])
|
||||
}
|
||||
}
|
||||
|
||||
# Calculate processing speed
|
||||
if summary["total_processing_time"] > 0:
|
||||
total_mb = summary["total_size_bytes"] / (1024 * 1024)
|
||||
summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
summary["total_elapsed_time"] = elapsed_time
|
||||
|
||||
print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
|
||||
print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
|
||||
print(f"Sequential: {summary['strategies_used']['sequential']}, Parallel: {summary['strategies_used']['parallel']}")
|
||||
print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
|
||||
|
||||
return {
|
||||
"directory": str(directory_path),
|
||||
"pattern": pattern,
|
||||
"files": file_results,
|
||||
"summary": summary
|
||||
}
|
||||
|
||||
def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
|
||||
"""
|
||||
Save analysis results to a JSON file.
|
||||
|
||||
Args:
|
||||
results: Analysis results to save
|
||||
output_path: Path to save the results
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
save_time = time.time() - start_time
|
||||
file_size = output_path.stat().st_size
|
||||
print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error saving results to {output_path}: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function for command-line usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Hybrid JSONL schema analyzer with intelligent processing strategy"
|
||||
)
|
||||
parser.add_argument(
|
||||
"path",
|
||||
help="Path to JSONL file or directory containing JSONL files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
help="Output file for analysis results (JSON format)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p", "--pattern",
|
||||
default="*.jsonl",
|
||||
help="File pattern when analyzing directory (default: *.jsonl)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s", "--max-samples",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Maximum number of JSON objects to sample per file (default: 1000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w", "--workers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of worker processes for parallel processing (default: CPU count, max 8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t", "--threshold",
|
||||
type=int,
|
||||
default=100,
|
||||
help="File size threshold in MB for parallel processing (default: 100)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--chunk-size",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of lines to process in each chunk (default: 1000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--directory",
|
||||
action="store_true",
|
||||
help="Treat path as directory instead of single file"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize hybrid analyzer
|
||||
analyzer = HybridJSONLSchemaAnalyzer(
|
||||
max_samples=args.max_samples,
|
||||
max_workers=args.workers,
|
||||
parallel_threshold_mb=args.threshold,
|
||||
chunk_size=args.chunk_size
|
||||
)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Analyze file or directory
|
||||
if args.directory or Path(args.path).is_dir():
|
||||
results = analyzer.analyze_directory(args.path, args.pattern)
|
||||
else:
|
||||
results = analyzer.analyze_jsonl_file(args.path)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Save or print results
|
||||
if args.output:
|
||||
analyzer.save_results(results, args.output)
|
||||
else:
|
||||
print("\n" + "="*50)
|
||||
print("ANALYSIS RESULTS")
|
||||
print("="*50)
|
||||
print(json.dumps(results, indent=2, ensure_ascii=False))
|
||||
|
||||
print(f"\nTotal analysis time: {total_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
567
scripts/Json Analyzer/jsonl_schema_analyzer_optimized.py
Normal file
567
scripts/Json Analyzer/jsonl_schema_analyzer_optimized.py
Normal file
@@ -0,0 +1,567 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Optimized JSONL Schema Analyzer
|
||||
|
||||
Analyzes JSONL files to extract and aggregate schema information using multiple cores.
|
||||
For each JSONL file, it generates a schema showing the JSON structure
|
||||
and aggregates all possible keys found across all records.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import mmap
|
||||
from collections import defaultdict, Counter
|
||||
from typing import Dict, List, Any, Set, Union, Tuple
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||
from multiprocessing import cpu_count, Manager
|
||||
import threading
|
||||
from functools import partial
|
||||
import gc
|
||||
|
||||
|
||||
class OptimizedJSONLSchemaAnalyzer:
|
||||
"""Optimized analyzer that uses multiple cores and system resources efficiently."""
|
||||
|
||||
def __init__(self, max_samples: int = 1000, max_workers: int = None, chunk_size: int = 1000):
|
||||
"""
|
||||
Initialize the optimized analyzer.
|
||||
|
||||
Args:
|
||||
max_samples: Maximum number of JSON objects to sample per file
|
||||
max_workers: Maximum number of worker processes (default: cpu_count)
|
||||
chunk_size: Number of lines to process in each chunk
|
||||
"""
|
||||
self.max_samples = max_samples
|
||||
self.max_workers = max_workers or min(cpu_count(), 8) # Limit to 8 to avoid memory issues
|
||||
self.chunk_size = chunk_size
|
||||
self.schema_cache = {}
|
||||
|
||||
print(f"Initialized analyzer with {self.max_workers} workers, chunk size: {self.chunk_size}")
|
||||
|
||||
def analyze_json_value(self, value: Any, depth: int = 0, max_depth: int = 10) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze a JSON value and return its type and structure.
|
||||
|
||||
Args:
|
||||
value: The JSON value to analyze
|
||||
depth: Current depth in the structure
|
||||
max_depth: Maximum depth to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary describing the value's type and structure
|
||||
"""
|
||||
if depth > max_depth:
|
||||
return {"type": "unknown", "note": "max_depth_reached"}
|
||||
|
||||
if value is None:
|
||||
return {"type": "null"}
|
||||
elif isinstance(value, bool):
|
||||
return {"type": "boolean"}
|
||||
elif isinstance(value, int):
|
||||
return {"type": "integer"}
|
||||
elif isinstance(value, float):
|
||||
return {"type": "number"}
|
||||
elif isinstance(value, str):
|
||||
return {"type": "string", "sample_length": len(value)}
|
||||
elif isinstance(value, list):
|
||||
if not value:
|
||||
return {"type": "array", "item_types": [], "length_range": [0, 0]}
|
||||
|
||||
item_types = set()
|
||||
item_schemas = []
|
||||
|
||||
# Sample first few items to determine array structure
|
||||
sample_size = min(10, len(value))
|
||||
for item in value[:sample_size]:
|
||||
item_schema = self.analyze_json_value(item, depth + 1, max_depth)
|
||||
item_schemas.append(item_schema)
|
||||
item_types.add(item_schema["type"])
|
||||
|
||||
return {
|
||||
"type": "array",
|
||||
"item_types": sorted(list(item_types)),
|
||||
"length_range": [len(value), len(value)],
|
||||
"sample_items": item_schemas[:3] # Keep first 3 as examples
|
||||
}
|
||||
elif isinstance(value, dict):
|
||||
if not value:
|
||||
return {"type": "object", "properties": {}, "required_keys": []}
|
||||
|
||||
properties = {}
|
||||
for key, val in value.items():
|
||||
properties[key] = self.analyze_json_value(val, depth + 1, max_depth)
|
||||
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": properties,
|
||||
"required_keys": list(value.keys())
|
||||
}
|
||||
else:
|
||||
return {"type": "unknown", "note": f"unexpected_type: {type(value)}"}
|
||||
|
||||
def merge_schemas(self, schema1: Dict[str, Any], schema2: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Merge two schemas, combining their information.
|
||||
|
||||
Args:
|
||||
schema1: First schema
|
||||
schema2: Second schema
|
||||
|
||||
Returns:
|
||||
Merged schema
|
||||
"""
|
||||
if schema1["type"] != schema2["type"]:
|
||||
# Different types, create a union
|
||||
return {
|
||||
"type": "union",
|
||||
"possible_types": sorted(set([schema1["type"], schema2["type"]])),
|
||||
"schemas": [schema1, schema2]
|
||||
}
|
||||
|
||||
merged = {"type": schema1["type"]}
|
||||
|
||||
if schema1["type"] == "array":
|
||||
# Merge array item types
|
||||
item_types = set(schema1.get("item_types", []))
|
||||
item_types.update(schema2.get("item_types", []))
|
||||
merged["item_types"] = sorted(list(item_types))
|
||||
|
||||
# Merge length ranges
|
||||
len1 = schema1.get("length_range", [0, 0])
|
||||
len2 = schema2.get("length_range", [0, 0])
|
||||
merged["length_range"] = [min(len1[0], len2[0]), max(len1[1], len2[1])]
|
||||
|
||||
# Merge sample items if available
|
||||
if "sample_items" in schema1 or "sample_items" in schema2:
|
||||
merged["sample_items"] = (
|
||||
schema1.get("sample_items", []) +
|
||||
schema2.get("sample_items", [])
|
||||
)[:5] # Keep max 5 samples
|
||||
|
||||
elif schema1["type"] == "object":
|
||||
# Merge object properties
|
||||
properties = {}
|
||||
all_keys = set()
|
||||
|
||||
# Copy properties from first schema
|
||||
for key, val in schema1.get("properties", {}).items():
|
||||
properties[key] = val
|
||||
all_keys.add(key)
|
||||
|
||||
# Merge properties from second schema
|
||||
for key, val in schema2.get("properties", {}).items():
|
||||
if key in properties:
|
||||
properties[key] = self.merge_schemas(properties[key], val)
|
||||
else:
|
||||
properties[key] = val
|
||||
all_keys.add(key)
|
||||
|
||||
merged["properties"] = properties
|
||||
merged["required_keys"] = sorted(list(all_keys))
|
||||
|
||||
# Copy other fields
|
||||
for key in schema1:
|
||||
if key not in merged and key != "type":
|
||||
merged[key] = schema1[key]
|
||||
|
||||
return merged
|
||||
|
||||
def _extract_all_keys(self, obj: Any, prefix: str = "") -> List[str]:
|
||||
"""
|
||||
Recursively extract all keys from a JSON object.
|
||||
|
||||
Args:
|
||||
obj: JSON object to analyze
|
||||
prefix: Prefix for nested keys
|
||||
|
||||
Returns:
|
||||
List of all keys found
|
||||
"""
|
||||
keys = []
|
||||
|
||||
if isinstance(obj, dict):
|
||||
for key, value in obj.items():
|
||||
full_key = f"{prefix}.{key}" if prefix else key
|
||||
keys.append(full_key)
|
||||
keys.extend(self._extract_all_keys(value, full_key))
|
||||
|
||||
elif isinstance(obj, list):
|
||||
for i, item in enumerate(obj):
|
||||
keys.extend(self._extract_all_keys(item, f"{prefix}[{i}]" if prefix else f"[{i}]"))
|
||||
|
||||
return keys
|
||||
|
||||
def _process_chunk(self, chunk_data: List[str]) -> Tuple[Counter, List[Dict], int, int]:
|
||||
"""
|
||||
Process a chunk of JSONL lines.
|
||||
|
||||
Args:
|
||||
chunk_data: List of JSONL lines to process
|
||||
|
||||
Returns:
|
||||
Tuple of (keys_counter, sample_objects, valid_count, error_count)
|
||||
"""
|
||||
all_keys = Counter()
|
||||
sample_objects = []
|
||||
valid_count = 0
|
||||
error_count = 0
|
||||
|
||||
for line in chunk_data:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
valid_count += 1
|
||||
|
||||
# Collect all keys from this object
|
||||
keys = self._extract_all_keys(obj)
|
||||
all_keys.update(keys)
|
||||
|
||||
# Keep sample objects for schema analysis
|
||||
if len(sample_objects) < self.max_samples:
|
||||
sample_objects.append(obj)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
error_count += 1
|
||||
|
||||
return all_keys, sample_objects, valid_count, error_count
|
||||
|
||||
def _read_file_chunks(self, file_path: Path) -> List[List[str]]:
|
||||
"""
|
||||
Read a JSONL file in chunks for parallel processing.
|
||||
|
||||
Args:
|
||||
file_path: Path to the JSONL file
|
||||
|
||||
Returns:
|
||||
List of chunks, each containing lines to process
|
||||
"""
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
current_chunk.append(line)
|
||||
|
||||
if len(current_chunk) >= self.chunk_size:
|
||||
chunks.append(current_chunk)
|
||||
current_chunk = []
|
||||
|
||||
# Add remaining lines
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk)
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error reading file {file_path}: {e}")
|
||||
|
||||
return chunks
|
||||
|
||||
def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze a JSONL file and return schema information using parallel processing.
|
||||
|
||||
Args:
|
||||
file_path: Path to the JSONL file
|
||||
|
||||
Returns:
|
||||
Dictionary containing schema analysis results
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
start_time = time.time()
|
||||
file_size = file_path.stat().st_size
|
||||
print(f"Analyzing {file_path.name} ({file_size / (1024*1024*1024):.2f} GB)...")
|
||||
|
||||
# Statistics
|
||||
total_lines = 0
|
||||
valid_lines = 0
|
||||
error_lines = 0
|
||||
all_keys = Counter()
|
||||
merged_schema = None
|
||||
sample_objects = []
|
||||
|
||||
# Read file in chunks and process in parallel
|
||||
chunks = self._read_file_chunks(file_path)
|
||||
|
||||
if len(chunks) == 1 or self.max_workers == 1:
|
||||
# Process sequentially for small files or single worker
|
||||
for chunk in chunks:
|
||||
chunk_keys, chunk_samples, chunk_valid, chunk_errors = self._process_chunk(chunk)
|
||||
all_keys.update(chunk_keys)
|
||||
sample_objects.extend(chunk_samples)
|
||||
valid_lines += chunk_valid
|
||||
error_lines += chunk_errors
|
||||
total_lines += len(chunk)
|
||||
else:
|
||||
# Process chunks in parallel
|
||||
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# Submit all chunks for processing
|
||||
future_to_chunk = {
|
||||
executor.submit(self._process_chunk, chunk): chunk
|
||||
for chunk in chunks
|
||||
}
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_chunk):
|
||||
chunk_keys, chunk_samples, chunk_valid, chunk_errors = future.result()
|
||||
all_keys.update(chunk_keys)
|
||||
sample_objects.extend(chunk_samples)
|
||||
valid_lines += chunk_valid
|
||||
error_lines += chunk_errors
|
||||
total_lines += len(future_to_chunk[future])
|
||||
|
||||
# Limit sample objects
|
||||
if len(sample_objects) >= self.max_samples:
|
||||
sample_objects = sample_objects[:self.max_samples]
|
||||
|
||||
# Analyze schema from sample objects
|
||||
if sample_objects:
|
||||
for obj in sample_objects:
|
||||
obj_schema = self.analyze_json_value(obj)
|
||||
|
||||
if merged_schema is None:
|
||||
merged_schema = obj_schema
|
||||
else:
|
||||
merged_schema = self.merge_schemas(merged_schema, obj_schema)
|
||||
|
||||
# Prepare results
|
||||
elapsed_time = time.time() - start_time
|
||||
results = {
|
||||
"file_path": str(file_path),
|
||||
"file_size_bytes": file_size,
|
||||
"total_lines": total_lines,
|
||||
"valid_lines": valid_lines,
|
||||
"error_lines": error_lines,
|
||||
"sample_count": len(sample_objects),
|
||||
"all_keys": dict(all_keys.most_common()),
|
||||
"unique_key_count": len(all_keys),
|
||||
"schema": merged_schema,
|
||||
"analysis_timestamp": time.time(),
|
||||
"processing_time_seconds": elapsed_time,
|
||||
"workers_used": self.max_workers,
|
||||
"chunks_processed": len(chunks)
|
||||
}
|
||||
|
||||
print(f" Completed in {elapsed_time:.2f}s - {valid_lines:,} valid lines, {error_lines:,} errors")
|
||||
|
||||
# Clean up memory
|
||||
gc.collect()
|
||||
|
||||
return results
|
||||
|
||||
def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze all JSONL files in a directory using parallel processing.
|
||||
|
||||
Args:
|
||||
directory_path: Path to directory containing JSONL files
|
||||
pattern: File pattern to match (default: *.jsonl)
|
||||
|
||||
Returns:
|
||||
Dictionary containing analysis results for all files
|
||||
"""
|
||||
directory_path = Path(directory_path)
|
||||
|
||||
if not directory_path.exists():
|
||||
raise FileNotFoundError(f"Directory not found: {directory_path}")
|
||||
|
||||
# Find all JSONL files
|
||||
jsonl_files = list(directory_path.glob(pattern))
|
||||
|
||||
if not jsonl_files:
|
||||
print(f"No JSONL files found in {directory_path} with pattern {pattern}")
|
||||
return {"files": [], "summary": {}}
|
||||
|
||||
print(f"Found {len(jsonl_files)} JSONL files to analyze using {self.max_workers} workers...")
|
||||
start_time = time.time()
|
||||
|
||||
# Sort files by size (largest first) for better load balancing
|
||||
jsonl_files.sort(key=lambda f: f.stat().st_size, reverse=True)
|
||||
|
||||
# Analyze files in parallel
|
||||
file_results = {}
|
||||
|
||||
if len(jsonl_files) == 1 or self.max_workers == 1:
|
||||
# Process sequentially for single file
|
||||
for file_path in jsonl_files:
|
||||
try:
|
||||
file_results[file_path.name] = self.analyze_jsonl_file(file_path)
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
else:
|
||||
# Process files in parallel
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# Submit all files for analysis
|
||||
future_to_file = {
|
||||
executor.submit(self.analyze_jsonl_file, file_path): file_path
|
||||
for file_path in jsonl_files
|
||||
}
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_file):
|
||||
file_path = future_to_file[future]
|
||||
try:
|
||||
result = future.result()
|
||||
file_results[file_path.name] = result
|
||||
except Exception as e:
|
||||
print(f"Error analyzing {file_path.name}: {e}")
|
||||
file_results[file_path.name] = {"error": str(e)}
|
||||
|
||||
# Create summary
|
||||
successful_results = [r for r in file_results.values() if "error" not in r]
|
||||
summary = {
|
||||
"total_files": len(jsonl_files),
|
||||
"successfully_analyzed": len(successful_results),
|
||||
"total_size_bytes": sum(
|
||||
r.get("file_size_bytes", 0) for r in successful_results
|
||||
),
|
||||
"total_lines": sum(
|
||||
r.get("total_lines", 0) for r in successful_results
|
||||
),
|
||||
"total_valid_lines": sum(
|
||||
r.get("valid_lines", 0) for r in successful_results
|
||||
),
|
||||
"total_processing_time": sum(
|
||||
r.get("processing_time_seconds", 0) for r in successful_results
|
||||
),
|
||||
"average_processing_speed_mb_per_sec": 0
|
||||
}
|
||||
|
||||
# Calculate processing speed
|
||||
if summary["total_processing_time"] > 0:
|
||||
total_mb = summary["total_size_bytes"] / (1024 * 1024)
|
||||
summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
summary["total_elapsed_time"] = elapsed_time
|
||||
|
||||
print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
|
||||
print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
|
||||
print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
|
||||
|
||||
return {
|
||||
"directory": str(directory_path),
|
||||
"pattern": pattern,
|
||||
"files": file_results,
|
||||
"summary": summary
|
||||
}
|
||||
|
||||
def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
|
||||
"""
|
||||
Save analysis results to a JSON file.
|
||||
|
||||
Args:
|
||||
results: Analysis results to save
|
||||
output_path: Path to save the results
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
save_time = time.time() - start_time
|
||||
file_size = output_path.stat().st_size
|
||||
print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error saving results to {output_path}: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function for command-line usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Optimized JSONL schema analyzer using multiple cores"
|
||||
)
|
||||
parser.add_argument(
|
||||
"path",
|
||||
help="Path to JSONL file or directory containing JSONL files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
help="Output file for analysis results (JSON format)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p", "--pattern",
|
||||
default="*.jsonl",
|
||||
help="File pattern when analyzing directory (default: *.jsonl)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s", "--max-samples",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Maximum number of JSON objects to sample per file (default: 1000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w", "--workers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of worker processes (default: CPU count, max 8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--chunk-size",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of lines to process in each chunk (default: 1000)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--directory",
|
||||
action="store_true",
|
||||
help="Treat path as directory instead of single file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Enable performance profiling"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize analyzer
|
||||
analyzer = OptimizedJSONLSchemaAnalyzer(
|
||||
max_samples=args.max_samples,
|
||||
max_workers=args.workers,
|
||||
chunk_size=args.chunk_size
|
||||
)
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Analyze file or directory
|
||||
if args.directory or Path(args.path).is_dir():
|
||||
results = analyzer.analyze_directory(args.path, args.pattern)
|
||||
else:
|
||||
results = analyzer.analyze_jsonl_file(args.path)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Save or print results
|
||||
if args.output:
|
||||
analyzer.save_results(results, args.output)
|
||||
else:
|
||||
print("\n" + "="*50)
|
||||
print("ANALYSIS RESULTS")
|
||||
print("="*50)
|
||||
print(json.dumps(results, indent=2, ensure_ascii=False))
|
||||
|
||||
print(f"\nTotal analysis time: {total_time:.2f}s")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
212
scripts/Json Analyzer/run_schema_analysis.py
Normal file
212
scripts/Json Analyzer/run_schema_analysis.py
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Run JSONL Schema Analysis with Default Configuration
|
||||
|
||||
This script runs the JSONL schema analyzer using predefined constants,
|
||||
so you don't need to pass any command line arguments.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Get the root directory (assuming this script is in the scripts folder)
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||||
|
||||
# Configuration constants
|
||||
DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
|
||||
DEFAULT_OUTPUT_DIR = ROOT_DIR / "intermediate"
|
||||
DEFAULT_LANG_FILTER = "fr"
|
||||
DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
|
||||
DEFAULT_INPUT_FILE = DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME
|
||||
|
||||
# Analyzer configuration
|
||||
DEFAULT_MAX_SAMPLES = 1000
|
||||
DEFAULT_MAX_WORKERS = None # Will use CPU count
|
||||
DEFAULT_PARALLEL_THRESHOLD_MB = 100
|
||||
DEFAULT_CHUNK_SIZE = 1000
|
||||
|
||||
# Output configuration
|
||||
DEFAULT_OUTPUT_FILENAME = f"{DEFAULT_LANG_FILTER}_schema_analysis.json"
|
||||
DEFAULT_OUTPUT_FILE = DEFAULT_OUTPUT_DIR / DEFAULT_OUTPUT_FILENAME
|
||||
|
||||
def main():
|
||||
"""Run the schema analysis with default configuration."""
|
||||
|
||||
print("=" * 60)
|
||||
print("JSONL Schema Analysis - Default Configuration")
|
||||
print("=" * 60)
|
||||
|
||||
# Display configuration
|
||||
print(f"Root directory: {ROOT_DIR}")
|
||||
print(f"Input directory: {DEFAULT_INPUT_DIR}")
|
||||
print(f"Input file: {DEFAULT_INPUT_FILENAME}")
|
||||
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
|
||||
print(f"Output file: {DEFAULT_OUTPUT_FILENAME}")
|
||||
print(f"Language filter: {DEFAULT_LANG_FILTER}")
|
||||
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
|
||||
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
|
||||
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
|
||||
print(f"Max workers: {DEFAULT_MAX_WORKERS or 'Auto (CPU count)'}")
|
||||
print()
|
||||
|
||||
# Check if input file exists
|
||||
if not DEFAULT_INPUT_FILE.exists():
|
||||
print(f"❌ Input file not found: {DEFAULT_INPUT_FILE}")
|
||||
print()
|
||||
print("Available files in raw_data directory:")
|
||||
|
||||
# List available JSONL files
|
||||
if DEFAULT_INPUT_DIR.exists():
|
||||
jsonl_files = list(DEFAULT_INPUT_DIR.glob("*.jsonl"))
|
||||
if jsonl_files:
|
||||
for i, file in enumerate(sorted(jsonl_files), 1):
|
||||
size_mb = file.stat().st_size / (1024 * 1024)
|
||||
print(f" {i:2d}. {file.name} ({size_mb:.1f} MB)")
|
||||
else:
|
||||
print(" No JSONL files found.")
|
||||
else:
|
||||
print(" raw_data directory not found.")
|
||||
|
||||
print()
|
||||
print("To analyze a different file, modify the constants in this script:")
|
||||
print(f" - DEFAULT_LANG_FILTER (currently: '{DEFAULT_LANG_FILTER}')")
|
||||
print(f" - DEFAULT_INPUT_FILENAME (currently: '{DEFAULT_INPUT_FILENAME}')")
|
||||
return False
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"✅ Input file found: {DEFAULT_INPUT_FILE.stat().st_size / (1024*1024):.1f} MB")
|
||||
print()
|
||||
|
||||
try:
|
||||
# Import the hybrid analyzer
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
|
||||
|
||||
# Initialize analyzer with default configuration
|
||||
analyzer = HybridJSONLSchemaAnalyzer(
|
||||
max_samples=DEFAULT_MAX_SAMPLES,
|
||||
max_workers=DEFAULT_MAX_WORKERS,
|
||||
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
|
||||
chunk_size=DEFAULT_CHUNK_SIZE
|
||||
)
|
||||
|
||||
print("🚀 Starting analysis...")
|
||||
print()
|
||||
|
||||
# Run analysis
|
||||
results = analyzer.analyze_jsonl_file(DEFAULT_INPUT_FILE)
|
||||
|
||||
# Save results
|
||||
analyzer.save_results(results, DEFAULT_OUTPUT_FILE)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("ANALYSIS COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"📊 Results saved to: {DEFAULT_OUTPUT_FILE}")
|
||||
print(f"📈 Valid lines processed: {results.get('valid_lines', 0):,}")
|
||||
print(f"🔑 Unique keys found: {results.get('unique_key_count', 0):,}")
|
||||
print(f"⏱️ Processing time: {results.get('processing_time_seconds', 0):.2f} seconds")
|
||||
print(f"📁 File size: {results.get('file_size_bytes', 0) / (1024*1024):.1f} MB")
|
||||
|
||||
if results.get('processing_strategy'):
|
||||
print(f"🔧 Strategy used: {results['processing_strategy']}")
|
||||
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"❌ Error importing analyzer: {e}")
|
||||
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ Error during analysis: {e}")
|
||||
return False
|
||||
|
||||
def run_directory_analysis():
|
||||
"""Run analysis on entire directory with default configuration."""
|
||||
|
||||
print("=" * 60)
|
||||
print("Directory JSONL Schema Analysis - Default Configuration")
|
||||
print("=" * 60)
|
||||
|
||||
# Display configuration
|
||||
print(f"Input directory: {DEFAULT_INPUT_DIR}")
|
||||
print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
|
||||
print(f"Pattern: *.jsonl")
|
||||
print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
|
||||
print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
|
||||
print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
|
||||
print()
|
||||
|
||||
# Check if input directory exists
|
||||
if not DEFAULT_INPUT_DIR.exists():
|
||||
print(f"❌ Input directory not found: {DEFAULT_INPUT_DIR}")
|
||||
return False
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Import the hybrid analyzer
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
|
||||
|
||||
# Initialize analyzer with default configuration
|
||||
analyzer = HybridJSONLSchemaAnalyzer(
|
||||
max_samples=DEFAULT_MAX_SAMPLES,
|
||||
max_workers=DEFAULT_MAX_WORKERS,
|
||||
parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
|
||||
chunk_size=DEFAULT_CHUNK_SIZE
|
||||
)
|
||||
|
||||
print("🚀 Starting directory analysis...")
|
||||
print()
|
||||
|
||||
# Run analysis
|
||||
results = analyzer.analyze_directory(DEFAULT_INPUT_DIR, "*.jsonl")
|
||||
|
||||
# Save results
|
||||
output_file = DEFAULT_OUTPUT_DIR / "directory_schema_analysis.json"
|
||||
analyzer.save_results(results, output_file)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("DIRECTORY ANALYSIS COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"📊 Results saved to: {output_file}")
|
||||
|
||||
summary = results.get('summary', {})
|
||||
print(f"📁 Files analyzed: {summary.get('successfully_analyzed', 0)}")
|
||||
print(f"📈 Total valid lines: {summary.get('total_valid_lines', 0):,}")
|
||||
print(f"⏱️ Total processing time: {summary.get('total_processing_time', 0):.2f} seconds")
|
||||
print(f"📦 Total data: {summary.get('total_size_bytes', 0) / (1024*1024*1024):.2f} GB")
|
||||
print(f"🚀 Average speed: {summary.get('average_processing_speed_mb_per_sec', 0):.2f} MB/sec")
|
||||
|
||||
if summary.get('strategies_used'):
|
||||
strategies = summary['strategies_used']
|
||||
print(f"🔧 Sequential files: {strategies.get('sequential', 0)}")
|
||||
print(f"🔧 Parallel files: {strategies.get('parallel', 0)}")
|
||||
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"❌ Error importing analyzer: {e}")
|
||||
print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ Error during analysis: {e}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
# You can choose what to run by default:
|
||||
|
||||
# Option 1: Analyze single file (based on DEFAULT_LANG_FILTER)
|
||||
success = main()
|
||||
|
||||
# Option 2: Analyze entire directory (comment out the line above and uncomment below)
|
||||
# success = run_directory_analysis()
|
||||
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
152
scripts/collect_samples.py
Normal file
152
scripts/collect_samples.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import json
|
||||
import pathlib
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
# ==============================================================================
|
||||
# --- CONFIGURATION ---
|
||||
# ==============================================================================
|
||||
|
||||
# --- Paths ---
|
||||
# Try to determine project root relative to this script location
|
||||
try:
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
except NameError:
|
||||
SCRIPT_DIR = pathlib.Path.cwd()
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
|
||||
# Input directory containing the source semua.org files
|
||||
RAW_DATA_DIR = ROOT_DIR / "raw_data"
|
||||
|
||||
# The pattern to match source files
|
||||
FILE_PATTERN = "*raw-wiktextract-data.jsonl"
|
||||
|
||||
# Output directory for the collected samples
|
||||
SAMPLES_DIR = ROOT_DIR / "samples"
|
||||
|
||||
# Final output filename
|
||||
OUTPUT_FILENAME = "combined_samples.jsonl"
|
||||
|
||||
# --- Sampling Options ---
|
||||
|
||||
# How many matching entries to take from EACH source file.
|
||||
SAMPLES_PER_FILE = 2
|
||||
|
||||
# Filter by Language Code.
|
||||
# Set to None to include all languages.
|
||||
# Example: "en", "de", "fr", "no"
|
||||
LANG_FILTER = set()
|
||||
# set()
|
||||
|
||||
# Filter by Part of Speech.
|
||||
# Leave empty set() to include ALL parts of speech.
|
||||
# Example: {"noun", "verb", "adj"}
|
||||
POS_FILTER = {"verb"}
|
||||
|
||||
# Filter to only include entries in their own language (lang_code matches file prefix)
|
||||
OWN_LANG_FILTER = True
|
||||
|
||||
# ==============================================================================
|
||||
# --- END OF CONFIGURATION ---
|
||||
# ==============================================================================
|
||||
|
||||
# Setup simple logging to console
|
||||
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def collect_samples():
|
||||
# 1. Setup Paths and Directories
|
||||
input_dir = pathlib.Path(RAW_DATA_DIR)
|
||||
output_dir = pathlib.Path(SAMPLES_DIR)
|
||||
output_file = output_dir / OUTPUT_FILENAME
|
||||
|
||||
if not input_dir.exists():
|
||||
logger.error(f"ERROR: Raw data directory not found at: {input_dir}")
|
||||
logger.error("Please ensure your configuration points to the correct folder.")
|
||||
sys.exit(1)
|
||||
|
||||
# Create samples directory if it doesn't exist
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 2. Find all matching input files
|
||||
source_files = list(input_dir.glob(FILE_PATTERN))
|
||||
if not source_files:
|
||||
logger.warning(f"No files matching '{FILE_PATTERN}' found in {input_dir}")
|
||||
sys.exit(0)
|
||||
|
||||
logger.info(f"Found {len(source_files)} source files to sample from.")
|
||||
logger.info(f"Target: {SAMPLES_PER_FILE} samples per file.")
|
||||
logger.info(f"Language Filter: {LANG_FILTER if LANG_FILTER else 'ALL'}")
|
||||
logger.info(f"POS Filter: {POS_FILTER if POS_FILTER else 'ALL'}")
|
||||
logger.info(f"Own Language Filter: {'ENABLED' if OWN_LANG_FILTER else 'DISABLED'}")
|
||||
logger.info("-" * 50)
|
||||
|
||||
total_collected = 0
|
||||
|
||||
# Open the output file once and append samples from all inputs to it
|
||||
try:
|
||||
with open(output_file, 'w', encoding='utf-8') as out_f:
|
||||
|
||||
for src_file in source_files:
|
||||
logger.info(f"Scanning: {src_file.name}...")
|
||||
lang_from_file = src_file.name[:2]
|
||||
file_collected = 0
|
||||
lines_read = 0
|
||||
|
||||
try:
|
||||
with open(src_file, 'r', encoding='utf-8') as in_f:
|
||||
for line in in_f:
|
||||
lines_read += 1
|
||||
|
||||
# Stop reading this file if we have enough samples
|
||||
if file_collected >= SAMPLES_PER_FILE:
|
||||
break
|
||||
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
|
||||
# --- Filtering Logic ---
|
||||
# 1. Language Filter
|
||||
if LANG_FILTER and entry.get('lang_code') != LANG_FILTER:
|
||||
continue
|
||||
|
||||
# 2. POS Filter
|
||||
if POS_FILTER and entry.get('pos') not in POS_FILTER:
|
||||
continue
|
||||
|
||||
# 3. Own Language Filter
|
||||
if OWN_LANG_FILTER and entry.get('lang_code') != lang_from_file:
|
||||
continue
|
||||
|
||||
# --- If it passed filters, save it ---
|
||||
# We write it exactly as it is in the source
|
||||
json.dump(entry, out_f, ensure_ascii=False)
|
||||
out_f.write('\n')
|
||||
file_collected += 1
|
||||
total_collected += 1
|
||||
|
||||
except json.JSONDecodeError:
|
||||
# Ignore bad lines in source files during sampling
|
||||
continue
|
||||
|
||||
logger.info(f" -> Collected {file_collected} samples (scanned {lines_read} lines)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f" ERROR reading {src_file.name}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.critical(f"FATAL ERROR writing output file: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info("-" * 50)
|
||||
logger.info("SAMPLING COMPLETE")
|
||||
logger.info(f"Total entries collected: {total_collected}")
|
||||
logger.info(f"Output saved to: {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
collect_samples()
|
||||
142
scripts/count_pos_values.py
Normal file
142
scripts/count_pos_values.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to count all different "pos" values in JSONL files using parallel processing.
|
||||
Analyzes all JSONL files in the raw_data directory and displays frequency counts.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
from collections import Counter
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from multiprocessing import cpu_count
|
||||
import time
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
|
||||
def process_jsonl_file(file_path: str) -> Tuple[str, Counter]:
|
||||
"""
|
||||
Process a single JSONL file and count POS values.
|
||||
|
||||
Args:
|
||||
file_path: Path to the JSONL file
|
||||
|
||||
Returns:
|
||||
Tuple of (filename, Counter of POS values)
|
||||
"""
|
||||
pos_counter = Counter()
|
||||
line_count = 0
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if 'pos' in data and data['pos']:
|
||||
pos_counter[data['pos']] += 1
|
||||
line_count += 1
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Warning: JSON decode error in {file_path} at line {line_num}: {e}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing file {file_path}: {e}")
|
||||
return file_path, Counter()
|
||||
|
||||
print(f"Processed {file_path}: {line_count} lines, {sum(pos_counter.values())} POS entries")
|
||||
return file_path, pos_counter
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to process all JSONL files and display POS statistics."""
|
||||
# Find all JSONL files in raw_data directory
|
||||
raw_data_dir = "raw_data"
|
||||
jsonl_files = glob.glob(os.path.join(raw_data_dir, "*.jsonl"))
|
||||
|
||||
if not jsonl_files:
|
||||
print(f"No JSONL files found in {raw_data_dir}")
|
||||
return
|
||||
|
||||
print(f"Found {len(jsonl_files)} JSONL files to process")
|
||||
print(f"Using {cpu_count()} CPU cores for parallel processing")
|
||||
print("-" * 60)
|
||||
|
||||
# Process files in parallel
|
||||
start_time = time.time()
|
||||
all_pos_counts = Counter()
|
||||
file_results = {}
|
||||
|
||||
with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
|
||||
# Submit all files for processing
|
||||
future_to_file = {
|
||||
executor.submit(process_jsonl_file, file_path): file_path
|
||||
for file_path in jsonl_files
|
||||
}
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_file):
|
||||
file_path = future_to_file[future]
|
||||
try:
|
||||
filename, pos_counter = future.result()
|
||||
file_results[filename] = pos_counter
|
||||
all_pos_counts.update(pos_counter)
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
|
||||
end_time = time.time()
|
||||
processing_time = end_time - start_time
|
||||
|
||||
# Display results
|
||||
print("\n" + "=" * 80)
|
||||
print("POS VALUE COUNTS ACROSS ALL FILES")
|
||||
print("=" * 80)
|
||||
print(f"Total processing time: {processing_time:.2f} seconds")
|
||||
print(f"Total POS entries found: {sum(all_pos_counts.values()):,}")
|
||||
print(f"Unique POS values: {len(all_pos_counts)}")
|
||||
print("\nTop 50 most common POS values:")
|
||||
print("-" * 80)
|
||||
|
||||
# Sort by frequency (descending)
|
||||
sorted_pos = sorted(all_pos_counts.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
for pos, count in sorted_pos[:100]:
|
||||
percentage = (count / sum(all_pos_counts.values())) * 100
|
||||
print(f"{pos:<20} {count:>10,} ({percentage:5.2f}%)")
|
||||
|
||||
if len(sorted_pos) > 100:
|
||||
print(f"\n... and {len(sorted_pos) - 100} more POS values")
|
||||
|
||||
# Show all unique POS values (alphabetical)
|
||||
print("\n" + "=" * 80)
|
||||
print("ALL UNIQUE POS VALUES (ALPHABETICAL)")
|
||||
print("=" * 80)
|
||||
|
||||
for pos, count in sorted(all_pos_counts.items(), key=lambda x: x[0].lower()):
|
||||
print(f"{pos:<30} {count:>10,}")
|
||||
|
||||
# Per-file breakdown
|
||||
print("\n" + "=" * 80)
|
||||
print("PER-FILE BREAKDOWN")
|
||||
print("=" * 80)
|
||||
|
||||
for filename, pos_counter in sorted(file_results.items()):
|
||||
total_entries = sum(pos_counter.values())
|
||||
if total_entries > 0:
|
||||
print(f"\n{os.path.basename(filename)}:")
|
||||
print(f" Total entries: {total_entries:,}")
|
||||
print(f" Unique POS values: {len(pos_counter)}")
|
||||
|
||||
# All POS values for this file (sorted by frequency)
|
||||
all_pos = sorted(pos_counter.items(), key=lambda x: x[1], reverse=True)
|
||||
for pos, count in all_pos:
|
||||
print(f" {pos:<15} {count:>8,}")
|
||||
|
||||
print(f"\nProcessing completed in {processing_time:.2f} seconds")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
401
scripts/lang_config.py
Normal file
401
scripts/lang_config.py
Normal file
@@ -0,0 +1,401 @@
|
||||
GERMAN_VERB_CONFIG = {
|
||||
"clean_prefixes": ["ich", "du", "er/sie/es", "wir", "ihr", "sie"],
|
||||
"normalization_rules": [
|
||||
{"field": "pronouns", "match": "ich", "add_tags": ["first-person", "singular", "indicative", "active"]},
|
||||
{"field": "pronouns", "match": "du", "add_tags": ["second-person", "singular", "indicative", "active"]},
|
||||
{"field": "pronouns", "match": "er", "add_tags": ["third-person", "singular", "indicative", "active"]},
|
||||
{"field": "pronouns", "match": "sie", "add_tags": ["third-person", "singular", "indicative", "active"]},
|
||||
{"field": "pronouns", "match": "es", "add_tags": ["third-person", "singular", "indicative", "active"]},
|
||||
{"field": "pronouns", "match": "wir", "add_tags": ["first-person", "plural", "indicative", "active"]},
|
||||
{"field": "pronouns", "match": "ihr", "add_tags": ["second-person", "plural", "indicative", "active"]}
|
||||
],
|
||||
"properties": [
|
||||
{
|
||||
"name": "auxiliary",
|
||||
"multivalue": True, # <--- CRITICAL CHANGE HERE
|
||||
"default": ["haben"],
|
||||
"rules": [
|
||||
# Check for explicit raw tags
|
||||
{"value": "sein", "criteria": {"raw_tags": ["Hilfsverb sein"]}},
|
||||
{"value": "haben", "criteria": {"raw_tags": ["Hilfsverb haben"]}},
|
||||
# Check for 'common forms' that imply the aux
|
||||
{"value": "sein", "criteria": {"form_regex": "^sein$", "tags": ["auxiliary", "perfect"]}},
|
||||
{"value": "haben", "criteria": {"form_regex": "^haben$", "tags": ["auxiliary", "perfect"]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "separability",
|
||||
"default": "inseparable",
|
||||
"rules": [
|
||||
{"value": "separable", "criteria": {"tags": ["separable"]}},
|
||||
{"value": "inseparable", "criteria": {"tags": ["inseparable"]}},
|
||||
{"value": "separable", "criteria": {"tags": ["participle-2"], "form_regex": "^(?!ge).+ge.+$"}}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schema": {
|
||||
"infinitive": {
|
||||
"type": "single",
|
||||
"criteria": {"tags": ["infinitive", "present"], "exclude_tags": ["extended", "passive", "reflexive", "zu"]}
|
||||
},
|
||||
"participle_perfect": {
|
||||
"type": "single",
|
||||
"criteria": {"tags": ["participle-2", "perfect"], "exclude_tags": ["active", "passive", "auxiliary"]}
|
||||
},
|
||||
"imperative": {
|
||||
"type": "list",
|
||||
"size": 2,
|
||||
"base_criteria": {"tags": ["imperative", "present", "active"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["singular", "second-person"]},
|
||||
{"index": 1, "tags": ["plural", "second-person"]}
|
||||
]
|
||||
},
|
||||
"present": {
|
||||
"type": "list",
|
||||
"size": 6,
|
||||
"base_criteria": {"tags": ["indicative", "present", "active"], "exclude_tags": ["passive"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"past": {
|
||||
"type": "list",
|
||||
"size": 6,
|
||||
"base_criteria": {"tags": ["indicative", "past", "active"], "exclude_tags": ["passive"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"subjunctive_ii": {
|
||||
"type": "list",
|
||||
"size": 6,
|
||||
"base_criteria": {"tags": ["subjunctive-ii", "past", "active"], "exclude_tags": ["passive"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FRENCH_VERB_CONFIG = {
|
||||
"skip_normalization_if_source": False,
|
||||
|
||||
# CHANGED: Set to False to prevent crashes on idioms, rare words, and defective verbs
|
||||
"validate_completeness": False,
|
||||
|
||||
"clean_prefixes": [
|
||||
"qu'", "qu’", "que", "j'", "j’", "je", "tu",
|
||||
"il/elle/on", "il", "elle", "on", "nous", "vous", "ils/elles", "ils", "elles"
|
||||
],
|
||||
|
||||
"normalization_rules": [
|
||||
# Pronoun matches
|
||||
{"field": "form", "match": r"\bje\b", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
|
||||
{"field": "form", "match": r"\bj[’']", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
|
||||
{"field": "form", "match": r"\btu\b", "match_mode": "regex", "add_tags": ["second-person", "singular"]},
|
||||
{"field": "form", "match": r"\b(il|elle|on|il/elle/on)\b", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
|
||||
{"field": "form", "match": r"\[il/ɛl/ɔ̃\]", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
|
||||
{"field": "form", "match": r"\bnous\b", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
|
||||
{"field": "form", "match": r"\bvous\b", "match_mode": "regex", "add_tags": ["second-person", "plural"]},
|
||||
{"field": "form", "match": r"\b(ils|elles|ils/elles)\b", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
|
||||
{"field": "form", "match": r"\[il/ɛl\]", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
|
||||
|
||||
# Suffix Heuristics
|
||||
{"field": "form", "match": r"ons$", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
|
||||
{"field": "form", "match": r"ez$", "match_mode": "regex", "add_tags": ["second-person", "plural"]}
|
||||
],
|
||||
|
||||
"properties": [
|
||||
{
|
||||
"name": "auxiliary",
|
||||
"multivalue": True,
|
||||
"default": ["avoir"],
|
||||
"rules": [
|
||||
{"value": "être", "criteria": {"raw_tags": ["auxiliary être"]}},
|
||||
{"value": "avoir", "criteria": {"raw_tags": ["auxiliary avoir"]}},
|
||||
{"value": "être", "criteria": {"tags": ["auxiliary-être"]}},
|
||||
{"value": "avoir", "criteria": {"tags": ["auxiliary-avoir"]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "group",
|
||||
"default": "unknown",
|
||||
"rules": [
|
||||
{"value": "1st-group", "criteria": {"raw_tags": ["1ᵉʳ groupe"]}},
|
||||
{"value": "2nd-group", "criteria": {"raw_tags": ["2ᵉ groupe"]}},
|
||||
{"value": "3rd-group", "criteria": {"raw_tags": ["3ᵉ groupe"]}},
|
||||
{"value": "1st-group", "criteria": {"form_regex": "er$"}},
|
||||
{"value": "2nd-group", "criteria": {"form_regex": "ir$"}},
|
||||
{"value": "3rd-group", "criteria": {"form_regex": "(re|oir)$"}}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
"schema": {
|
||||
"infinitive": {
|
||||
"type": "single",
|
||||
"criteria": {"tags": ["infinitive", "present"]}
|
||||
},
|
||||
"participle_present": {
|
||||
"type": "single",
|
||||
"optional": True,
|
||||
"criteria": {"tags": ["participle", "present"]}
|
||||
},
|
||||
"participle_past": {
|
||||
"type": "single",
|
||||
"optional": True,
|
||||
"criteria": {"tags": ["participle", "past"], "exclude_tags": ["multiword-construction"]}
|
||||
},
|
||||
# All lists are now marked optional to handle defective verbs (like 'traire') and sparse data
|
||||
"indicative_present": {
|
||||
"type": "list", "size": 6, "optional": True,
|
||||
"base_criteria": {"tags": ["indicative", "present"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"indicative_imperfect": {
|
||||
"type": "list", "size": 6, "optional": True,
|
||||
"base_criteria": {"tags": ["indicative", "imperfect"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"indicative_future": {
|
||||
"type": "list", "size": 6, "optional": True,
|
||||
"base_criteria": {"tags": ["indicative", "future"], "exclude_tags": ["perfect"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"indicative_simple_past": {
|
||||
"type": "list", "size": 6, "optional": True, # Traire/clore do not have this
|
||||
"base_criteria": {"tags": ["indicative", "past"], "exclude_tags": ["multiword-construction", "imperfect", "perfect", "anterior"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"subjunctive_present": {
|
||||
"type": "list", "size": 6, "optional": True,
|
||||
"base_criteria": {"tags": ["subjunctive", "present"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"conditional_present": {
|
||||
"type": "list", "size": 6, "optional": True,
|
||||
"base_criteria": {"tags": ["conditional", "present"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"imperative": {
|
||||
"type": "list", "size": 3, "optional": True,
|
||||
"base_criteria": {"tags": ["imperative", "present"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["singular"]},
|
||||
{"index": 1, "tags": ["plural", "first-person"]},
|
||||
{"index": 2, "tags": ["plural", "second-person"]},
|
||||
{"index": 1, "criteria": {"form_regex": r"ons$"}},
|
||||
{"index": 2, "criteria": {"form_regex": r"ez$"}},
|
||||
{"index": 0, "criteria": {"form_regex": r"[es]$"}}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OLD_FRENCH_VERB_CONFIG = {
|
||||
"skip_normalization_if_source": False,
|
||||
"validate_completeness": True,
|
||||
|
||||
# --- 1. Normalization ---
|
||||
"clean_prefixes": [
|
||||
"qu'", "qu’", "que", "j'", "j’", "je", "tu",
|
||||
"il/elle/on", "il", "elle", "on", "nous", "vous", "ils/elles", "ils", "elles"
|
||||
],
|
||||
|
||||
"normalization_rules": [
|
||||
{"field": "form", "match": r"\bje\b", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
|
||||
{"field": "form", "match": r"\bj[’']", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
|
||||
{"field": "form", "match": r"\btu\b", "match_mode": "regex", "add_tags": ["second-person", "singular"]},
|
||||
{"field": "form", "match": r"\b(il|elle|on|il/elle/on)\b", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
|
||||
{"field": "form", "match": r"\[il/ɛl/ɔ̃\]", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
|
||||
{"field": "form", "match": r"\bnous\b", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
|
||||
{"field": "form", "match": r"\bvous\b", "match_mode": "regex", "add_tags": ["second-person", "plural"]},
|
||||
{"field": "form", "match": r"\b(ils|elles|ils/elles)\b", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
|
||||
{"field": "form", "match": r"\[il/ɛl\]", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
|
||||
],
|
||||
|
||||
# --- 2. Properties ---
|
||||
"properties": [
|
||||
{
|
||||
"name": "auxiliary",
|
||||
"multivalue": True,
|
||||
"default": ["avoir"],
|
||||
"rules": [
|
||||
{"value": "être", "criteria": {"raw_tags": ["auxiliary être"]}},
|
||||
{"value": "avoir", "criteria": {"raw_tags": ["auxiliary avoir"]}},
|
||||
{"value": "être", "criteria": {"tags": ["auxiliary-être"]}},
|
||||
{"value": "avoir", "criteria": {"tags": ["auxiliary-avoir"]}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "group",
|
||||
"default": "unknown",
|
||||
"rules": [
|
||||
{"value": "1st-group", "criteria": {"raw_tags": ["1ᵉʳ groupe"]}},
|
||||
{"value": "2nd-group", "criteria": {"raw_tags": ["2ᵉ groupe"]}},
|
||||
{"value": "3rd-group", "criteria": {"raw_tags": ["3ᵉ groupe"]}},
|
||||
{"value": "1st-group", "criteria": {"form_regex": "er$"}},
|
||||
{"value": "2nd-group", "criteria": {"form_regex": "ir$"}},
|
||||
{"value": "3rd-group", "criteria": {"form_regex": "(re|oir)$"}}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
# --- 3. Schema ---
|
||||
"schema": {
|
||||
"infinitive": {
|
||||
"type": "single",
|
||||
"criteria": {"tags": ["infinitive", "present"]}
|
||||
},
|
||||
"participle_present": {
|
||||
"type": "single",
|
||||
"optional": True, # <--- NEW: Allows missing participle
|
||||
"criteria": {"tags": ["participle", "present"]}
|
||||
},
|
||||
"participle_past": {
|
||||
"type": "single",
|
||||
"optional": True, # <--- Often missing in defective verbs
|
||||
"criteria": {"tags": ["participle", "past"], "exclude_tags": ["multiword-construction"]}
|
||||
},
|
||||
"indicative_present": {
|
||||
"type": "list", "size": 6,
|
||||
"base_criteria": {"tags": ["indicative", "present"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"indicative_imperfect": {
|
||||
"type": "list", "size": 6,
|
||||
"base_criteria": {"tags": ["indicative", "imperfect"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"indicative_future": {
|
||||
"type": "list", "size": 6,
|
||||
"base_criteria": {"tags": ["indicative", "future"], "exclude_tags": ["perfect"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"indicative_simple_past": {
|
||||
"type": "list", "size": 6,
|
||||
"base_criteria": {"tags": ["indicative", "past"], "exclude_tags": ["multiword-construction", "imperfect", "perfect", "anterior"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"subjunctive_present": {
|
||||
"type": "list", "size": 6,
|
||||
"base_criteria": {"tags": ["subjunctive", "present"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"conditional_present": {
|
||||
"type": "list", "size": 6,
|
||||
"base_criteria": {"tags": ["conditional", "present"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["first-person", "singular"]},
|
||||
{"index": 1, "tags": ["second-person", "singular"]},
|
||||
{"index": 2, "tags": ["third-person", "singular"]},
|
||||
{"index": 3, "tags": ["first-person", "plural"]},
|
||||
{"index": 4, "tags": ["second-person", "plural"]},
|
||||
{"index": 5, "tags": ["third-person", "plural"]}
|
||||
]
|
||||
},
|
||||
"imperative": {
|
||||
"type": "list", "size": 3,
|
||||
"optional": True, # <--- Often missing for phrases/defective verbs
|
||||
"base_criteria": {"tags": ["imperative", "present"]},
|
||||
"indices": [
|
||||
{"index": 0, "tags": ["singular"]},
|
||||
{"index": 1, "tags": ["plural", "first-person"]},
|
||||
{"index": 2, "tags": ["plural", "second-person"]}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
38
scripts/printline.py
Normal file
38
scripts/printline.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import json
|
||||
import pathlib
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
INPUT_FILE_NAME = "fr_raw-wiktextract-data.jsonl"
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
|
||||
|
||||
|
||||
|
||||
# --- Configuration ---
|
||||
START_LINE = 99 # 1-based index (first line is 1)
|
||||
NUM_LINES = 99 # Number of lines/objects to write
|
||||
|
||||
|
||||
def extract_lines_to_file(file_path, start_line, num_lines):
|
||||
# Generate timestamp filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = file_path.parent / f"{timestamp}.json"
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as infile:
|
||||
with open(output_file, 'w', encoding='utf-8') as outfile:
|
||||
for i, line in enumerate(infile, start=1):
|
||||
if i >= start_line and i < start_line + num_lines:
|
||||
try:
|
||||
element = json.loads(line)
|
||||
outfile.write(json.dumps(element, indent=2, ensure_ascii=False))
|
||||
outfile.write('\n')
|
||||
except json.JSONDecodeError:
|
||||
outfile.write(f"Error: Line {i} is not valid JSON.\n")
|
||||
|
||||
print(f"Output written to: {output_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_lines_to_file(INPUT_FILE, START_LINE, NUM_LINES)
|
||||
110
scripts/search_word.py
Normal file
110
scripts/search_word.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import json
|
||||
import pathlib
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl" # <-- Update this to your file
|
||||
# --- Dynamic Path Setup ---
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
|
||||
|
||||
|
||||
# --- Filter Configuration ---
|
||||
# Set the POS (part of speech) you want to filter for
|
||||
# Examples: "noun", "verb", "adj", "adv", etc.
|
||||
# Set to None to skip POS filtering
|
||||
FILTER_POS = "noun"
|
||||
|
||||
# Set the word you want to filter for
|
||||
# Set to None to skip word filtering
|
||||
FILTER_WORD = "grenouille"
|
||||
|
||||
# Set word prefix to filter for (e.g., "Septem" will match "September")
|
||||
# Set to None to skip prefix filtering
|
||||
FILTER_PREFIX = None
|
||||
|
||||
# Set word suffix to filter for (e.g., "ber" will match "September")
|
||||
# Set to None to skip suffix filtering
|
||||
FILTER_SUFFIX = None
|
||||
|
||||
# Maximum number of results to include (set to None for unlimited)
|
||||
MAX_RESULTS = 5
|
||||
|
||||
|
||||
def matches_filters(entry):
|
||||
"""Check if an entry matches all active filters."""
|
||||
|
||||
# Filter by POS
|
||||
if FILTER_POS is not None:
|
||||
if entry.get("pos") != FILTER_POS:
|
||||
return False
|
||||
|
||||
# Filter by exact word
|
||||
if FILTER_WORD is not None:
|
||||
if entry.get("word") != FILTER_WORD:
|
||||
return False
|
||||
|
||||
# Filter by prefix
|
||||
if FILTER_PREFIX is not None:
|
||||
word = entry.get("word", "")
|
||||
if not word.startswith(FILTER_PREFIX):
|
||||
return False
|
||||
|
||||
# Filter by suffix
|
||||
if FILTER_SUFFIX is not None:
|
||||
word = entry.get("word", "")
|
||||
if not word.endswith(FILTER_SUFFIX):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def filter_and_save(file_path):
|
||||
"""Filter JSONL file and save matching entries."""
|
||||
|
||||
# Generate output filename with original filename and timestamp
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl"
|
||||
|
||||
match_count = 0
|
||||
total_lines = 0
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as infile:
|
||||
with open(output_file, 'w', encoding='utf-8') as outfile:
|
||||
for line in infile:
|
||||
total_lines += 1
|
||||
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
|
||||
# Check if entry matches filters
|
||||
if matches_filters(entry):
|
||||
outfile.write(json.dumps(entry, ensure_ascii=False))
|
||||
outfile.write('\n')
|
||||
match_count += 1
|
||||
|
||||
# Stop if we've reached max results
|
||||
if MAX_RESULTS is not None and match_count >= MAX_RESULTS:
|
||||
break
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(f"Warning: Line {total_lines} is not valid JSON.")
|
||||
|
||||
print(f"Filtered {match_count} entries from {total_lines} total lines")
|
||||
print(f"Output written to: {output_file}")
|
||||
|
||||
# Print active filters
|
||||
print("\nActive filters:")
|
||||
if FILTER_POS:
|
||||
print(f" - POS: {FILTER_POS}")
|
||||
if FILTER_WORD:
|
||||
print(f" - Word (exact): {FILTER_WORD}")
|
||||
if FILTER_PREFIX:
|
||||
print(f" - Prefix: {FILTER_PREFIX}")
|
||||
if FILTER_SUFFIX:
|
||||
print(f" - Suffix: {FILTER_SUFFIX}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filter_and_save(INPUT_FILE)
|
||||
419
scripts/transform_wiktionary.py
Normal file
419
scripts/transform_wiktionary.py
Normal file
@@ -0,0 +1,419 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Universal Wiktionary Format Transformer
|
||||
========================================
|
||||
Transforms any Wiktionary JSON format to a standardized universal schema.
|
||||
|
||||
Usage:
|
||||
python transform_wiktionary.py input.jsonl output.jsonl
|
||||
python transform_wiktionary.py input.jsonl output.jsonl --validate
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
from typing import Dict, List, Any, Optional
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class WiktionaryTransformer:
|
||||
"""Transforms Wiktionary entries to universal format."""
|
||||
|
||||
def __init__(self, validate: bool = False):
|
||||
self.validate = validate
|
||||
self.stats = {
|
||||
"total": 0,
|
||||
"successful": 0,
|
||||
"errors": 0,
|
||||
"warnings": []
|
||||
}
|
||||
|
||||
def transform_entry(self, raw_entry: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Transform a single Wiktionary entry to universal format.
|
||||
|
||||
Args:
|
||||
raw_entry: Raw entry from any Wiktionary edition
|
||||
|
||||
Returns:
|
||||
Transformed entry in universal format
|
||||
"""
|
||||
# === REQUIRED CORE FIELDS ===
|
||||
try:
|
||||
universal = {
|
||||
"word": raw_entry["word"],
|
||||
"lang_code": raw_entry["lang_code"],
|
||||
"pos": raw_entry["pos"],
|
||||
"senses": raw_entry["senses"]
|
||||
}
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Missing required field: {e}")
|
||||
|
||||
# === PHONETICS ===
|
||||
phonetics = self._extract_phonetics(raw_entry)
|
||||
if phonetics:
|
||||
universal["phonetics"] = phonetics
|
||||
|
||||
# === HYPHENATION ===
|
||||
hyphenation = self._extract_hyphenation(raw_entry)
|
||||
if hyphenation:
|
||||
universal["hyphenation"] = hyphenation
|
||||
|
||||
# === FORMS ===
|
||||
if "forms" in raw_entry:
|
||||
universal["forms"] = raw_entry["forms"]
|
||||
|
||||
# === GRAMMATICAL FEATURES ===
|
||||
grammatical = self._extract_grammatical_features(raw_entry)
|
||||
if grammatical:
|
||||
universal["grammatical_features"] = grammatical
|
||||
|
||||
# === ETYMOLOGY ===
|
||||
etymology = self._extract_etymology(raw_entry)
|
||||
if etymology:
|
||||
universal["etymology"] = etymology
|
||||
|
||||
# === RELATIONS ===
|
||||
relations = self._extract_relations(raw_entry)
|
||||
if relations:
|
||||
universal["relations"] = relations
|
||||
|
||||
# === TRANSLATIONS ===
|
||||
if "translations" in raw_entry:
|
||||
universal["translations"] = raw_entry["translations"]
|
||||
|
||||
# === DESCENDANTS ===
|
||||
if "descendants" in raw_entry:
|
||||
universal["descendants"] = raw_entry["descendants"]
|
||||
|
||||
# === METADATA ===
|
||||
metadata = self._extract_metadata(raw_entry)
|
||||
universal["metadata"] = metadata
|
||||
|
||||
return universal
|
||||
|
||||
def _extract_phonetics(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Extract and normalize phonetic information."""
|
||||
phonetics = {}
|
||||
|
||||
# Process sounds array
|
||||
if "sounds" in entry and entry["sounds"]:
|
||||
ipa_variations = []
|
||||
audio_list = []
|
||||
homophones = []
|
||||
|
||||
for sound in entry["sounds"]:
|
||||
# IPA transcription with country information
|
||||
if "ipa" in sound:
|
||||
ipa_entry = {"ipa": sound["ipa"]}
|
||||
|
||||
# Preserve country information from raw_tags
|
||||
if "raw_tags" in sound:
|
||||
ipa_entry["raw_tags"] = sound["raw_tags"]
|
||||
|
||||
# Clean IPA string by removing special characters at beginning/end
|
||||
cleaned_ipa = self._clean_ipa_string(sound["ipa"])
|
||||
ipa_entry["ipa_cleaned"] = cleaned_ipa
|
||||
|
||||
ipa_variations.append(ipa_entry)
|
||||
|
||||
# Audio files (keep for now, will be removed in filter step)
|
||||
if "audio" in sound:
|
||||
audio_obj = {}
|
||||
# Try multiple URL formats
|
||||
for url_key in ["ogg_url", "mp3_url", "url"]:
|
||||
if url_key in sound:
|
||||
audio_obj["url"] = sound[url_key]
|
||||
break
|
||||
audio_obj["text"] = sound.get("audio", "")
|
||||
if audio_obj:
|
||||
audio_list.append(audio_obj)
|
||||
|
||||
# Homophones
|
||||
if "homophone" in sound:
|
||||
homophones.append(sound["homophone"])
|
||||
|
||||
if ipa_variations:
|
||||
phonetics["ipa_variations"] = ipa_variations
|
||||
if audio_list:
|
||||
phonetics["audio"] = audio_list
|
||||
if homophones:
|
||||
phonetics["homophones"] = homophones
|
||||
|
||||
# Handle extra_sounds (some editions)
|
||||
if "extra_sounds" in entry:
|
||||
if "pronunciación" in entry["extra_sounds"]:
|
||||
phonetics["notes"] = entry["extra_sounds"]["pronunciación"]
|
||||
|
||||
return phonetics if phonetics else None
|
||||
|
||||
def _clean_ipa_string(self, ipa_string: str) -> str:
|
||||
"""Clean IPA string by removing special characters at beginning/end."""
|
||||
if not ipa_string:
|
||||
return ipa_string
|
||||
|
||||
# Remove leading/trailing special characters: [, ], \, :
|
||||
cleaned = ipa_string.strip("[]\\:")
|
||||
return cleaned
|
||||
|
||||
def _extract_hyphenation(self, entry: Dict[str, Any]) -> Optional[List[str]]:
|
||||
"""Extract and normalize hyphenation."""
|
||||
# Format 1: hyphenations array with parts
|
||||
if "hyphenations" in entry and entry["hyphenations"]:
|
||||
parts = []
|
||||
for h in entry["hyphenations"]:
|
||||
if isinstance(h, dict) and "parts" in h:
|
||||
parts.extend(h["parts"])
|
||||
elif isinstance(h, str):
|
||||
parts.append(h)
|
||||
if parts:
|
||||
return parts
|
||||
|
||||
# Format 2: hyphenation string with separator
|
||||
if "hyphenation" in entry:
|
||||
# Split on common separators
|
||||
hyph = entry["hyphenation"]
|
||||
for sep in ["‐", "-", "·", "•"]:
|
||||
if sep in hyph:
|
||||
return hyph.split(sep)
|
||||
return [hyph]
|
||||
|
||||
return None
|
||||
|
||||
def _extract_grammatical_features(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Extract grammatical features and tags."""
|
||||
if "tags" not in entry:
|
||||
return None
|
||||
|
||||
grammatical = {"tags": entry["tags"]}
|
||||
|
||||
# Extract gender from tags
|
||||
gender_map = {
|
||||
"masculine": "masculine",
|
||||
"feminine": "feminine",
|
||||
"neuter": "neuter",
|
||||
"common": "common",
|
||||
"m": "masculine",
|
||||
"f": "feminine",
|
||||
"n": "neuter",
|
||||
"c": "common"
|
||||
}
|
||||
|
||||
for tag in entry["tags"]:
|
||||
tag_lower = tag.lower()
|
||||
if tag_lower in gender_map:
|
||||
grammatical["gender"] = gender_map[tag_lower]
|
||||
break
|
||||
|
||||
# Extract number
|
||||
number_map = {
|
||||
"singular": "singular",
|
||||
"plural": "plural",
|
||||
"dual": "dual",
|
||||
"sg": "singular",
|
||||
"pl": "plural"
|
||||
}
|
||||
|
||||
for tag in entry["tags"]:
|
||||
tag_lower = tag.lower()
|
||||
if tag_lower in number_map:
|
||||
grammatical["number"] = number_map[tag_lower]
|
||||
break
|
||||
|
||||
return grammatical
|
||||
|
||||
def _extract_etymology(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Extract etymology information."""
|
||||
etymology = {}
|
||||
|
||||
if "etymology_text" in entry:
|
||||
etymology["text"] = entry["etymology_text"]
|
||||
|
||||
if "etymology_texts" in entry:
|
||||
etymology["texts"] = entry["etymology_texts"]
|
||||
|
||||
if "etymology_number" in entry:
|
||||
etymology["number"] = entry["etymology_number"]
|
||||
|
||||
return etymology if etymology else None
|
||||
|
||||
def _extract_relations(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Extract semantic and lexical relations."""
|
||||
relations = {}
|
||||
|
||||
# Define all possible relation types
|
||||
relation_fields = [
|
||||
"synonyms", "antonyms", "hypernyms", "hyponyms",
|
||||
"meronyms", "holonyms", "related", "derived",
|
||||
"coordinate_terms", "troponyms", "compounds"
|
||||
]
|
||||
|
||||
for field in relation_fields:
|
||||
if field in entry and entry[field]:
|
||||
relations[field] = entry[field]
|
||||
|
||||
return relations if relations else None
|
||||
|
||||
def _extract_metadata(self, entry: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Extract metadata and source information."""
|
||||
metadata = {}
|
||||
|
||||
# Source language
|
||||
if "lang" in entry:
|
||||
metadata["source_lang"] = entry["lang"]
|
||||
|
||||
# Infer source language code if possible
|
||||
if "lang_code" in entry:
|
||||
metadata["source_lang_code"] = entry["lang_code"]
|
||||
|
||||
# POS title (localized)
|
||||
if "pos_title" in entry:
|
||||
metadata["pos_title"] = entry["pos_title"]
|
||||
elif "pos_text" in entry:
|
||||
metadata["pos_title"] = entry["pos_text"]
|
||||
|
||||
# Categories
|
||||
if "categories" in entry:
|
||||
metadata["categories"] = entry["categories"]
|
||||
|
||||
# Templates
|
||||
templates = []
|
||||
if "head_templates" in entry:
|
||||
templates.extend(entry["head_templates"])
|
||||
if "inflection_templates" in entry:
|
||||
templates.extend(entry["inflection_templates"])
|
||||
if templates:
|
||||
metadata["templates"] = templates
|
||||
|
||||
# Additional metadata
|
||||
if "attestations" in entry:
|
||||
metadata["attestations"] = entry["attestations"]
|
||||
|
||||
return metadata
|
||||
|
||||
def transform_file(self, input_path: str, output_path: str) -> None:
|
||||
"""
|
||||
Transform an entire JSONL file.
|
||||
|
||||
Args:
|
||||
input_path: Path to input JSONL file
|
||||
output_path: Path to output JSONL file
|
||||
"""
|
||||
input_file = Path(input_path)
|
||||
output_file = Path(output_path)
|
||||
|
||||
if not input_file.exists():
|
||||
raise FileNotFoundError(f"Input file not found: {input_path}")
|
||||
|
||||
print(f"Transforming: {input_path} → {output_path}")
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as infile, \
|
||||
open(output_file, 'w', encoding='utf-8') as outfile:
|
||||
|
||||
for line_num, line in enumerate(infile, 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
self.stats["total"] += 1
|
||||
|
||||
try:
|
||||
# Parse input
|
||||
raw_entry = json.loads(line)
|
||||
|
||||
# Transform
|
||||
universal_entry = self.transform_entry(raw_entry)
|
||||
|
||||
# Validate if requested
|
||||
if self.validate:
|
||||
self._validate_entry(universal_entry)
|
||||
|
||||
# Write output
|
||||
outfile.write(json.dumps(universal_entry, ensure_ascii=False) + '\n')
|
||||
self.stats["successful"] += 1
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
self.stats["errors"] += 1
|
||||
warning = f"Line {line_num}: JSON decode error - {e}"
|
||||
self.stats["warnings"].append(warning)
|
||||
print(f"⚠ {warning}", file=sys.stderr)
|
||||
|
||||
except ValueError as e:
|
||||
self.stats["errors"] += 1
|
||||
warning = f"Line {line_num}: {e}"
|
||||
self.stats["warnings"].append(warning)
|
||||
print(f"⚠ {warning}", file=sys.stderr)
|
||||
|
||||
except Exception as e:
|
||||
self.stats["errors"] += 1
|
||||
warning = f"Line {line_num}: Unexpected error - {e}"
|
||||
self.stats["warnings"].append(warning)
|
||||
print(f"⚠ {warning}", file=sys.stderr)
|
||||
|
||||
self._print_summary()
|
||||
|
||||
def _validate_entry(self, entry: Dict[str, Any]) -> None:
|
||||
"""Validate a transformed entry."""
|
||||
required = ["word", "lang_code", "pos", "senses"]
|
||||
for field in required:
|
||||
if field not in entry:
|
||||
raise ValueError(f"Missing required field after transformation: {field}")
|
||||
|
||||
def _print_summary(self) -> None:
|
||||
"""Print transformation summary."""
|
||||
print("\n" + "="*60)
|
||||
print("TRANSFORMATION SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Total entries: {self.stats['total']}")
|
||||
print(f"Successful: {self.stats['successful']}")
|
||||
print(f"Errors: {self.stats['errors']}")
|
||||
|
||||
if self.stats['successful'] > 0:
|
||||
success_rate = (self.stats['successful'] / self.stats['total']) * 100
|
||||
print(f"Success rate: {success_rate:.1f}%")
|
||||
|
||||
if self.stats['warnings']:
|
||||
print(f"\nWarnings: {len(self.stats['warnings'])}")
|
||||
if len(self.stats['warnings']) <= 10:
|
||||
for warning in self.stats['warnings']:
|
||||
print(f" - {warning}")
|
||||
else:
|
||||
print(f" (showing first 10 of {len(self.stats['warnings'])})")
|
||||
for warning in self.stats['warnings'][:10]:
|
||||
print(f" - {warning}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Transform Wiktionary JSONL to universal format",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
%(prog)s input.jsonl output.jsonl
|
||||
%(prog)s data/raw.jsonl data/transformed.jsonl --validate
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument("input", help="Input JSONL file")
|
||||
parser.add_argument("output", help="Output JSONL file")
|
||||
parser.add_argument("--validate", action="store_true",
|
||||
help="Validate transformed entries")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
transformer = WiktionaryTransformer(validate=args.validate)
|
||||
transformer.transform_file(args.input, args.output)
|
||||
|
||||
# Exit with error code if there were errors
|
||||
if transformer.stats["errors"] > 0:
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user