Migrate to gitea

2026-02-13 00:10:40 +01:00
commit 6d06a9e14e
38 changed files with 31427 additions and 0 deletions
--- a/scripts/01_filter_dictionary.py
+++ b/scripts/01_filter_dictionary.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+"""
+Transforms dictionary data from kaakki.org JSONL format to the universal
+dictionary schema defined in 'universal_dictionary_schema.json'.
+Uses ALL system cores for parallel processing.
+"""
+
+import json
+import pathlib
+import logging
+import sys
+import argparse
+import csv
+import multiprocessing
+import traceback
+from datetime import datetime
+from typing import List, Dict, Any, Set, Optional, Tuple
+
+# ==============================================================================
+# --- DEFAULT CONFIGURATION (Overridable via CLI args) ---
+# ==============================================================================
+
+try:
+    SCRIPT_DIR = pathlib.Path(__file__).parent
+    ROOT_DIR = SCRIPT_DIR.parent
+except NameError:
+    SCRIPT_DIR = pathlib.Path.cwd()
+    ROOT_DIR = SCRIPT_DIR.parent
+
+sys.path.insert(0, str(ROOT_DIR))
+
+# --- IMPORTS ---
+try:
+    from transform_wiktionary import WiktionaryTransformer
+    from InflectionProcessor import InflectionProcessor
+    # Import language configurations
+    try:
+        from lang_config import GERMAN_VERB_CONFIG
+    except ImportError:
+        GERMAN_VERB_CONFIG = {}
+    try:
+        from lang_config import FRENCH_VERB_CONFIG
+    except ImportError:
+        FRENCH_VERB_CONFIG = {}
+except ImportError:
+    pass
+
+DEFAULT_LANG_FILTER = "fr"
+
+DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
+DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
+DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate"
+
+DEFAULT_POS_WHITELIST = set()
+DEFAULT_POS_BLACKLIST = {"unknown"}
+DEFAULT_IGNORE_FORM_OF = True
+DEFAULT_TRANS_LANGS = {"pt", "es", "en", "de", "it", "fr", "nl"}
+
+# ==============================================================================
+# --- LOGGING ---
+# ==============================================================================
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# ==============================================================================
+# --- WORKER FUNCTION ---
+# ==============================================================================
+
+def process_chunk_filtering(
+    chunk_lines: List[str], 
+    lang_filter: Optional[str],
+    pos_whitelist: Set[str], 
+    pos_blacklist: Set[str], 
+    ignore_form_of: bool,
+    translation_languages: Set[str],
+    inflection_configs: Dict
+) -> Tuple[List[str], Dict[str, int], List[str]]:
+    
+    # Re-instantiate processors inside the worker process
+    transformer = WiktionaryTransformer()
+    inflection_processor = InflectionProcessor(inflection_configs)
+    
+    form_of_tags = {"form-of", "affix", "particle", "suffix", "prefix"}
+    
+    results = []
+    errors = []
+    counters = {"processed": 0, "skipped": 0, "errors": 0}
+
+    for line in chunk_lines:
+        if not line.strip():
+            continue
+            
+        try:
+            data = json.loads(line)
+            
+            # --- Apply Filters ---
+            if lang_filter and data.get("lang_code") != lang_filter:
+                counters["skipped"] += 1; continue
+            
+            pos = data.get("pos")
+            if pos_whitelist and pos not in pos_whitelist:
+                counters["skipped"] += 1; continue
+            if pos_blacklist and pos in pos_blacklist:
+                counters["skipped"] += 1; continue
+            
+            if ignore_form_of:
+                if set(data.get("tags", [])).intersection(form_of_tags):
+                    counters["skipped"] += 1; continue
+
+            # --- Filter Translations ---
+            if 'translations' in data:
+                data['translations'] = [
+                    tr for tr in data['translations'] 
+                    if tr.get('lang_code') in translation_languages
+                ]
+
+            # --- 1. Transform Data to Universal Schema ---
+            new_entry = transformer.transform_entry(data)
+
+            # --- CLEANUP PHONETICS (Audio & Duplicates) ---
+            if 'phonetics' in new_entry:
+                # Remove Audio
+                if 'audio' in new_entry['phonetics']:
+                    del new_entry['phonetics']['audio']
+                
+                # Process IPA variations to remove duplicates while preserving country information
+                if 'ipa_variations' in new_entry['phonetics'] and isinstance(new_entry['phonetics']['ipa_variations'], list):
+                    # Group variations by cleaned IPA to collect all regions for each pronunciation
+                    ipa_groups = {}
+                    for variation in new_entry['phonetics']['ipa_variations']:
+                        ipa_cleaned = variation.get('ipa_cleaned', '')
+                        if ipa_cleaned:
+                            if ipa_cleaned not in ipa_groups:
+                                ipa_groups[ipa_cleaned] = {
+                                    "ipa": ipa_cleaned,
+                                    "raw_tags": []
+                                }
+                            # Collect all raw_tags for this IPA
+                            if 'raw_tags' in variation:
+                                ipa_groups[ipa_cleaned]['raw_tags'].extend(variation['raw_tags'])
+                    
+                    # Create compressed variations list
+                    compressed_variations = []
+                    for ipa_cleaned, group_data in ipa_groups.items():
+                        variation = {"ipa": ipa_cleaned}
+                        if group_data['raw_tags']:
+                            # Remove duplicates from raw_tags while preserving order
+                            seen_tags = set()
+                            unique_tags = []
+                            for tag in group_data['raw_tags']:
+                                if tag not in seen_tags:
+                                    unique_tags.append(tag)
+                                    seen_tags.add(tag)
+                            variation['raw_tags'] = unique_tags
+                        compressed_variations.append(variation)
+                    
+                    # Create simplified IPA list and compressed variations
+                    simplified_ipa = list(ipa_groups.keys())
+                    new_entry['phonetics']['ipa'] = simplified_ipa
+                    new_entry['phonetics']['ipa_variations'] = compressed_variations
+
+            # --- Filter out unnecessary fields ---
+            if 'metadata' in new_entry:
+                del new_entry['metadata']
+            if 'translations' in new_entry:
+                for tr in new_entry['translations']:
+                    tr.pop('lang', None)
+                    tr.pop('sense', None)
+            
+            if 'senses' in new_entry:
+                for sense in new_entry['senses']:
+                    if 'examples' in sense:
+                        sense['examples'] = [ex['text'] for ex in sense['examples'] if 'text' in ex]
+
+            if 'relations' in new_entry and 'derived' in new_entry['relations']:
+                del new_entry['relations']['derived']
+
+            # --- 2. Run Inflection Processor ---
+            new_entry = inflection_processor.process(new_entry)
+
+            # --- Remove lang_code after processing ---
+            if 'lang_code' in new_entry:
+                del new_entry['lang_code']
+
+            results.append(json.dumps(new_entry, ensure_ascii=False))
+            counters["processed"] += 1
+        
+        except ValueError as e:
+            counters["skipped"] += 1
+            errors.append(f"Value Error: {str(e)}")
+        except json.JSONDecodeError:
+            counters["errors"] += 1
+        except Exception as e:
+            counters["errors"] += 1
+            errors.append(f"Unexpected Error: {str(e)}")
+
+    return results, counters, errors
+
+# ==============================================================================
+# --- MAIN PROCESS ---
+# ==============================================================================
+
+def process_file(input_path: pathlib.Path, output_path: pathlib.Path, lang_filter: Optional[str],
+                 pos_whitelist: Set[str], pos_blacklist: Set[str], ignore_form_of: bool,
+                 translation_languages: Set[str]):
+    
+    logger.info(f"Starting parallel processing...")
+    logger.info(f"  Input file: {input_path}")
+    logger.info(f"  Output file: {output_path}")
+
+    if not input_path.exists():
+         logger.critical(f"Input file not found: {input_path}")
+         sys.exit(1)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Prepare Inflection Configs
+    inflection_configs = {
+        'de_verb': GERMAN_VERB_CONFIG,
+        'fr_verb': FRENCH_VERB_CONFIG
+    }
+    
+    if lang_filter and f"{lang_filter}_verb" not in inflection_configs:
+        logger.warning(f"No inflection configuration found for language '{lang_filter}'. Verbs will remain uncompressed.")
+
+    logger.info("Reading input file into memory...")
+    try:
+        with open(input_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+    except Exception as e:
+        logger.critical(f"Failed to read input file: {e}")
+        sys.exit(1)
+
+    total_lines = len(lines)
+    logger.info(f"Total lines to process: {total_lines:,}")
+
+    num_processes = multiprocessing.cpu_count()
+    chunk_size = total_lines // num_processes + 1
+    chunks = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)]
+    logger.info(f"Split data into {len(chunks)} chunks for {num_processes} cores.")
+
+    pool = multiprocessing.Pool(processes=num_processes)
+    
+    worker_args = [
+        (chunk, lang_filter, pos_whitelist, pos_blacklist, ignore_form_of, translation_languages, inflection_configs)
+        for chunk in chunks
+    ]
+
+    try:
+        all_results = pool.starmap(process_chunk_filtering, worker_args)
+        pool.close()
+        pool.join()
+    except KeyboardInterrupt:
+        logger.warning("Interrupted by user. Terminating pool...")
+        pool.terminate()
+        sys.exit(1)
+    except Exception as e:
+        logger.critical(f"Error during parallel processing: {e}")
+        traceback.print_exc()
+        sys.exit(1)
+
+    logger.info("Aggregating results and writing to output...")
+    
+    final_counters = {"processed": 0, "skipped": 0, "errors": 0}
+    error_log_path = output_path.parent / "verb_errors.log"
+    
+    with open(output_path, 'w', encoding='utf-8') as out_f, \
+         open(error_log_path, 'w', encoding='utf-8') as err_f:
+        
+        for result_strings, worker_stats, worker_errors in all_results:
+            for k in final_counters:
+                final_counters[k] += worker_stats.get(k, 0)
+            
+            for json_str in result_strings:
+                out_f.write(json_str + "\n")
+            
+            for err_msg in worker_errors:
+                err_f.write(err_msg + "\n")
+
+    logger.info(f"DONE. Total Read: {total_lines}")
+    logger.info(f"Processed: {final_counters['processed']}, Skipped: {final_counters['skipped']}, Errors: {final_counters['errors']}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Transform kaakki.org JSONL to universal dictionary format (Parallel).")
+    parser.add_argument("--input", type=pathlib.Path, default=DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME,
+                        help="Path to the raw input JSONL file.")
+    parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR,
+                        help="Directory to save the transformed JSONL file.")
+    parser.add_argument("--lang", type=str, default=DEFAULT_LANG_FILTER,
+                        help="Language code to filter for (e.g., 'de').")
+    parser.add_argument("--trans-langs", type=str, default=",".join(DEFAULT_TRANS_LANGS),
+                        help="Comma-separated list of translation languages to keep.")
+
+    args = parser.parse_args()
+
+    output_filename = f"{args.lang.capitalize()}_universal.jsonl" if args.lang else "universal.jsonl"
+    output_file_path = args.output_dir / output_filename
+
+    trans_langs_set = set(lang.strip() for lang in args.trans_langs.split(",")) if args.trans_langs else set()
+
+    process_file(
+        args.input,
+        output_file_path,
+        args.lang,
+        DEFAULT_POS_WHITELIST,
+        DEFAULT_POS_BLACKLIST,
+        DEFAULT_IGNORE_FORM_OF,
+        trans_langs_set
+    )
+
+    stats_file = ROOT_DIR / "processing_stats.csv"
+    if output_file_path.exists():
+        file_size = output_file_path.stat().st_size
+    else:
+        file_size = 0
+        
+    timestamp = datetime.now().isoformat()
+    write_header = not stats_file.exists()
+    try:
+        with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile:
+            writer = csv.writer(csvfile)
+            if write_header:
+                writer.writerow(['timestamp', 'output_file', 'size_bytes'])
+            writer.writerow([timestamp, str(output_file_path), file_size])
+    except Exception as e:
+        logger.warning(f"Could not write stats csv: {e}")
+
+if __name__ == "__main__":
+    multiprocessing.freeze_support()
+    main()
--- a/scripts/02_create_db.py
+++ b/scripts/02_create_db.py
@@ -0,0 +1,380 @@
+import json
+import sqlite3
+import pathlib
+import traceback
+import os
+import argparse
+import sys
+import multiprocessing
+import csv
+import statistics
+from datetime import datetime
+
+try:
+    import zstandard
+except ImportError:
+    print("ERROR: zstandard library not found. Please install it: pip install zstandard")
+    sys.exit(1)
+
+# ======================================================================
+# --- DEFAULT CONFIGURATION (Overridable via CLI args) ---
+# ======================================================================
+
+try:
+    SCRIPT_DIR = pathlib.Path(__file__).parent
+    ROOT_DIR = SCRIPT_DIR.parent
+except NameError:
+    SCRIPT_DIR = pathlib.Path.cwd()
+    ROOT_DIR = SCRIPT_DIR.parent
+
+DEFAULT_LANG_CODE = "fr"
+DEFAULT_INTERMEDIATE_DIR = ROOT_DIR / "intermediate"
+DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs"
+
+COMPRESSION_LEVEL = 22
+DICTIONARY_SAMPLE_COUNT = 200000
+DICTIONARY_MAX_SIZE = 10 * 1024 * 1024  # 10MB
+
+DEFAULT_UNCOMPRESSED_ONLY = False #change this for compression!
+DEFAULT_MINIMAL = False
+
+# ======================================================================
+
+def get_file_size_mb(filepath):
+    return os.path.getsize(filepath) / (1024 * 1024)
+
+def count_lines(filepath):
+    print("Counting total lines for progress tracking...")
+    with open(filepath, 'r', encoding='utf-8') as f:
+        return sum(1 for _ in f)
+
+def process_chunk(chunk, compression_dict_bytes):
+    import zstandard
+    compression_dict = zstandard.ZstdCompressionDict(compression_dict_bytes)
+    local_compressor = zstandard.ZstdCompressor(level=22, dict_data=compression_dict)
+    results = []
+    for line in chunk:
+        if not line.strip(): continue
+        try:
+            entry = json.loads(line)
+            word = entry.get("word")
+            pos = entry.get("pos", "")
+            if not word: continue
+            data_to_compress = entry.copy()
+            data_to_compress.pop("word", None)
+            data_to_compress.pop("pos", None)
+            value_bytes = json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8')
+            compressed_blob = local_compressor.compress(value_bytes)
+            results.append((word, pos, compressed_blob, len(value_bytes)))
+        except Exception:
+            pass
+    return results
+
+def process_chunk_uncompressed(chunk):
+    results = []
+    for line in chunk:
+        if not line.strip(): continue
+        try:
+            entry = json.loads(line)
+            word = entry.get("word")
+            pos = entry.get("pos", "")
+            if not word: continue
+            data_to_store = entry.copy()
+            data_to_store.pop("word", None)
+            data_to_store.pop("pos", None)
+            value_str = json.dumps(data_to_store, ensure_ascii=False)
+            value_bytes = value_str.encode('utf-8')
+            results.append((word, pos, value_str, len(value_bytes)))
+        except Exception:
+            pass
+    return results
+
+def train_config(config, lines):
+    import zstandard
+    sample_count, max_size = config
+    step = max(1, len(lines) // sample_count)
+    samples = []
+    for j in range(0, len(lines), step):
+        line = lines[j]
+        if not line.strip(): continue
+        entry = json.loads(line)
+        data_to_compress = entry.copy()
+        data_to_compress.pop("word", None)
+        data_to_compress.pop("pos", None)
+        samples.append(json.dumps(data_to_compress, ensure_ascii=False).encode('utf-8'))
+        if len(samples) >= sample_count: break
+    if not samples:
+        return None
+    compression_dict = zstandard.train_dictionary(max_size, samples)
+    dict_bytes = compression_dict.as_bytes()
+    return (sample_count, max_size, len(dict_bytes), dict_bytes)
+
+def create_database(lang_code, input_file, output_dir, intermediate_dir, uncompressed_only=False, minimal=False):
+    
+    database_file = output_dir / f"dictionary_{lang_code}.db"
+    dictionary_file = output_dir / f"dictionary_{lang_code}.zstdict"
+
+    # Ensure output directory exists
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Settings:\n - Language: {lang_code}\n - Input: {input_file}\n - DB Output: {database_file}\n - Dict Output: {dictionary_file}")
+
+    if not input_file.exists():
+        print(f"Error: Input file not found at {input_file}")
+        sys.exit(1)
+
+    total_lines = count_lines(input_file)
+    print(f"Total lines to process: {total_lines:,}")
+
+    with open(input_file, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    num_processes = multiprocessing.cpu_count()
+    chunk_size = len(lines) // num_processes + 1
+    chunks = [lines[i:i+chunk_size] for i in range(0, len(lines), chunk_size)]
+
+    # --- Pass 1: Training Compression Dictionary ---
+    if not uncompressed_only:
+        print(f"\n--- Pass 1: Training Compression Dictionary ---")
+        try:
+            if minimal:
+                sample_count = DICTIONARY_SAMPLE_COUNT
+                max_size = DICTIONARY_MAX_SIZE
+                config = (sample_count, max_size)
+                result = train_config(config, lines)
+                if result is None:
+                    print("Error: No valid dictionary trained.")
+                    sys.exit(1)
+                sample_count, max_size, dict_size, dict_bytes = result
+                print(f"Using default configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
+            else:
+                # Generate 20 configurations to try (varying both sample_count and max_size)
+                configs = []
+                for i in range(20):
+                    sample_count = 100000 + (i % 5) * 200000  # 5 different: 200k, 400k, 600k, 800k, 1M
+                    max_size = (3 + (i // 5) * 2) * 1024 * 1024  # 4 different: 3MB, 5MB, 7MB, 9MB
+                    configs.append((sample_count, max_size))
+
+                pool = multiprocessing.Pool(processes=min(20, multiprocessing.cpu_count()))
+                results = pool.starmap(train_config, [(config, lines) for config in configs])
+                pool.close()
+                pool.join()
+
+                # Find the best configuration (largest dictionary size)
+                valid_results = [r for r in results if r is not None]
+                if not valid_results:
+                    print("Error: No valid dictionaries trained.")
+                    sys.exit(1)
+
+                print("All configurations results:")
+                for sample_count, max_size, dict_size, _ in valid_results:
+                    print(f"  samples={sample_count}, max_size={max_size/1024/1024:.1f}MB -> dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
+
+                best_result = max(valid_results, key=lambda x: x[2])
+                sample_count, max_size, dict_size, dict_bytes = best_result
+
+                print(f"\nBest configuration: samples={sample_count}, max_size={max_size/1024/1024:.1f}MB, dict_size={dict_size} bytes ({dict_size/1024:.1f} KB)")
+
+            compression_dict = zstandard.ZstdCompressionDict(dict_bytes)
+
+            with open(dictionary_file, "wb") as f:
+                f.write(dict_bytes)
+            print(f"Saved dictionary to {dictionary_file}")
+
+        except Exception as e:
+            print(f"Error during training: {e}")
+            traceback.print_exc()
+            sys.exit(1)
+
+    if not uncompressed_only:
+        # --- Database Setup ---
+        if database_file.exists():
+            os.remove(database_file)
+
+        conn = sqlite3.connect(database_file)
+        conn.execute("PRAGMA journal_mode=WAL;")
+        conn.execute("PRAGMA auto_vacuum=full;")
+        cursor = conn.cursor()
+        compressor = zstandard.ZstdCompressor(level=COMPRESSION_LEVEL, dict_data=compression_dict)
+
+        cursor.execute('''
+        CREATE TABLE dictionary_data (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            word TEXT NOT NULL,
+            pos TEXT,
+            data_blob BLOB,
+            uncompressed_size INTEGER
+        );
+        ''')
+
+        # --- Pass 2: Insert Data ---
+        print("\n--- Pass 2: Inserting Data ---")
+
+        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
+
+        print("Processing chunks in parallel for compressed DB...")
+        all_results = pool.starmap(process_chunk, zip(chunks, [dict_bytes] * len(chunks)))
+        data_to_insert = [item for sublist in all_results for item in sublist]
+
+        print(f"Collected {len(data_to_insert)} items to insert into compressed DB.")
+        cursor.executemany("INSERT INTO dictionary_data (word, pos, data_blob, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert)
+        word_counter = len(data_to_insert)
+
+        conn.commit()
+        print(f"Inserted {word_counter:,} words into compressed DB.")
+
+        # --- Pass 3: FTS & Cleanup ---
+        print("Creating FTS4 index...")
+        cursor.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
+        cursor.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
+        conn.commit()
+
+        print("Running VACUUM...")
+        cursor.execute('VACUUM')
+        conn.commit()
+        conn.close()
+
+        db_size_mb = get_file_size_mb(database_file)
+        dict_size_mb = get_file_size_mb(dictionary_file)
+
+        print(f"\n{'='*60}")
+        print(f"SUCCESS: Database created.")
+        print(f"{'='*60}")
+        print(f"Final Database Size:   {db_size_mb:.2f} MB  ({database_file.name})")
+        print(f"Final Dictionary Size: {dict_size_mb:.2f} MB  ({dictionary_file.name})")
+        print(f"{'='*60}")
+
+    # --- Create Uncompressed Database ---
+    print(f"\n--- Creating Uncompressed Database ---")
+    uncompressed_db_file = intermediate_dir / f"dictionary_{lang_code}_uncompressed.db"
+
+    # Ensure intermediate directory exists
+    intermediate_dir.mkdir(parents=True, exist_ok=True)
+
+    if uncompressed_db_file.exists():
+        os.remove(uncompressed_db_file)
+
+    conn2 = sqlite3.connect(uncompressed_db_file)
+    conn2.execute("PRAGMA journal_mode=WAL;")
+    conn2.execute("PRAGMA auto_vacuum=full;")
+    cursor2 = conn2.cursor()
+
+    cursor2.execute('''
+    CREATE TABLE dictionary_data (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        word TEXT NOT NULL,
+        pos TEXT,
+        data TEXT,
+        uncompressed_size INTEGER
+    );
+    ''')
+
+    # --- Pass 2b: Insert Uncompressed Data ---
+    print("\n--- Pass 2b: Inserting Uncompressed Data ---")
+
+    print("Processing chunks in parallel for uncompressed DB...")
+    if uncompressed_only:
+        pool_uncomp = multiprocessing.Pool(processes=multiprocessing.cpu_count())
+        all_results2 = pool_uncomp.map(process_chunk_uncompressed, chunks)
+        pool_uncomp.close()
+        pool_uncomp.join()
+    else:
+        all_results2 = pool.map(process_chunk_uncompressed, chunks)
+        pool.close()
+        pool.join()
+    data_to_insert2 = [item for sublist in all_results2 for item in sublist]
+
+    print(f"Collected {len(data_to_insert2)} items to insert into uncompressed DB.")
+    cursor2.executemany("INSERT INTO dictionary_data (word, pos, data, uncompressed_size) VALUES (?, ?, ?, ?)", data_to_insert2)
+    word_counter2 = len(data_to_insert2)
+
+    conn2.commit()
+    print(f"Inserted {word_counter2:,} words into uncompressed DB.")
+
+    # --- Pass 3b: FTS & Cleanup ---
+    print("Creating FTS4 index for uncompressed DB...")
+    cursor2.execute("CREATE VIRTUAL TABLE dictionary_fts USING fts4(word, pos, content='dictionary_data');")
+    cursor2.execute("INSERT INTO dictionary_fts(docid, word, pos) SELECT id, word, pos FROM dictionary_data;")
+    conn2.commit()
+
+    print("Running VACUUM on uncompressed DB...")
+    cursor2.execute('VACUUM')
+    conn2.commit()
+
+    # Compute and print uncompressed_size statistics
+    sizes = [row[0] for row in cursor2.execute("SELECT uncompressed_size FROM dictionary_data")]
+    if sizes:
+        min_size = min(sizes)
+        max_size = max(sizes)
+        avg_size = statistics.mean(sizes)
+        median_size = statistics.median(sizes)
+        try:
+            stdev_size = statistics.stdev(sizes)
+        except statistics.StatisticsError:
+            stdev_size = 0.0
+
+        print(f"\nUncompressed Size Statistics:")
+        print(f"  Count: {len(sizes):,}")
+        print(f"  Min: {min_size}")
+        print(f"  Max: {max_size}")
+        print(f"  Avg: {avg_size:.2f}")
+        print(f"  Median: {median_size}")
+        print(f"  Std Dev: {stdev_size:.2f}")
+
+        # Outliers: top 10 largest entries
+        outliers = cursor2.execute("SELECT word, uncompressed_size FROM dictionary_data ORDER BY uncompressed_size DESC LIMIT 10").fetchall()
+        print(f"\nTop 10 largest entries by uncompressed size:")
+        for word, size in outliers:
+            print(f"  {word}: {size:,} bytes")
+
+    conn2.close()
+
+    uncompressed_db_size_mb = get_file_size_mb(uncompressed_db_file)
+
+    print(f"\n{'='*60}")
+    print(f"Uncompressed Database Size: {uncompressed_db_size_mb:.2f} MB  ({uncompressed_db_file.name})")
+    print(f"{'='*60}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Compress dictionary JSONL into SQLite DB.")
+    parser.add_argument("--lang", type=str, default=DEFAULT_LANG_CODE, 
+                        help="Language code (e.g., 'de'). Used for naming output files.")
+    parser.add_argument("--input", type=pathlib.Path,
+                        help="Full path to input JSONL. If omitted, tries to find it in standard intermediate folder based on lang.")
+    parser.add_argument("--output-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR,
+                        help="Directory to save .db and .zstdict files.")
+    parser.add_argument("--intermediate-dir", type=pathlib.Path, default=DEFAULT_INTERMEDIATE_DIR,
+                        help="Directory to save uncompressed .db file.")
+
+    args = parser.parse_args()
+
+    # Determine input file if not explicitly provided
+    if args.input:
+        input_file = args.input
+    else:
+        # Try to guess the filename based on the language code matching script 1's output
+        filename = f"{args.lang.capitalize()}_universal.jsonl"
+        input_file = DEFAULT_INTERMEDIATE_DIR / filename
+
+    create_database(args.lang, input_file, args.output_dir, args.intermediate_dir, DEFAULT_UNCOMPRESSED_ONLY, DEFAULT_MINIMAL)
+
+    # Log stats to CSV
+    stats_file = ROOT_DIR / "processing_stats.csv"
+    timestamp = datetime.now().isoformat()
+    files_to_log = [
+        (args.output_dir / f"dictionary_{args.lang}.db", "compressed_db"),
+        (args.output_dir / f"dictionary_{args.lang}.zstdict", "compression_dict"),
+        (args.intermediate_dir / f"dictionary_{args.lang}_uncompressed.db", "uncompressed_db")
+    ]
+    write_header = not stats_file.exists()
+    with open(stats_file, 'a', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        if write_header:
+            writer.writerow(['timestamp', 'output_file', 'size_bytes', 'type'])
+        for file_path, file_type in files_to_log:
+            if file_path.exists():
+                size = file_path.stat().st_size
+                writer.writerow([timestamp, str(file_path), size, file_type])
+
+if __name__ == "__main__":
+    main()
--- a/scripts/03_update_manifest.py
+++ b/scripts/03_update_manifest.py
@@ -0,0 +1,108 @@
+import json
+import os
+import hashlib
+import sys
+import pathlib
+import re
+import argparse
+from typing import Dict, Any, Set
+
+# ======================================================================
+# --- DEFAULT CONFIGURATION ---
+# ======================================================================
+try:
+    SCRIPT_DIR = pathlib.Path(__file__).parent
+    ROOT_DIR = SCRIPT_DIR.parent
+except NameError:
+    SCRIPT_DIR = pathlib.Path.cwd()
+    ROOT_DIR = SCRIPT_DIR.parent
+
+DEFAULT_OUTPUTS_DIR = ROOT_DIR / "outputs"
+# ======================================================================
+
+def calculate_sha256(filepath: pathlib.Path, block_size=65536) -> str | None:
+    sha256 = hashlib.sha256()
+    try:
+        with open(filepath, 'rb') as f:
+            for block in iter(lambda: f.read(block_size), b''):
+                sha256.update(block)
+    except IOError as e:
+        print(f"    ERROR: Could not read file '{filepath.name}': {e}")
+        return None
+    return sha256.hexdigest().upper()
+
+def guess_properties_from_base(base_name: str) -> Dict[str, str]:
+    match = re.match(r"dictionary_([a-zA-Z]{2,3})", base_name)
+    if match:
+        lang_code = match.group(1)
+        return {"id": f"{lang_code}_dict", "name": f"Dictionary ({lang_code.upper()})", "lang_code": lang_code}
+    return {"id": base_name, "name": f"Dictionary ({base_name})", "lang_code": "xx"}
+
+def create_new_dict_entry(base_name: str, asset_files: list[pathlib.Path]) -> Dict[str, Any]:
+    props = guess_properties_from_base(base_name)
+    new_entry = {
+        "id": props["id"], "name": props["name"], "description": "Auto-generated", "version": "1.0.0", "assets": []
+    }
+    for file_path in asset_files:
+        print(f"    -> Adding new asset: '{file_path.name}'")
+        csum = calculate_sha256(file_path)
+        if csum:
+            new_entry["assets"].append({
+                "filename": file_path.name, "size_bytes": os.path.getsize(file_path), "checksum_sha256": csum
+            })
+    return new_entry
+
+def update_manifest(outputs_dir: pathlib.Path):
+    manifest_path = outputs_dir / 'manifest.json'
+    if not outputs_dir.exists():
+        print(f"Error: Outputs directory does not exist: {outputs_dir}")
+        sys.exit(1)
+
+    manifest_data = {"files": []}
+    if manifest_path.exists():
+        try:
+            with open(manifest_path, 'r', encoding='utf-8') as f:
+                manifest_data = json.load(f)
+                if 'files' not in manifest_data: manifest_data['files'] = []
+        except Exception as e:
+            print(f"Error reading manifest: {e}"); sys.exit(1)
+
+    print(f"Scanning {outputs_dir} for assets...")
+    assets_map = {asset['filename']: asset for entry in manifest_data.get('files', []) for asset in entry.get('assets', [])}
+    
+    discovered = list(outputs_dir.glob('*.db')) + list(outputs_dir.glob('*.zstdict'))
+    new_files, updated_count = [], 0
+
+    for fpath in discovered:
+        fname = fpath.name
+        if fname in assets_map:
+            print(f"Updating: {fname}")
+            assets_map[fname]['size_bytes'] = os.path.getsize(fpath)
+            assets_map[fname]['checksum_sha256'] = calculate_sha256(fpath)
+            updated_count += 1
+        else:
+            new_files.append(fpath)
+
+    added_count = 0
+    if new_files:
+        grouped = {}
+        for f in new_files:
+            grouped.setdefault(f.stem, []).append(f)
+        for base, files in grouped.items():
+            print(f"Creating new entry for: {base}")
+            manifest_data['files'].append(create_new_dict_entry(base, files))
+            added_count += 1
+
+    with open(manifest_path, 'w', encoding='utf-8') as f:
+        json.dump(manifest_data, f, indent=2, ensure_ascii=False)
+    print(f"\nComplete. Updated {updated_count} assets, added {added_count} new entries.")
+
+def main():
+    parser = argparse.ArgumentParser(description="Update manifest.json with .db and .zstdict files.")
+    parser.add_argument("--outputs-dir", type=pathlib.Path, default=DEFAULT_OUTPUTS_DIR,
+                        help="Directory containing assets and manifest.json.")
+    args = parser.parse_args()
+    update_manifest(args.outputs_dir)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/InflectionProcessor.py
+++ b/scripts/InflectionProcessor.py
@@ -0,0 +1,225 @@
+import re
+
+class UniversalInflectionCompressor:
+    """
+    A generic inflection compressor that uses a configuration dictionary 
+    to process, partition, and compress verb forms for any language.
+    """
+    def __init__(self, config: dict):
+        self.config = config
+
+    def _matches_criteria(self, form: dict, criteria: dict) -> bool:
+        """Helper: Checks if a form matches specific criteria."""
+        # Regex Match
+        if 'form_regex' in criteria:
+            form_str = form.get('form', '')
+            if form_str is None: form_str = ''
+            if not re.search(criteria['form_regex'], form_str):
+                return False
+        
+        # Tags Inclusion
+        if 'tags' in criteria:
+            form_tags = set(form.get('tags', []))
+            required = set(criteria['tags'])
+            if not required.issubset(form_tags):
+                return False
+
+        # Raw Tags Inclusion
+        if 'raw_tags' in criteria:
+            form_raw = set(form.get('raw_tags', []))
+            required_raw = set(criteria['raw_tags'])
+            if not required_raw.issubset(form_raw):
+                return False
+                
+        # Tag Exclusion
+        if 'exclude_tags' in criteria:
+            form_tags = set(form.get('tags', []))
+            if not form_tags.isdisjoint(set(criteria['exclude_tags'])):
+                return False
+
+        return True
+
+    def _normalize_forms(self, forms: list) -> list:
+        """Enriches forms with tags based on 'normalization_rules'."""
+        rules = self.config.get('normalization_rules', [])
+        skip_if_source = self.config.get('skip_normalization_if_source', True)
+
+        for form in forms:
+            if form.get('source') and skip_if_source:
+                continue
+
+            for rule in rules:
+                field = rule.get('field')
+                value_to_match = rule.get('match')
+                match_mode = rule.get('match_mode', 'exact')
+                add_tags = rule.get('add_tags', [])
+
+                form_value = form.get(field)
+                if form_value is None: continue
+
+                is_match = False
+                if match_mode == 'regex':
+                    if isinstance(form_value, list):
+                        for item in form_value:
+                            if re.search(value_to_match, str(item)):
+                                is_match = True; break
+                    else:
+                        if re.search(value_to_match, str(form_value)):
+                            is_match = True
+                else: 
+                    if isinstance(form_value, list):
+                        is_match = value_to_match in form_value
+                    else:
+                        is_match = value_to_match == form_value
+                
+                if is_match:
+                    current_tags = set(form.get('tags', []))
+                    current_tags.update(add_tags)
+                    form['tags'] = list(current_tags)
+        return forms
+
+    def _extract_properties(self, forms: list, entry_context: dict = None) -> dict:
+        """Determines global properties (e.g. aux, group)."""
+        properties = {}
+        candidates = forms.copy()
+        if entry_context:
+            candidates.append(entry_context)
+        
+        for prop_def in self.config.get('properties', []):
+            name = prop_def['name']
+            default_val = prop_def.get('default')
+            is_multivalue = prop_def.get('multivalue', False)
+            
+            found_values = set()
+            for rule in prop_def.get('rules', []):
+                for candidate in candidates:
+                    if self._matches_criteria(candidate, rule.get('criteria', {})):
+                        found_values.add(rule['value'])
+                        if not is_multivalue:
+                            break 
+                if found_values and not is_multivalue:
+                    break
+            
+            if not found_values:
+                if is_multivalue and default_val is not None:
+                    properties[name] = default_val if isinstance(default_val, list) else [default_val]
+                else:
+                    properties[name] = default_val
+            elif is_multivalue:
+                properties[name] = sorted(list(found_values))
+            else:
+                properties[name] = list(found_values)[0]
+                    
+        return properties
+
+    def _clean_verb_string(self, form_string: str) -> str:
+        ignored = self.config.get('clean_prefixes', [])
+        current_string = form_string.strip()
+        changed = True
+        while changed:
+            changed = False
+            for prefix in ignored:
+                if prefix.endswith("'") or prefix.endswith("’"):
+                    if current_string.startswith(prefix):
+                        current_string = current_string[len(prefix):]
+                        changed = True
+                        break
+                else:
+                    if current_string.startswith(prefix + " "):
+                        current_string = current_string[len(prefix)+1:]
+                        changed = True
+                        break
+        return current_string
+
+    def compress(self, forms_list: list, word: str = None, entry: dict = None) -> dict:
+        if not forms_list:
+            return None
+
+        # 1. Normalize tags
+        normalized_forms = self._normalize_forms(forms_list)
+
+        # 2. Extract Properties
+        entry_context = None
+        if entry:
+            entry_context = {
+                'form': entry.get('word', ''),
+                'tags': entry.get('tags', []),
+                'raw_tags': entry.get('raw_tags', [])
+            }
+        table_properties = self._extract_properties(normalized_forms, entry_context)
+
+        # 3. Initialize Output
+        result = table_properties.copy()
+        
+        # 4. Fill Slots
+        schema = self.config.get('schema', {})
+        for slot_name, slot_def in schema.items():
+            slot_type = slot_def.get('type', 'single')
+            
+            if slot_type == 'single':
+                result[slot_name] = None
+                for form in normalized_forms:
+                    if self._matches_criteria(form, slot_def.get('criteria', {})):
+                        if result[slot_name] is None or (form.get('source') and not result[slot_name]): 
+                             result[slot_name] = self._clean_verb_string(form['form'])
+
+            elif slot_type == 'list':
+                size = slot_def.get('size', 6)
+                result[slot_name] = [None] * size
+                base_criteria = slot_def.get('base_criteria', {})
+                candidates = [f for f in normalized_forms if self._matches_criteria(f, base_criteria)]
+                
+                for form in candidates:
+                    idx = -1
+                    # Iterate through index rules to find where this form belongs
+                    for index_rule in slot_def.get('indices', []):
+                        # Support full criteria in indices (e.g. form_regex), fallback to 'tags' shortcut
+                        rule_criteria = index_rule.get('criteria', {})
+                        if 'tags' in index_rule:
+                            rule_criteria = rule_criteria.copy()
+                            rule_criteria['tags'] = index_rule['tags']
+                        
+                        if self._matches_criteria(form, rule_criteria):
+                            idx = index_rule['index']
+                            break
+                    
+                    if idx >= 0 and idx < size:
+                        current_val = result[slot_name][idx]
+                        if current_val is None:
+                            result[slot_name][idx] = self._clean_verb_string(form['form'])
+                        elif form.get('source') and ("Flexion" in form.get('source') or "Conjugaison" in form.get('source')):
+                             result[slot_name][idx] = self._clean_verb_string(form['form'])
+
+        # 5. Fallbacks
+        if not result.get('infinitive') and word:
+            result['infinitive'] = word
+
+        # 6. Validation
+        if self.config.get('validate_completeness', False):
+            for key, val in result.items():
+                slot_config = schema.get(key, {})
+                if slot_config.get('optional', False):
+                    continue
+                if val is None:
+                     raise ValueError(f"Inflection Error: Missing required slot '{key}' for word '{word}'.")
+                if isinstance(val, list):
+                     for i, v in enumerate(val):
+                        if v is None:
+                             raise ValueError(f"Inflection Error: Missing form at index {i} in slot '{key}' for word '{word}'.")
+
+        return result
+
+class InflectionProcessor:
+    def __init__(self, configs):
+        self.compressors = {k: UniversalInflectionCompressor(v) for k, v in configs.items()}
+
+    def process(self, entry: dict) -> dict:
+        key = f"{entry.get('lang_code')}_{entry.get('pos')}"
+        if key in self.compressors:
+            try:
+                compressed = self.compressors[key].compress(entry.get('forms'), entry.get('word'), entry=entry)
+                if compressed:
+                    entry['forms'] = compressed
+            except Exception as e:
+                print(f"Error processing {entry.get('word')}: {e}")
+        return entry
--- a/Analyzer/jsonl_schema_analyzer_hybrid.py
+++ b/Analyzer/jsonl_schema_analyzer_hybrid.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+"""
+Hybrid JSONL Schema Analyzer
+
+Intelligently chooses between sequential and parallel processing based on file size.
+For small files, uses sequential processing. For large files, uses parallel processing.
+"""
+
+import json
+import os
+import sys
+import time
+import mmap
+from collections import defaultdict, Counter
+from typing import Dict, List, Any, Set, Union, Tuple
+import argparse
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from multiprocessing import cpu_count
+import threading
+from functools import partial
+import gc
+
+# Import the optimized analyzer for parallel processing
+sys.path.insert(0, str(Path(__file__).parent))
+try:
+    from jsonl_schema_analyzer_optimized import OptimizedJSONLSchemaAnalyzer
+except ImportError:
+    print("Warning: Could not import optimized analyzer, using fallback")
+    OptimizedJSONLSchemaAnalyzer = None
+
+
+class HybridJSONLSchemaAnalyzer:
+    """Hybrid analyzer that intelligently chooses processing strategy."""
+    
+    def __init__(self, max_samples: int = 1000, max_workers: int = None, 
+                 parallel_threshold_mb: int = 100, chunk_size: int = 1000):
+        """
+        Initialize the hybrid analyzer.
+        
+        Args:
+            max_samples: Maximum number of JSON objects to sample per file
+            max_workers: Maximum number of worker processes (default: cpu_count)
+            parallel_threshold_mb: File size threshold in MB to use parallel processing
+            chunk_size: Number of lines to process in each chunk
+        """
+        self.max_samples = max_samples
+        self.max_workers = max_workers or min(cpu_count(), 8)
+        self.parallel_threshold_mb = parallel_threshold_mb
+        self.chunk_size = chunk_size
+        
+        # Import the original analyzer for small files
+        sys.path.insert(0, str(Path(__file__).parent))
+        try:
+            from jsonl_schema_analyzer import JSONLSchemaAnalyzer
+            self.sequential_analyzer = JSONLSchemaAnalyzer(max_samples=max_samples)
+        except ImportError:
+            print("Warning: Could not import sequential analyzer")
+            self.sequential_analyzer = None
+        
+        # Initialize optimized analyzer for large files
+        if OptimizedJSONLSchemaAnalyzer:
+            self.parallel_analyzer = OptimizedJSONLSchemaAnalyzer(
+                max_samples=max_samples,
+                max_workers=max_workers,
+                chunk_size=chunk_size
+            )
+        else:
+            self.parallel_analyzer = None
+        
+        print(f"Hybrid analyzer initialized:")
+        print(f"  Parallel threshold: {parallel_threshold_mb} MB")
+        print(f"  Max workers: {self.max_workers}")
+        print(f"  Chunk size: {self.chunk_size}")
+    
+    def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
+        """
+        Analyze a JSONL file using the appropriate strategy.
+        
+        Args:
+            file_path: Path to the JSONL file
+            
+        Returns:
+            Dictionary containing schema analysis results
+        """
+        file_path = Path(file_path)
+        
+        if not file_path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        # Get file size in MB
+        file_size_mb = file_path.stat().st_size / (1024 * 1024)
+        
+        print(f"Analyzing {file_path.name} ({file_size_mb:.2f} MB)...")
+        
+        # Choose processing strategy
+        if file_size_mb >= self.parallel_threshold_mb and self.parallel_analyzer:
+            print(f"  Using parallel processing (file >= {self.parallel_threshold_mb} MB)")
+            result = self.parallel_analyzer.analyze_jsonl_file(file_path)
+            result["processing_strategy"] = "parallel"
+        elif self.sequential_analyzer:
+            print(f"  Using sequential processing (file < {self.parallel_threshold_mb} MB)")
+            result = self.sequential_analyzer.analyze_jsonl_file(file_path)
+            result["processing_strategy"] = "sequential"
+        else:
+            # Fallback to parallel if sequential not available
+            print(f"  Using parallel processing (sequential analyzer unavailable)")
+            if self.parallel_analyzer:
+                result = self.parallel_analyzer.analyze_jsonl_file(file_path)
+                result["processing_strategy"] = "parallel_fallback"
+            else:
+                raise RuntimeError("No analyzer available")
+        
+        # Add hybrid-specific metadata
+        result["file_size_mb"] = file_size_mb
+        result["parallel_threshold_mb"] = self.parallel_threshold_mb
+        
+        return result
+    
+    def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
+        """
+        Analyze all JSONL files in a directory using hybrid processing.
+        
+        Args:
+            directory_path: Path to directory containing JSONL files
+            pattern: File pattern to match (default: *.jsonl)
+            
+        Returns:
+            Dictionary containing analysis results for all files
+        """
+        directory_path = Path(directory_path)
+        
+        if not directory_path.exists():
+            raise FileNotFoundError(f"Directory not found: {directory_path}")
+        
+        # Find all JSONL files
+        jsonl_files = list(directory_path.glob(pattern))
+        
+        if not jsonl_files:
+            print(f"No JSONL files found in {directory_path} with pattern {pattern}")
+            return {"files": [], "summary": {}}
+        
+        print(f"Found {len(jsonl_files)} JSONL files to analyze...")
+        start_time = time.time()
+        
+        # Categorize files by size
+        small_files = []
+        large_files = []
+        
+        for file_path in jsonl_files:
+            size_mb = file_path.stat().st_size / (1024 * 1024)
+            if size_mb >= self.parallel_threshold_mb:
+                large_files.append(file_path)
+            else:
+                small_files.append(file_path)
+        
+        print(f"  Small files (< {self.parallel_threshold_mb} MB): {len(small_files)}")
+        print(f"  Large files (>= {self.parallel_threshold_mb} MB): {len(large_files)}")
+        
+        file_results = {}
+        
+        # Process small files sequentially (they're fast anyway)
+        if small_files and self.sequential_analyzer:
+            print(f"Processing {len(small_files)} small files sequentially...")
+            for file_path in small_files:
+                try:
+                    result = self.analyze_jsonl_file(file_path)
+                    file_results[file_path.name] = result
+                except Exception as e:
+                    print(f"Error analyzing {file_path.name}: {e}")
+                    file_results[file_path.name] = {"error": str(e)}
+        
+        # Process large files in parallel
+        if large_files and self.parallel_analyzer:
+            print(f"Processing {len(large_files)} large files in parallel...")
+            
+            if len(large_files) == 1:
+                # Single large file - just process it directly
+                file_path = large_files[0]
+                try:
+                    result = self.analyze_jsonl_file(file_path)
+                    file_results[file_path.name] = result
+                except Exception as e:
+                    print(f"Error analyzing {file_path.name}: {e}")
+                    file_results[file_path.name] = {"error": str(e)}
+            else:
+                # Multiple large files - process in parallel
+                with ThreadPoolExecutor(max_workers=min(len(large_files), self.max_workers)) as executor:
+                    future_to_file = {
+                        executor.submit(self.analyze_jsonl_file, file_path): file_path 
+                        for file_path in large_files
+                    }
+                    
+                    for future in as_completed(future_to_file):
+                        file_path = future_to_file[future]
+                        try:
+                            result = future.result()
+                            file_results[file_path.name] = result
+                        except Exception as e:
+                            print(f"Error analyzing {file_path.name}: {e}")
+                            file_results[file_path.name] = {"error": str(e)}
+        
+        # Create summary
+        successful_results = [r for r in file_results.values() if "error" not in r]
+        summary = {
+            "total_files": len(jsonl_files),
+            "small_files": len(small_files),
+            "large_files": len(large_files),
+            "successfully_analyzed": len(successful_results),
+            "total_size_bytes": sum(
+                r.get("file_size_bytes", 0) for r in successful_results
+            ),
+            "total_lines": sum(
+                r.get("total_lines", 0) for r in successful_results
+            ),
+            "total_valid_lines": sum(
+                r.get("valid_lines", 0) for r in successful_results
+            ),
+            "total_processing_time": sum(
+                r.get("processing_time_seconds", 0) for r in successful_results
+            ),
+            "parallel_threshold_mb": self.parallel_threshold_mb,
+            "strategies_used": {
+                "sequential": len([r for r in successful_results if r.get("processing_strategy") == "sequential"]),
+                "parallel": len([r for r in successful_results if r.get("processing_strategy") in ["parallel", "parallel_fallback"]])
+            }
+        }
+        
+        # Calculate processing speed
+        if summary["total_processing_time"] > 0:
+            total_mb = summary["total_size_bytes"] / (1024 * 1024)
+            summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
+        
+        elapsed_time = time.time() - start_time
+        summary["total_elapsed_time"] = elapsed_time
+        
+        print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
+        print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
+        print(f"Sequential: {summary['strategies_used']['sequential']}, Parallel: {summary['strategies_used']['parallel']}")
+        print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
+        
+        return {
+            "directory": str(directory_path),
+            "pattern": pattern,
+            "files": file_results,
+            "summary": summary
+        }
+    
+    def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
+        """
+        Save analysis results to a JSON file.
+        
+        Args:
+            results: Analysis results to save
+            output_path: Path to save the results
+        """
+        output_path = Path(output_path)
+        
+        try:
+            start_time = time.time()
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(results, f, indent=2, ensure_ascii=False)
+            
+            save_time = time.time() - start_time
+            file_size = output_path.stat().st_size
+            print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
+            
+        except Exception as e:
+            raise RuntimeError(f"Error saving results to {output_path}: {e}")
+
+
+def main():
+    """Main function for command-line usage."""
+    parser = argparse.ArgumentParser(
+        description="Hybrid JSONL schema analyzer with intelligent processing strategy"
+    )
+    parser.add_argument(
+        "path",
+        help="Path to JSONL file or directory containing JSONL files"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Output file for analysis results (JSON format)"
+    )
+    parser.add_argument(
+        "-p", "--pattern",
+        default="*.jsonl",
+        help="File pattern when analyzing directory (default: *.jsonl)"
+    )
+    parser.add_argument(
+        "-s", "--max-samples",
+        type=int,
+        default=1000,
+        help="Maximum number of JSON objects to sample per file (default: 1000)"
+    )
+    parser.add_argument(
+        "-w", "--workers",
+        type=int,
+        default=None,
+        help="Number of worker processes for parallel processing (default: CPU count, max 8)"
+    )
+    parser.add_argument(
+        "-t", "--threshold",
+        type=int,
+        default=100,
+        help="File size threshold in MB for parallel processing (default: 100)"
+    )
+    parser.add_argument(
+        "-c", "--chunk-size",
+        type=int,
+        default=1000,
+        help="Number of lines to process in each chunk (default: 1000)"
+    )
+    parser.add_argument(
+        "--directory",
+        action="store_true",
+        help="Treat path as directory instead of single file"
+    )
+    
+    args = parser.parse_args()
+    
+    # Initialize hybrid analyzer
+    analyzer = HybridJSONLSchemaAnalyzer(
+        max_samples=args.max_samples,
+        max_workers=args.workers,
+        parallel_threshold_mb=args.threshold,
+        chunk_size=args.chunk_size
+    )
+    
+    try:
+        start_time = time.time()
+        
+        # Analyze file or directory
+        if args.directory or Path(args.path).is_dir():
+            results = analyzer.analyze_directory(args.path, args.pattern)
+        else:
+            results = analyzer.analyze_jsonl_file(args.path)
+        
+        total_time = time.time() - start_time
+        
+        # Save or print results
+        if args.output:
+            analyzer.save_results(results, args.output)
+        else:
+            print("\n" + "="*50)
+            print("ANALYSIS RESULTS")
+            print("="*50)
+            print(json.dumps(results, indent=2, ensure_ascii=False))
+        
+        print(f"\nTotal analysis time: {total_time:.2f}s")
+    
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/Analyzer/jsonl_schema_analyzer_optimized.py
+++ b/Analyzer/jsonl_schema_analyzer_optimized.py
@@ -0,0 +1,567 @@
+#!/usr/bin/env python3
+"""
+Optimized JSONL Schema Analyzer
+
+Analyzes JSONL files to extract and aggregate schema information using multiple cores.
+For each JSONL file, it generates a schema showing the JSON structure
+and aggregates all possible keys found across all records.
+"""
+
+import json
+import os
+import sys
+import time
+import mmap
+from collections import defaultdict, Counter
+from typing import Dict, List, Any, Set, Union, Tuple
+import argparse
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from multiprocessing import cpu_count, Manager
+import threading
+from functools import partial
+import gc
+
+
+class OptimizedJSONLSchemaAnalyzer:
+    """Optimized analyzer that uses multiple cores and system resources efficiently."""
+    
+    def __init__(self, max_samples: int = 1000, max_workers: int = None, chunk_size: int = 1000):
+        """
+        Initialize the optimized analyzer.
+        
+        Args:
+            max_samples: Maximum number of JSON objects to sample per file
+            max_workers: Maximum number of worker processes (default: cpu_count)
+            chunk_size: Number of lines to process in each chunk
+        """
+        self.max_samples = max_samples
+        self.max_workers = max_workers or min(cpu_count(), 8)  # Limit to 8 to avoid memory issues
+        self.chunk_size = chunk_size
+        self.schema_cache = {}
+        
+        print(f"Initialized analyzer with {self.max_workers} workers, chunk size: {self.chunk_size}")
+    
+    def analyze_json_value(self, value: Any, depth: int = 0, max_depth: int = 10) -> Dict[str, Any]:
+        """
+        Analyze a JSON value and return its type and structure.
+        
+        Args:
+            value: The JSON value to analyze
+            depth: Current depth in the structure
+            max_depth: Maximum depth to analyze
+            
+        Returns:
+            Dictionary describing the value's type and structure
+        """
+        if depth > max_depth:
+            return {"type": "unknown", "note": "max_depth_reached"}
+        
+        if value is None:
+            return {"type": "null"}
+        elif isinstance(value, bool):
+            return {"type": "boolean"}
+        elif isinstance(value, int):
+            return {"type": "integer"}
+        elif isinstance(value, float):
+            return {"type": "number"}
+        elif isinstance(value, str):
+            return {"type": "string", "sample_length": len(value)}
+        elif isinstance(value, list):
+            if not value:
+                return {"type": "array", "item_types": [], "length_range": [0, 0]}
+            
+            item_types = set()
+            item_schemas = []
+            
+            # Sample first few items to determine array structure
+            sample_size = min(10, len(value))
+            for item in value[:sample_size]:
+                item_schema = self.analyze_json_value(item, depth + 1, max_depth)
+                item_schemas.append(item_schema)
+                item_types.add(item_schema["type"])
+            
+            return {
+                "type": "array",
+                "item_types": sorted(list(item_types)),
+                "length_range": [len(value), len(value)],
+                "sample_items": item_schemas[:3]  # Keep first 3 as examples
+            }
+        elif isinstance(value, dict):
+            if not value:
+                return {"type": "object", "properties": {}, "required_keys": []}
+            
+            properties = {}
+            for key, val in value.items():
+                properties[key] = self.analyze_json_value(val, depth + 1, max_depth)
+            
+            return {
+                "type": "object",
+                "properties": properties,
+                "required_keys": list(value.keys())
+            }
+        else:
+            return {"type": "unknown", "note": f"unexpected_type: {type(value)}"}
+    
+    def merge_schemas(self, schema1: Dict[str, Any], schema2: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Merge two schemas, combining their information.
+        
+        Args:
+            schema1: First schema
+            schema2: Second schema
+            
+        Returns:
+            Merged schema
+        """
+        if schema1["type"] != schema2["type"]:
+            # Different types, create a union
+            return {
+                "type": "union",
+                "possible_types": sorted(set([schema1["type"], schema2["type"]])),
+                "schemas": [schema1, schema2]
+            }
+        
+        merged = {"type": schema1["type"]}
+        
+        if schema1["type"] == "array":
+            # Merge array item types
+            item_types = set(schema1.get("item_types", []))
+            item_types.update(schema2.get("item_types", []))
+            merged["item_types"] = sorted(list(item_types))
+            
+            # Merge length ranges
+            len1 = schema1.get("length_range", [0, 0])
+            len2 = schema2.get("length_range", [0, 0])
+            merged["length_range"] = [min(len1[0], len2[0]), max(len1[1], len2[1])]
+            
+            # Merge sample items if available
+            if "sample_items" in schema1 or "sample_items" in schema2:
+                merged["sample_items"] = (
+                    schema1.get("sample_items", []) + 
+                    schema2.get("sample_items", [])
+                )[:5]  # Keep max 5 samples
+        
+        elif schema1["type"] == "object":
+            # Merge object properties
+            properties = {}
+            all_keys = set()
+            
+            # Copy properties from first schema
+            for key, val in schema1.get("properties", {}).items():
+                properties[key] = val
+                all_keys.add(key)
+            
+            # Merge properties from second schema
+            for key, val in schema2.get("properties", {}).items():
+                if key in properties:
+                    properties[key] = self.merge_schemas(properties[key], val)
+                else:
+                    properties[key] = val
+                all_keys.add(key)
+            
+            merged["properties"] = properties
+            merged["required_keys"] = sorted(list(all_keys))
+        
+        # Copy other fields
+        for key in schema1:
+            if key not in merged and key != "type":
+                merged[key] = schema1[key]
+        
+        return merged
+    
+    def _extract_all_keys(self, obj: Any, prefix: str = "") -> List[str]:
+        """
+        Recursively extract all keys from a JSON object.
+        
+        Args:
+            obj: JSON object to analyze
+            prefix: Prefix for nested keys
+            
+        Returns:
+            List of all keys found
+        """
+        keys = []
+        
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                full_key = f"{prefix}.{key}" if prefix else key
+                keys.append(full_key)
+                keys.extend(self._extract_all_keys(value, full_key))
+        
+        elif isinstance(obj, list):
+            for i, item in enumerate(obj):
+                keys.extend(self._extract_all_keys(item, f"{prefix}[{i}]" if prefix else f"[{i}]"))
+        
+        return keys
+    
+    def _process_chunk(self, chunk_data: List[str]) -> Tuple[Counter, List[Dict], int, int]:
+        """
+        Process a chunk of JSONL lines.
+        
+        Args:
+            chunk_data: List of JSONL lines to process
+            
+        Returns:
+            Tuple of (keys_counter, sample_objects, valid_count, error_count)
+        """
+        all_keys = Counter()
+        sample_objects = []
+        valid_count = 0
+        error_count = 0
+        
+        for line in chunk_data:
+            line = line.strip()
+            if not line:
+                continue
+            
+            try:
+                obj = json.loads(line)
+                valid_count += 1
+                
+                # Collect all keys from this object
+                keys = self._extract_all_keys(obj)
+                all_keys.update(keys)
+                
+                # Keep sample objects for schema analysis
+                if len(sample_objects) < self.max_samples:
+                    sample_objects.append(obj)
+            
+            except json.JSONDecodeError:
+                error_count += 1
+        
+        return all_keys, sample_objects, valid_count, error_count
+    
+    def _read_file_chunks(self, file_path: Path) -> List[List[str]]:
+        """
+        Read a JSONL file in chunks for parallel processing.
+        
+        Args:
+            file_path: Path to the JSONL file
+            
+        Returns:
+            List of chunks, each containing lines to process
+        """
+        chunks = []
+        current_chunk = []
+        
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    current_chunk.append(line)
+                    
+                    if len(current_chunk) >= self.chunk_size:
+                        chunks.append(current_chunk)
+                        current_chunk = []
+                
+                # Add remaining lines
+                if current_chunk:
+                    chunks.append(current_chunk)
+        
+        except Exception as e:
+            raise RuntimeError(f"Error reading file {file_path}: {e}")
+        
+        return chunks
+    
+    def analyze_jsonl_file(self, file_path: Union[str, Path]) -> Dict[str, Any]:
+        """
+        Analyze a JSONL file and return schema information using parallel processing.
+        
+        Args:
+            file_path: Path to the JSONL file
+            
+        Returns:
+            Dictionary containing schema analysis results
+        """
+        file_path = Path(file_path)
+        
+        if not file_path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        
+        start_time = time.time()
+        file_size = file_path.stat().st_size
+        print(f"Analyzing {file_path.name} ({file_size / (1024*1024*1024):.2f} GB)...")
+        
+        # Statistics
+        total_lines = 0
+        valid_lines = 0
+        error_lines = 0
+        all_keys = Counter()
+        merged_schema = None
+        sample_objects = []
+        
+        # Read file in chunks and process in parallel
+        chunks = self._read_file_chunks(file_path)
+        
+        if len(chunks) == 1 or self.max_workers == 1:
+            # Process sequentially for small files or single worker
+            for chunk in chunks:
+                chunk_keys, chunk_samples, chunk_valid, chunk_errors = self._process_chunk(chunk)
+                all_keys.update(chunk_keys)
+                sample_objects.extend(chunk_samples)
+                valid_lines += chunk_valid
+                error_lines += chunk_errors
+                total_lines += len(chunk)
+        else:
+            # Process chunks in parallel
+            with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+                # Submit all chunks for processing
+                future_to_chunk = {
+                    executor.submit(self._process_chunk, chunk): chunk 
+                    for chunk in chunks
+                }
+                
+                # Collect results as they complete
+                for future in as_completed(future_to_chunk):
+                    chunk_keys, chunk_samples, chunk_valid, chunk_errors = future.result()
+                    all_keys.update(chunk_keys)
+                    sample_objects.extend(chunk_samples)
+                    valid_lines += chunk_valid
+                    error_lines += chunk_errors
+                    total_lines += len(future_to_chunk[future])
+                    
+                    # Limit sample objects
+                    if len(sample_objects) >= self.max_samples:
+                        sample_objects = sample_objects[:self.max_samples]
+        
+        # Analyze schema from sample objects
+        if sample_objects:
+            for obj in sample_objects:
+                obj_schema = self.analyze_json_value(obj)
+                
+                if merged_schema is None:
+                    merged_schema = obj_schema
+                else:
+                    merged_schema = self.merge_schemas(merged_schema, obj_schema)
+        
+        # Prepare results
+        elapsed_time = time.time() - start_time
+        results = {
+            "file_path": str(file_path),
+            "file_size_bytes": file_size,
+            "total_lines": total_lines,
+            "valid_lines": valid_lines,
+            "error_lines": error_lines,
+            "sample_count": len(sample_objects),
+            "all_keys": dict(all_keys.most_common()),
+            "unique_key_count": len(all_keys),
+            "schema": merged_schema,
+            "analysis_timestamp": time.time(),
+            "processing_time_seconds": elapsed_time,
+            "workers_used": self.max_workers,
+            "chunks_processed": len(chunks)
+        }
+        
+        print(f"  Completed in {elapsed_time:.2f}s - {valid_lines:,} valid lines, {error_lines:,} errors")
+        
+        # Clean up memory
+        gc.collect()
+        
+        return results
+    
+    def analyze_directory(self, directory_path: Union[str, Path], pattern: str = "*.jsonl") -> Dict[str, Any]:
+        """
+        Analyze all JSONL files in a directory using parallel processing.
+        
+        Args:
+            directory_path: Path to directory containing JSONL files
+            pattern: File pattern to match (default: *.jsonl)
+            
+        Returns:
+            Dictionary containing analysis results for all files
+        """
+        directory_path = Path(directory_path)
+        
+        if not directory_path.exists():
+            raise FileNotFoundError(f"Directory not found: {directory_path}")
+        
+        # Find all JSONL files
+        jsonl_files = list(directory_path.glob(pattern))
+        
+        if not jsonl_files:
+            print(f"No JSONL files found in {directory_path} with pattern {pattern}")
+            return {"files": [], "summary": {}}
+        
+        print(f"Found {len(jsonl_files)} JSONL files to analyze using {self.max_workers} workers...")
+        start_time = time.time()
+        
+        # Sort files by size (largest first) for better load balancing
+        jsonl_files.sort(key=lambda f: f.stat().st_size, reverse=True)
+        
+        # Analyze files in parallel
+        file_results = {}
+        
+        if len(jsonl_files) == 1 or self.max_workers == 1:
+            # Process sequentially for single file
+            for file_path in jsonl_files:
+                try:
+                    file_results[file_path.name] = self.analyze_jsonl_file(file_path)
+                except Exception as e:
+                    print(f"Error analyzing {file_path.name}: {e}")
+                    file_results[file_path.name] = {"error": str(e)}
+        else:
+            # Process files in parallel
+            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                # Submit all files for analysis
+                future_to_file = {
+                    executor.submit(self.analyze_jsonl_file, file_path): file_path 
+                    for file_path in jsonl_files
+                }
+                
+                # Collect results as they complete
+                for future in as_completed(future_to_file):
+                    file_path = future_to_file[future]
+                    try:
+                        result = future.result()
+                        file_results[file_path.name] = result
+                    except Exception as e:
+                        print(f"Error analyzing {file_path.name}: {e}")
+                        file_results[file_path.name] = {"error": str(e)}
+        
+        # Create summary
+        successful_results = [r for r in file_results.values() if "error" not in r]
+        summary = {
+            "total_files": len(jsonl_files),
+            "successfully_analyzed": len(successful_results),
+            "total_size_bytes": sum(
+                r.get("file_size_bytes", 0) for r in successful_results
+            ),
+            "total_lines": sum(
+                r.get("total_lines", 0) for r in successful_results
+            ),
+            "total_valid_lines": sum(
+                r.get("valid_lines", 0) for r in successful_results
+            ),
+            "total_processing_time": sum(
+                r.get("processing_time_seconds", 0) for r in successful_results
+            ),
+            "average_processing_speed_mb_per_sec": 0
+        }
+        
+        # Calculate processing speed
+        if summary["total_processing_time"] > 0:
+            total_mb = summary["total_size_bytes"] / (1024 * 1024)
+            summary["average_processing_speed_mb_per_sec"] = total_mb / summary["total_processing_time"]
+        
+        elapsed_time = time.time() - start_time
+        summary["total_elapsed_time"] = elapsed_time
+        
+        print(f"\nDirectory analysis completed in {elapsed_time:.2f}s")
+        print(f"Processed {summary['total_valid_lines']:,} valid lines from {summary['successfully_analyzed']} files")
+        print(f"Average speed: {summary['average_processing_speed_mb_per_sec']:.2f} MB/sec")
+        
+        return {
+            "directory": str(directory_path),
+            "pattern": pattern,
+            "files": file_results,
+            "summary": summary
+        }
+    
+    def save_results(self, results: Dict[str, Any], output_path: Union[str, Path]):
+        """
+        Save analysis results to a JSON file.
+        
+        Args:
+            results: Analysis results to save
+            output_path: Path to save the results
+        """
+        output_path = Path(output_path)
+        
+        try:
+            start_time = time.time()
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(results, f, indent=2, ensure_ascii=False)
+            
+            save_time = time.time() - start_time
+            file_size = output_path.stat().st_size
+            print(f"Results saved to {output_path} ({file_size / (1024*1024):.2f} MB) in {save_time:.2f}s")
+            
+        except Exception as e:
+            raise RuntimeError(f"Error saving results to {output_path}: {e}")
+
+
+def main():
+    """Main function for command-line usage."""
+    parser = argparse.ArgumentParser(
+        description="Optimized JSONL schema analyzer using multiple cores"
+    )
+    parser.add_argument(
+        "path",
+        help="Path to JSONL file or directory containing JSONL files"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Output file for analysis results (JSON format)"
+    )
+    parser.add_argument(
+        "-p", "--pattern",
+        default="*.jsonl",
+        help="File pattern when analyzing directory (default: *.jsonl)"
+    )
+    parser.add_argument(
+        "-s", "--max-samples",
+        type=int,
+        default=1000,
+        help="Maximum number of JSON objects to sample per file (default: 1000)"
+    )
+    parser.add_argument(
+        "-w", "--workers",
+        type=int,
+        default=None,
+        help="Number of worker processes (default: CPU count, max 8)"
+    )
+    parser.add_argument(
+        "-c", "--chunk-size",
+        type=int,
+        default=1000,
+        help="Number of lines to process in each chunk (default: 1000)"
+    )
+    parser.add_argument(
+        "--directory",
+        action="store_true",
+        help="Treat path as directory instead of single file"
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Enable performance profiling"
+    )
+    
+    args = parser.parse_args()
+    
+    # Initialize analyzer
+    analyzer = OptimizedJSONLSchemaAnalyzer(
+        max_samples=args.max_samples,
+        max_workers=args.workers,
+        chunk_size=args.chunk_size
+    )
+    
+    try:
+        start_time = time.time()
+        
+        # Analyze file or directory
+        if args.directory or Path(args.path).is_dir():
+            results = analyzer.analyze_directory(args.path, args.pattern)
+        else:
+            results = analyzer.analyze_jsonl_file(args.path)
+        
+        total_time = time.time() - start_time
+        
+        # Save or print results
+        if args.output:
+            analyzer.save_results(results, args.output)
+        else:
+            print("\n" + "="*50)
+            print("ANALYSIS RESULTS")
+            print("="*50)
+            print(json.dumps(results, indent=2, ensure_ascii=False))
+        
+        print(f"\nTotal analysis time: {total_time:.2f}s")
+    
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/Analyzer/run_schema_analysis.py
+++ b/Analyzer/run_schema_analysis.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""
+Run JSONL Schema Analysis with Default Configuration
+
+This script runs the JSONL schema analyzer using predefined constants,
+so you don't need to pass any command line arguments.
+"""
+
+import sys
+from pathlib import Path
+
+# Get the root directory (assuming this script is in the scripts folder)
+ROOT_DIR = Path(__file__).parent.parent.parent
+
+# Configuration constants
+DEFAULT_INPUT_DIR = ROOT_DIR / "raw_data"
+DEFAULT_OUTPUT_DIR = ROOT_DIR / "intermediate"
+DEFAULT_LANG_FILTER = "fr"
+DEFAULT_INPUT_FILENAME = f"{DEFAULT_LANG_FILTER}-raw-wiktextract-data.jsonl"
+DEFAULT_INPUT_FILE = DEFAULT_INPUT_DIR / DEFAULT_INPUT_FILENAME
+
+# Analyzer configuration
+DEFAULT_MAX_SAMPLES = 1000
+DEFAULT_MAX_WORKERS = None  # Will use CPU count
+DEFAULT_PARALLEL_THRESHOLD_MB = 100
+DEFAULT_CHUNK_SIZE = 1000
+
+# Output configuration
+DEFAULT_OUTPUT_FILENAME = f"{DEFAULT_LANG_FILTER}_schema_analysis.json"
+DEFAULT_OUTPUT_FILE = DEFAULT_OUTPUT_DIR / DEFAULT_OUTPUT_FILENAME
+
+def main():
+    """Run the schema analysis with default configuration."""
+    
+    print("=" * 60)
+    print("JSONL Schema Analysis - Default Configuration")
+    print("=" * 60)
+    
+    # Display configuration
+    print(f"Root directory: {ROOT_DIR}")
+    print(f"Input directory: {DEFAULT_INPUT_DIR}")
+    print(f"Input file: {DEFAULT_INPUT_FILENAME}")
+    print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
+    print(f"Output file: {DEFAULT_OUTPUT_FILENAME}")
+    print(f"Language filter: {DEFAULT_LANG_FILTER}")
+    print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
+    print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
+    print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
+    print(f"Max workers: {DEFAULT_MAX_WORKERS or 'Auto (CPU count)'}")
+    print()
+    
+    # Check if input file exists
+    if not DEFAULT_INPUT_FILE.exists():
+        print(f"❌ Input file not found: {DEFAULT_INPUT_FILE}")
+        print()
+        print("Available files in raw_data directory:")
+        
+        # List available JSONL files
+        if DEFAULT_INPUT_DIR.exists():
+            jsonl_files = list(DEFAULT_INPUT_DIR.glob("*.jsonl"))
+            if jsonl_files:
+                for i, file in enumerate(sorted(jsonl_files), 1):
+                    size_mb = file.stat().st_size / (1024 * 1024)
+                    print(f"  {i:2d}. {file.name} ({size_mb:.1f} MB)")
+            else:
+                print("  No JSONL files found.")
+        else:
+            print("  raw_data directory not found.")
+        
+        print()
+        print("To analyze a different file, modify the constants in this script:")
+        print(f"  - DEFAULT_LANG_FILTER (currently: '{DEFAULT_LANG_FILTER}')")
+        print(f"  - DEFAULT_INPUT_FILENAME (currently: '{DEFAULT_INPUT_FILENAME}')")
+        return False
+    
+    # Create output directory if it doesn't exist
+    DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
+    
+    print(f"✅ Input file found: {DEFAULT_INPUT_FILE.stat().st_size / (1024*1024):.1f} MB")
+    print()
+    
+    try:
+        # Import the hybrid analyzer
+        sys.path.insert(0, str(Path(__file__).parent))
+        from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
+        
+        # Initialize analyzer with default configuration
+        analyzer = HybridJSONLSchemaAnalyzer(
+            max_samples=DEFAULT_MAX_SAMPLES,
+            max_workers=DEFAULT_MAX_WORKERS,
+            parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
+            chunk_size=DEFAULT_CHUNK_SIZE
+        )
+        
+        print("🚀 Starting analysis...")
+        print()
+        
+        # Run analysis
+        results = analyzer.analyze_jsonl_file(DEFAULT_INPUT_FILE)
+        
+        # Save results
+        analyzer.save_results(results, DEFAULT_OUTPUT_FILE)
+        
+        print()
+        print("=" * 60)
+        print("ANALYSIS COMPLETE")
+        print("=" * 60)
+        print(f"📊 Results saved to: {DEFAULT_OUTPUT_FILE}")
+        print(f"📈 Valid lines processed: {results.get('valid_lines', 0):,}")
+        print(f"🔑 Unique keys found: {results.get('unique_key_count', 0):,}")
+        print(f"⏱️  Processing time: {results.get('processing_time_seconds', 0):.2f} seconds")
+        print(f"📁 File size: {results.get('file_size_bytes', 0) / (1024*1024):.1f} MB")
+        
+        if results.get('processing_strategy'):
+            print(f"🔧 Strategy used: {results['processing_strategy']}")
+        
+        return True
+        
+    except ImportError as e:
+        print(f"❌ Error importing analyzer: {e}")
+        print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
+        return False
+    except Exception as e:
+        print(f"❌ Error during analysis: {e}")
+        return False
+
+def run_directory_analysis():
+    """Run analysis on entire directory with default configuration."""
+    
+    print("=" * 60)
+    print("Directory JSONL Schema Analysis - Default Configuration")
+    print("=" * 60)
+    
+    # Display configuration
+    print(f"Input directory: {DEFAULT_INPUT_DIR}")
+    print(f"Output directory: {DEFAULT_OUTPUT_DIR}")
+    print(f"Pattern: *.jsonl")
+    print(f"Max samples: {DEFAULT_MAX_SAMPLES:,}")
+    print(f"Parallel threshold: {DEFAULT_PARALLEL_THRESHOLD_MB} MB")
+    print(f"Chunk size: {DEFAULT_CHUNK_SIZE}")
+    print()
+    
+    # Check if input directory exists
+    if not DEFAULT_INPUT_DIR.exists():
+        print(f"❌ Input directory not found: {DEFAULT_INPUT_DIR}")
+        return False
+    
+    # Create output directory if it doesn't exist
+    DEFAULT_OUTPUT_DIR.mkdir(exist_ok=True)
+    
+    try:
+        # Import the hybrid analyzer
+        sys.path.insert(0, str(Path(__file__).parent))
+        from jsonl_schema_analyzer_hybrid import HybridJSONLSchemaAnalyzer
+        
+        # Initialize analyzer with default configuration
+        analyzer = HybridJSONLSchemaAnalyzer(
+            max_samples=DEFAULT_MAX_SAMPLES,
+            max_workers=DEFAULT_MAX_WORKERS,
+            parallel_threshold_mb=DEFAULT_PARALLEL_THRESHOLD_MB,
+            chunk_size=DEFAULT_CHUNK_SIZE
+        )
+        
+        print("🚀 Starting directory analysis...")
+        print()
+        
+        # Run analysis
+        results = analyzer.analyze_directory(DEFAULT_INPUT_DIR, "*.jsonl")
+        
+        # Save results
+        output_file = DEFAULT_OUTPUT_DIR / "directory_schema_analysis.json"
+        analyzer.save_results(results, output_file)
+        
+        print()
+        print("=" * 60)
+        print("DIRECTORY ANALYSIS COMPLETE")
+        print("=" * 60)
+        print(f"📊 Results saved to: {output_file}")
+        
+        summary = results.get('summary', {})
+        print(f"📁 Files analyzed: {summary.get('successfully_analyzed', 0)}")
+        print(f"📈 Total valid lines: {summary.get('total_valid_lines', 0):,}")
+        print(f"⏱️  Total processing time: {summary.get('total_processing_time', 0):.2f} seconds")
+        print(f"📦 Total data: {summary.get('total_size_bytes', 0) / (1024*1024*1024):.2f} GB")
+        print(f"🚀 Average speed: {summary.get('average_processing_speed_mb_per_sec', 0):.2f} MB/sec")
+        
+        if summary.get('strategies_used'):
+            strategies = summary['strategies_used']
+            print(f"🔧 Sequential files: {strategies.get('sequential', 0)}")
+            print(f"🔧 Parallel files: {strategies.get('parallel', 0)}")
+        
+        return True
+        
+    except ImportError as e:
+        print(f"❌ Error importing analyzer: {e}")
+        print("Make sure jsonl_schema_analyzer_hybrid.py is in the same directory.")
+        return False
+    except Exception as e:
+        print(f"❌ Error during analysis: {e}")
+        return False
+
+if __name__ == "__main__":
+    # You can choose what to run by default:
+    
+    # Option 1: Analyze single file (based on DEFAULT_LANG_FILTER)
+    success = main()
+    
+    # Option 2: Analyze entire directory (comment out the line above and uncomment below)
+    # success = run_directory_analysis()
+    
+    if not success:
+        sys.exit(1)
--- a/scripts/collect_samples.py
+++ b/scripts/collect_samples.py
@@ -0,0 +1,152 @@
+import json
+import pathlib
+import logging
+import sys
+import os
+
+# ==============================================================================
+# --- CONFIGURATION ---
+# ==============================================================================
+
+# --- Paths ---
+# Try to determine project root relative to this script location
+try:
+    SCRIPT_DIR = pathlib.Path(__file__).parent
+    ROOT_DIR = SCRIPT_DIR.parent
+except NameError:
+    SCRIPT_DIR = pathlib.Path.cwd()
+    ROOT_DIR = SCRIPT_DIR.parent
+
+# Input directory containing the source semua.org files
+RAW_DATA_DIR = ROOT_DIR / "raw_data"
+
+# The pattern to match source files
+FILE_PATTERN = "*raw-wiktextract-data.jsonl"
+
+# Output directory for the collected samples
+SAMPLES_DIR = ROOT_DIR / "samples"
+
+# Final output filename
+OUTPUT_FILENAME = "combined_samples.jsonl"
+
+# --- Sampling Options ---
+
+# How many matching entries to take from EACH source file.
+SAMPLES_PER_FILE = 2
+
+# Filter by Language Code.
+# Set to None to include all languages.
+# Example: "en", "de", "fr", "no"
+LANG_FILTER = set()
+# set()
+
+# Filter by Part of Speech.
+# Leave empty set() to include ALL parts of speech.
+# Example: {"noun", "verb", "adj"}
+POS_FILTER = {"verb"}
+
+# Filter to only include entries in their own language (lang_code matches file prefix)
+OWN_LANG_FILTER = True
+
+# ==============================================================================
+# --- END OF CONFIGURATION ---
+# ==============================================================================
+
+# Setup simple logging to console
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+logger = logging.getLogger(__name__)
+
+def collect_samples():
+    # 1. Setup Paths and Directories
+    input_dir = pathlib.Path(RAW_DATA_DIR)
+    output_dir = pathlib.Path(SAMPLES_DIR)
+    output_file = output_dir / OUTPUT_FILENAME
+
+    if not input_dir.exists():
+        logger.error(f"ERROR: Raw data directory not found at: {input_dir}")
+        logger.error("Please ensure your configuration points to the correct folder.")
+        sys.exit(1)
+
+    # Create samples directory if it doesn't exist
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # 2. Find all matching input files
+    source_files = list(input_dir.glob(FILE_PATTERN))
+    if not source_files:
+        logger.warning(f"No files matching '{FILE_PATTERN}' found in {input_dir}")
+        sys.exit(0)
+
+    logger.info(f"Found {len(source_files)} source files to sample from.")
+    logger.info(f"Target: {SAMPLES_PER_FILE} samples per file.")
+    logger.info(f"Language Filter: {LANG_FILTER if LANG_FILTER else 'ALL'}")
+    logger.info(f"POS Filter: {POS_FILTER if POS_FILTER else 'ALL'}")
+    logger.info(f"Own Language Filter: {'ENABLED' if OWN_LANG_FILTER else 'DISABLED'}")
+    logger.info("-" * 50)
+
+    total_collected = 0
+    
+    # Open the output file once and append samples from all inputs to it
+    try:
+        with open(output_file, 'w', encoding='utf-8') as out_f:
+            
+            for src_file in source_files:
+                logger.info(f"Scanning: {src_file.name}...")
+                lang_from_file = src_file.name[:2]
+                file_collected = 0
+                lines_read = 0
+                
+                try:
+                    with open(src_file, 'r', encoding='utf-8') as in_f:
+                        for line in in_f:
+                            lines_read += 1
+                            
+                            # Stop reading this file if we have enough samples
+                            if file_collected >= SAMPLES_PER_FILE:
+                                break
+
+                            if not line.strip():
+                                continue
+
+                            try:
+                                entry = json.loads(line)
+                                
+                                # --- Filtering Logic ---
+                                # 1. Language Filter
+                                if LANG_FILTER and entry.get('lang_code') != LANG_FILTER:
+                                    continue
+
+                                # 2. POS Filter
+                                if POS_FILTER and entry.get('pos') not in POS_FILTER:
+                                    continue
+
+                                # 3. Own Language Filter
+                                if OWN_LANG_FILTER and entry.get('lang_code') != lang_from_file:
+                                    continue
+
+                                # --- If it passed filters, save it ---
+                                # We write it exactly as it is in the source
+                                json.dump(entry, out_f, ensure_ascii=False)
+                                out_f.write('\n')
+                                file_collected += 1
+                                total_collected += 1
+
+                            except json.JSONDecodeError:
+                                # Ignore bad lines in source files during sampling
+                                continue
+                                
+                    logger.info(f"   -> Collected {file_collected} samples (scanned {lines_read} lines)")
+
+                except Exception as e:
+                    logger.error(f"   ERROR reading {src_file.name}: {e}")
+
+    except Exception as e:
+         logger.critical(f"FATAL ERROR writing output file: {e}")
+         sys.exit(1)
+
+    logger.info("-" * 50)
+    logger.info("SAMPLING COMPLETE")
+    logger.info(f"Total entries collected: {total_collected}")
+    logger.info(f"Output saved to: {output_file}")
+
+if __name__ == "__main__":
+    collect_samples()
--- a/scripts/count_pos_values.py
+++ b/scripts/count_pos_values.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Script to count all different "pos" values in JSONL files using parallel processing.
+Analyzes all JSONL files in the raw_data directory and displays frequency counts.
+"""
+
+import json
+import os
+import glob
+from collections import Counter
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from multiprocessing import cpu_count
+import time
+from typing import Dict, List, Tuple
+
+
+def process_jsonl_file(file_path: str) -> Tuple[str, Counter]:
+    """
+    Process a single JSONL file and count POS values.
+    
+    Args:
+        file_path: Path to the JSONL file
+        
+    Returns:
+        Tuple of (filename, Counter of POS values)
+    """
+    pos_counter = Counter()
+    line_count = 0
+    
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                    
+                try:
+                    data = json.loads(line)
+                    if 'pos' in data and data['pos']:
+                        pos_counter[data['pos']] += 1
+                    line_count += 1
+                except json.JSONDecodeError as e:
+                    print(f"Warning: JSON decode error in {file_path} at line {line_num}: {e}")
+                    continue
+                    
+    except Exception as e:
+        print(f"Error processing file {file_path}: {e}")
+        return file_path, Counter()
+    
+    print(f"Processed {file_path}: {line_count} lines, {sum(pos_counter.values())} POS entries")
+    return file_path, pos_counter
+
+
+def main():
+    """Main function to process all JSONL files and display POS statistics."""
+    # Find all JSONL files in raw_data directory
+    raw_data_dir = "raw_data"
+    jsonl_files = glob.glob(os.path.join(raw_data_dir, "*.jsonl"))
+    
+    if not jsonl_files:
+        print(f"No JSONL files found in {raw_data_dir}")
+        return
+    
+    print(f"Found {len(jsonl_files)} JSONL files to process")
+    print(f"Using {cpu_count()} CPU cores for parallel processing")
+    print("-" * 60)
+    
+    # Process files in parallel
+    start_time = time.time()
+    all_pos_counts = Counter()
+    file_results = {}
+    
+    with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
+        # Submit all files for processing
+        future_to_file = {
+            executor.submit(process_jsonl_file, file_path): file_path 
+            for file_path in jsonl_files
+        }
+        
+        # Collect results as they complete
+        for future in as_completed(future_to_file):
+            file_path = future_to_file[future]
+            try:
+                filename, pos_counter = future.result()
+                file_results[filename] = pos_counter
+                all_pos_counts.update(pos_counter)
+            except Exception as e:
+                print(f"Error processing {file_path}: {e}")
+    
+    end_time = time.time()
+    processing_time = end_time - start_time
+    
+    # Display results
+    print("\n" + "=" * 80)
+    print("POS VALUE COUNTS ACROSS ALL FILES")
+    print("=" * 80)
+    print(f"Total processing time: {processing_time:.2f} seconds")
+    print(f"Total POS entries found: {sum(all_pos_counts.values()):,}")
+    print(f"Unique POS values: {len(all_pos_counts)}")
+    print("\nTop 50 most common POS values:")
+    print("-" * 80)
+    
+    # Sort by frequency (descending)
+    sorted_pos = sorted(all_pos_counts.items(), key=lambda x: x[1], reverse=True)
+    
+    for pos, count in sorted_pos[:100]:
+        percentage = (count / sum(all_pos_counts.values())) * 100
+        print(f"{pos:<20} {count:>10,} ({percentage:5.2f}%)")
+    
+    if len(sorted_pos) > 100:
+        print(f"\n... and {len(sorted_pos) - 100} more POS values")
+    
+    # Show all unique POS values (alphabetical)
+    print("\n" + "=" * 80)
+    print("ALL UNIQUE POS VALUES (ALPHABETICAL)")
+    print("=" * 80)
+    
+    for pos, count in sorted(all_pos_counts.items(), key=lambda x: x[0].lower()):
+        print(f"{pos:<30} {count:>10,}")
+    
+    # Per-file breakdown
+    print("\n" + "=" * 80)
+    print("PER-FILE BREAKDOWN")
+    print("=" * 80)
+    
+    for filename, pos_counter in sorted(file_results.items()):
+        total_entries = sum(pos_counter.values())
+        if total_entries > 0:
+            print(f"\n{os.path.basename(filename)}:")
+            print(f"  Total entries: {total_entries:,}")
+            print(f"  Unique POS values: {len(pos_counter)}")
+            
+            # All POS values for this file (sorted by frequency)
+            all_pos = sorted(pos_counter.items(), key=lambda x: x[1], reverse=True)
+            for pos, count in all_pos:
+                print(f"    {pos:<15} {count:>8,}")
+    
+    print(f"\nProcessing completed in {processing_time:.2f} seconds")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/lang_config.py
+++ b/scripts/lang_config.py
@@ -0,0 +1,401 @@
+GERMAN_VERB_CONFIG = {
+    "clean_prefixes": ["ich", "du", "er/sie/es", "wir", "ihr", "sie"],
+    "normalization_rules": [
+        {"field": "pronouns", "match": "ich", "add_tags": ["first-person", "singular", "indicative", "active"]},
+        {"field": "pronouns", "match": "du", "add_tags": ["second-person", "singular", "indicative", "active"]},
+        {"field": "pronouns", "match": "er", "add_tags": ["third-person", "singular", "indicative", "active"]},
+        {"field": "pronouns", "match": "sie", "add_tags": ["third-person", "singular", "indicative", "active"]},
+        {"field": "pronouns", "match": "es", "add_tags": ["third-person", "singular", "indicative", "active"]},
+        {"field": "pronouns", "match": "wir", "add_tags": ["first-person", "plural", "indicative", "active"]},
+        {"field": "pronouns", "match": "ihr", "add_tags": ["second-person", "plural", "indicative", "active"]}
+    ],
+    "properties": [
+        {
+            "name": "auxiliary",
+            "multivalue": True,  # <--- CRITICAL CHANGE HERE
+            "default": ["haben"], 
+            "rules": [
+                # Check for explicit raw tags
+                {"value": "sein", "criteria": {"raw_tags": ["Hilfsverb sein"]}},
+                {"value": "haben", "criteria": {"raw_tags": ["Hilfsverb haben"]}},
+                # Check for 'common forms' that imply the aux
+                {"value": "sein", "criteria": {"form_regex": "^sein$", "tags": ["auxiliary", "perfect"]}},
+                {"value": "haben", "criteria": {"form_regex": "^haben$", "tags": ["auxiliary", "perfect"]}}
+            ]
+        },
+        {
+            "name": "separability",
+            "default": "inseparable",
+            "rules": [
+                {"value": "separable", "criteria": {"tags": ["separable"]}},
+                {"value": "inseparable", "criteria": {"tags": ["inseparable"]}},
+                {"value": "separable", "criteria": {"tags": ["participle-2"], "form_regex": "^(?!ge).+ge.+$"}} 
+            ]
+        }
+    ],
+    "schema": {
+        "infinitive": {
+            "type": "single",
+            "criteria": {"tags": ["infinitive", "present"], "exclude_tags": ["extended", "passive", "reflexive", "zu"]} 
+        },
+        "participle_perfect": {
+            "type": "single",
+            "criteria": {"tags": ["participle-2", "perfect"], "exclude_tags": ["active", "passive", "auxiliary"]}
+        },
+        "imperative": {
+            "type": "list",
+            "size": 2,
+            "base_criteria": {"tags": ["imperative", "present", "active"]},
+            "indices": [
+                {"index": 0, "tags": ["singular", "second-person"]},
+                {"index": 1, "tags": ["plural", "second-person"]}
+            ]
+        },
+        "present": {
+            "type": "list",
+            "size": 6,
+            "base_criteria": {"tags": ["indicative", "present", "active"], "exclude_tags": ["passive"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "past": {
+            "type": "list",
+            "size": 6,
+            "base_criteria": {"tags": ["indicative", "past", "active"], "exclude_tags": ["passive"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "subjunctive_ii": {
+            "type": "list",
+            "size": 6,
+            "base_criteria": {"tags": ["subjunctive-ii", "past", "active"], "exclude_tags": ["passive"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        }
+    }
+}
+
+FRENCH_VERB_CONFIG = {
+    "skip_normalization_if_source": False,
+    
+    # CHANGED: Set to False to prevent crashes on idioms, rare words, and defective verbs
+    "validate_completeness": False, 
+
+    "clean_prefixes": [
+        "qu'", "qu’", "que", "j'", "j’", "je", "tu", 
+        "il/elle/on", "il", "elle", "on", "nous", "vous", "ils/elles", "ils", "elles"
+    ],
+    
+    "normalization_rules": [
+        # Pronoun matches
+        {"field": "form", "match": r"\bje\b", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
+        {"field": "form", "match": r"\bj[’']", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
+        {"field": "form", "match": r"\btu\b", "match_mode": "regex", "add_tags": ["second-person", "singular"]},
+        {"field": "form", "match": r"\b(il|elle|on|il/elle/on)\b", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
+        {"field": "form", "match": r"\[il/ɛl/ɔ̃\]", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
+        {"field": "form", "match": r"\bnous\b", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
+        {"field": "form", "match": r"\bvous\b", "match_mode": "regex", "add_tags": ["second-person", "plural"]},
+        {"field": "form", "match": r"\b(ils|elles|ils/elles)\b", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
+        {"field": "form", "match": r"\[il/ɛl\]", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
+
+        # Suffix Heuristics
+        {"field": "form", "match": r"ons$", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
+        {"field": "form", "match": r"ez$", "match_mode": "regex", "add_tags": ["second-person", "plural"]}
+    ],
+
+    "properties": [
+        {
+            "name": "auxiliary",
+            "multivalue": True,
+            "default": ["avoir"], 
+            "rules": [
+                {"value": "être", "criteria": {"raw_tags": ["auxiliary être"]}},
+                {"value": "avoir", "criteria": {"raw_tags": ["auxiliary avoir"]}},
+                {"value": "être", "criteria": {"tags": ["auxiliary-être"]}},
+                {"value": "avoir", "criteria": {"tags": ["auxiliary-avoir"]}}
+            ]
+        },
+        {
+            "name": "group",
+            "default": "unknown", 
+            "rules": [
+                {"value": "1st-group", "criteria": {"raw_tags": ["1ᵉʳ groupe"]}},
+                {"value": "2nd-group", "criteria": {"raw_tags": ["2ᵉ groupe"]}},
+                {"value": "3rd-group", "criteria": {"raw_tags": ["3ᵉ groupe"]}},
+                {"value": "1st-group", "criteria": {"form_regex": "er$"}},
+                {"value": "2nd-group", "criteria": {"form_regex": "ir$"}},
+                {"value": "3rd-group", "criteria": {"form_regex": "(re|oir)$"}}
+            ]
+        }
+    ],
+
+    "schema": {
+        "infinitive": {
+            "type": "single",
+            "criteria": {"tags": ["infinitive", "present"]}
+        },
+        "participle_present": {
+            "type": "single",
+            "optional": True,
+            "criteria": {"tags": ["participle", "present"]}
+        },
+        "participle_past": {
+            "type": "single",
+            "optional": True,
+            "criteria": {"tags": ["participle", "past"], "exclude_tags": ["multiword-construction"]} 
+        },
+        # All lists are now marked optional to handle defective verbs (like 'traire') and sparse data
+        "indicative_present": {
+            "type": "list", "size": 6, "optional": True,
+            "base_criteria": {"tags": ["indicative", "present"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "indicative_imperfect": {
+            "type": "list", "size": 6, "optional": True,
+            "base_criteria": {"tags": ["indicative", "imperfect"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "indicative_future": {
+            "type": "list", "size": 6, "optional": True,
+            "base_criteria": {"tags": ["indicative", "future"], "exclude_tags": ["perfect"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "indicative_simple_past": { 
+            "type": "list", "size": 6, "optional": True, # Traire/clore do not have this
+            "base_criteria": {"tags": ["indicative", "past"], "exclude_tags": ["multiword-construction", "imperfect", "perfect", "anterior"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "subjunctive_present": {
+            "type": "list", "size": 6, "optional": True,
+            "base_criteria": {"tags": ["subjunctive", "present"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "conditional_present": {
+            "type": "list", "size": 6, "optional": True,
+            "base_criteria": {"tags": ["conditional", "present"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "imperative": {
+            "type": "list", "size": 3, "optional": True,
+            "base_criteria": {"tags": ["imperative", "present"]},
+            "indices": [
+                {"index": 0, "tags": ["singular"]},
+                {"index": 1, "tags": ["plural", "first-person"]},
+                {"index": 2, "tags": ["plural", "second-person"]},
+                {"index": 1, "criteria": {"form_regex": r"ons$"}},
+                {"index": 2, "criteria": {"form_regex": r"ez$"}},
+                {"index": 0, "criteria": {"form_regex": r"[es]$"}} 
+            ]
+        }
+    }
+}
+
+OLD_FRENCH_VERB_CONFIG = {
+    "skip_normalization_if_source": False,
+    "validate_completeness": True, 
+
+    # --- 1. Normalization ---
+    "clean_prefixes": [
+        "qu'", "qu’", "que", "j'", "j’", "je", "tu", 
+        "il/elle/on", "il", "elle", "on", "nous", "vous", "ils/elles", "ils", "elles"
+    ],
+    
+    "normalization_rules": [
+        {"field": "form", "match": r"\bje\b", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
+        {"field": "form", "match": r"\bj[’']", "match_mode": "regex", "add_tags": ["first-person", "singular"]},
+        {"field": "form", "match": r"\btu\b", "match_mode": "regex", "add_tags": ["second-person", "singular"]},
+        {"field": "form", "match": r"\b(il|elle|on|il/elle/on)\b", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
+        {"field": "form", "match": r"\[il/ɛl/ɔ̃\]", "match_mode": "regex", "add_tags": ["third-person", "singular"]},
+        {"field": "form", "match": r"\bnous\b", "match_mode": "regex", "add_tags": ["first-person", "plural"]},
+        {"field": "form", "match": r"\bvous\b", "match_mode": "regex", "add_tags": ["second-person", "plural"]},
+        {"field": "form", "match": r"\b(ils|elles|ils/elles)\b", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
+        {"field": "form", "match": r"\[il/ɛl\]", "match_mode": "regex", "add_tags": ["third-person", "plural"]},
+    ],
+
+    # --- 2. Properties ---
+    "properties": [
+        {
+            "name": "auxiliary",
+            "multivalue": True,
+            "default": ["avoir"], 
+            "rules": [
+                {"value": "être", "criteria": {"raw_tags": ["auxiliary être"]}},
+                {"value": "avoir", "criteria": {"raw_tags": ["auxiliary avoir"]}},
+                {"value": "être", "criteria": {"tags": ["auxiliary-être"]}},
+                {"value": "avoir", "criteria": {"tags": ["auxiliary-avoir"]}}
+            ]
+        },
+        {
+            "name": "group",
+            "default": "unknown", 
+            "rules": [
+                {"value": "1st-group", "criteria": {"raw_tags": ["1ᵉʳ groupe"]}},
+                {"value": "2nd-group", "criteria": {"raw_tags": ["2ᵉ groupe"]}},
+                {"value": "3rd-group", "criteria": {"raw_tags": ["3ᵉ groupe"]}},
+                {"value": "1st-group", "criteria": {"form_regex": "er$"}},
+                {"value": "2nd-group", "criteria": {"form_regex": "ir$"}},
+                {"value": "3rd-group", "criteria": {"form_regex": "(re|oir)$"}}
+            ]
+        }
+    ],
+
+    # --- 3. Schema ---
+    "schema": {
+        "infinitive": {
+            "type": "single",
+            "criteria": {"tags": ["infinitive", "present"]}
+        },
+        "participle_present": {
+            "type": "single",
+            "optional": True,  # <--- NEW: Allows missing participle
+            "criteria": {"tags": ["participle", "present"]}
+        },
+        "participle_past": {
+            "type": "single",
+            "optional": True, # <--- Often missing in defective verbs
+            "criteria": {"tags": ["participle", "past"], "exclude_tags": ["multiword-construction"]} 
+        },
+        "indicative_present": {
+            "type": "list", "size": 6,
+            "base_criteria": {"tags": ["indicative", "present"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "indicative_imperfect": {
+            "type": "list", "size": 6,
+            "base_criteria": {"tags": ["indicative", "imperfect"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "indicative_future": {
+            "type": "list", "size": 6,
+            "base_criteria": {"tags": ["indicative", "future"], "exclude_tags": ["perfect"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "indicative_simple_past": { 
+            "type": "list", "size": 6,
+            "base_criteria": {"tags": ["indicative", "past"], "exclude_tags": ["multiword-construction", "imperfect", "perfect", "anterior"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "subjunctive_present": {
+            "type": "list", "size": 6,
+            "base_criteria": {"tags": ["subjunctive", "present"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "conditional_present": {
+            "type": "list", "size": 6,
+            "base_criteria": {"tags": ["conditional", "present"]},
+            "indices": [
+                {"index": 0, "tags": ["first-person", "singular"]},
+                {"index": 1, "tags": ["second-person", "singular"]},
+                {"index": 2, "tags": ["third-person", "singular"]},
+                {"index": 3, "tags": ["first-person", "plural"]},
+                {"index": 4, "tags": ["second-person", "plural"]},
+                {"index": 5, "tags": ["third-person", "plural"]}
+            ]
+        },
+        "imperative": {
+            "type": "list", "size": 3,
+            "optional": True, # <--- Often missing for phrases/defective verbs
+            "base_criteria": {"tags": ["imperative", "present"]},
+            "indices": [
+                {"index": 0, "tags": ["singular"]},
+                {"index": 1, "tags": ["plural", "first-person"]},
+                {"index": 2, "tags": ["plural", "second-person"]}
+            ]
+        }
+    }
+}
--- a/scripts/printline.py
+++ b/scripts/printline.py
@@ -0,0 +1,38 @@
+import json
+import pathlib
+from datetime import datetime
+
+
+INPUT_FILE_NAME = "fr_raw-wiktextract-data.jsonl" 
+SCRIPT_DIR = pathlib.Path(__file__).parent
+ROOT_DIR = SCRIPT_DIR.parent
+INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
+
+
+
+# --- Configuration ---
+START_LINE = 99  # 1-based index (first line is 1)
+NUM_LINES = 99  # Number of lines/objects to write
+
+
+def extract_lines_to_file(file_path, start_line, num_lines):
+    # Generate timestamp filename
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = file_path.parent / f"{timestamp}.json"
+    
+    with open(file_path, 'r', encoding='utf-8') as infile:
+        with open(output_file, 'w', encoding='utf-8') as outfile:
+            for i, line in enumerate(infile, start=1):
+                if i >= start_line and i < start_line + num_lines:
+                    try:
+                        element = json.loads(line)
+                        outfile.write(json.dumps(element, indent=2, ensure_ascii=False))
+                        outfile.write('\n')
+                    except json.JSONDecodeError:
+                        outfile.write(f"Error: Line {i} is not valid JSON.\n")
+    
+    print(f"Output written to: {output_file}")
+
+
+if __name__ == "__main__":
+    extract_lines_to_file(INPUT_FILE, START_LINE, NUM_LINES)
--- a/scripts/search_word.py
+++ b/scripts/search_word.py
@@ -0,0 +1,110 @@
+import json
+import pathlib
+from datetime import datetime
+
+
+INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl"  # <-- Update this to your file
+# --- Dynamic Path Setup ---
+SCRIPT_DIR = pathlib.Path(__file__).parent
+ROOT_DIR = SCRIPT_DIR.parent
+INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
+
+
+# --- Filter Configuration ---
+# Set the POS (part of speech) you want to filter for
+# Examples: "noun", "verb", "adj", "adv", etc.
+# Set to None to skip POS filtering
+FILTER_POS = "noun"
+
+# Set the word you want to filter for
+# Set to None to skip word filtering
+FILTER_WORD = "grenouille"
+
+# Set word prefix to filter for (e.g., "Septem" will match "September")
+# Set to None to skip prefix filtering
+FILTER_PREFIX = None
+
+# Set word suffix to filter for (e.g., "ber" will match "September")
+# Set to None to skip suffix filtering
+FILTER_SUFFIX = None
+
+# Maximum number of results to include (set to None for unlimited)
+MAX_RESULTS = 5
+
+
+def matches_filters(entry):
+    """Check if an entry matches all active filters."""
+    
+    # Filter by POS
+    if FILTER_POS is not None:
+        if entry.get("pos") != FILTER_POS:
+            return False
+    
+    # Filter by exact word
+    if FILTER_WORD is not None:
+        if entry.get("word") != FILTER_WORD:
+            return False
+    
+    # Filter by prefix
+    if FILTER_PREFIX is not None:
+        word = entry.get("word", "")
+        if not word.startswith(FILTER_PREFIX):
+            return False
+    
+    # Filter by suffix
+    if FILTER_SUFFIX is not None:
+        word = entry.get("word", "")
+        if not word.endswith(FILTER_SUFFIX):
+            return False
+    
+    return True
+
+
+def filter_and_save(file_path):
+    """Filter JSONL file and save matching entries."""
+    
+    # Generate output filename with original filename and timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl"
+    
+    match_count = 0
+    total_lines = 0
+    
+    with open(file_path, 'r', encoding='utf-8') as infile:
+        with open(output_file, 'w', encoding='utf-8') as outfile:
+            for line in infile:
+                total_lines += 1
+                
+                try:
+                    entry = json.loads(line)
+                    
+                    # Check if entry matches filters
+                    if matches_filters(entry):
+                        outfile.write(json.dumps(entry, ensure_ascii=False))
+                        outfile.write('\n')
+                        match_count += 1
+                        
+                        # Stop if we've reached max results
+                        if MAX_RESULTS is not None and match_count >= MAX_RESULTS:
+                            break
+                            
+                except json.JSONDecodeError:
+                    print(f"Warning: Line {total_lines} is not valid JSON.")
+    
+    print(f"Filtered {match_count} entries from {total_lines} total lines")
+    print(f"Output written to: {output_file}")
+    
+    # Print active filters
+    print("\nActive filters:")
+    if FILTER_POS:
+        print(f"  - POS: {FILTER_POS}")
+    if FILTER_WORD:
+        print(f"  - Word (exact): {FILTER_WORD}")
+    if FILTER_PREFIX:
+        print(f"  - Prefix: {FILTER_PREFIX}")
+    if FILTER_SUFFIX:
+        print(f"  - Suffix: {FILTER_SUFFIX}")
+
+
+if __name__ == "__main__":
+    filter_and_save(INPUT_FILE)
--- a/scripts/transform_wiktionary.py
+++ b/scripts/transform_wiktionary.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+"""
+Universal Wiktionary Format Transformer
+========================================
+Transforms any Wiktionary JSON format to a standardized universal schema.
+
+Usage:
+    python transform_wiktionary.py input.jsonl output.jsonl
+    python transform_wiktionary.py input.jsonl output.jsonl --validate
+"""
+
+import json
+import sys
+import argparse
+from typing import Dict, List, Any, Optional
+from pathlib import Path
+
+
+class WiktionaryTransformer:
+    """Transforms Wiktionary entries to universal format."""
+
+    def __init__(self, validate: bool = False):
+        self.validate = validate
+        self.stats = {
+            "total": 0,
+            "successful": 0,
+            "errors": 0,
+            "warnings": []
+        }
+
+    def transform_entry(self, raw_entry: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Transform a single Wiktionary entry to universal format.
+
+        Args:
+            raw_entry: Raw entry from any Wiktionary edition
+
+        Returns:
+            Transformed entry in universal format
+        """
+        # === REQUIRED CORE FIELDS ===
+        try:
+            universal = {
+                "word": raw_entry["word"],
+                "lang_code": raw_entry["lang_code"],
+                "pos": raw_entry["pos"],
+                "senses": raw_entry["senses"]
+            }
+        except KeyError as e:
+            raise ValueError(f"Missing required field: {e}")
+
+        # === PHONETICS ===
+        phonetics = self._extract_phonetics(raw_entry)
+        if phonetics:
+            universal["phonetics"] = phonetics
+
+        # === HYPHENATION ===
+        hyphenation = self._extract_hyphenation(raw_entry)
+        if hyphenation:
+            universal["hyphenation"] = hyphenation
+
+        # === FORMS ===
+        if "forms" in raw_entry:
+            universal["forms"] = raw_entry["forms"]
+
+        # === GRAMMATICAL FEATURES ===
+        grammatical = self._extract_grammatical_features(raw_entry)
+        if grammatical:
+            universal["grammatical_features"] = grammatical
+
+        # === ETYMOLOGY ===
+        etymology = self._extract_etymology(raw_entry)
+        if etymology:
+            universal["etymology"] = etymology
+
+        # === RELATIONS ===
+        relations = self._extract_relations(raw_entry)
+        if relations:
+            universal["relations"] = relations
+
+        # === TRANSLATIONS ===
+        if "translations" in raw_entry:
+            universal["translations"] = raw_entry["translations"]
+
+        # === DESCENDANTS ===
+        if "descendants" in raw_entry:
+            universal["descendants"] = raw_entry["descendants"]
+
+        # === METADATA ===
+        metadata = self._extract_metadata(raw_entry)
+        universal["metadata"] = metadata
+
+        return universal
+
+    def _extract_phonetics(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """Extract and normalize phonetic information."""
+        phonetics = {}
+
+        # Process sounds array
+        if "sounds" in entry and entry["sounds"]:
+            ipa_variations = []
+            audio_list = []
+            homophones = []
+
+            for sound in entry["sounds"]:
+                # IPA transcription with country information
+                if "ipa" in sound:
+                    ipa_entry = {"ipa": sound["ipa"]}
+                    
+                    # Preserve country information from raw_tags
+                    if "raw_tags" in sound:
+                        ipa_entry["raw_tags"] = sound["raw_tags"]
+                    
+                    # Clean IPA string by removing special characters at beginning/end
+                    cleaned_ipa = self._clean_ipa_string(sound["ipa"])
+                    ipa_entry["ipa_cleaned"] = cleaned_ipa
+                    
+                    ipa_variations.append(ipa_entry)
+
+                # Audio files (keep for now, will be removed in filter step)
+                if "audio" in sound:
+                    audio_obj = {}
+                    # Try multiple URL formats
+                    for url_key in ["ogg_url", "mp3_url", "url"]:
+                        if url_key in sound:
+                            audio_obj["url"] = sound[url_key]
+                            break
+                    audio_obj["text"] = sound.get("audio", "")
+                    if audio_obj:
+                        audio_list.append(audio_obj)
+
+                # Homophones
+                if "homophone" in sound:
+                    homophones.append(sound["homophone"])
+
+            if ipa_variations:
+                phonetics["ipa_variations"] = ipa_variations
+            if audio_list:
+                phonetics["audio"] = audio_list
+            if homophones:
+                phonetics["homophones"] = homophones
+
+        # Handle extra_sounds (some editions)
+        if "extra_sounds" in entry:
+            if "pronunciación" in entry["extra_sounds"]:
+                phonetics["notes"] = entry["extra_sounds"]["pronunciación"]
+
+        return phonetics if phonetics else None
+
+    def _clean_ipa_string(self, ipa_string: str) -> str:
+        """Clean IPA string by removing special characters at beginning/end."""
+        if not ipa_string:
+            return ipa_string
+        
+        # Remove leading/trailing special characters: [, ], \, :
+        cleaned = ipa_string.strip("[]\\:")
+        return cleaned
+
+    def _extract_hyphenation(self, entry: Dict[str, Any]) -> Optional[List[str]]:
+        """Extract and normalize hyphenation."""
+        # Format 1: hyphenations array with parts
+        if "hyphenations" in entry and entry["hyphenations"]:
+            parts = []
+            for h in entry["hyphenations"]:
+                if isinstance(h, dict) and "parts" in h:
+                    parts.extend(h["parts"])
+                elif isinstance(h, str):
+                    parts.append(h)
+            if parts:
+                return parts
+
+        # Format 2: hyphenation string with separator
+        if "hyphenation" in entry:
+            # Split on common separators
+            hyph = entry["hyphenation"]
+            for sep in ["‐", "-", "·", "•"]:
+                if sep in hyph:
+                    return hyph.split(sep)
+            return [hyph]
+
+        return None
+
+    def _extract_grammatical_features(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """Extract grammatical features and tags."""
+        if "tags" not in entry:
+            return None
+
+        grammatical = {"tags": entry["tags"]}
+
+        # Extract gender from tags
+        gender_map = {
+            "masculine": "masculine",
+            "feminine": "feminine",
+            "neuter": "neuter",
+            "common": "common",
+            "m": "masculine",
+            "f": "feminine",
+            "n": "neuter",
+            "c": "common"
+        }
+
+        for tag in entry["tags"]:
+            tag_lower = tag.lower()
+            if tag_lower in gender_map:
+                grammatical["gender"] = gender_map[tag_lower]
+                break
+
+        # Extract number
+        number_map = {
+            "singular": "singular",
+            "plural": "plural",
+            "dual": "dual",
+            "sg": "singular",
+            "pl": "plural"
+        }
+
+        for tag in entry["tags"]:
+            tag_lower = tag.lower()
+            if tag_lower in number_map:
+                grammatical["number"] = number_map[tag_lower]
+                break
+
+        return grammatical
+
+    def _extract_etymology(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """Extract etymology information."""
+        etymology = {}
+
+        if "etymology_text" in entry:
+            etymology["text"] = entry["etymology_text"]
+
+        if "etymology_texts" in entry:
+            etymology["texts"] = entry["etymology_texts"]
+
+        if "etymology_number" in entry:
+            etymology["number"] = entry["etymology_number"]
+
+        return etymology if etymology else None
+
+    def _extract_relations(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """Extract semantic and lexical relations."""
+        relations = {}
+
+        # Define all possible relation types
+        relation_fields = [
+            "synonyms", "antonyms", "hypernyms", "hyponyms",
+            "meronyms", "holonyms", "related", "derived",
+            "coordinate_terms", "troponyms", "compounds"
+        ]
+
+        for field in relation_fields:
+            if field in entry and entry[field]:
+                relations[field] = entry[field]
+
+        return relations if relations else None
+
+    def _extract_metadata(self, entry: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract metadata and source information."""
+        metadata = {}
+
+        # Source language
+        if "lang" in entry:
+            metadata["source_lang"] = entry["lang"]
+
+        # Infer source language code if possible
+        if "lang_code" in entry:
+            metadata["source_lang_code"] = entry["lang_code"]
+
+        # POS title (localized)
+        if "pos_title" in entry:
+            metadata["pos_title"] = entry["pos_title"]
+        elif "pos_text" in entry:
+            metadata["pos_title"] = entry["pos_text"]
+
+        # Categories
+        if "categories" in entry:
+            metadata["categories"] = entry["categories"]
+
+        # Templates
+        templates = []
+        if "head_templates" in entry:
+            templates.extend(entry["head_templates"])
+        if "inflection_templates" in entry:
+            templates.extend(entry["inflection_templates"])
+        if templates:
+            metadata["templates"] = templates
+
+        # Additional metadata
+        if "attestations" in entry:
+            metadata["attestations"] = entry["attestations"]
+
+        return metadata
+
+    def transform_file(self, input_path: str, output_path: str) -> None:
+        """
+        Transform an entire JSONL file.
+
+        Args:
+            input_path: Path to input JSONL file
+            output_path: Path to output JSONL file
+        """
+        input_file = Path(input_path)
+        output_file = Path(output_path)
+
+        if not input_file.exists():
+            raise FileNotFoundError(f"Input file not found: {input_path}")
+
+        print(f"Transforming: {input_path} → {output_path}")
+
+        with open(input_file, 'r', encoding='utf-8') as infile, \
+             open(output_file, 'w', encoding='utf-8') as outfile:
+
+            for line_num, line in enumerate(infile, 1):
+                line = line.strip()
+                if not line:
+                    continue
+
+                self.stats["total"] += 1
+
+                try:
+                    # Parse input
+                    raw_entry = json.loads(line)
+
+                    # Transform
+                    universal_entry = self.transform_entry(raw_entry)
+
+                    # Validate if requested
+                    if self.validate:
+                        self._validate_entry(universal_entry)
+
+                    # Write output
+                    outfile.write(json.dumps(universal_entry, ensure_ascii=False) + '\n')
+                    self.stats["successful"] += 1
+
+                except json.JSONDecodeError as e:
+                    self.stats["errors"] += 1
+                    warning = f"Line {line_num}: JSON decode error - {e}"
+                    self.stats["warnings"].append(warning)
+                    print(f"⚠ {warning}", file=sys.stderr)
+
+                except ValueError as e:
+                    self.stats["errors"] += 1
+                    warning = f"Line {line_num}: {e}"
+                    self.stats["warnings"].append(warning)
+                    print(f"⚠ {warning}", file=sys.stderr)
+
+                except Exception as e:
+                    self.stats["errors"] += 1
+                    warning = f"Line {line_num}: Unexpected error - {e}"
+                    self.stats["warnings"].append(warning)
+                    print(f"⚠ {warning}", file=sys.stderr)
+
+        self._print_summary()
+
+    def _validate_entry(self, entry: Dict[str, Any]) -> None:
+        """Validate a transformed entry."""
+        required = ["word", "lang_code", "pos", "senses"]
+        for field in required:
+            if field not in entry:
+                raise ValueError(f"Missing required field after transformation: {field}")
+
+    def _print_summary(self) -> None:
+        """Print transformation summary."""
+        print("\n" + "="*60)
+        print("TRANSFORMATION SUMMARY")
+        print("="*60)
+        print(f"Total entries:      {self.stats['total']}")
+        print(f"Successful:         {self.stats['successful']}")
+        print(f"Errors:             {self.stats['errors']}")
+
+        if self.stats['successful'] > 0:
+            success_rate = (self.stats['successful'] / self.stats['total']) * 100
+            print(f"Success rate:       {success_rate:.1f}%")
+
+        if self.stats['warnings']:
+            print(f"\nWarnings: {len(self.stats['warnings'])}")
+            if len(self.stats['warnings']) <= 10:
+                for warning in self.stats['warnings']:
+                    print(f"  - {warning}")
+            else:
+                print(f"  (showing first 10 of {len(self.stats['warnings'])})")
+                for warning in self.stats['warnings'][:10]:
+                    print(f"  - {warning}")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Transform Wiktionary JSONL to universal format",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s input.jsonl output.jsonl
+  %(prog)s data/raw.jsonl data/transformed.jsonl --validate
+        """
+    )
+
+    parser.add_argument("input", help="Input JSONL file")
+    parser.add_argument("output", help="Output JSONL file")
+    parser.add_argument("--validate", action="store_true",
+                       help="Validate transformed entries")
+
+    args = parser.parse_args()
+
+    try:
+        transformer = WiktionaryTransformer(validate=args.validate)
+        transformer.transform_file(args.input, args.output)
+
+        # Exit with error code if there were errors
+        if transformer.stats["errors"] > 0:
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()