Wictionary-Data-Parser/scripts/search_word.py

import json
import pathlib
from datetime import datetime


INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl"  # <-- Update this to your file
# --- Dynamic Path Setup ---
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME


# --- Filter Configuration ---
# Set the POS (part of speech) you want to filter for
# Examples: "noun", "verb", "adj", "adv", etc.
# Set to None to skip POS filtering
FILTER_POS = "noun"

# Set the word you want to filter for
# Set to None to skip word filtering
FILTER_WORD = "grenouille"

# Set word prefix to filter for (e.g., "Septem" will match "September")
# Set to None to skip prefix filtering
FILTER_PREFIX = None

# Set word suffix to filter for (e.g., "ber" will match "September")
# Set to None to skip suffix filtering
FILTER_SUFFIX = None

# Maximum number of results to include (set to None for unlimited)
MAX_RESULTS = 5


def matches_filters(entry):
    """Check if an entry matches all active filters."""

    # Filter by POS
    if FILTER_POS is not None:
        if entry.get("pos") != FILTER_POS:
            return False

    # Filter by exact word
    if FILTER_WORD is not None:
        if entry.get("word") != FILTER_WORD:
            return False

    # Filter by prefix
    if FILTER_PREFIX is not None:
        word = entry.get("word", "")
        if not word.startswith(FILTER_PREFIX):
            return False

    # Filter by suffix
    if FILTER_SUFFIX is not None:
        word = entry.get("word", "")
        if not word.endswith(FILTER_SUFFIX):
            return False

    return True


def filter_and_save(file_path):
    """Filter JSONL file and save matching entries."""

    # Generate output filename with original filename and timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl"

    match_count = 0
    total_lines = 0

    with open(file_path, 'r', encoding='utf-8') as infile:
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for line in infile:
                total_lines += 1

                try:
                    entry = json.loads(line)

                    # Check if entry matches filters
                    if matches_filters(entry):
                        outfile.write(json.dumps(entry, ensure_ascii=False))
                        outfile.write('\n')
                        match_count += 1

                        # Stop if we've reached max results
                        if MAX_RESULTS is not None and match_count >= MAX_RESULTS:
                            break

                except json.JSONDecodeError:
                    print(f"Warning: Line {total_lines} is not valid JSON.")

    print(f"Filtered {match_count} entries from {total_lines} total lines")
    print(f"Output written to: {output_file}")

    # Print active filters
    print("\nActive filters:")
    if FILTER_POS:
        print(f"  - POS: {FILTER_POS}")
    if FILTER_WORD:
        print(f"  - Word (exact): {FILTER_WORD}")
    if FILTER_PREFIX:
        print(f"  - Prefix: {FILTER_PREFIX}")
    if FILTER_SUFFIX:
        print(f"  - Suffix: {FILTER_SUFFIX}")


if __name__ == "__main__":
    filter_and_save(INPUT_FILE)