Files
Wictionary-Data-Parser/scripts/search_word.py
2026-02-13 00:10:40 +01:00

111 lines
3.3 KiB
Python

import json
import pathlib
from datetime import datetime
INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl" # <-- Update this to your file
# --- Dynamic Path Setup ---
SCRIPT_DIR = pathlib.Path(__file__).parent
ROOT_DIR = SCRIPT_DIR.parent
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
# --- Filter Configuration ---
# Set the POS (part of speech) you want to filter for
# Examples: "noun", "verb", "adj", "adv", etc.
# Set to None to skip POS filtering
FILTER_POS = "noun"
# Set the word you want to filter for
# Set to None to skip word filtering
FILTER_WORD = "grenouille"
# Set word prefix to filter for (e.g., "Septem" will match "September")
# Set to None to skip prefix filtering
FILTER_PREFIX = None
# Set word suffix to filter for (e.g., "ber" will match "September")
# Set to None to skip suffix filtering
FILTER_SUFFIX = None
# Maximum number of results to include (set to None for unlimited)
MAX_RESULTS = 5
def matches_filters(entry):
"""Check if an entry matches all active filters."""
# Filter by POS
if FILTER_POS is not None:
if entry.get("pos") != FILTER_POS:
return False
# Filter by exact word
if FILTER_WORD is not None:
if entry.get("word") != FILTER_WORD:
return False
# Filter by prefix
if FILTER_PREFIX is not None:
word = entry.get("word", "")
if not word.startswith(FILTER_PREFIX):
return False
# Filter by suffix
if FILTER_SUFFIX is not None:
word = entry.get("word", "")
if not word.endswith(FILTER_SUFFIX):
return False
return True
def filter_and_save(file_path):
"""Filter JSONL file and save matching entries."""
# Generate output filename with original filename and timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl"
match_count = 0
total_lines = 0
with open(file_path, 'r', encoding='utf-8') as infile:
with open(output_file, 'w', encoding='utf-8') as outfile:
for line in infile:
total_lines += 1
try:
entry = json.loads(line)
# Check if entry matches filters
if matches_filters(entry):
outfile.write(json.dumps(entry, ensure_ascii=False))
outfile.write('\n')
match_count += 1
# Stop if we've reached max results
if MAX_RESULTS is not None and match_count >= MAX_RESULTS:
break
except json.JSONDecodeError:
print(f"Warning: Line {total_lines} is not valid JSON.")
print(f"Filtered {match_count} entries from {total_lines} total lines")
print(f"Output written to: {output_file}")
# Print active filters
print("\nActive filters:")
if FILTER_POS:
print(f" - POS: {FILTER_POS}")
if FILTER_WORD:
print(f" - Word (exact): {FILTER_WORD}")
if FILTER_PREFIX:
print(f" - Prefix: {FILTER_PREFIX}")
if FILTER_SUFFIX:
print(f" - Suffix: {FILTER_SUFFIX}")
if __name__ == "__main__":
filter_and_save(INPUT_FILE)