Migrate to gitea
This commit is contained in:
110
scripts/search_word.py
Normal file
110
scripts/search_word.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import json
|
||||
import pathlib
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl" # <-- Update this to your file
|
||||
# --- Dynamic Path Setup ---
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
|
||||
|
||||
|
||||
# --- Filter Configuration ---
|
||||
# Set the POS (part of speech) you want to filter for
|
||||
# Examples: "noun", "verb", "adj", "adv", etc.
|
||||
# Set to None to skip POS filtering
|
||||
FILTER_POS = "noun"
|
||||
|
||||
# Set the word you want to filter for
|
||||
# Set to None to skip word filtering
|
||||
FILTER_WORD = "grenouille"
|
||||
|
||||
# Set word prefix to filter for (e.g., "Septem" will match "September")
|
||||
# Set to None to skip prefix filtering
|
||||
FILTER_PREFIX = None
|
||||
|
||||
# Set word suffix to filter for (e.g., "ber" will match "September")
|
||||
# Set to None to skip suffix filtering
|
||||
FILTER_SUFFIX = None
|
||||
|
||||
# Maximum number of results to include (set to None for unlimited)
|
||||
MAX_RESULTS = 5
|
||||
|
||||
|
||||
def matches_filters(entry):
|
||||
"""Check if an entry matches all active filters."""
|
||||
|
||||
# Filter by POS
|
||||
if FILTER_POS is not None:
|
||||
if entry.get("pos") != FILTER_POS:
|
||||
return False
|
||||
|
||||
# Filter by exact word
|
||||
if FILTER_WORD is not None:
|
||||
if entry.get("word") != FILTER_WORD:
|
||||
return False
|
||||
|
||||
# Filter by prefix
|
||||
if FILTER_PREFIX is not None:
|
||||
word = entry.get("word", "")
|
||||
if not word.startswith(FILTER_PREFIX):
|
||||
return False
|
||||
|
||||
# Filter by suffix
|
||||
if FILTER_SUFFIX is not None:
|
||||
word = entry.get("word", "")
|
||||
if not word.endswith(FILTER_SUFFIX):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def filter_and_save(file_path):
|
||||
"""Filter JSONL file and save matching entries."""
|
||||
|
||||
# Generate output filename with original filename and timestamp
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl"
|
||||
|
||||
match_count = 0
|
||||
total_lines = 0
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as infile:
|
||||
with open(output_file, 'w', encoding='utf-8') as outfile:
|
||||
for line in infile:
|
||||
total_lines += 1
|
||||
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
|
||||
# Check if entry matches filters
|
||||
if matches_filters(entry):
|
||||
outfile.write(json.dumps(entry, ensure_ascii=False))
|
||||
outfile.write('\n')
|
||||
match_count += 1
|
||||
|
||||
# Stop if we've reached max results
|
||||
if MAX_RESULTS is not None and match_count >= MAX_RESULTS:
|
||||
break
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(f"Warning: Line {total_lines} is not valid JSON.")
|
||||
|
||||
print(f"Filtered {match_count} entries from {total_lines} total lines")
|
||||
print(f"Output written to: {output_file}")
|
||||
|
||||
# Print active filters
|
||||
print("\nActive filters:")
|
||||
if FILTER_POS:
|
||||
print(f" - POS: {FILTER_POS}")
|
||||
if FILTER_WORD:
|
||||
print(f" - Word (exact): {FILTER_WORD}")
|
||||
if FILTER_PREFIX:
|
||||
print(f" - Prefix: {FILTER_PREFIX}")
|
||||
if FILTER_SUFFIX:
|
||||
print(f" - Suffix: {FILTER_SUFFIX}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filter_and_save(INPUT_FILE)
|
||||
Reference in New Issue
Block a user