111 lines
3.3 KiB
Python
111 lines
3.3 KiB
Python
import json
|
|
import pathlib
|
|
from datetime import datetime
|
|
|
|
|
|
INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl" # <-- Update this to your file
|
|
# --- Dynamic Path Setup ---
|
|
SCRIPT_DIR = pathlib.Path(__file__).parent
|
|
ROOT_DIR = SCRIPT_DIR.parent
|
|
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
|
|
|
|
|
|
# --- Filter Configuration ---
|
|
# Set the POS (part of speech) you want to filter for
|
|
# Examples: "noun", "verb", "adj", "adv", etc.
|
|
# Set to None to skip POS filtering
|
|
FILTER_POS = "noun"
|
|
|
|
# Set the word you want to filter for
|
|
# Set to None to skip word filtering
|
|
FILTER_WORD = "grenouille"
|
|
|
|
# Set word prefix to filter for (e.g., "Septem" will match "September")
|
|
# Set to None to skip prefix filtering
|
|
FILTER_PREFIX = None
|
|
|
|
# Set word suffix to filter for (e.g., "ber" will match "September")
|
|
# Set to None to skip suffix filtering
|
|
FILTER_SUFFIX = None
|
|
|
|
# Maximum number of results to include (set to None for unlimited)
|
|
MAX_RESULTS = 5
|
|
|
|
|
|
def matches_filters(entry):
|
|
"""Check if an entry matches all active filters."""
|
|
|
|
# Filter by POS
|
|
if FILTER_POS is not None:
|
|
if entry.get("pos") != FILTER_POS:
|
|
return False
|
|
|
|
# Filter by exact word
|
|
if FILTER_WORD is not None:
|
|
if entry.get("word") != FILTER_WORD:
|
|
return False
|
|
|
|
# Filter by prefix
|
|
if FILTER_PREFIX is not None:
|
|
word = entry.get("word", "")
|
|
if not word.startswith(FILTER_PREFIX):
|
|
return False
|
|
|
|
# Filter by suffix
|
|
if FILTER_SUFFIX is not None:
|
|
word = entry.get("word", "")
|
|
if not word.endswith(FILTER_SUFFIX):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def filter_and_save(file_path):
|
|
"""Filter JSONL file and save matching entries."""
|
|
|
|
# Generate output filename with original filename and timestamp
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl"
|
|
|
|
match_count = 0
|
|
total_lines = 0
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as infile:
|
|
with open(output_file, 'w', encoding='utf-8') as outfile:
|
|
for line in infile:
|
|
total_lines += 1
|
|
|
|
try:
|
|
entry = json.loads(line)
|
|
|
|
# Check if entry matches filters
|
|
if matches_filters(entry):
|
|
outfile.write(json.dumps(entry, ensure_ascii=False))
|
|
outfile.write('\n')
|
|
match_count += 1
|
|
|
|
# Stop if we've reached max results
|
|
if MAX_RESULTS is not None and match_count >= MAX_RESULTS:
|
|
break
|
|
|
|
except json.JSONDecodeError:
|
|
print(f"Warning: Line {total_lines} is not valid JSON.")
|
|
|
|
print(f"Filtered {match_count} entries from {total_lines} total lines")
|
|
print(f"Output written to: {output_file}")
|
|
|
|
# Print active filters
|
|
print("\nActive filters:")
|
|
if FILTER_POS:
|
|
print(f" - POS: {FILTER_POS}")
|
|
if FILTER_WORD:
|
|
print(f" - Word (exact): {FILTER_WORD}")
|
|
if FILTER_PREFIX:
|
|
print(f" - Prefix: {FILTER_PREFIX}")
|
|
if FILTER_SUFFIX:
|
|
print(f" - Suffix: {FILTER_SUFFIX}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
filter_and_save(INPUT_FILE)
|