import json import pathlib from datetime import datetime INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl" # <-- Update this to your file # --- Dynamic Path Setup --- SCRIPT_DIR = pathlib.Path(__file__).parent ROOT_DIR = SCRIPT_DIR.parent INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME # --- Filter Configuration --- # Set the POS (part of speech) you want to filter for # Examples: "noun", "verb", "adj", "adv", etc. # Set to None to skip POS filtering FILTER_POS = "noun" # Set the word you want to filter for # Set to None to skip word filtering FILTER_WORD = "grenouille" # Set word prefix to filter for (e.g., "Septem" will match "September") # Set to None to skip prefix filtering FILTER_PREFIX = None # Set word suffix to filter for (e.g., "ber" will match "September") # Set to None to skip suffix filtering FILTER_SUFFIX = None # Maximum number of results to include (set to None for unlimited) MAX_RESULTS = 5 def matches_filters(entry): """Check if an entry matches all active filters.""" # Filter by POS if FILTER_POS is not None: if entry.get("pos") != FILTER_POS: return False # Filter by exact word if FILTER_WORD is not None: if entry.get("word") != FILTER_WORD: return False # Filter by prefix if FILTER_PREFIX is not None: word = entry.get("word", "") if not word.startswith(FILTER_PREFIX): return False # Filter by suffix if FILTER_SUFFIX is not None: word = entry.get("word", "") if not word.endswith(FILTER_SUFFIX): return False return True def filter_and_save(file_path): """Filter JSONL file and save matching entries.""" # Generate output filename with original filename and timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl" match_count = 0 total_lines = 0 with open(file_path, 'r', encoding='utf-8') as infile: with open(output_file, 'w', encoding='utf-8') as outfile: for line in infile: total_lines += 1 try: entry = json.loads(line) # Check if entry matches filters if matches_filters(entry): outfile.write(json.dumps(entry, ensure_ascii=False)) outfile.write('\n') match_count += 1 # Stop if we've reached max results if MAX_RESULTS is not None and match_count >= MAX_RESULTS: break except json.JSONDecodeError: print(f"Warning: Line {total_lines} is not valid JSON.") print(f"Filtered {match_count} entries from {total_lines} total lines") print(f"Output written to: {output_file}") # Print active filters print("\nActive filters:") if FILTER_POS: print(f" - POS: {FILTER_POS}") if FILTER_WORD: print(f" - Word (exact): {FILTER_WORD}") if FILTER_PREFIX: print(f" - Prefix: {FILTER_PREFIX}") if FILTER_SUFFIX: print(f" - Suffix: {FILTER_SUFFIX}") if __name__ == "__main__": filter_and_save(INPUT_FILE)