import json import pathlib import logging import sys import os # ============================================================================== # --- CONFIGURATION --- # ============================================================================== # --- Paths --- # Try to determine project root relative to this script location try: SCRIPT_DIR = pathlib.Path(__file__).parent ROOT_DIR = SCRIPT_DIR.parent except NameError: SCRIPT_DIR = pathlib.Path.cwd() ROOT_DIR = SCRIPT_DIR.parent # Input directory containing the source semua.org files RAW_DATA_DIR = ROOT_DIR / "raw_data" # The pattern to match source files FILE_PATTERN = "*raw-wiktextract-data.jsonl" # Output directory for the collected samples SAMPLES_DIR = ROOT_DIR / "samples" # Final output filename OUTPUT_FILENAME = "combined_samples.jsonl" # --- Sampling Options --- # How many matching entries to take from EACH source file. SAMPLES_PER_FILE = 2 # Filter by Language Code. # Set to None to include all languages. # Example: "en", "de", "fr", "no" LANG_FILTER = set() # set() # Filter by Part of Speech. # Leave empty set() to include ALL parts of speech. # Example: {"noun", "verb", "adj"} POS_FILTER = {"verb"} # Filter to only include entries in their own language (lang_code matches file prefix) OWN_LANG_FILTER = True # ============================================================================== # --- END OF CONFIGURATION --- # ============================================================================== # Setup simple logging to console logging.basicConfig(level=logging.INFO, format='%(message)s') logger = logging.getLogger(__name__) def collect_samples(): # 1. Setup Paths and Directories input_dir = pathlib.Path(RAW_DATA_DIR) output_dir = pathlib.Path(SAMPLES_DIR) output_file = output_dir / OUTPUT_FILENAME if not input_dir.exists(): logger.error(f"ERROR: Raw data directory not found at: {input_dir}") logger.error("Please ensure your configuration points to the correct folder.") sys.exit(1) # Create samples directory if it doesn't exist output_dir.mkdir(parents=True, exist_ok=True) # 2. Find all matching input files source_files = list(input_dir.glob(FILE_PATTERN)) if not source_files: logger.warning(f"No files matching '{FILE_PATTERN}' found in {input_dir}") sys.exit(0) logger.info(f"Found {len(source_files)} source files to sample from.") logger.info(f"Target: {SAMPLES_PER_FILE} samples per file.") logger.info(f"Language Filter: {LANG_FILTER if LANG_FILTER else 'ALL'}") logger.info(f"POS Filter: {POS_FILTER if POS_FILTER else 'ALL'}") logger.info(f"Own Language Filter: {'ENABLED' if OWN_LANG_FILTER else 'DISABLED'}") logger.info("-" * 50) total_collected = 0 # Open the output file once and append samples from all inputs to it try: with open(output_file, 'w', encoding='utf-8') as out_f: for src_file in source_files: logger.info(f"Scanning: {src_file.name}...") lang_from_file = src_file.name[:2] file_collected = 0 lines_read = 0 try: with open(src_file, 'r', encoding='utf-8') as in_f: for line in in_f: lines_read += 1 # Stop reading this file if we have enough samples if file_collected >= SAMPLES_PER_FILE: break if not line.strip(): continue try: entry = json.loads(line) # --- Filtering Logic --- # 1. Language Filter if LANG_FILTER and entry.get('lang_code') != LANG_FILTER: continue # 2. POS Filter if POS_FILTER and entry.get('pos') not in POS_FILTER: continue # 3. Own Language Filter if OWN_LANG_FILTER and entry.get('lang_code') != lang_from_file: continue # --- If it passed filters, save it --- # We write it exactly as it is in the source json.dump(entry, out_f, ensure_ascii=False) out_f.write('\n') file_collected += 1 total_collected += 1 except json.JSONDecodeError: # Ignore bad lines in source files during sampling continue logger.info(f" -> Collected {file_collected} samples (scanned {lines_read} lines)") except Exception as e: logger.error(f" ERROR reading {src_file.name}: {e}") except Exception as e: logger.critical(f"FATAL ERROR writing output file: {e}") sys.exit(1) logger.info("-" * 50) logger.info("SAMPLING COMPLETE") logger.info(f"Total entries collected: {total_collected}") logger.info(f"Output saved to: {output_file}") if __name__ == "__main__": collect_samples()