""" VocabListGenerator — Single-run script ---------------------------------------- Reads configuration from 'conf', calls the LLM to generate vocabulary word pairs, writes a Polly-compatible JSON import file into the configured output folder, and updates the manifest. Usage: python generate.py """ import os import sys import xml.etree.ElementTree as ET import yaml from typing import Dict, Any, Optional from config import Config from llm_client import LLMClient from models import VocabRequest from first import generate_vocabulary_import from manifest_manager import update_manifest, print_manifest, prune_missing_files # --------------------------------------------------------------------------- # Language map helper # --------------------------------------------------------------------------- def load_language_map(xml_path: str = "languages.xml") -> Dict[int, str]: """ Parse languages.xml and return a mapping of { language_id: language_name }. E.g. {1: 'English', 2: 'Mandarin', 15: 'German', ...} """ language_map: Dict[int, str] = {} try: tree = ET.parse(xml_path) root = tree.getroot() for elem in root.iter("string"): name_attr = elem.get("name", "") if name_attr.startswith("language_"): try: lang_id = int(name_attr.split("_", 1)[1]) if elem.text: language_map[lang_id] = elem.text.strip() except (ValueError, AttributeError): pass except FileNotFoundError: print(f"ERROR: '{xml_path}' not found.") sys.exit(1) except ET.ParseError as e: print(f"ERROR: Could not parse '{xml_path}': {e}") sys.exit(1) return language_map def load_language_code_map(xml_path: str = "languages.xml") -> Dict[int, str]: """ Parse languages.xml and return a mapping of { language_id: iso_code }. E.g. {1: 'en', 7: 'pt', 15: 'de', ...} Parsed from the string-array items like "de,DE,15". """ code_map: Dict[int, str] = {} try: tree = ET.parse(xml_path) root = tree.getroot() for array in root.iter("string-array"): if array.get("name") == "language_codes": for item in array.iter("item"): if item.text: parts = item.text.strip().split(",") if len(parts) >= 3: try: lang_id = int(parts[2]) code_map[lang_id] = parts[0].lower() except ValueError: pass except (FileNotFoundError, ET.ParseError): pass return code_map def load_language_instructions(yaml_path: str = "language_instructions.yaml") -> Dict[int, Dict[str, str]]: """ Load language-specific instructions from YAML file. Returns a mapping of { language_id: { key: instruction } }. """ instructions: Dict[int, Dict[str, str]] = {} try: with open(yaml_path, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if data: for lang_id_str, lang_data in data.items(): # Skip comments/strings if isinstance(lang_id_str, str) and lang_id_str.startswith("#"): continue try: lang_id = int(lang_id_str) if isinstance(lang_data, dict): instructions[lang_id] = lang_data except (ValueError, TypeError): pass except FileNotFoundError: print(f"WARNING: '{yaml_path}' not found. Using default instructions.") except yaml.YAMLError as e: print(f"WARNING: Could not parse '{yaml_path}': {e}") return instructions def get_language_instruction_text(lang_id: int, language_instructions: Dict[int, Dict[str, str]]) -> str: """ Get the instruction text for a specific language. Returns a formatted string with transcription/variant instructions. """ if lang_id not in language_instructions: return "" lang_data = language_instructions[lang_id] parts = [] # Add transcription instruction if present if "transcription" in lang_data: parts.append(lang_data["transcription"]) # Add variant instruction if present if "variant" in lang_data: parts.append(f"Use {lang_data['variant']}.") # Add special instruction if present if "special" in lang_data: parts.append(lang_data["special"]) return " ".join(parts) def merge_instructions( base_instructions: str, lang_first_id: int, lang_second_id: int, language_instructions: Dict[int, Dict[str, str]] ) -> str: """ Merge base instructions with language-specific instructions. Language-specific instructions are appended to the base instructions. """ lang1_instr = get_language_instruction_text(lang_first_id, language_instructions) lang2_instr = get_language_instruction_text(lang_second_id, language_instructions) # Collect all instruction parts all_parts = [] if base_instructions: all_parts.append(base_instructions) if lang1_instr: all_parts.append(lang1_instr) if lang2_instr: all_parts.append(lang2_instr) return " ".join(all_parts) # --------------------------------------------------------------------------- # Core generation function (reused by batch_generate.py) # --------------------------------------------------------------------------- def run_generation( llm: LLMClient, language_map: Dict[int, str], lang_first_id: int, lang_second_id: int, amount: int, category: str, name: str, description: str, instructions: str, output_file_path: str, # absolute path including filename manifest_path: str, # absolute path to manifest JSON emoji: str = "", level: str = "A2", language_instructions: Optional[Dict[int, Dict[str, str]]] = None, ) -> bool: """ Generate one vocabulary list and update the manifest. Returns True on success, False on failure. """ lang_first_name = language_map.get(lang_first_id) lang_second_name = language_map.get(lang_second_id) if not lang_first_name: print(f" ERROR: Language ID {lang_first_id} not found in languages.xml") return False if not lang_second_name: print(f" ERROR: Language ID {lang_second_id} not found in languages.xml") return False # Merge base instructions with language-specific instructions final_instructions = instructions if language_instructions: final_instructions = merge_instructions( instructions, lang_first_id, lang_second_id, language_instructions ) print(f" Languages : {lang_first_name} (ID {lang_first_id}) → {lang_second_name} (ID {lang_second_id})") print(f" Amount : {amount} word pairs") if final_instructions: preview = final_instructions if len(final_instructions) <= 90 else final_instructions[:87] + "..." print(f" Instructions: {preview}") print() request = VocabRequest( amount=amount, lang_first_id=lang_first_id, lang_second_id=lang_second_id, lang_first_name=lang_first_name, lang_second_name=lang_second_name, category=category, instructions=final_instructions, level=level, ) print(" Generating vocabulary via LLM …") word_pairs = llm.generate_vocabulary(request) if not word_pairs: print(" ERROR: No vocabulary pairs were generated.") return False print(f" Generated {len(word_pairs)} word pairs.") preview_count = min(3, len(word_pairs)) for i, (w1, w2) in enumerate(word_pairs[:preview_count], 1): print(f" {i}. {w1} → {w2}") if len(word_pairs) > preview_count: print(f" … and {len(word_pairs) - preview_count} more") print() # Ensure output directory exists os.makedirs(os.path.dirname(output_file_path), exist_ok=True) # Write Polly import file (pass absolute path directly) generate_vocabulary_import( category_name=category, lang_first_id=lang_first_id, lang_second_id=lang_second_id, word_pairs=word_pairs, output_filename=output_file_path, ) # Update manifest update_manifest( manifest_path=manifest_path, vocab_file_path=output_file_path, lang_first_id=lang_first_id, lang_second_id=lang_second_id, category=category, item_count=len(word_pairs), name=name, description=description, emoji=emoji, level=level, ) return True # --------------------------------------------------------------------------- # Main (single-run entry point) # --------------------------------------------------------------------------- def main() -> None: print("=" * 50) print(" VocabListGenerator") print("=" * 50) print() config = Config() vocab_cfg = config.vocab_config manifest_cfg = config.manifest_config language_map = load_language_map() # Resolve paths script_dir = os.path.dirname(os.path.abspath(__file__)) output_dir = os.path.join(script_dir, manifest_cfg.get("output_dir", "output")) manifest_path = os.path.join(output_dir, manifest_cfg.get("filename", "vocab_manifest.json")) output_filename = vocab_cfg.get("output_filename", "vocab_output.json") vocab_file_path = os.path.join(output_dir, output_filename) os.makedirs(output_dir, exist_ok=True) # Prune stale manifest entries before generating if os.path.isfile(manifest_path): prune_missing_files(manifest_path, output_dir) # Read parameters lang_ids = vocab_cfg["languages"] category = vocab_cfg["category"] name = vocab_cfg.get("name", "").strip() description = vocab_cfg.get("description", "").strip() instructions = vocab_cfg.get("instructions", "").strip() emoji = vocab_cfg.get("emoji", "").strip() level = vocab_cfg.get("level", "A2").strip().upper() amount = vocab_cfg["amount"] print(f" Category : {category}") print(f" Level : {level}") print(f" Output dir : {output_dir}") print(f" Manifest : {manifest_path}") llm = LLMClient(config) success = run_generation( llm=llm, language_map=language_map, lang_first_id=lang_ids[0], lang_second_id=lang_ids[1], amount=amount, category=category, name=name, description=description, instructions=instructions, output_file_path=vocab_file_path, manifest_path=manifest_path, emoji=emoji, level=level, ) if not success: sys.exit(1) print_manifest(manifest_path) if __name__ == "__main__": main()