BatchVocabListGenerator/generate.py

"""
VocabListGenerator — Single-run script
----------------------------------------
Reads configuration from 'conf', calls the LLM to generate vocabulary word pairs,
writes a Polly-compatible JSON import file into the configured output folder,
and updates the manifest.

Usage:
    python generate.py
"""

import os
import sys
import xml.etree.ElementTree as ET
import yaml
from typing import Dict, Any, Optional

from config import Config
from llm_client import LLMClient
from models import VocabRequest
from first import generate_vocabulary_import
from manifest_manager import update_manifest, print_manifest, prune_missing_files


# ---------------------------------------------------------------------------
# Language map helper
# ---------------------------------------------------------------------------

def load_language_map(xml_path: str = "languages.xml") -> Dict[int, str]:
    """
    Parse languages.xml and return a mapping of { language_id: language_name }.
    E.g.  {1: 'English', 2: 'Mandarin', 15: 'German', ...}
    """
    language_map: Dict[int, str] = {}
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        for elem in root.iter("string"):
            name_attr = elem.get("name", "")
            if name_attr.startswith("language_"):
                try:
                    lang_id = int(name_attr.split("_", 1)[1])
                    if elem.text:
                        language_map[lang_id] = elem.text.strip()
                except (ValueError, AttributeError):
                    pass
    except FileNotFoundError:
        print(f"ERROR: '{xml_path}' not found.")
        sys.exit(1)
    except ET.ParseError as e:
        print(f"ERROR: Could not parse '{xml_path}': {e}")
        sys.exit(1)
    return language_map


def load_language_code_map(xml_path: str = "languages.xml") -> Dict[int, str]:
    """
    Parse languages.xml and return a mapping of { language_id: iso_code }.
    E.g.  {1: 'en', 7: 'pt', 15: 'de', ...}
    Parsed from the string-array items like  "de,DE,15".
    """
    code_map: Dict[int, str] = {}
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        for array in root.iter("string-array"):
            if array.get("name") == "language_codes":
                for item in array.iter("item"):
                    if item.text:
                        parts = item.text.strip().split(",")
                        if len(parts) >= 3:
                            try:
                                lang_id = int(parts[2])
                                code_map[lang_id] = parts[0].lower()
                            except ValueError:
                                pass
    except (FileNotFoundError, ET.ParseError):
        pass
    return code_map


def load_language_instructions(yaml_path: str = "language_instructions.yaml") -> Dict[int, Dict[str, str]]:
    """
    Load language-specific instructions from YAML file.
    Returns a mapping of { language_id: { key: instruction } }.
    """
    instructions: Dict[int, Dict[str, str]] = {}
    try:
        with open(yaml_path, "r", encoding="utf-8") as f:
            data = yaml.safe_load(f)
            if data:
                for lang_id_str, lang_data in data.items():
                    # Skip comments/strings
                    if isinstance(lang_id_str, str) and lang_id_str.startswith("#"):
                        continue
                    try:
                        lang_id = int(lang_id_str)
                        if isinstance(lang_data, dict):
                            instructions[lang_id] = lang_data
                    except (ValueError, TypeError):
                        pass
    except FileNotFoundError:
        print(f"WARNING: '{yaml_path}' not found. Using default instructions.")
    except yaml.YAMLError as e:
        print(f"WARNING: Could not parse '{yaml_path}': {e}")
    return instructions


def get_language_instruction_text(lang_id: int, language_instructions: Dict[int, Dict[str, str]]) -> str:
    """
    Get the instruction text for a specific language.
    Returns a formatted string with transcription/variant instructions.
    """
    if lang_id not in language_instructions:
        return ""

    lang_data = language_instructions[lang_id]
    parts = []

    # Add transcription instruction if present
    if "transcription" in lang_data:
        parts.append(lang_data["transcription"])

    # Add variant instruction if present
    if "variant" in lang_data:
        parts.append(f"Use {lang_data['variant']}.")

    # Add special instruction if present
    if "special" in lang_data:
        parts.append(lang_data["special"])

    return " ".join(parts)


def merge_instructions(
    base_instructions: str,
    lang_first_id: int,
    lang_second_id: int,
    language_instructions: Dict[int, Dict[str, str]]
) -> str:
    """
    Merge base instructions with language-specific instructions.
    Language-specific instructions are appended to the base instructions.
    """
    lang1_instr = get_language_instruction_text(lang_first_id, language_instructions)
    lang2_instr = get_language_instruction_text(lang_second_id, language_instructions)

    # Collect all instruction parts
    all_parts = []
    if base_instructions:
        all_parts.append(base_instructions)
    if lang1_instr:
        all_parts.append(lang1_instr)
    if lang2_instr:
        all_parts.append(lang2_instr)

    return " ".join(all_parts)


# ---------------------------------------------------------------------------
# Core generation function (reused by batch_generate.py)
# ---------------------------------------------------------------------------

def run_generation(
    llm: LLMClient,
    language_map: Dict[int, str],
    lang_first_id: int,
    lang_second_id: int,
    amount: int,
    category: str,
    name: str,
    description: str,
    instructions: str,
    output_file_path: str,    # absolute path including filename
    manifest_path: str,       # absolute path to manifest JSON
    emoji: str = "",
    level: str = "A2",
    language_instructions: Optional[Dict[int, Dict[str, str]]] = None,
) -> bool:
    """
    Generate one vocabulary list and update the manifest.
    Returns True on success, False on failure.
    """
    lang_first_name = language_map.get(lang_first_id)
    lang_second_name = language_map.get(lang_second_id)

    if not lang_first_name:
        print(f"  ERROR: Language ID {lang_first_id} not found in languages.xml")
        return False
    if not lang_second_name:
        print(f"  ERROR: Language ID {lang_second_id} not found in languages.xml")
        return False

    # Merge base instructions with language-specific instructions
    final_instructions = instructions
    if language_instructions:
        final_instructions = merge_instructions(
            instructions,
            lang_first_id,
            lang_second_id,
            language_instructions
        )

    print(f"  Languages  : {lang_first_name} (ID {lang_first_id})  →  {lang_second_name} (ID {lang_second_id})")
    print(f"  Amount     : {amount} word pairs")
    if final_instructions:
        preview = final_instructions if len(final_instructions) <= 90 else final_instructions[:87] + "..."
        print(f"  Instructions: {preview}")
    print()

    request = VocabRequest(
        amount=amount,
        lang_first_id=lang_first_id,
        lang_second_id=lang_second_id,
        lang_first_name=lang_first_name,
        lang_second_name=lang_second_name,
        category=category,
        instructions=final_instructions,
        level=level,
    )

    print("  Generating vocabulary via LLM …")
    word_pairs = llm.generate_vocabulary(request)

    if not word_pairs:
        print("  ERROR: No vocabulary pairs were generated.")
        return False

    print(f"  Generated {len(word_pairs)} word pairs.")
    preview_count = min(3, len(word_pairs))
    for i, (w1, w2) in enumerate(word_pairs[:preview_count], 1):
        print(f"    {i}. {w1}  →  {w2}")
    if len(word_pairs) > preview_count:
        print(f"       … and {len(word_pairs) - preview_count} more")
    print()

    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    # Write Polly import file (pass absolute path directly)
    generate_vocabulary_import(
        category_name=category,
        lang_first_id=lang_first_id,
        lang_second_id=lang_second_id,
        word_pairs=word_pairs,
        output_filename=output_file_path,
    )

    # Update manifest
    update_manifest(
        manifest_path=manifest_path,
        vocab_file_path=output_file_path,
        lang_first_id=lang_first_id,
        lang_second_id=lang_second_id,
        category=category,
        item_count=len(word_pairs),
        name=name,
        description=description,
        emoji=emoji,
        level=level,
    )
    return True


# ---------------------------------------------------------------------------
# Main (single-run entry point)
# ---------------------------------------------------------------------------

def main() -> None:
    print("=" * 50)
    print("       VocabListGenerator")
    print("=" * 50)
    print()

    config = Config()
    vocab_cfg = config.vocab_config
    manifest_cfg = config.manifest_config

    language_map = load_language_map()

    # Resolve paths
    script_dir      = os.path.dirname(os.path.abspath(__file__))
    output_dir      = os.path.join(script_dir, manifest_cfg.get("output_dir", "output"))
    manifest_path   = os.path.join(output_dir, manifest_cfg.get("filename", "vocab_manifest.json"))
    output_filename = vocab_cfg.get("output_filename", "vocab_output.json")
    vocab_file_path = os.path.join(output_dir, output_filename)

    os.makedirs(output_dir, exist_ok=True)

    # Prune stale manifest entries before generating
    if os.path.isfile(manifest_path):
        prune_missing_files(manifest_path, output_dir)

    # Read parameters
    lang_ids       = vocab_cfg["languages"]
    category       = vocab_cfg["category"]
    name           = vocab_cfg.get("name", "").strip()
    description    = vocab_cfg.get("description", "").strip()
    instructions   = vocab_cfg.get("instructions", "").strip()
    emoji          = vocab_cfg.get("emoji", "").strip()
    level          = vocab_cfg.get("level", "A2").strip().upper()
    amount         = vocab_cfg["amount"]

    print(f"  Category   : {category}")
    print(f"  Level      : {level}")
    print(f"  Output dir : {output_dir}")
    print(f"  Manifest   : {manifest_path}")

    llm = LLMClient(config)

    success = run_generation(
        llm=llm,
        language_map=language_map,
        lang_first_id=lang_ids[0],
        lang_second_id=lang_ids[1],
        amount=amount,
        category=category,
        name=name,
        description=description,
        instructions=instructions,
        output_file_path=vocab_file_path,
        manifest_path=manifest_path,
        emoji=emoji,
        level=level,
    )

    if not success:
        sys.exit(1)

    print_manifest(manifest_path)


if __name__ == "__main__":
    main()