welcome gitea

2026-02-19 17:18:23 +01:00
commit eabe2e2969
717 changed files with 654575 additions and 0 deletions
--- a/generate.py
+++ b/generate.py
@@ -0,0 +1,334 @@
+"""
+VocabListGenerator — Single-run script
+----------------------------------------
+Reads configuration from 'conf', calls the LLM to generate vocabulary word pairs,
+writes a Polly-compatible JSON import file into the configured output folder,
+and updates the manifest.
+
+Usage:
+    python generate.py
+"""
+
+import os
+import sys
+import xml.etree.ElementTree as ET
+import yaml
+from typing import Dict, Any, Optional
+
+from config import Config
+from llm_client import LLMClient
+from models import VocabRequest
+from first import generate_vocabulary_import
+from manifest_manager import update_manifest, print_manifest, prune_missing_files
+
+
+# ---------------------------------------------------------------------------
+# Language map helper
+# ---------------------------------------------------------------------------
+
+def load_language_map(xml_path: str = "languages.xml") -> Dict[int, str]:
+    """
+    Parse languages.xml and return a mapping of { language_id: language_name }.
+    E.g.  {1: 'English', 2: 'Mandarin', 15: 'German', ...}
+    """
+    language_map: Dict[int, str] = {}
+    try:
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        for elem in root.iter("string"):
+            name_attr = elem.get("name", "")
+            if name_attr.startswith("language_"):
+                try:
+                    lang_id = int(name_attr.split("_", 1)[1])
+                    if elem.text:
+                        language_map[lang_id] = elem.text.strip()
+                except (ValueError, AttributeError):
+                    pass
+    except FileNotFoundError:
+        print(f"ERROR: '{xml_path}' not found.")
+        sys.exit(1)
+    except ET.ParseError as e:
+        print(f"ERROR: Could not parse '{xml_path}': {e}")
+        sys.exit(1)
+    return language_map
+
+
+def load_language_code_map(xml_path: str = "languages.xml") -> Dict[int, str]:
+    """
+    Parse languages.xml and return a mapping of { language_id: iso_code }.
+    E.g.  {1: 'en', 7: 'pt', 15: 'de', ...}
+    Parsed from the string-array items like  "de,DE,15".
+    """
+    code_map: Dict[int, str] = {}
+    try:
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        for array in root.iter("string-array"):
+            if array.get("name") == "language_codes":
+                for item in array.iter("item"):
+                    if item.text:
+                        parts = item.text.strip().split(",")
+                        if len(parts) >= 3:
+                            try:
+                                lang_id = int(parts[2])
+                                code_map[lang_id] = parts[0].lower()
+                            except ValueError:
+                                pass
+    except (FileNotFoundError, ET.ParseError):
+        pass
+    return code_map
+
+
+def load_language_instructions(yaml_path: str = "language_instructions.yaml") -> Dict[int, Dict[str, str]]:
+    """
+    Load language-specific instructions from YAML file.
+    Returns a mapping of { language_id: { key: instruction } }.
+    """
+    instructions: Dict[int, Dict[str, str]] = {}
+    try:
+        with open(yaml_path, "r", encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            if data:
+                for lang_id_str, lang_data in data.items():
+                    # Skip comments/strings
+                    if isinstance(lang_id_str, str) and lang_id_str.startswith("#"):
+                        continue
+                    try:
+                        lang_id = int(lang_id_str)
+                        if isinstance(lang_data, dict):
+                            instructions[lang_id] = lang_data
+                    except (ValueError, TypeError):
+                        pass
+    except FileNotFoundError:
+        print(f"WARNING: '{yaml_path}' not found. Using default instructions.")
+    except yaml.YAMLError as e:
+        print(f"WARNING: Could not parse '{yaml_path}': {e}")
+    return instructions
+
+
+def get_language_instruction_text(lang_id: int, language_instructions: Dict[int, Dict[str, str]]) -> str:
+    """
+    Get the instruction text for a specific language.
+    Returns a formatted string with transcription/variant instructions.
+    """
+    if lang_id not in language_instructions:
+        return ""
+    
+    lang_data = language_instructions[lang_id]
+    parts = []
+    
+    # Add transcription instruction if present
+    if "transcription" in lang_data:
+        parts.append(lang_data["transcription"])
+    
+    # Add variant instruction if present
+    if "variant" in lang_data:
+        parts.append(f"Use {lang_data['variant']}.")
+    
+    # Add special instruction if present
+    if "special" in lang_data:
+        parts.append(lang_data["special"])
+    
+    return " ".join(parts)
+
+
+def merge_instructions(
+    base_instructions: str,
+    lang_first_id: int,
+    lang_second_id: int,
+    language_instructions: Dict[int, Dict[str, str]]
+) -> str:
+    """
+    Merge base instructions with language-specific instructions.
+    Language-specific instructions are appended to the base instructions.
+    """
+    lang1_instr = get_language_instruction_text(lang_first_id, language_instructions)
+    lang2_instr = get_language_instruction_text(lang_second_id, language_instructions)
+    
+    # Collect all instruction parts
+    all_parts = []
+    if base_instructions:
+        all_parts.append(base_instructions)
+    if lang1_instr:
+        all_parts.append(lang1_instr)
+    if lang2_instr:
+        all_parts.append(lang2_instr)
+    
+    return " ".join(all_parts)
+
+
+# ---------------------------------------------------------------------------
+# Core generation function (reused by batch_generate.py)
+# ---------------------------------------------------------------------------
+
+def run_generation(
+    llm: LLMClient,
+    language_map: Dict[int, str],
+    lang_first_id: int,
+    lang_second_id: int,
+    amount: int,
+    category: str,
+    name: str,
+    description: str,
+    instructions: str,
+    output_file_path: str,    # absolute path including filename
+    manifest_path: str,       # absolute path to manifest JSON
+    emoji: str = "",
+    level: str = "A2",
+    language_instructions: Optional[Dict[int, Dict[str, str]]] = None,
+) -> bool:
+    """
+    Generate one vocabulary list and update the manifest.
+    Returns True on success, False on failure.
+    """
+    lang_first_name = language_map.get(lang_first_id)
+    lang_second_name = language_map.get(lang_second_id)
+
+    if not lang_first_name:
+        print(f"  ERROR: Language ID {lang_first_id} not found in languages.xml")
+        return False
+    if not lang_second_name:
+        print(f"  ERROR: Language ID {lang_second_id} not found in languages.xml")
+        return False
+
+    # Merge base instructions with language-specific instructions
+    final_instructions = instructions
+    if language_instructions:
+        final_instructions = merge_instructions(
+            instructions,
+            lang_first_id,
+            lang_second_id,
+            language_instructions
+        )
+
+    print(f"  Languages  : {lang_first_name} (ID {lang_first_id})  →  {lang_second_name} (ID {lang_second_id})")
+    print(f"  Amount     : {amount} word pairs")
+    if final_instructions:
+        preview = final_instructions if len(final_instructions) <= 90 else final_instructions[:87] + "..."
+        print(f"  Instructions: {preview}")
+    print()
+
+    request = VocabRequest(
+        amount=amount,
+        lang_first_id=lang_first_id,
+        lang_second_id=lang_second_id,
+        lang_first_name=lang_first_name,
+        lang_second_name=lang_second_name,
+        category=category,
+        instructions=final_instructions,
+        level=level,
+    )
+
+    print("  Generating vocabulary via LLM …")
+    word_pairs = llm.generate_vocabulary(request)
+
+    if not word_pairs:
+        print("  ERROR: No vocabulary pairs were generated.")
+        return False
+
+    print(f"  Generated {len(word_pairs)} word pairs.")
+    preview_count = min(3, len(word_pairs))
+    for i, (w1, w2) in enumerate(word_pairs[:preview_count], 1):
+        print(f"    {i}. {w1}  →  {w2}")
+    if len(word_pairs) > preview_count:
+        print(f"       … and {len(word_pairs) - preview_count} more")
+    print()
+
+    # Ensure output directory exists
+    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
+
+    # Write Polly import file (pass absolute path directly)
+    generate_vocabulary_import(
+        category_name=category,
+        lang_first_id=lang_first_id,
+        lang_second_id=lang_second_id,
+        word_pairs=word_pairs,
+        output_filename=output_file_path,
+    )
+
+    # Update manifest
+    update_manifest(
+        manifest_path=manifest_path,
+        vocab_file_path=output_file_path,
+        lang_first_id=lang_first_id,
+        lang_second_id=lang_second_id,
+        category=category,
+        item_count=len(word_pairs),
+        name=name,
+        description=description,
+        emoji=emoji,
+        level=level,
+    )
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Main (single-run entry point)
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    print("=" * 50)
+    print("       VocabListGenerator")
+    print("=" * 50)
+    print()
+
+    config = Config()
+    vocab_cfg = config.vocab_config
+    manifest_cfg = config.manifest_config
+
+    language_map = load_language_map()
+
+    # Resolve paths
+    script_dir      = os.path.dirname(os.path.abspath(__file__))
+    output_dir      = os.path.join(script_dir, manifest_cfg.get("output_dir", "output"))
+    manifest_path   = os.path.join(output_dir, manifest_cfg.get("filename", "vocab_manifest.json"))
+    output_filename = vocab_cfg.get("output_filename", "vocab_output.json")
+    vocab_file_path = os.path.join(output_dir, output_filename)
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Prune stale manifest entries before generating
+    if os.path.isfile(manifest_path):
+        prune_missing_files(manifest_path, output_dir)
+
+    # Read parameters
+    lang_ids       = vocab_cfg["languages"]
+    category       = vocab_cfg["category"]
+    name           = vocab_cfg.get("name", "").strip()
+    description    = vocab_cfg.get("description", "").strip()
+    instructions   = vocab_cfg.get("instructions", "").strip()
+    emoji          = vocab_cfg.get("emoji", "").strip()
+    level          = vocab_cfg.get("level", "A2").strip().upper()
+    amount         = vocab_cfg["amount"]
+
+    print(f"  Category   : {category}")
+    print(f"  Level      : {level}")
+    print(f"  Output dir : {output_dir}")
+    print(f"  Manifest   : {manifest_path}")
+
+    llm = LLMClient(config)
+
+    success = run_generation(
+        llm=llm,
+        language_map=language_map,
+        lang_first_id=lang_ids[0],
+        lang_second_id=lang_ids[1],
+        amount=amount,
+        category=category,
+        name=name,
+        description=description,
+        instructions=instructions,
+        output_file_path=vocab_file_path,
+        manifest_path=manifest_path,
+        emoji=emoji,
+        level=level,
+    )
+
+    if not success:
+        sys.exit(1)
+
+    print_manifest(manifest_path)
+
+
+if __name__ == "__main__":
+    main()