welcome gitea

This commit is contained in:
jonasgaudian
2026-02-19 17:18:23 +01:00
commit eabe2e2969
717 changed files with 654575 additions and 0 deletions

334
generate.py Normal file
View File

@@ -0,0 +1,334 @@
"""
VocabListGenerator — Single-run script
----------------------------------------
Reads configuration from 'conf', calls the LLM to generate vocabulary word pairs,
writes a Polly-compatible JSON import file into the configured output folder,
and updates the manifest.
Usage:
python generate.py
"""
import os
import sys
import xml.etree.ElementTree as ET
import yaml
from typing import Dict, Any, Optional
from config import Config
from llm_client import LLMClient
from models import VocabRequest
from first import generate_vocabulary_import
from manifest_manager import update_manifest, print_manifest, prune_missing_files
# ---------------------------------------------------------------------------
# Language map helper
# ---------------------------------------------------------------------------
def load_language_map(xml_path: str = "languages.xml") -> Dict[int, str]:
"""
Parse languages.xml and return a mapping of { language_id: language_name }.
E.g. {1: 'English', 2: 'Mandarin', 15: 'German', ...}
"""
language_map: Dict[int, str] = {}
try:
tree = ET.parse(xml_path)
root = tree.getroot()
for elem in root.iter("string"):
name_attr = elem.get("name", "")
if name_attr.startswith("language_"):
try:
lang_id = int(name_attr.split("_", 1)[1])
if elem.text:
language_map[lang_id] = elem.text.strip()
except (ValueError, AttributeError):
pass
except FileNotFoundError:
print(f"ERROR: '{xml_path}' not found.")
sys.exit(1)
except ET.ParseError as e:
print(f"ERROR: Could not parse '{xml_path}': {e}")
sys.exit(1)
return language_map
def load_language_code_map(xml_path: str = "languages.xml") -> Dict[int, str]:
"""
Parse languages.xml and return a mapping of { language_id: iso_code }.
E.g. {1: 'en', 7: 'pt', 15: 'de', ...}
Parsed from the string-array items like "de,DE,15".
"""
code_map: Dict[int, str] = {}
try:
tree = ET.parse(xml_path)
root = tree.getroot()
for array in root.iter("string-array"):
if array.get("name") == "language_codes":
for item in array.iter("item"):
if item.text:
parts = item.text.strip().split(",")
if len(parts) >= 3:
try:
lang_id = int(parts[2])
code_map[lang_id] = parts[0].lower()
except ValueError:
pass
except (FileNotFoundError, ET.ParseError):
pass
return code_map
def load_language_instructions(yaml_path: str = "language_instructions.yaml") -> Dict[int, Dict[str, str]]:
"""
Load language-specific instructions from YAML file.
Returns a mapping of { language_id: { key: instruction } }.
"""
instructions: Dict[int, Dict[str, str]] = {}
try:
with open(yaml_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if data:
for lang_id_str, lang_data in data.items():
# Skip comments/strings
if isinstance(lang_id_str, str) and lang_id_str.startswith("#"):
continue
try:
lang_id = int(lang_id_str)
if isinstance(lang_data, dict):
instructions[lang_id] = lang_data
except (ValueError, TypeError):
pass
except FileNotFoundError:
print(f"WARNING: '{yaml_path}' not found. Using default instructions.")
except yaml.YAMLError as e:
print(f"WARNING: Could not parse '{yaml_path}': {e}")
return instructions
def get_language_instruction_text(lang_id: int, language_instructions: Dict[int, Dict[str, str]]) -> str:
"""
Get the instruction text for a specific language.
Returns a formatted string with transcription/variant instructions.
"""
if lang_id not in language_instructions:
return ""
lang_data = language_instructions[lang_id]
parts = []
# Add transcription instruction if present
if "transcription" in lang_data:
parts.append(lang_data["transcription"])
# Add variant instruction if present
if "variant" in lang_data:
parts.append(f"Use {lang_data['variant']}.")
# Add special instruction if present
if "special" in lang_data:
parts.append(lang_data["special"])
return " ".join(parts)
def merge_instructions(
base_instructions: str,
lang_first_id: int,
lang_second_id: int,
language_instructions: Dict[int, Dict[str, str]]
) -> str:
"""
Merge base instructions with language-specific instructions.
Language-specific instructions are appended to the base instructions.
"""
lang1_instr = get_language_instruction_text(lang_first_id, language_instructions)
lang2_instr = get_language_instruction_text(lang_second_id, language_instructions)
# Collect all instruction parts
all_parts = []
if base_instructions:
all_parts.append(base_instructions)
if lang1_instr:
all_parts.append(lang1_instr)
if lang2_instr:
all_parts.append(lang2_instr)
return " ".join(all_parts)
# ---------------------------------------------------------------------------
# Core generation function (reused by batch_generate.py)
# ---------------------------------------------------------------------------
def run_generation(
llm: LLMClient,
language_map: Dict[int, str],
lang_first_id: int,
lang_second_id: int,
amount: int,
category: str,
name: str,
description: str,
instructions: str,
output_file_path: str, # absolute path including filename
manifest_path: str, # absolute path to manifest JSON
emoji: str = "",
level: str = "A2",
language_instructions: Optional[Dict[int, Dict[str, str]]] = None,
) -> bool:
"""
Generate one vocabulary list and update the manifest.
Returns True on success, False on failure.
"""
lang_first_name = language_map.get(lang_first_id)
lang_second_name = language_map.get(lang_second_id)
if not lang_first_name:
print(f" ERROR: Language ID {lang_first_id} not found in languages.xml")
return False
if not lang_second_name:
print(f" ERROR: Language ID {lang_second_id} not found in languages.xml")
return False
# Merge base instructions with language-specific instructions
final_instructions = instructions
if language_instructions:
final_instructions = merge_instructions(
instructions,
lang_first_id,
lang_second_id,
language_instructions
)
print(f" Languages : {lang_first_name} (ID {lang_first_id}) → {lang_second_name} (ID {lang_second_id})")
print(f" Amount : {amount} word pairs")
if final_instructions:
preview = final_instructions if len(final_instructions) <= 90 else final_instructions[:87] + "..."
print(f" Instructions: {preview}")
print()
request = VocabRequest(
amount=amount,
lang_first_id=lang_first_id,
lang_second_id=lang_second_id,
lang_first_name=lang_first_name,
lang_second_name=lang_second_name,
category=category,
instructions=final_instructions,
level=level,
)
print(" Generating vocabulary via LLM …")
word_pairs = llm.generate_vocabulary(request)
if not word_pairs:
print(" ERROR: No vocabulary pairs were generated.")
return False
print(f" Generated {len(word_pairs)} word pairs.")
preview_count = min(3, len(word_pairs))
for i, (w1, w2) in enumerate(word_pairs[:preview_count], 1):
print(f" {i}. {w1}{w2}")
if len(word_pairs) > preview_count:
print(f" … and {len(word_pairs) - preview_count} more")
print()
# Ensure output directory exists
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
# Write Polly import file (pass absolute path directly)
generate_vocabulary_import(
category_name=category,
lang_first_id=lang_first_id,
lang_second_id=lang_second_id,
word_pairs=word_pairs,
output_filename=output_file_path,
)
# Update manifest
update_manifest(
manifest_path=manifest_path,
vocab_file_path=output_file_path,
lang_first_id=lang_first_id,
lang_second_id=lang_second_id,
category=category,
item_count=len(word_pairs),
name=name,
description=description,
emoji=emoji,
level=level,
)
return True
# ---------------------------------------------------------------------------
# Main (single-run entry point)
# ---------------------------------------------------------------------------
def main() -> None:
print("=" * 50)
print(" VocabListGenerator")
print("=" * 50)
print()
config = Config()
vocab_cfg = config.vocab_config
manifest_cfg = config.manifest_config
language_map = load_language_map()
# Resolve paths
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, manifest_cfg.get("output_dir", "output"))
manifest_path = os.path.join(output_dir, manifest_cfg.get("filename", "vocab_manifest.json"))
output_filename = vocab_cfg.get("output_filename", "vocab_output.json")
vocab_file_path = os.path.join(output_dir, output_filename)
os.makedirs(output_dir, exist_ok=True)
# Prune stale manifest entries before generating
if os.path.isfile(manifest_path):
prune_missing_files(manifest_path, output_dir)
# Read parameters
lang_ids = vocab_cfg["languages"]
category = vocab_cfg["category"]
name = vocab_cfg.get("name", "").strip()
description = vocab_cfg.get("description", "").strip()
instructions = vocab_cfg.get("instructions", "").strip()
emoji = vocab_cfg.get("emoji", "").strip()
level = vocab_cfg.get("level", "A2").strip().upper()
amount = vocab_cfg["amount"]
print(f" Category : {category}")
print(f" Level : {level}")
print(f" Output dir : {output_dir}")
print(f" Manifest : {manifest_path}")
llm = LLMClient(config)
success = run_generation(
llm=llm,
language_map=language_map,
lang_first_id=lang_ids[0],
lang_second_id=lang_ids[1],
amount=amount,
category=category,
name=name,
description=description,
instructions=instructions,
output_file_path=vocab_file_path,
manifest_path=manifest_path,
emoji=emoji,
level=level,
)
if not success:
sys.exit(1)
print_manifest(manifest_path)
if __name__ == "__main__":
main()