335 lines
11 KiB
Python
335 lines
11 KiB
Python
"""
|
|
VocabListGenerator — Single-run script
|
|
----------------------------------------
|
|
Reads configuration from 'conf', calls the LLM to generate vocabulary word pairs,
|
|
writes a Polly-compatible JSON import file into the configured output folder,
|
|
and updates the manifest.
|
|
|
|
Usage:
|
|
python generate.py
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import xml.etree.ElementTree as ET
|
|
import yaml
|
|
from typing import Dict, Any, Optional
|
|
|
|
from config import Config
|
|
from llm_client import LLMClient
|
|
from models import VocabRequest
|
|
from first import generate_vocabulary_import
|
|
from manifest_manager import update_manifest, print_manifest, prune_missing_files
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Language map helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def load_language_map(xml_path: str = "languages.xml") -> Dict[int, str]:
|
|
"""
|
|
Parse languages.xml and return a mapping of { language_id: language_name }.
|
|
E.g. {1: 'English', 2: 'Mandarin', 15: 'German', ...}
|
|
"""
|
|
language_map: Dict[int, str] = {}
|
|
try:
|
|
tree = ET.parse(xml_path)
|
|
root = tree.getroot()
|
|
for elem in root.iter("string"):
|
|
name_attr = elem.get("name", "")
|
|
if name_attr.startswith("language_"):
|
|
try:
|
|
lang_id = int(name_attr.split("_", 1)[1])
|
|
if elem.text:
|
|
language_map[lang_id] = elem.text.strip()
|
|
except (ValueError, AttributeError):
|
|
pass
|
|
except FileNotFoundError:
|
|
print(f"ERROR: '{xml_path}' not found.")
|
|
sys.exit(1)
|
|
except ET.ParseError as e:
|
|
print(f"ERROR: Could not parse '{xml_path}': {e}")
|
|
sys.exit(1)
|
|
return language_map
|
|
|
|
|
|
def load_language_code_map(xml_path: str = "languages.xml") -> Dict[int, str]:
|
|
"""
|
|
Parse languages.xml and return a mapping of { language_id: iso_code }.
|
|
E.g. {1: 'en', 7: 'pt', 15: 'de', ...}
|
|
Parsed from the string-array items like "de,DE,15".
|
|
"""
|
|
code_map: Dict[int, str] = {}
|
|
try:
|
|
tree = ET.parse(xml_path)
|
|
root = tree.getroot()
|
|
for array in root.iter("string-array"):
|
|
if array.get("name") == "language_codes":
|
|
for item in array.iter("item"):
|
|
if item.text:
|
|
parts = item.text.strip().split(",")
|
|
if len(parts) >= 3:
|
|
try:
|
|
lang_id = int(parts[2])
|
|
code_map[lang_id] = parts[0].lower()
|
|
except ValueError:
|
|
pass
|
|
except (FileNotFoundError, ET.ParseError):
|
|
pass
|
|
return code_map
|
|
|
|
|
|
def load_language_instructions(yaml_path: str = "language_instructions.yaml") -> Dict[int, Dict[str, str]]:
|
|
"""
|
|
Load language-specific instructions from YAML file.
|
|
Returns a mapping of { language_id: { key: instruction } }.
|
|
"""
|
|
instructions: Dict[int, Dict[str, str]] = {}
|
|
try:
|
|
with open(yaml_path, "r", encoding="utf-8") as f:
|
|
data = yaml.safe_load(f)
|
|
if data:
|
|
for lang_id_str, lang_data in data.items():
|
|
# Skip comments/strings
|
|
if isinstance(lang_id_str, str) and lang_id_str.startswith("#"):
|
|
continue
|
|
try:
|
|
lang_id = int(lang_id_str)
|
|
if isinstance(lang_data, dict):
|
|
instructions[lang_id] = lang_data
|
|
except (ValueError, TypeError):
|
|
pass
|
|
except FileNotFoundError:
|
|
print(f"WARNING: '{yaml_path}' not found. Using default instructions.")
|
|
except yaml.YAMLError as e:
|
|
print(f"WARNING: Could not parse '{yaml_path}': {e}")
|
|
return instructions
|
|
|
|
|
|
def get_language_instruction_text(lang_id: int, language_instructions: Dict[int, Dict[str, str]]) -> str:
|
|
"""
|
|
Get the instruction text for a specific language.
|
|
Returns a formatted string with transcription/variant instructions.
|
|
"""
|
|
if lang_id not in language_instructions:
|
|
return ""
|
|
|
|
lang_data = language_instructions[lang_id]
|
|
parts = []
|
|
|
|
# Add transcription instruction if present
|
|
if "transcription" in lang_data:
|
|
parts.append(lang_data["transcription"])
|
|
|
|
# Add variant instruction if present
|
|
if "variant" in lang_data:
|
|
parts.append(f"Use {lang_data['variant']}.")
|
|
|
|
# Add special instruction if present
|
|
if "special" in lang_data:
|
|
parts.append(lang_data["special"])
|
|
|
|
return " ".join(parts)
|
|
|
|
|
|
def merge_instructions(
|
|
base_instructions: str,
|
|
lang_first_id: int,
|
|
lang_second_id: int,
|
|
language_instructions: Dict[int, Dict[str, str]]
|
|
) -> str:
|
|
"""
|
|
Merge base instructions with language-specific instructions.
|
|
Language-specific instructions are appended to the base instructions.
|
|
"""
|
|
lang1_instr = get_language_instruction_text(lang_first_id, language_instructions)
|
|
lang2_instr = get_language_instruction_text(lang_second_id, language_instructions)
|
|
|
|
# Collect all instruction parts
|
|
all_parts = []
|
|
if base_instructions:
|
|
all_parts.append(base_instructions)
|
|
if lang1_instr:
|
|
all_parts.append(lang1_instr)
|
|
if lang2_instr:
|
|
all_parts.append(lang2_instr)
|
|
|
|
return " ".join(all_parts)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Core generation function (reused by batch_generate.py)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def run_generation(
|
|
llm: LLMClient,
|
|
language_map: Dict[int, str],
|
|
lang_first_id: int,
|
|
lang_second_id: int,
|
|
amount: int,
|
|
category: str,
|
|
name: str,
|
|
description: str,
|
|
instructions: str,
|
|
output_file_path: str, # absolute path including filename
|
|
manifest_path: str, # absolute path to manifest JSON
|
|
emoji: str = "",
|
|
level: str = "A2",
|
|
language_instructions: Optional[Dict[int, Dict[str, str]]] = None,
|
|
) -> bool:
|
|
"""
|
|
Generate one vocabulary list and update the manifest.
|
|
Returns True on success, False on failure.
|
|
"""
|
|
lang_first_name = language_map.get(lang_first_id)
|
|
lang_second_name = language_map.get(lang_second_id)
|
|
|
|
if not lang_first_name:
|
|
print(f" ERROR: Language ID {lang_first_id} not found in languages.xml")
|
|
return False
|
|
if not lang_second_name:
|
|
print(f" ERROR: Language ID {lang_second_id} not found in languages.xml")
|
|
return False
|
|
|
|
# Merge base instructions with language-specific instructions
|
|
final_instructions = instructions
|
|
if language_instructions:
|
|
final_instructions = merge_instructions(
|
|
instructions,
|
|
lang_first_id,
|
|
lang_second_id,
|
|
language_instructions
|
|
)
|
|
|
|
print(f" Languages : {lang_first_name} (ID {lang_first_id}) → {lang_second_name} (ID {lang_second_id})")
|
|
print(f" Amount : {amount} word pairs")
|
|
if final_instructions:
|
|
preview = final_instructions if len(final_instructions) <= 90 else final_instructions[:87] + "..."
|
|
print(f" Instructions: {preview}")
|
|
print()
|
|
|
|
request = VocabRequest(
|
|
amount=amount,
|
|
lang_first_id=lang_first_id,
|
|
lang_second_id=lang_second_id,
|
|
lang_first_name=lang_first_name,
|
|
lang_second_name=lang_second_name,
|
|
category=category,
|
|
instructions=final_instructions,
|
|
level=level,
|
|
)
|
|
|
|
print(" Generating vocabulary via LLM …")
|
|
word_pairs = llm.generate_vocabulary(request)
|
|
|
|
if not word_pairs:
|
|
print(" ERROR: No vocabulary pairs were generated.")
|
|
return False
|
|
|
|
print(f" Generated {len(word_pairs)} word pairs.")
|
|
preview_count = min(3, len(word_pairs))
|
|
for i, (w1, w2) in enumerate(word_pairs[:preview_count], 1):
|
|
print(f" {i}. {w1} → {w2}")
|
|
if len(word_pairs) > preview_count:
|
|
print(f" … and {len(word_pairs) - preview_count} more")
|
|
print()
|
|
|
|
# Ensure output directory exists
|
|
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
|
|
|
|
# Write Polly import file (pass absolute path directly)
|
|
generate_vocabulary_import(
|
|
category_name=category,
|
|
lang_first_id=lang_first_id,
|
|
lang_second_id=lang_second_id,
|
|
word_pairs=word_pairs,
|
|
output_filename=output_file_path,
|
|
)
|
|
|
|
# Update manifest
|
|
update_manifest(
|
|
manifest_path=manifest_path,
|
|
vocab_file_path=output_file_path,
|
|
lang_first_id=lang_first_id,
|
|
lang_second_id=lang_second_id,
|
|
category=category,
|
|
item_count=len(word_pairs),
|
|
name=name,
|
|
description=description,
|
|
emoji=emoji,
|
|
level=level,
|
|
)
|
|
return True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main (single-run entry point)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main() -> None:
|
|
print("=" * 50)
|
|
print(" VocabListGenerator")
|
|
print("=" * 50)
|
|
print()
|
|
|
|
config = Config()
|
|
vocab_cfg = config.vocab_config
|
|
manifest_cfg = config.manifest_config
|
|
|
|
language_map = load_language_map()
|
|
|
|
# Resolve paths
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
output_dir = os.path.join(script_dir, manifest_cfg.get("output_dir", "output"))
|
|
manifest_path = os.path.join(output_dir, manifest_cfg.get("filename", "vocab_manifest.json"))
|
|
output_filename = vocab_cfg.get("output_filename", "vocab_output.json")
|
|
vocab_file_path = os.path.join(output_dir, output_filename)
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Prune stale manifest entries before generating
|
|
if os.path.isfile(manifest_path):
|
|
prune_missing_files(manifest_path, output_dir)
|
|
|
|
# Read parameters
|
|
lang_ids = vocab_cfg["languages"]
|
|
category = vocab_cfg["category"]
|
|
name = vocab_cfg.get("name", "").strip()
|
|
description = vocab_cfg.get("description", "").strip()
|
|
instructions = vocab_cfg.get("instructions", "").strip()
|
|
emoji = vocab_cfg.get("emoji", "").strip()
|
|
level = vocab_cfg.get("level", "A2").strip().upper()
|
|
amount = vocab_cfg["amount"]
|
|
|
|
print(f" Category : {category}")
|
|
print(f" Level : {level}")
|
|
print(f" Output dir : {output_dir}")
|
|
print(f" Manifest : {manifest_path}")
|
|
|
|
llm = LLMClient(config)
|
|
|
|
success = run_generation(
|
|
llm=llm,
|
|
language_map=language_map,
|
|
lang_first_id=lang_ids[0],
|
|
lang_second_id=lang_ids[1],
|
|
amount=amount,
|
|
category=category,
|
|
name=name,
|
|
description=description,
|
|
instructions=instructions,
|
|
output_file_path=vocab_file_path,
|
|
manifest_path=manifest_path,
|
|
emoji=emoji,
|
|
level=level,
|
|
)
|
|
|
|
if not success:
|
|
sys.exit(1)
|
|
|
|
print_manifest(manifest_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|