welcome gitea
This commit is contained in:
334
generate.py
Normal file
334
generate.py
Normal file
@@ -0,0 +1,334 @@
|
||||
"""
|
||||
VocabListGenerator — Single-run script
|
||||
----------------------------------------
|
||||
Reads configuration from 'conf', calls the LLM to generate vocabulary word pairs,
|
||||
writes a Polly-compatible JSON import file into the configured output folder,
|
||||
and updates the manifest.
|
||||
|
||||
Usage:
|
||||
python generate.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
import yaml
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from config import Config
|
||||
from llm_client import LLMClient
|
||||
from models import VocabRequest
|
||||
from first import generate_vocabulary_import
|
||||
from manifest_manager import update_manifest, print_manifest, prune_missing_files
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Language map helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_language_map(xml_path: str = "languages.xml") -> Dict[int, str]:
|
||||
"""
|
||||
Parse languages.xml and return a mapping of { language_id: language_name }.
|
||||
E.g. {1: 'English', 2: 'Mandarin', 15: 'German', ...}
|
||||
"""
|
||||
language_map: Dict[int, str] = {}
|
||||
try:
|
||||
tree = ET.parse(xml_path)
|
||||
root = tree.getroot()
|
||||
for elem in root.iter("string"):
|
||||
name_attr = elem.get("name", "")
|
||||
if name_attr.startswith("language_"):
|
||||
try:
|
||||
lang_id = int(name_attr.split("_", 1)[1])
|
||||
if elem.text:
|
||||
language_map[lang_id] = elem.text.strip()
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
print(f"ERROR: '{xml_path}' not found.")
|
||||
sys.exit(1)
|
||||
except ET.ParseError as e:
|
||||
print(f"ERROR: Could not parse '{xml_path}': {e}")
|
||||
sys.exit(1)
|
||||
return language_map
|
||||
|
||||
|
||||
def load_language_code_map(xml_path: str = "languages.xml") -> Dict[int, str]:
|
||||
"""
|
||||
Parse languages.xml and return a mapping of { language_id: iso_code }.
|
||||
E.g. {1: 'en', 7: 'pt', 15: 'de', ...}
|
||||
Parsed from the string-array items like "de,DE,15".
|
||||
"""
|
||||
code_map: Dict[int, str] = {}
|
||||
try:
|
||||
tree = ET.parse(xml_path)
|
||||
root = tree.getroot()
|
||||
for array in root.iter("string-array"):
|
||||
if array.get("name") == "language_codes":
|
||||
for item in array.iter("item"):
|
||||
if item.text:
|
||||
parts = item.text.strip().split(",")
|
||||
if len(parts) >= 3:
|
||||
try:
|
||||
lang_id = int(parts[2])
|
||||
code_map[lang_id] = parts[0].lower()
|
||||
except ValueError:
|
||||
pass
|
||||
except (FileNotFoundError, ET.ParseError):
|
||||
pass
|
||||
return code_map
|
||||
|
||||
|
||||
def load_language_instructions(yaml_path: str = "language_instructions.yaml") -> Dict[int, Dict[str, str]]:
|
||||
"""
|
||||
Load language-specific instructions from YAML file.
|
||||
Returns a mapping of { language_id: { key: instruction } }.
|
||||
"""
|
||||
instructions: Dict[int, Dict[str, str]] = {}
|
||||
try:
|
||||
with open(yaml_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
if data:
|
||||
for lang_id_str, lang_data in data.items():
|
||||
# Skip comments/strings
|
||||
if isinstance(lang_id_str, str) and lang_id_str.startswith("#"):
|
||||
continue
|
||||
try:
|
||||
lang_id = int(lang_id_str)
|
||||
if isinstance(lang_data, dict):
|
||||
instructions[lang_id] = lang_data
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
print(f"WARNING: '{yaml_path}' not found. Using default instructions.")
|
||||
except yaml.YAMLError as e:
|
||||
print(f"WARNING: Could not parse '{yaml_path}': {e}")
|
||||
return instructions
|
||||
|
||||
|
||||
def get_language_instruction_text(lang_id: int, language_instructions: Dict[int, Dict[str, str]]) -> str:
|
||||
"""
|
||||
Get the instruction text for a specific language.
|
||||
Returns a formatted string with transcription/variant instructions.
|
||||
"""
|
||||
if lang_id not in language_instructions:
|
||||
return ""
|
||||
|
||||
lang_data = language_instructions[lang_id]
|
||||
parts = []
|
||||
|
||||
# Add transcription instruction if present
|
||||
if "transcription" in lang_data:
|
||||
parts.append(lang_data["transcription"])
|
||||
|
||||
# Add variant instruction if present
|
||||
if "variant" in lang_data:
|
||||
parts.append(f"Use {lang_data['variant']}.")
|
||||
|
||||
# Add special instruction if present
|
||||
if "special" in lang_data:
|
||||
parts.append(lang_data["special"])
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def merge_instructions(
|
||||
base_instructions: str,
|
||||
lang_first_id: int,
|
||||
lang_second_id: int,
|
||||
language_instructions: Dict[int, Dict[str, str]]
|
||||
) -> str:
|
||||
"""
|
||||
Merge base instructions with language-specific instructions.
|
||||
Language-specific instructions are appended to the base instructions.
|
||||
"""
|
||||
lang1_instr = get_language_instruction_text(lang_first_id, language_instructions)
|
||||
lang2_instr = get_language_instruction_text(lang_second_id, language_instructions)
|
||||
|
||||
# Collect all instruction parts
|
||||
all_parts = []
|
||||
if base_instructions:
|
||||
all_parts.append(base_instructions)
|
||||
if lang1_instr:
|
||||
all_parts.append(lang1_instr)
|
||||
if lang2_instr:
|
||||
all_parts.append(lang2_instr)
|
||||
|
||||
return " ".join(all_parts)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core generation function (reused by batch_generate.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_generation(
|
||||
llm: LLMClient,
|
||||
language_map: Dict[int, str],
|
||||
lang_first_id: int,
|
||||
lang_second_id: int,
|
||||
amount: int,
|
||||
category: str,
|
||||
name: str,
|
||||
description: str,
|
||||
instructions: str,
|
||||
output_file_path: str, # absolute path including filename
|
||||
manifest_path: str, # absolute path to manifest JSON
|
||||
emoji: str = "",
|
||||
level: str = "A2",
|
||||
language_instructions: Optional[Dict[int, Dict[str, str]]] = None,
|
||||
) -> bool:
|
||||
"""
|
||||
Generate one vocabulary list and update the manifest.
|
||||
Returns True on success, False on failure.
|
||||
"""
|
||||
lang_first_name = language_map.get(lang_first_id)
|
||||
lang_second_name = language_map.get(lang_second_id)
|
||||
|
||||
if not lang_first_name:
|
||||
print(f" ERROR: Language ID {lang_first_id} not found in languages.xml")
|
||||
return False
|
||||
if not lang_second_name:
|
||||
print(f" ERROR: Language ID {lang_second_id} not found in languages.xml")
|
||||
return False
|
||||
|
||||
# Merge base instructions with language-specific instructions
|
||||
final_instructions = instructions
|
||||
if language_instructions:
|
||||
final_instructions = merge_instructions(
|
||||
instructions,
|
||||
lang_first_id,
|
||||
lang_second_id,
|
||||
language_instructions
|
||||
)
|
||||
|
||||
print(f" Languages : {lang_first_name} (ID {lang_first_id}) → {lang_second_name} (ID {lang_second_id})")
|
||||
print(f" Amount : {amount} word pairs")
|
||||
if final_instructions:
|
||||
preview = final_instructions if len(final_instructions) <= 90 else final_instructions[:87] + "..."
|
||||
print(f" Instructions: {preview}")
|
||||
print()
|
||||
|
||||
request = VocabRequest(
|
||||
amount=amount,
|
||||
lang_first_id=lang_first_id,
|
||||
lang_second_id=lang_second_id,
|
||||
lang_first_name=lang_first_name,
|
||||
lang_second_name=lang_second_name,
|
||||
category=category,
|
||||
instructions=final_instructions,
|
||||
level=level,
|
||||
)
|
||||
|
||||
print(" Generating vocabulary via LLM …")
|
||||
word_pairs = llm.generate_vocabulary(request)
|
||||
|
||||
if not word_pairs:
|
||||
print(" ERROR: No vocabulary pairs were generated.")
|
||||
return False
|
||||
|
||||
print(f" Generated {len(word_pairs)} word pairs.")
|
||||
preview_count = min(3, len(word_pairs))
|
||||
for i, (w1, w2) in enumerate(word_pairs[:preview_count], 1):
|
||||
print(f" {i}. {w1} → {w2}")
|
||||
if len(word_pairs) > preview_count:
|
||||
print(f" … and {len(word_pairs) - preview_count} more")
|
||||
print()
|
||||
|
||||
# Ensure output directory exists
|
||||
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
|
||||
|
||||
# Write Polly import file (pass absolute path directly)
|
||||
generate_vocabulary_import(
|
||||
category_name=category,
|
||||
lang_first_id=lang_first_id,
|
||||
lang_second_id=lang_second_id,
|
||||
word_pairs=word_pairs,
|
||||
output_filename=output_file_path,
|
||||
)
|
||||
|
||||
# Update manifest
|
||||
update_manifest(
|
||||
manifest_path=manifest_path,
|
||||
vocab_file_path=output_file_path,
|
||||
lang_first_id=lang_first_id,
|
||||
lang_second_id=lang_second_id,
|
||||
category=category,
|
||||
item_count=len(word_pairs),
|
||||
name=name,
|
||||
description=description,
|
||||
emoji=emoji,
|
||||
level=level,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main (single-run entry point)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
print("=" * 50)
|
||||
print(" VocabListGenerator")
|
||||
print("=" * 50)
|
||||
print()
|
||||
|
||||
config = Config()
|
||||
vocab_cfg = config.vocab_config
|
||||
manifest_cfg = config.manifest_config
|
||||
|
||||
language_map = load_language_map()
|
||||
|
||||
# Resolve paths
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
output_dir = os.path.join(script_dir, manifest_cfg.get("output_dir", "output"))
|
||||
manifest_path = os.path.join(output_dir, manifest_cfg.get("filename", "vocab_manifest.json"))
|
||||
output_filename = vocab_cfg.get("output_filename", "vocab_output.json")
|
||||
vocab_file_path = os.path.join(output_dir, output_filename)
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Prune stale manifest entries before generating
|
||||
if os.path.isfile(manifest_path):
|
||||
prune_missing_files(manifest_path, output_dir)
|
||||
|
||||
# Read parameters
|
||||
lang_ids = vocab_cfg["languages"]
|
||||
category = vocab_cfg["category"]
|
||||
name = vocab_cfg.get("name", "").strip()
|
||||
description = vocab_cfg.get("description", "").strip()
|
||||
instructions = vocab_cfg.get("instructions", "").strip()
|
||||
emoji = vocab_cfg.get("emoji", "").strip()
|
||||
level = vocab_cfg.get("level", "A2").strip().upper()
|
||||
amount = vocab_cfg["amount"]
|
||||
|
||||
print(f" Category : {category}")
|
||||
print(f" Level : {level}")
|
||||
print(f" Output dir : {output_dir}")
|
||||
print(f" Manifest : {manifest_path}")
|
||||
|
||||
llm = LLMClient(config)
|
||||
|
||||
success = run_generation(
|
||||
llm=llm,
|
||||
language_map=language_map,
|
||||
lang_first_id=lang_ids[0],
|
||||
lang_second_id=lang_ids[1],
|
||||
amount=amount,
|
||||
category=category,
|
||||
name=name,
|
||||
description=description,
|
||||
instructions=instructions,
|
||||
output_file_path=vocab_file_path,
|
||||
manifest_path=manifest_path,
|
||||
emoji=emoji,
|
||||
level=level,
|
||||
)
|
||||
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
|
||||
print_manifest(manifest_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user