welcome gitea

2026-02-19 17:18:23 +01:00
commit eabe2e2969
717 changed files with 654575 additions and 0 deletions
--- a/llm_client.py
+++ b/llm_client.py
@@ -0,0 +1,176 @@
+"""
+OpenAI-compatible LLM client for VocabListGenerator
+Adapted from ResourceTranslate/llm_client.py
+"""
+
+import json
+import openai
+from typing import List, Tuple
+
+from models import VocabRequest, CEFR_DESCRIPTIONS
+from config import Config
+
+
+class LLMClient:
+    """OpenAI-compatible LLM client (LM Studio / DeepSeek)"""
+
+    def __init__(self, config: Config):
+        self.config = config
+        self.client = openai.OpenAI(
+            base_url=config.llm_config['base_url'],
+            api_key=config.llm_config.get('api_key', 'not-needed')
+        )
+        self.model = config.llm_config['model']
+        self.timeout = config.llm_config.get('timeout', 30)
+        self.max_retries = config.llm_config.get('max_retries', 3)
+
+    def generate_vocabulary(self, request: VocabRequest) -> List[Tuple[str, str]]:
+        """
+        Ask the LLM to generate vocabulary word pairs for the given request.
+        Returns a list of (word_in_lang_first, word_in_lang_second) tuples.
+        """
+        prompt = self._build_vocab_prompt(request)
+
+        system_message = (
+            "You are an expert language teacher and vocabulary specialist. "
+            "Generate accurate, natural vocabulary word pairs exactly as instructed. "
+            "Your response must be a valid JSON array and nothing else."
+        )
+
+        for attempt in range(self.max_retries):
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model,
+                    messages=[
+                        {"role": "system", "content": system_message},
+                        {"role": "user", "content": prompt}
+                    ],
+                    temperature=0.7,
+                    timeout=self.timeout
+                )
+
+                if not response or not response.choices:
+                    print(f"  [attempt {attempt + 1}] Empty response from LLM")
+                    continue
+
+                choice = response.choices[0]
+                if not hasattr(choice, 'message') or not choice.message:
+                    print(f"  [attempt {attempt + 1}] Invalid response structure")
+                    continue
+
+                content = choice.message.content
+                if content is None:
+                    print(f"  [attempt {attempt + 1}] Empty content in response")
+                    continue
+
+                pairs = self._parse_vocab_response(content)
+                if pairs:
+                    return pairs
+                else:
+                    print(f"  [attempt {attempt + 1}] Could not parse valid pairs from response")
+
+            except Exception as e:
+                print(f"  [attempt {attempt + 1}] Failed: {e}")
+                if attempt == self.max_retries - 1:
+                    print("All attempts exhausted.")
+
+        return []
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _build_vocab_prompt(self, request: VocabRequest) -> str:
+        """Build the vocabulary generation prompt."""
+        level_desc = CEFR_DESCRIPTIONS.get(request.level, "")
+        lines = [
+            f"Generate exactly {request.amount} vocabulary word pairs for the topic: \"{request.category}\".",
+            "",
+            f"Proficiency level: {request.level}" + (f" — {level_desc}" if level_desc else ""),
+            "",
+            f"First language  (wordFirst):  {request.lang_first_name}",
+            f"Second language (wordSecond): {request.lang_second_name}",
+        ]
+
+        if request.instructions and request.instructions.strip():
+            lines += [
+                "",
+                "Additional instructions (follow these carefully):",
+                request.instructions.strip(),
+            ]
+
+        lines += [
+            "",
+            "Rules:",
+            "- Choose vocabulary appropriate for the specified proficiency level.",
+            "- Return ONLY a JSON array. No markdown, no explanation, no extra text.",
+            "- Each element is a 2-item array: [word_in_first_language, word_in_second_language].",
+            f"- The array must contain exactly {request.amount} unique pairs.",
+            "- NO DUPLICATES: Each word in the first language must appear only once.",
+            "- VARY the vocabulary: avoid repeating similar words.",
+            "",
+            "Example format:",
+            '[',
+            '  ["Krankenhaus", "hospital"],',
+            '  ["Arzt", "médico"]',
+            ']',
+        ]
+
+        return "\n".join(lines)
+
+    def _parse_vocab_response(self, response: str) -> List[Tuple[str, str]]:
+        """Parse the LLM response into a list of word-pair tuples."""
+        if not response or not response.strip():
+            return []
+
+        try:
+            text = response.strip()
+
+            # Try direct parse first
+            if text.startswith('['):
+                data = json.loads(text)
+            else:
+                # Extract JSON array from surrounding text
+                start = text.find('[')
+                end = text.rfind(']') + 1
+                if start == -1 or end == 0:
+                    print(f"Could not locate JSON array in response:\n{response[:500]}")
+                    return []
+                data = json.loads(text[start:end])
+
+            # Check for duplicates and log warning
+            seen_first = {}
+            duplicates = []
+            pairs: List[Tuple[str, str]] = []
+            for item in data:
+                if isinstance(item, (list, tuple)) and len(item) >= 2:
+                    word_first = str(item[0]).strip()
+                    word_second = str(item[1]).strip()
+                    
+                    # Track duplicates
+                    if word_first in seen_first:
+                        duplicates.append(word_first)
+                    else:
+                        seen_first[word_first] = word_second
+                    
+                    pairs.append((word_first, word_second))
+            
+            # Log duplicates if found
+            if duplicates:
+                unique_dups = list(set(duplicates))
+                print(f"  ⚠ Warning: Found {len(duplicates)} duplicate first-language words: {unique_dups[:5]}{'...' if len(unique_dups) > 5 else ''}")
+                # Deduplicate - keep first occurrence only
+                pairs = list(seen_first.items())
+                print(f"  → Using {len(pairs)} unique pairs after deduplication")
+
+            return pairs
+
+        except json.JSONDecodeError as e:
+            print(f"JSON parse error: {e}")
+            print(f"Raw response (first 800 chars):\n{response[:800]}")
+            print(f"Raw response (last 200 chars):\n{response[-200:]}")
+            return []
+        except Exception as e:
+            print(f"Unexpected error parsing response: {e}")
+            print(f"Response preview:\n{response[:500]}")
+            return []