""" Manifest manager for VocabListGenerator ----------------------------------------- Maintains vocab_manifest.json — the index file the app fetches to discover all available vocabulary lists, their metadata, and download info. Manifest entry schema --------------------- { "id": "verbs_beginners", // filename stem (no .json) "name": "Verbs for Beginners (DE-PT)", "description": "...", "filename": "verbs_beginners.json", // file the app downloads "language_ids": [15, 7], // [lang_first_id, lang_second_id] "category": "Verbs for beginners", "item_count": 104, "level": "A1", "emoji": "🏃", "version": 1, "size_bytes": 45312, "checksum_sha256": "A1B2C3...", "created_at": "2026-02-18T20:53:54Z", // first generation "updated_at": "2026-02-18T21:10:00Z" // last re-generation } """ import hashlib import json import os from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional MANIFEST_VERSION = "1.0" # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def prune_missing_files(manifest_path: str, output_dir: str) -> int: """ Remove entries from the manifest whose vocab file no longer exists in *output_dir*. Saves the manifest only when at least one entry is removed. Returns the number of entries that were pruned. """ manifest = _load_manifest(manifest_path) output_path = Path(output_dir) before = len(manifest["lists"]) surviving = [] for entry in manifest["lists"]: file_path = output_path / entry["filename"] if file_path.is_file(): surviving.append(entry) else: print(f" [manifest] Pruned missing file: {entry['filename']} (id={entry['id']})") removed = before - len(surviving) if removed: manifest["lists"] = surviving manifest["updated_at"] = _utc_now() _save_manifest(manifest_path, manifest) print(f" [manifest] Pruned {removed} stale entr{'y' if removed == 1 else 'ies'} → {manifest_path}") return removed def update_manifest( manifest_path: str, vocab_file_path: str, lang_first_id: int, lang_second_id: int, category: str, item_count: int, name: str = "", description: str = "", emoji: str = "", level: str = "", ) -> None: """ Compute size + checksum of *vocab_file_path*, then upsert an entry in the manifest at *manifest_path*. Creates the manifest if it does not exist yet. If an entry with the same ``id`` already exists it is updated in-place (created_at is preserved, updated_at is refreshed). """ vocab_path = Path(vocab_file_path) if not vocab_path.is_file(): print(f" [manifest] WARNING: vocab file not found: {vocab_file_path}") return entry_id = vocab_path.stem # e.g. "verbs_beginners" filename = vocab_path.name # e.g. "verbs_beginners.json" size_bytes = _file_size(vocab_path) checksum_sha256 = _sha256(vocab_path) now_iso = _utc_now() # Load manifest; drop any entries whose files have since been deleted output_dir = str(vocab_path.parent) manifest = _load_manifest(manifest_path) manifest["lists"] = [ e for e in manifest["lists"] if (vocab_path.parent / e["filename"]).is_file() ] # Find existing entry (if any) existing = _find_entry(manifest["lists"], entry_id) if existing is None: # Brand-new entry entry: Dict[str, Any] = { "id": entry_id, "name": name or entry_id, "description": description, "filename": filename, "language_ids": [lang_first_id, lang_second_id], "category": category, "item_count": item_count, "level": level, "emoji": emoji, "version": 1, "size_bytes": size_bytes, "checksum_sha256": checksum_sha256, "created_at": now_iso, "updated_at": now_iso, } manifest["lists"].append(entry) print(f" [manifest] Added new entry: {entry_id}") else: # Update mutable fields; keep created_at and version existing["name"] = name or existing.get("name", entry_id) existing["description"] = description or existing.get("description", "") existing["filename"] = filename existing["language_ids"] = [lang_first_id, lang_second_id] existing["category"] = category existing["item_count"] = item_count existing["level"] = level or existing.get("level", "") existing["emoji"] = emoji or existing.get("emoji", "") existing.setdefault("version", 1) # preserve existing version if already set existing["size_bytes"] = size_bytes existing["checksum_sha256"] = checksum_sha256 existing["updated_at"] = now_iso print(f" [manifest] Updated existing entry: {entry_id}") # Sort list alphabetically by id for stable output manifest["lists"].sort(key=lambda e: e["id"]) manifest["updated_at"] = now_iso _save_manifest(manifest_path, manifest) print(f" [manifest] Saved → {manifest_path}") def print_manifest(manifest_path: str) -> None: """Pretty-print a summary of the manifest to stdout.""" manifest = _load_manifest(manifest_path) lists = manifest.get("lists", []) print(f"\nManifest: {manifest_path} ({len(lists)} lists)") print("-" * 60) for entry in lists: lang_ids = ", ".join(str(i) for i in entry.get("language_ids", [])) print( f" [{entry['id']}] {entry['name']}\n" f" category={entry['category']} " f"items={entry['item_count']} " f"langs=[{lang_ids}] " f"size={entry['size_bytes']} B\n" f" updated={entry['updated_at']}" ) print() # --------------------------------------------------------------------------- # Private helpers # --------------------------------------------------------------------------- def _load_manifest(path: str) -> Dict[str, Any]: """Load existing manifest or return a fresh skeleton.""" if os.path.isfile(path): try: with open(path, "r", encoding="utf-8") as f: return json.load(f) except (json.JSONDecodeError, OSError) as e: print(f" [manifest] WARNING: could not read manifest ({e}), starting fresh.") return { "manifest_version": MANIFEST_VERSION, "updated_at": _utc_now(), "lists": [], } def _save_manifest(path: str, manifest: Dict[str, Any]) -> None: with open(path, "w", encoding="utf-8") as f: json.dump(manifest, f, indent=2, ensure_ascii=False) def _find_entry( lists: List[Dict[str, Any]], entry_id: str ) -> Optional[Dict[str, Any]]: for entry in lists: if entry.get("id") == entry_id: return entry return None def _file_size(path: Path) -> int: return path.stat().st_size def _sha256(path: Path) -> str: h = hashlib.sha256() with open(path, "rb") as f: for chunk in iter(lambda: f.read(65536), b""): h.update(chunk) return h.hexdigest().upper() def _utc_now() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")