This commit is contained in:
jonasgaudian
2026-02-20 11:05:53 +01:00
parent 0c61d0731d
commit bf116584ef
603 changed files with 488445 additions and 9550 deletions

View File

@@ -62,6 +62,44 @@ def sanitize_for_filename(name: str) -> str:
return name
def strip_date_prefix(filename: str) -> str:
"""
Strip the date prefix (YYYY_MM_DD_) from a filename.
If the filename doesn't have a date prefix, return it unchanged.
Example: '2026_02_19_verbs_en_de_A1.json' -> 'verbs_en_de_A1.json'
"""
# Match date pattern at the start: YYYY_MM_DD_
date_pattern = r'^\d{4}_\d{2}_\d{2}_'
return re.sub(date_pattern, '', filename)
def find_existing_file_ignoring_date(output_dir: str, target_filename: str) -> str | None:
"""
Check if a file with the same name (ignoring date prefix) already exists.
Args:
output_dir: The directory to search in
target_filename: The target filename (may include date prefix)
Returns:
The path to the existing file if found, None otherwise.
"""
# Strip the date prefix from our target filename
target_without_date = strip_date_prefix(target_filename)
if not os.path.isdir(output_dir):
return None
# Search for files matching the pattern
for existing_file in os.listdir(output_dir):
existing_without_date = strip_date_prefix(existing_file)
if existing_without_date == target_without_date:
return os.path.join(output_dir, existing_file)
return None
def generate_output_filename(
entry: Dict[str, Any],
code_map: Dict[int, str],
@@ -271,7 +309,7 @@ def main() -> None:
else:
already = sum(
1 for b in active
if os.path.isfile(os.path.join(output_dir, b["output_filename"]))
if find_existing_file_ignoring_date(output_dir, b["output_filename"]) is not None
)
if already:
print(f" Resuming : {already} existing file(s) will be skipped (use --force to override)")
@@ -331,8 +369,10 @@ def main() -> None:
print(f" ETA : {eta_str} ({int(avg_time_per_item)}s/item)")
# Skip if already generated (unless --force)
if not args.force and os.path.isfile(vocab_file_path):
print(f" ✔ Already exists — skipping (use --force to regenerate)")
existing_file = find_existing_file_ignoring_date(output_dir, output_filename)
if not args.force and existing_file is not None:
existing_filename = os.path.basename(existing_file)
print(f" ✔ Already exists ({existing_filename}) — skipping (use --force to regenerate)")
print("-" * 60)
skipped_existing += 1
continue