more
This commit is contained in:
@@ -62,6 +62,44 @@ def sanitize_for_filename(name: str) -> str:
|
||||
return name
|
||||
|
||||
|
||||
def strip_date_prefix(filename: str) -> str:
|
||||
"""
|
||||
Strip the date prefix (YYYY_MM_DD_) from a filename.
|
||||
If the filename doesn't have a date prefix, return it unchanged.
|
||||
|
||||
Example: '2026_02_19_verbs_en_de_A1.json' -> 'verbs_en_de_A1.json'
|
||||
"""
|
||||
# Match date pattern at the start: YYYY_MM_DD_
|
||||
date_pattern = r'^\d{4}_\d{2}_\d{2}_'
|
||||
return re.sub(date_pattern, '', filename)
|
||||
|
||||
|
||||
def find_existing_file_ignoring_date(output_dir: str, target_filename: str) -> str | None:
|
||||
"""
|
||||
Check if a file with the same name (ignoring date prefix) already exists.
|
||||
|
||||
Args:
|
||||
output_dir: The directory to search in
|
||||
target_filename: The target filename (may include date prefix)
|
||||
|
||||
Returns:
|
||||
The path to the existing file if found, None otherwise.
|
||||
"""
|
||||
# Strip the date prefix from our target filename
|
||||
target_without_date = strip_date_prefix(target_filename)
|
||||
|
||||
if not os.path.isdir(output_dir):
|
||||
return None
|
||||
|
||||
# Search for files matching the pattern
|
||||
for existing_file in os.listdir(output_dir):
|
||||
existing_without_date = strip_date_prefix(existing_file)
|
||||
if existing_without_date == target_without_date:
|
||||
return os.path.join(output_dir, existing_file)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def generate_output_filename(
|
||||
entry: Dict[str, Any],
|
||||
code_map: Dict[int, str],
|
||||
@@ -271,7 +309,7 @@ def main() -> None:
|
||||
else:
|
||||
already = sum(
|
||||
1 for b in active
|
||||
if os.path.isfile(os.path.join(output_dir, b["output_filename"]))
|
||||
if find_existing_file_ignoring_date(output_dir, b["output_filename"]) is not None
|
||||
)
|
||||
if already:
|
||||
print(f" Resuming : {already} existing file(s) will be skipped (use --force to override)")
|
||||
@@ -331,8 +369,10 @@ def main() -> None:
|
||||
print(f" ETA : {eta_str} ({int(avg_time_per_item)}s/item)")
|
||||
|
||||
# Skip if already generated (unless --force)
|
||||
if not args.force and os.path.isfile(vocab_file_path):
|
||||
print(f" ✔ Already exists — skipping (use --force to regenerate)")
|
||||
existing_file = find_existing_file_ignoring_date(output_dir, output_filename)
|
||||
if not args.force and existing_file is not None:
|
||||
existing_filename = os.path.basename(existing_file)
|
||||
print(f" ✔ Already exists ({existing_filename}) — skipping (use --force to regenerate)")
|
||||
print("-" * 60)
|
||||
skipped_existing += 1
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user