|
|
|
|
|
""" |
|
|
Clean up script for removing training runs without exported models. |
|
|
Removes all directories in runs/ folder that don't have a corresponding exported model file. |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import os |
|
|
import shutil |
|
|
from pathlib import Path |
|
|
import glob |
|
|
|
|
|
|
|
|
def find_exported_models(): |
|
|
"""Find all exported model files in the current directory""" |
|
|
exported_models = [] |
|
|
seen_files = set() |
|
|
|
|
|
|
|
|
|
|
|
patterns = [ |
|
|
"*_20[0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].joblib" |
|
|
] |
|
|
|
|
|
for pattern in patterns: |
|
|
for filepath in glob.glob(pattern): |
|
|
|
|
|
if filepath in seen_files: |
|
|
continue |
|
|
seen_files.add(filepath) |
|
|
|
|
|
|
|
|
|
|
|
filename = os.path.basename(filepath) |
|
|
parts = filename.replace(".joblib", "").split("_") |
|
|
if len(parts) >= 4: |
|
|
|
|
|
timestamp = "_".join(parts[-2:]) |
|
|
exported_models.append({ |
|
|
"file": filepath, |
|
|
"timestamp": timestamp |
|
|
}) |
|
|
|
|
|
return exported_models |
|
|
|
|
|
|
|
|
def find_all_runs(): |
|
|
"""Find all run directories in the runs folder""" |
|
|
runs_dir = Path("runs") |
|
|
if not runs_dir.exists(): |
|
|
return [] |
|
|
|
|
|
runs = [] |
|
|
for run_path in runs_dir.iterdir(): |
|
|
if run_path.is_dir(): |
|
|
|
|
|
run_name = run_path.name |
|
|
runs.append({ |
|
|
"path": run_path, |
|
|
"timestamp": run_name |
|
|
}) |
|
|
|
|
|
return runs |
|
|
|
|
|
|
|
|
def clean_runs(dry_run=False, verbose=False): |
|
|
""" |
|
|
Remove all run directories that don't have exported models. |
|
|
|
|
|
Args: |
|
|
dry_run: If True, only show what would be deleted without actually deleting |
|
|
verbose: If True, show detailed information |
|
|
|
|
|
Returns: |
|
|
Tuple of (runs_to_keep, runs_to_delete) |
|
|
""" |
|
|
|
|
|
exported_models = find_exported_models() |
|
|
exported_timestamps = {model["timestamp"] for model in exported_models} |
|
|
|
|
|
|
|
|
all_runs = find_all_runs() |
|
|
|
|
|
|
|
|
runs_to_keep = [] |
|
|
runs_to_delete = [] |
|
|
|
|
|
for run in all_runs: |
|
|
if run["timestamp"] in exported_timestamps: |
|
|
runs_to_keep.append(run) |
|
|
else: |
|
|
runs_to_delete.append(run) |
|
|
|
|
|
|
|
|
print(f"Found {len(all_runs)} total runs") |
|
|
print(f"Found {len(exported_models)} exported models") |
|
|
print(f"Runs to keep: {len(runs_to_keep)}") |
|
|
print(f"Runs to delete: {len(runs_to_delete)}") |
|
|
|
|
|
if verbose and exported_models: |
|
|
print("\nExported models found:") |
|
|
for model in exported_models: |
|
|
print(f" - {model['file']} (timestamp: {model['timestamp']})") |
|
|
|
|
|
if verbose and runs_to_keep: |
|
|
print("\nRuns with exported models (will be kept):") |
|
|
for run in runs_to_keep: |
|
|
print(f" - {run['path']}") |
|
|
|
|
|
if runs_to_delete: |
|
|
print("\nRuns without exported models (will be deleted):") |
|
|
for run in runs_to_delete: |
|
|
print(f" - {run['path']}") |
|
|
if verbose: |
|
|
|
|
|
metadata_path = run["path"] / "metadata.json" |
|
|
if metadata_path.exists(): |
|
|
import json |
|
|
try: |
|
|
with open(metadata_path) as f: |
|
|
metadata = json.load(f) |
|
|
print(f" Model: {metadata.get('model_name', 'unknown')}, " |
|
|
f"Dataset: {metadata.get('dataset', 'unknown')}, " |
|
|
f"Accuracy: {metadata.get('test_accuracy', 'N/A'):.4f}") |
|
|
except (json.JSONDecodeError, KeyError): |
|
|
pass |
|
|
|
|
|
|
|
|
total_size = 0 |
|
|
for run in runs_to_delete: |
|
|
total_size += sum(f.stat().st_size for f in run["path"].rglob("*") if f.is_file()) |
|
|
|
|
|
if total_size > 0: |
|
|
size_mb = total_size / (1024 * 1024) |
|
|
print(f"\nTotal space to be freed: {size_mb:.2f} MB") |
|
|
|
|
|
|
|
|
if not dry_run and runs_to_delete: |
|
|
deleted_count = 0 |
|
|
for run in runs_to_delete: |
|
|
try: |
|
|
shutil.rmtree(run["path"]) |
|
|
deleted_count += 1 |
|
|
if verbose: |
|
|
print(f"Deleted: {run['path']}") |
|
|
except Exception as e: |
|
|
print(f"Error deleting {run['path']}: {e}") |
|
|
|
|
|
print(f"\nSuccessfully deleted {deleted_count} run(s)") |
|
|
elif dry_run and runs_to_delete: |
|
|
print("\nDry run mode - no files were deleted") |
|
|
print("Run without --dry-run to actually delete these directories") |
|
|
elif not runs_to_delete: |
|
|
print("\nNo runs to delete - all runs have exported models or no runs found") |
|
|
|
|
|
return runs_to_keep, runs_to_delete |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Clean up training runs without exported models" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--dry-run", |
|
|
action="store_true", |
|
|
help="Show what would be deleted without actually deleting", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--verbose", |
|
|
"-v", |
|
|
action="store_true", |
|
|
help="Show detailed information about runs", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--yes", |
|
|
"-y", |
|
|
action="store_true", |
|
|
help="Skip confirmation prompt", |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if not Path("runs").exists(): |
|
|
print("No 'runs' directory found. Nothing to clean.") |
|
|
return |
|
|
|
|
|
|
|
|
print("Analyzing runs directory...\n") |
|
|
|
|
|
|
|
|
_, runs_to_delete = clean_runs(dry_run=True, verbose=args.verbose) |
|
|
|
|
|
if not runs_to_delete: |
|
|
return |
|
|
|
|
|
|
|
|
if not args.dry_run and not args.yes and runs_to_delete: |
|
|
print("\n" + "=" * 60) |
|
|
response = input(f"Are you sure you want to delete {len(runs_to_delete)} run(s)? [y/N]: ") |
|
|
if response.lower() != 'y': |
|
|
print("Cleanup cancelled") |
|
|
return |
|
|
|
|
|
|
|
|
if not args.dry_run: |
|
|
print("\nPerforming cleanup...") |
|
|
clean_runs(dry_run=False, verbose=args.verbose) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |