#!/usr/bin/env python3 """ Clean up script for removing training runs without exported models. Removes all directories in runs/ folder that don't have a corresponding exported model file. """ import argparse import os import shutil from pathlib import Path import glob def find_exported_models(): """Find all exported model files in the current directory""" exported_models = [] seen_files = set() # Track files we've already processed # Look for pattern: *_YYYYMMDD_HHMMSS.joblib # This matches any exported model with timestamp format patterns = [ "*_20[0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].joblib" ] for pattern in patterns: for filepath in glob.glob(pattern): # Skip if we've already seen this file if filepath in seen_files: continue seen_files.add(filepath) # Extract timestamp from filename # Format: dataset_sentiment_YYYYMMDD_HHMMSS.joblib filename = os.path.basename(filepath) parts = filename.replace(".joblib", "").split("_") if len(parts) >= 4: # Get the last two parts which should be date and time timestamp = "_".join(parts[-2:]) exported_models.append({ "file": filepath, "timestamp": timestamp }) return exported_models def find_all_runs(): """Find all run directories in the runs folder""" runs_dir = Path("runs") if not runs_dir.exists(): return [] runs = [] for run_path in runs_dir.iterdir(): if run_path.is_dir(): # Run directories are named with timestamps: YYYYMMDD_HHMMSS run_name = run_path.name runs.append({ "path": run_path, "timestamp": run_name }) return runs def clean_runs(dry_run=False, verbose=False): """ Remove all run directories that don't have exported models. Args: dry_run: If True, only show what would be deleted without actually deleting verbose: If True, show detailed information Returns: Tuple of (runs_to_keep, runs_to_delete) """ # Find all exported models exported_models = find_exported_models() exported_timestamps = {model["timestamp"] for model in exported_models} # Find all runs all_runs = find_all_runs() # Categorize runs runs_to_keep = [] runs_to_delete = [] for run in all_runs: if run["timestamp"] in exported_timestamps: runs_to_keep.append(run) else: runs_to_delete.append(run) # Show summary print(f"Found {len(all_runs)} total runs") print(f"Found {len(exported_models)} exported models") print(f"Runs to keep: {len(runs_to_keep)}") print(f"Runs to delete: {len(runs_to_delete)}") if verbose and exported_models: print("\nExported models found:") for model in exported_models: print(f" - {model['file']} (timestamp: {model['timestamp']})") if verbose and runs_to_keep: print("\nRuns with exported models (will be kept):") for run in runs_to_keep: print(f" - {run['path']}") if runs_to_delete: print("\nRuns without exported models (will be deleted):") for run in runs_to_delete: print(f" - {run['path']}") if verbose: # Check if metadata exists and show some info metadata_path = run["path"] / "metadata.json" if metadata_path.exists(): import json try: with open(metadata_path) as f: metadata = json.load(f) print(f" Model: {metadata.get('model_name', 'unknown')}, " f"Dataset: {metadata.get('dataset', 'unknown')}, " f"Accuracy: {metadata.get('test_accuracy', 'N/A'):.4f}") except (json.JSONDecodeError, KeyError): pass # Calculate space to be freed total_size = 0 for run in runs_to_delete: total_size += sum(f.stat().st_size for f in run["path"].rglob("*") if f.is_file()) if total_size > 0: size_mb = total_size / (1024 * 1024) print(f"\nTotal space to be freed: {size_mb:.2f} MB") # Perform deletion if not dry run if not dry_run and runs_to_delete: deleted_count = 0 for run in runs_to_delete: try: shutil.rmtree(run["path"]) deleted_count += 1 if verbose: print(f"Deleted: {run['path']}") except Exception as e: print(f"Error deleting {run['path']}: {e}") print(f"\nSuccessfully deleted {deleted_count} run(s)") elif dry_run and runs_to_delete: print("\nDry run mode - no files were deleted") print("Run without --dry-run to actually delete these directories") elif not runs_to_delete: print("\nNo runs to delete - all runs have exported models or no runs found") return runs_to_keep, runs_to_delete def main(): parser = argparse.ArgumentParser( description="Clean up training runs without exported models" ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be deleted without actually deleting", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Show detailed information about runs", ) parser.add_argument( "--yes", "-y", action="store_true", help="Skip confirmation prompt", ) args = parser.parse_args() # Check if runs directory exists if not Path("runs").exists(): print("No 'runs' directory found. Nothing to clean.") return # Find runs to delete print("Analyzing runs directory...\n") # Do a dry run first to show what will be deleted _, runs_to_delete = clean_runs(dry_run=True, verbose=args.verbose) if not runs_to_delete: return # Ask for confirmation if not in dry-run mode and not auto-yes if not args.dry_run and not args.yes and runs_to_delete: print("\n" + "=" * 60) response = input(f"Are you sure you want to delete {len(runs_to_delete)} run(s)? [y/N]: ") if response.lower() != 'y': print("Cleanup cancelled") return # Perform actual cleanup if not dry run if not args.dry_run: print("\nPerforming cleanup...") clean_runs(dry_run=False, verbose=args.verbose) if __name__ == "__main__": main()