pulse_core_1 / clean.py
Vu Anh
Add VLSP2016 dataset support and comprehensive evaluation updates
08bbb4c
#!/usr/bin/env python3
"""
Clean up script for removing training runs without exported models.
Removes all directories in runs/ folder that don't have a corresponding exported model file.
"""
import argparse
import os
import shutil
from pathlib import Path
import glob
def find_exported_models():
"""Find all exported model files in the current directory"""
exported_models = []
seen_files = set() # Track files we've already processed
# Look for pattern: *_YYYYMMDD_HHMMSS.joblib
# This matches any exported model with timestamp format
patterns = [
"*_20[0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].joblib"
]
for pattern in patterns:
for filepath in glob.glob(pattern):
# Skip if we've already seen this file
if filepath in seen_files:
continue
seen_files.add(filepath)
# Extract timestamp from filename
# Format: dataset_sentiment_YYYYMMDD_HHMMSS.joblib
filename = os.path.basename(filepath)
parts = filename.replace(".joblib", "").split("_")
if len(parts) >= 4:
# Get the last two parts which should be date and time
timestamp = "_".join(parts[-2:])
exported_models.append({
"file": filepath,
"timestamp": timestamp
})
return exported_models
def find_all_runs():
"""Find all run directories in the runs folder"""
runs_dir = Path("runs")
if not runs_dir.exists():
return []
runs = []
for run_path in runs_dir.iterdir():
if run_path.is_dir():
# Run directories are named with timestamps: YYYYMMDD_HHMMSS
run_name = run_path.name
runs.append({
"path": run_path,
"timestamp": run_name
})
return runs
def clean_runs(dry_run=False, verbose=False):
"""
Remove all run directories that don't have exported models.
Args:
dry_run: If True, only show what would be deleted without actually deleting
verbose: If True, show detailed information
Returns:
Tuple of (runs_to_keep, runs_to_delete)
"""
# Find all exported models
exported_models = find_exported_models()
exported_timestamps = {model["timestamp"] for model in exported_models}
# Find all runs
all_runs = find_all_runs()
# Categorize runs
runs_to_keep = []
runs_to_delete = []
for run in all_runs:
if run["timestamp"] in exported_timestamps:
runs_to_keep.append(run)
else:
runs_to_delete.append(run)
# Show summary
print(f"Found {len(all_runs)} total runs")
print(f"Found {len(exported_models)} exported models")
print(f"Runs to keep: {len(runs_to_keep)}")
print(f"Runs to delete: {len(runs_to_delete)}")
if verbose and exported_models:
print("\nExported models found:")
for model in exported_models:
print(f" - {model['file']} (timestamp: {model['timestamp']})")
if verbose and runs_to_keep:
print("\nRuns with exported models (will be kept):")
for run in runs_to_keep:
print(f" - {run['path']}")
if runs_to_delete:
print("\nRuns without exported models (will be deleted):")
for run in runs_to_delete:
print(f" - {run['path']}")
if verbose:
# Check if metadata exists and show some info
metadata_path = run["path"] / "metadata.json"
if metadata_path.exists():
import json
try:
with open(metadata_path) as f:
metadata = json.load(f)
print(f" Model: {metadata.get('model_name', 'unknown')}, "
f"Dataset: {metadata.get('dataset', 'unknown')}, "
f"Accuracy: {metadata.get('test_accuracy', 'N/A'):.4f}")
except (json.JSONDecodeError, KeyError):
pass
# Calculate space to be freed
total_size = 0
for run in runs_to_delete:
total_size += sum(f.stat().st_size for f in run["path"].rglob("*") if f.is_file())
if total_size > 0:
size_mb = total_size / (1024 * 1024)
print(f"\nTotal space to be freed: {size_mb:.2f} MB")
# Perform deletion if not dry run
if not dry_run and runs_to_delete:
deleted_count = 0
for run in runs_to_delete:
try:
shutil.rmtree(run["path"])
deleted_count += 1
if verbose:
print(f"Deleted: {run['path']}")
except Exception as e:
print(f"Error deleting {run['path']}: {e}")
print(f"\nSuccessfully deleted {deleted_count} run(s)")
elif dry_run and runs_to_delete:
print("\nDry run mode - no files were deleted")
print("Run without --dry-run to actually delete these directories")
elif not runs_to_delete:
print("\nNo runs to delete - all runs have exported models or no runs found")
return runs_to_keep, runs_to_delete
def main():
parser = argparse.ArgumentParser(
description="Clean up training runs without exported models"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be deleted without actually deleting",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Show detailed information about runs",
)
parser.add_argument(
"--yes",
"-y",
action="store_true",
help="Skip confirmation prompt",
)
args = parser.parse_args()
# Check if runs directory exists
if not Path("runs").exists():
print("No 'runs' directory found. Nothing to clean.")
return
# Find runs to delete
print("Analyzing runs directory...\n")
# Do a dry run first to show what will be deleted
_, runs_to_delete = clean_runs(dry_run=True, verbose=args.verbose)
if not runs_to_delete:
return
# Ask for confirmation if not in dry-run mode and not auto-yes
if not args.dry_run and not args.yes and runs_to_delete:
print("\n" + "=" * 60)
response = input(f"Are you sure you want to delete {len(runs_to_delete)} run(s)? [y/N]: ")
if response.lower() != 'y':
print("Cleanup cancelled")
return
# Perform actual cleanup if not dry run
if not args.dry_run:
print("\nPerforming cleanup...")
clean_runs(dry_run=False, verbose=args.verbose)
if __name__ == "__main__":
main()