pulse_core_1 / clean.py

Vu Anh

Add VLSP2016 dataset support and comprehensive evaluation updates

08bbb4c 3 months ago

6.8 kB

	#!/usr/bin/env python3
	"""
	Clean up script for removing training runs without exported models.
	Removes all directories in runs/ folder that don't have a corresponding exported model file.
	"""

	import argparse
	import os
	import shutil
	from pathlib import Path
	import glob


	def find_exported_models():
	"""Find all exported model files in the current directory"""
	exported_models = []
	seen_files = set() # Track files we've already processed

	# Look for pattern: *_YYYYMMDD_HHMMSS.joblib
	# This matches any exported model with timestamp format
	patterns = [
	"*_20[0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].joblib"
	]

	for pattern in patterns:
	for filepath in glob.glob(pattern):
	# Skip if we've already seen this file
	if filepath in seen_files:
	continue
	seen_files.add(filepath)

	# Extract timestamp from filename
	# Format: dataset_sentiment_YYYYMMDD_HHMMSS.joblib
	filename = os.path.basename(filepath)
	parts = filename.replace(".joblib", "").split("_")
	if len(parts) >= 4:
	# Get the last two parts which should be date and time
	timestamp = "_".join(parts[-2:])
	exported_models.append({
	"file": filepath,
	"timestamp": timestamp
	})

	return exported_models


	def find_all_runs():
	"""Find all run directories in the runs folder"""
	runs_dir = Path("runs")
	if not runs_dir.exists():
	return []

	runs = []
	for run_path in runs_dir.iterdir():
	if run_path.is_dir():
	# Run directories are named with timestamps: YYYYMMDD_HHMMSS
	run_name = run_path.name
	runs.append({
	"path": run_path,
	"timestamp": run_name
	})

	return runs


	def clean_runs(dry_run=False, verbose=False):
	"""
	Remove all run directories that don't have exported models.

	Args:
	dry_run: If True, only show what would be deleted without actually deleting
	verbose: If True, show detailed information

	Returns:
	Tuple of (runs_to_keep, runs_to_delete)
	"""
	# Find all exported models
	exported_models = find_exported_models()
	exported_timestamps = {model["timestamp"] for model in exported_models}

	# Find all runs
	all_runs = find_all_runs()

	# Categorize runs
	runs_to_keep = []
	runs_to_delete = []

	for run in all_runs:
	if run["timestamp"] in exported_timestamps:
	runs_to_keep.append(run)
	else:
	runs_to_delete.append(run)

	# Show summary
	print(f"Found {len(all_runs)} total runs")
	print(f"Found {len(exported_models)} exported models")
	print(f"Runs to keep: {len(runs_to_keep)}")
	print(f"Runs to delete: {len(runs_to_delete)}")

	if verbose and exported_models:
	print("\nExported models found:")
	for model in exported_models:
	print(f" - {model['file']} (timestamp: {model['timestamp']})")

	if verbose and runs_to_keep:
	print("\nRuns with exported models (will be kept):")
	for run in runs_to_keep:
	print(f" - {run['path']}")

	if runs_to_delete:
	print("\nRuns without exported models (will be deleted):")
	for run in runs_to_delete:
	print(f" - {run['path']}")
	if verbose:
	# Check if metadata exists and show some info
	metadata_path = run["path"] / "metadata.json"
	if metadata_path.exists():
	import json
	try:
	with open(metadata_path) as f:
	metadata = json.load(f)
	print(f" Model: {metadata.get('model_name', 'unknown')}, "
	f"Dataset: {metadata.get('dataset', 'unknown')}, "
	f"Accuracy: {metadata.get('test_accuracy', 'N/A'):.4f}")
	except (json.JSONDecodeError, KeyError):
	pass

	# Calculate space to be freed
	total_size = 0
	for run in runs_to_delete:
	total_size += sum(f.stat().st_size for f in run["path"].rglob("*") if f.is_file())

	if total_size > 0:
	size_mb = total_size / (1024 * 1024)
	print(f"\nTotal space to be freed: {size_mb:.2f} MB")

	# Perform deletion if not dry run
	if not dry_run and runs_to_delete:
	deleted_count = 0
	for run in runs_to_delete:
	try:
	shutil.rmtree(run["path"])
	deleted_count += 1
	if verbose:
	print(f"Deleted: {run['path']}")
	except Exception as e:
	print(f"Error deleting {run['path']}: {e}")

	print(f"\nSuccessfully deleted {deleted_count} run(s)")
	elif dry_run and runs_to_delete:
	print("\nDry run mode - no files were deleted")
	print("Run without --dry-run to actually delete these directories")
	elif not runs_to_delete:
	print("\nNo runs to delete - all runs have exported models or no runs found")

	return runs_to_keep, runs_to_delete


	def main():
	parser = argparse.ArgumentParser(
	description="Clean up training runs without exported models"
	)
	parser.add_argument(
	"--dry-run",
	action="store_true",
	help="Show what would be deleted without actually deleting",
	)
	parser.add_argument(
	"--verbose",
	"-v",
	action="store_true",
	help="Show detailed information about runs",
	)
	parser.add_argument(
	"--yes",
	"-y",
	action="store_true",
	help="Skip confirmation prompt",
	)

	args = parser.parse_args()

	# Check if runs directory exists
	if not Path("runs").exists():
	print("No 'runs' directory found. Nothing to clean.")
	return

	# Find runs to delete
	print("Analyzing runs directory...\n")

	# Do a dry run first to show what will be deleted
	_, runs_to_delete = clean_runs(dry_run=True, verbose=args.verbose)

	if not runs_to_delete:
	return

	# Ask for confirmation if not in dry-run mode and not auto-yes
	if not args.dry_run and not args.yes and runs_to_delete:
	print("\n" + "=" * 60)
	response = input(f"Are you sure you want to delete {len(runs_to_delete)} run(s)? [y/N]: ")
	if response.lower() != 'y':
	print("Cleanup cancelled")
	return

	# Perform actual cleanup if not dry run
	if not args.dry_run:
	print("\nPerforming cleanup...")
	clean_runs(dry_run=False, verbose=args.verbose)


	if __name__ == "__main__":
	main()