|
|
|
|
|
""" |
|
|
Training script for Vietnamese sentiment classification. |
|
|
Trains TF-IDF + ML models on VLSP2016 sentiment dataset. |
|
|
This script trains various machine learning models for Vietnamese sentiment analysis. |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import logging |
|
|
import os |
|
|
import time |
|
|
from datetime import datetime |
|
|
|
|
|
import numpy as np |
|
|
from datasets import load_dataset |
|
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.svm import SVC |
|
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier |
|
|
from sklearn.naive_bayes import MultinomialNB |
|
|
from sklearn.neural_network import MLPClassifier |
|
|
from sklearn.tree import DecisionTreeClassifier |
|
|
import joblib |
|
|
|
|
|
|
|
|
def setup_logging(run_name): |
|
|
"""Setup logging to save all information to runs folder""" |
|
|
runs_dir = "runs" |
|
|
os.makedirs(runs_dir, exist_ok=True) |
|
|
|
|
|
run_dir = os.path.join(runs_dir, run_name) |
|
|
os.makedirs(run_dir, exist_ok=True) |
|
|
|
|
|
log_file = os.path.join(run_dir, "training.log") |
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s - %(levelname)s - %(message)s", |
|
|
handlers=[logging.FileHandler(log_file), logging.StreamHandler()], |
|
|
) |
|
|
|
|
|
return run_dir |
|
|
|
|
|
|
|
|
|
|
|
def load_uts2017_data(split_ratio=0.2, random_state=42, n_samples=None): |
|
|
"""Load and prepare UTS2017_Bank aspect sentiment dataset |
|
|
Args: |
|
|
split_ratio: Ratio for train/test split |
|
|
random_state: Random seed for reproducibility |
|
|
n_samples: Optional limit on number of samples |
|
|
Returns: |
|
|
Tuple of (X_train, y_train), (X_test, y_test) |
|
|
""" |
|
|
print("Loading UTS2017_Bank aspect sentiment dataset from Hugging Face...") |
|
|
|
|
|
|
|
|
dataset = load_dataset("undertheseanlp/UTS2017_Bank", "aspect_sentiment") |
|
|
|
|
|
|
|
|
train_data = dataset["train"] |
|
|
|
|
|
|
|
|
texts = [] |
|
|
labels = [] |
|
|
|
|
|
for item in train_data: |
|
|
text = item["text"] |
|
|
aspect_data = item["aspects"] |
|
|
|
|
|
|
|
|
if aspect_data and len(aspect_data) > 0: |
|
|
aspect = aspect_data[0]["aspect"] |
|
|
sentiment = aspect_data[0]["sentiment"] |
|
|
|
|
|
texts.append(text) |
|
|
labels.append(f"{aspect}#{sentiment}") |
|
|
|
|
|
|
|
|
texts = list(texts) |
|
|
labels = list(labels) |
|
|
|
|
|
|
|
|
if n_samples and n_samples < len(texts): |
|
|
|
|
|
indices = np.arange(len(texts)) |
|
|
np.random.seed(random_state) |
|
|
np.random.shuffle(indices) |
|
|
indices = indices[:n_samples] |
|
|
texts = [texts[i] for i in indices] |
|
|
labels = [labels[i] for i in indices] |
|
|
|
|
|
|
|
|
X = np.array(texts) |
|
|
y = np.array(labels) |
|
|
|
|
|
|
|
|
|
|
|
min_samples_per_class = 2 |
|
|
unique_classes, class_counts = np.unique(y, return_counts=True) |
|
|
can_stratify = all(count >= min_samples_per_class for count in class_counts) |
|
|
|
|
|
if can_stratify: |
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=split_ratio, random_state=random_state, stratify=y |
|
|
) |
|
|
else: |
|
|
print( |
|
|
f"Warning: Some classes have fewer than {min_samples_per_class} samples. Disabling stratification." |
|
|
) |
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=split_ratio, random_state=random_state |
|
|
) |
|
|
|
|
|
print(f"Dataset loaded: {len(X_train)} train samples, {len(X_test)} test samples") |
|
|
print(f"Number of unique labels: {len(set(y))}") |
|
|
|
|
|
return (X_train, y_train), (X_test, y_test) |
|
|
|
|
|
|
|
|
def load_vlsp2016_data(use_predefined_split=True, split_ratio=0.2, random_state=42, n_samples=None): |
|
|
"""Load and prepare VLSP2016 sentiment dataset |
|
|
Args: |
|
|
use_predefined_split: If True, use the predefined train/test split from the dataset |
|
|
split_ratio: Ratio for train/test split (only used if use_predefined_split is False) |
|
|
random_state: Random seed for reproducibility |
|
|
n_samples: Optional limit on number of samples |
|
|
Returns: |
|
|
Tuple of (X_train, y_train), (X_test, y_test) |
|
|
""" |
|
|
print("Loading VLSP2016 sentiment dataset from Hugging Face...") |
|
|
|
|
|
|
|
|
dataset = load_dataset("ura-hcmut/vlsp2016") |
|
|
|
|
|
if use_predefined_split: |
|
|
|
|
|
train_data = dataset["train"] |
|
|
test_data = dataset["test"] |
|
|
|
|
|
|
|
|
X_train = [item["Data"] for item in train_data] |
|
|
y_train = [item["Class"] for item in train_data] |
|
|
X_test = [item["Data"] for item in test_data] |
|
|
y_test = [item["Class"] for item in test_data] |
|
|
|
|
|
|
|
|
if n_samples: |
|
|
if n_samples < len(X_train): |
|
|
|
|
|
indices = np.arange(len(X_train)) |
|
|
np.random.seed(random_state) |
|
|
np.random.shuffle(indices) |
|
|
indices = indices[:n_samples] |
|
|
X_train = [X_train[i] for i in indices] |
|
|
y_train = [y_train[i] for i in indices] |
|
|
if n_samples < len(X_test): |
|
|
|
|
|
test_samples = int(n_samples * 0.2) |
|
|
indices = np.arange(len(X_test)) |
|
|
np.random.seed(random_state) |
|
|
np.random.shuffle(indices) |
|
|
indices = indices[:test_samples] |
|
|
X_test = [X_test[i] for i in indices] |
|
|
y_test = [y_test[i] for i in indices] |
|
|
|
|
|
|
|
|
X_train = np.array(X_train) |
|
|
y_train = np.array(y_train) |
|
|
X_test = np.array(X_test) |
|
|
y_test = np.array(y_test) |
|
|
else: |
|
|
|
|
|
all_data = list(dataset["train"]) + list(dataset["test"]) |
|
|
|
|
|
|
|
|
texts = [item["Data"] for item in all_data] |
|
|
labels = [item["Class"] for item in all_data] |
|
|
|
|
|
|
|
|
if n_samples and n_samples < len(texts): |
|
|
texts = texts[:n_samples] |
|
|
labels = labels[:n_samples] |
|
|
|
|
|
|
|
|
X = np.array(texts) |
|
|
y = np.array(labels) |
|
|
|
|
|
|
|
|
|
|
|
min_samples_per_class = 2 |
|
|
unique_classes, class_counts = np.unique(y, return_counts=True) |
|
|
can_stratify = all(count >= min_samples_per_class for count in class_counts) |
|
|
|
|
|
if can_stratify: |
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=split_ratio, random_state=random_state, stratify=y |
|
|
) |
|
|
else: |
|
|
print( |
|
|
f"Warning: Some classes have fewer than {min_samples_per_class} samples. Disabling stratification." |
|
|
) |
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=split_ratio, random_state=random_state |
|
|
) |
|
|
|
|
|
print(f"Dataset loaded: {len(X_train)} train samples, {len(X_test)} test samples") |
|
|
print(f"Number of unique labels: {len(set(y_train))}") |
|
|
print(f"Labels: {sorted(set(y_train))}") |
|
|
|
|
|
return (X_train, y_train), (X_test, y_test) |
|
|
|
|
|
|
|
|
def get_available_models(): |
|
|
"""Get available classifier options""" |
|
|
return { |
|
|
|
|
|
"logistic": LogisticRegression(max_iter=1000, random_state=42), |
|
|
"svc_linear": SVC(kernel="linear", random_state=42, probability=True), |
|
|
"svc_rbf": SVC(kernel="rbf", random_state=42, probability=True, gamma='scale'), |
|
|
"naive_bayes": MultinomialNB(), |
|
|
|
|
|
|
|
|
"decision_tree": DecisionTreeClassifier(random_state=42, max_depth=10), |
|
|
"random_forest": RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1), |
|
|
|
|
|
|
|
|
"gradient_boost": GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5), |
|
|
"ada_boost": AdaBoostClassifier(n_estimators=100, random_state=42), |
|
|
|
|
|
|
|
|
"mlp": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42, early_stopping=True), |
|
|
} |
|
|
|
|
|
|
|
|
def load_data(dataset_name="vlsp2016", split_ratio=0.2, random_state=42, n_samples=None): |
|
|
"""Load data from the specified dataset |
|
|
Args: |
|
|
dataset_name: Name of the dataset to load ('vlsp2016' or 'uts2017') |
|
|
split_ratio: Ratio for train/test split |
|
|
random_state: Random seed for reproducibility |
|
|
n_samples: Optional limit on number of samples |
|
|
Returns: |
|
|
Tuple of (X_train, y_train), (X_test, y_test), dataset_display_name |
|
|
""" |
|
|
if dataset_name.lower() == "vlsp2016": |
|
|
(X_train, y_train), (X_test, y_test) = load_vlsp2016_data( |
|
|
use_predefined_split=True, split_ratio=split_ratio, |
|
|
random_state=random_state, n_samples=n_samples |
|
|
) |
|
|
display_name = "VLSP2016_Sentiment" |
|
|
elif dataset_name.lower() == "uts2017": |
|
|
(X_train, y_train), (X_test, y_test) = load_uts2017_data( |
|
|
split_ratio=split_ratio, random_state=random_state, n_samples=n_samples |
|
|
) |
|
|
display_name = "UTS2017_Bank_AspectSentiment" |
|
|
else: |
|
|
raise ValueError(f"Unknown dataset: {dataset_name}. Choose 'vlsp2016' or 'uts2017'") |
|
|
|
|
|
return (X_train, y_train), (X_test, y_test), display_name |
|
|
|
|
|
|
|
|
def train_model( |
|
|
dataset="vlsp2016", |
|
|
model_name="logistic", |
|
|
max_features=20000, |
|
|
ngram_range=(1, 2), |
|
|
split_ratio=0.2, |
|
|
n_samples=None, |
|
|
export_model=False, |
|
|
): |
|
|
"""Train a single model with specified parameters |
|
|
Args: |
|
|
dataset: Name of the dataset to use ('vlsp2016' or 'uts2017') |
|
|
model_name: Name of the model to train ('logistic' or 'svc') |
|
|
max_features: Maximum number of features for TF-IDF vectorizer |
|
|
ngram_range: N-gram range for feature extraction |
|
|
split_ratio: Train/test split ratio |
|
|
n_samples: Optional limit on number of samples |
|
|
export_model: Whether to export the model for distribution |
|
|
Returns: |
|
|
Dictionary containing training results |
|
|
""" |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
run_dir = setup_logging(timestamp) |
|
|
|
|
|
logging.info(f"Starting training run: {timestamp}") |
|
|
logging.info(f"Dataset: {dataset}") |
|
|
logging.info(f"Model: {model_name}") |
|
|
logging.info(f"Max features: {max_features}") |
|
|
logging.info(f"N-gram range: {ngram_range}") |
|
|
if n_samples: |
|
|
logging.info(f"Sample limit: {n_samples}") |
|
|
|
|
|
|
|
|
output_folder = os.path.join(run_dir, "models") |
|
|
os.makedirs(output_folder, exist_ok=True) |
|
|
|
|
|
|
|
|
logging.info(f"Loading {dataset} dataset...") |
|
|
(X_train, y_train), (X_test, y_test), dataset_name = load_data( |
|
|
dataset_name=dataset, split_ratio=split_ratio, random_state=42, n_samples=n_samples |
|
|
) |
|
|
|
|
|
|
|
|
unique_labels = sorted(set(y_train)) |
|
|
label_counts_train = {label: np.sum(y_train == label) for label in unique_labels} |
|
|
label_counts_test = {label: np.sum(y_test == label) for label in unique_labels} |
|
|
|
|
|
logging.info(f"Train samples: {len(X_train)}") |
|
|
logging.info(f"Test samples: {len(X_test)}") |
|
|
logging.info(f"Unique labels: {len(unique_labels)}") |
|
|
logging.info(f"Label distribution (train): {label_counts_train}") |
|
|
logging.info(f"Label distribution (test): {label_counts_test}") |
|
|
|
|
|
|
|
|
available_models = get_available_models() |
|
|
if model_name not in available_models: |
|
|
raise ValueError( |
|
|
f"Model '{model_name}' not available. Choose from: {list(available_models.keys())}" |
|
|
) |
|
|
|
|
|
classifier = available_models[model_name] |
|
|
clf_name = classifier.__class__.__name__ |
|
|
logging.info(f"Selected classifier: {clf_name}") |
|
|
|
|
|
|
|
|
config_name = f"{dataset_name}_{clf_name}_feat{max_features // 1000}k_ngram{ngram_range[0]}-{ngram_range[1]}" |
|
|
|
|
|
logging.info("=" * 60) |
|
|
logging.info(f"Training: {config_name}") |
|
|
logging.info("=" * 60) |
|
|
|
|
|
|
|
|
logging.info( |
|
|
f"Creating pipeline with max_features={max_features}, ngram_range={ngram_range}" |
|
|
) |
|
|
|
|
|
text_clf = Pipeline( |
|
|
[ |
|
|
( |
|
|
"vect", |
|
|
CountVectorizer(max_features=max_features, ngram_range=ngram_range), |
|
|
), |
|
|
("tfidf", TfidfTransformer(use_idf=True)), |
|
|
("clf", classifier), |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
logging.info("Training model...") |
|
|
start_time = time.time() |
|
|
text_clf.fit(X_train, y_train) |
|
|
train_time = time.time() - start_time |
|
|
logging.info(f"Training completed in {train_time:.2f} seconds") |
|
|
|
|
|
|
|
|
logging.info("Evaluating on training set...") |
|
|
train_predictions = text_clf.predict(X_train) |
|
|
train_accuracy = accuracy_score(y_train, train_predictions) |
|
|
logging.info(f"Training accuracy: {train_accuracy:.4f}") |
|
|
|
|
|
|
|
|
logging.info("Evaluating on test set...") |
|
|
start_time = time.time() |
|
|
test_predictions = text_clf.predict(X_test) |
|
|
test_accuracy = accuracy_score(y_test, test_predictions) |
|
|
prediction_time = time.time() - start_time |
|
|
logging.info(f"Test accuracy: {test_accuracy:.4f}") |
|
|
logging.info(f"Prediction time: {prediction_time:.2f} seconds") |
|
|
|
|
|
|
|
|
logging.info("Classification Report:") |
|
|
report = classification_report(y_test, test_predictions, zero_division=0) |
|
|
logging.info(report) |
|
|
print("\nClassification Report:") |
|
|
print(report) |
|
|
|
|
|
|
|
|
report_dict = classification_report( |
|
|
y_test, test_predictions, zero_division=0, output_dict=True |
|
|
) |
|
|
|
|
|
|
|
|
cm = confusion_matrix(y_test, test_predictions, labels=unique_labels) |
|
|
logging.info(f"Confusion Matrix shape: {cm.shape}") |
|
|
|
|
|
|
|
|
model_path = os.path.join(output_folder, "model.joblib") |
|
|
joblib.dump(text_clf, model_path) |
|
|
logging.info(f"Model saved to {model_path}") |
|
|
print(f"Model saved to {model_path}") |
|
|
|
|
|
|
|
|
config_model_path = os.path.join(output_folder, f"{config_name}.joblib") |
|
|
joblib.dump(text_clf, config_model_path) |
|
|
logging.info(f"Model also saved as {config_model_path}") |
|
|
|
|
|
|
|
|
if export_model: |
|
|
|
|
|
run_id = os.path.basename(run_dir) |
|
|
dataset_prefix = dataset.lower() |
|
|
export_filename = f"{dataset_prefix}_sentiment_{run_id}.joblib" |
|
|
export_path = os.path.join(".", export_filename) |
|
|
joblib.dump(text_clf, export_path) |
|
|
logging.info(f"Model exported as {export_path}") |
|
|
print(f"Model exported for distribution: {export_filename}") |
|
|
|
|
|
|
|
|
label_mapping_path = os.path.join(output_folder, "labels.txt") |
|
|
with open(label_mapping_path, "w", encoding="utf-8") as f: |
|
|
for label in unique_labels: |
|
|
f.write(f"{label}\n") |
|
|
logging.info(f"Label mapping saved to {label_mapping_path}") |
|
|
|
|
|
|
|
|
metadata = { |
|
|
"timestamp": timestamp, |
|
|
"dataset": dataset, |
|
|
"dataset_name": dataset_name, |
|
|
"config_name": config_name, |
|
|
"model_name": model_name, |
|
|
"classifier": clf_name, |
|
|
"max_features": max_features, |
|
|
"ngram_range": list(ngram_range), |
|
|
"split_ratio": split_ratio, |
|
|
"n_samples": n_samples, |
|
|
"train_samples": len(X_train), |
|
|
"test_samples": len(X_test), |
|
|
"unique_labels": len(unique_labels), |
|
|
"labels": unique_labels, |
|
|
"train_accuracy": float(train_accuracy), |
|
|
"test_accuracy": float(test_accuracy), |
|
|
"train_time": train_time, |
|
|
"prediction_time": prediction_time, |
|
|
"classification_report": report_dict, |
|
|
"confusion_matrix": cm.tolist(), |
|
|
} |
|
|
|
|
|
metadata_path = os.path.join(run_dir, "metadata.json") |
|
|
with open(metadata_path, "w", encoding="utf-8") as f: |
|
|
json.dump(metadata, f, indent=2, ensure_ascii=False) |
|
|
logging.info(f"Metadata saved to {metadata_path}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Training Summary") |
|
|
print("=" * 60) |
|
|
print(f"Model: {clf_name}") |
|
|
print(f"Training samples: {len(X_train)}") |
|
|
print(f"Test samples: {len(X_test)}") |
|
|
print(f"Number of classes: {len(unique_labels)}") |
|
|
print(f"Training accuracy: {train_accuracy:.4f}") |
|
|
print(f"Test accuracy: {test_accuracy:.4f}") |
|
|
print(f"Training time: {train_time:.2f} seconds") |
|
|
print(f"Model saved to: {model_path}") |
|
|
print("=" * 60) |
|
|
|
|
|
return metadata |
|
|
|
|
|
|
|
|
def train_all_configurations(dataset="vlsp2016", models=None, num_rows=None): |
|
|
"""Train multiple model configurations and compare results""" |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
run_dir = setup_logging(timestamp) |
|
|
|
|
|
logging.info(f"Starting comparison run: {timestamp}") |
|
|
logging.info(f"Dataset: {dataset}") |
|
|
if num_rows: |
|
|
logging.info(f"Sample limit: {num_rows}") |
|
|
|
|
|
if models is None: |
|
|
|
|
|
available_models = get_available_models() |
|
|
models = list(available_models.keys()) |
|
|
|
|
|
logging.info(f"Models to compare: {models}") |
|
|
|
|
|
|
|
|
configurations = [] |
|
|
for model_name in models: |
|
|
if model_name in ["svc_rbf", "gradient_boost", "ada_boost", "mlp"]: |
|
|
|
|
|
configurations.append({ |
|
|
"dataset": dataset, |
|
|
"model_name": model_name, |
|
|
"max_features": 10000, |
|
|
"ngram_range": (1, 2), |
|
|
"n_samples": num_rows |
|
|
}) |
|
|
else: |
|
|
|
|
|
configurations.append({ |
|
|
"dataset": dataset, |
|
|
"model_name": model_name, |
|
|
"max_features": 20000, |
|
|
"ngram_range": (1, 2), |
|
|
"n_samples": num_rows |
|
|
}) |
|
|
|
|
|
results = [] |
|
|
|
|
|
for config in configurations: |
|
|
print(f"\nTraining configuration: {config}") |
|
|
try: |
|
|
result = train_model(**config) |
|
|
results.append(result) |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to train with config {config}: {e}") |
|
|
print(f"Error training configuration: {e}") |
|
|
|
|
|
|
|
|
comparison_path = os.path.join(run_dir, "comparison_results.json") |
|
|
with open(comparison_path, "w", encoding="utf-8") as f: |
|
|
json.dump(results, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 80) |
|
|
print("Model Comparison Results") |
|
|
print("=" * 80) |
|
|
print( |
|
|
f"{'Model':<10} {'Features':<10} {'N-gram':<10} {'Train Acc':<12} {'Test Acc':<12}" |
|
|
) |
|
|
print("-" * 80) |
|
|
|
|
|
for result in sorted(results, key=lambda x: x["test_accuracy"], reverse=True): |
|
|
model = result["classifier"][:8] |
|
|
features = f"{result['max_features'] // 1000}k" |
|
|
ngram = f"{result['ngram_range'][0]}-{result['ngram_range'][1]}" |
|
|
train_acc = result["train_accuracy"] |
|
|
test_acc = result["test_accuracy"] |
|
|
print( |
|
|
f"{model:<10} {features:<10} {ngram:<10} {train_acc:<12.4f} {test_acc:<12.4f}" |
|
|
) |
|
|
|
|
|
print("=" * 80) |
|
|
|
|
|
|
|
|
best_model = max(results, key=lambda x: x["test_accuracy"]) |
|
|
print(f"\nBest model: {best_model['config_name']}") |
|
|
print(f"Test accuracy: {best_model['test_accuracy']:.4f}") |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def train_notebook(dataset="vlsp2016", model_name="logistic", max_features=20000, ngram_min=1, ngram_max=2, |
|
|
split_ratio=0.2, n_samples=None, compare=False, export_model=False): |
|
|
""" |
|
|
Convenience function for training in Jupyter/Colab notebooks without argparse. |
|
|
Example usage: |
|
|
from train import train_notebook |
|
|
train_notebook(dataset="vlsp2016", model_name="logistic", max_features=20000, export_model=True) |
|
|
""" |
|
|
if compare: |
|
|
print(f"Training and comparing multiple configurations on {dataset}...") |
|
|
return train_all_configurations(dataset=dataset) |
|
|
else: |
|
|
print(f"Training {model_name} model on {dataset} dataset...") |
|
|
print(f"Configuration: max_features={max_features}, ngram=({ngram_min}, {ngram_max})") |
|
|
|
|
|
return train_model( |
|
|
dataset=dataset, |
|
|
model_name=model_name, |
|
|
max_features=max_features, |
|
|
ngram_range=(ngram_min, ngram_max), |
|
|
split_ratio=split_ratio, |
|
|
n_samples=n_samples, |
|
|
export_model=export_model, |
|
|
) |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function with argument parsing""" |
|
|
|
|
|
import sys |
|
|
in_notebook = hasattr(sys, 'ps1') or 'ipykernel' in sys.modules or 'google.colab' in sys.modules |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
description="Train Vietnamese sentiment classification model on various datasets" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--dataset", |
|
|
type=str, |
|
|
choices=["vlsp2016", "uts2017"], |
|
|
default="vlsp2016", |
|
|
help="Dataset to use for training (default: vlsp2016)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--model", |
|
|
type=str, |
|
|
choices=["logistic", "svc_linear", "svc_rbf", "naive_bayes", "decision_tree", "random_forest", "gradient_boost", "ada_boost", "mlp"], |
|
|
default="logistic", |
|
|
help="Model type to train (default: logistic)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-features", |
|
|
type=int, |
|
|
default=20000, |
|
|
help="Maximum number of features for TF-IDF (default: 20000)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--ngram-min", type=int, default=1, help="Minimum n-gram range (default: 1)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--ngram-max", type=int, default=2, help="Maximum n-gram range (default: 2)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--split-ratio", type=float, default=0.2, help="Test split ratio (default: 0.2)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--num-rows", |
|
|
type=int, |
|
|
default=None, |
|
|
help="Limit number of rows/samples for quick testing (default: None - use all data)", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--compare", |
|
|
action="store_true", |
|
|
help="Train and compare multiple configurations", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--compare-models", |
|
|
nargs="+", |
|
|
help="List of specific models to compare (e.g., --compare-models logistic random_forest svc_rbf)", |
|
|
choices=["logistic", "svc_linear", "svc_rbf", "naive_bayes", "decision_tree", "random_forest", "gradient_boost", "ada_boost", "mlp"] |
|
|
) |
|
|
parser.add_argument( |
|
|
"--export-model", |
|
|
action="store_true", |
|
|
help="Export a copy of the trained model to project root for distribution/publishing" |
|
|
) |
|
|
|
|
|
|
|
|
args, unknown = parser.parse_known_args() |
|
|
|
|
|
|
|
|
if in_notebook and unknown: |
|
|
print(f"Note: Running in Jupyter/Colab environment. Ignoring kernel arguments: {unknown}") |
|
|
|
|
|
if args.compare or args.compare_models: |
|
|
if args.compare_models: |
|
|
print(f"Training and comparing selected models: {args.compare_models}") |
|
|
print(f"Dataset: {args.dataset}") |
|
|
if args.num_rows: |
|
|
print(f"Using {args.num_rows} rows") |
|
|
train_all_configurations(dataset=args.dataset, models=args.compare_models, num_rows=args.num_rows) |
|
|
else: |
|
|
print("Training and comparing all available models...") |
|
|
print(f"Dataset: {args.dataset}") |
|
|
if args.num_rows: |
|
|
print(f"Using {args.num_rows} rows") |
|
|
train_all_configurations(dataset=args.dataset, num_rows=args.num_rows) |
|
|
else: |
|
|
print(f"Training {args.model} model on {args.dataset} dataset...") |
|
|
print( |
|
|
f"Configuration: max_features={args.max_features}, ngram=({args.ngram_min}, {args.ngram_max})" |
|
|
) |
|
|
|
|
|
train_model( |
|
|
dataset=args.dataset, |
|
|
model_name=args.model, |
|
|
max_features=args.max_features, |
|
|
ngram_range=(args.ngram_min, args.ngram_max), |
|
|
split_ratio=args.split_ratio, |
|
|
n_samples=args.num_rows, |
|
|
export_model=args.export_model, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |