{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "b2b05c00", "metadata": {}, "outputs": [], "source": [ "!pip install -q transformers torch huggingface_hub pandas numpy kaggle\n", "\n", "import os\n", "from pathlib import Path\n", "\n", "kaggle_json_path = Path.home() / '.kaggle' / 'kaggle.json'\n", "\n", "if not kaggle_json_path.exists():\n", " print(\"Kaggle credentials not found.\")\n", " print(\"\\nIf you have kaggle.json in the current directory:\")\n", " if Path('kaggle.json').exists():\n", " kaggle_json_path.parent.mkdir(exist_ok=True, parents=True)\n", " import shutil\n", " shutil.copy('kaggle.json', kaggle_json_path)\n", " kaggle_json_path.chmod(0o600)\n", " print(\"Kaggle credentials configured\")\n", " else:\n", " print(\"\\nPlease upload kaggle.json to this directory, then re-run this cell.\")\n", " print(\"Download from: https://www.kaggle.com/settings\")\n", "else:\n", " print(\"Kaggle credentials found\")\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import torch\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", "from torch.cuda.amp import autocast\n", "from huggingface_hub import hf_hub_download\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "class Config:\n", " HF_REPO_ID = \"YOUR_USERNAME/emotion-classifier-deberta-v3\"\n", " COMPETITION_NAME = \"2025-sep-dl-gen-ai-project\"\n", " LABELS = [\"anger\", \"fear\", \"joy\", \"sadness\", \"surprise\"]\n", " MAX_LEN = 128\n", " BATCH_SIZE = 32\n", " TEST_CSV = \"/kaggle/input/2025-sep-dl-gen-ai-project/test.csv\"\n", " SUBMISSION_PATH = \"submission.csv\"\n", "\n", "CONFIG = Config()\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "print(f\"Using device: {device}\")\n", "if torch.cuda.is_available():\n", " print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n", "\n", "print(f\"Loading model from HuggingFace: {CONFIG.HF_REPO_ID}\")\n", "\n", "try:\n", " print(\" Loading model...\")\n", " model = AutoModelForSequenceClassification.from_pretrained(\n", " CONFIG.HF_REPO_ID,\n", " num_labels=len(CONFIG.LABELS),\n", " problem_type=\"multi_label_classification\"\n", " )\n", " model.to(device)\n", " model.eval()\n", " print(\" Model loaded\")\n", " \n", " print(\" Loading tokenizer...\")\n", " tokenizer = AutoTokenizer.from_pretrained(CONFIG.HF_REPO_ID)\n", " print(\" Tokenizer loaded\")\n", " \n", " print(\" Loading optimized thresholds...\")\n", " try:\n", " threshold_path = hf_hub_download(\n", " repo_id=CONFIG.HF_REPO_ID,\n", " filename=\"best_thresholds.npy\"\n", " )\n", " best_thresholds = np.load(threshold_path)\n", " print(\" Optimized thresholds loaded\")\n", " print(f\"\\n Thresholds per label:\")\n", " for i, label in enumerate(CONFIG.LABELS):\n", " print(f\" {label}: {best_thresholds[i]:.3f}\")\n", " except Exception as e:\n", " print(f\" Could not load thresholds: {e}\")\n", " print(\" Using default thresholds of 0.5\")\n", " best_thresholds = np.array([0.5] * len(CONFIG.LABELS))\n", " \n", " print(\"\\nModel setup complete\")\n", " \n", "except Exception as e:\n", " print(f\"\\nError loading model: {e}\")\n", " print(\"\\nPlease ensure:\")\n", " print(\"1. You've updated CONFIG.HF_REPO_ID with your actual repository ID\")\n", " print(\"2. The model was successfully uploaded in the training notebook\")\n", " print(\"3. The repository is public or you're logged in to HuggingFace\")\n", " raise\n", "\n", "def ensure_text_column(df: pd.DataFrame) -> pd.DataFrame:\n", " if \"text\" in df.columns:\n", " return df\n", " for c in [\"comment_text\", \"sentence\", \"content\", \"review\"]:\n", " if c in df.columns:\n", " return df.rename(columns={c: \"text\"})\n", " raise ValueError(\"No text column found. Add/rename your text column to 'text'.\")\n", "\n", "class EmotionDS(torch.utils.data.Dataset):\n", " def __init__(self, texts, tokenizer, max_len):\n", " self.texts = texts\n", " self.tok = tokenizer\n", " self.max_len = max_len\n", "\n", " def __len__(self):\n", " return len(self.texts)\n", "\n", " def __getitem__(self, i):\n", " enc = self.tok(\n", " self.texts[i],\n", " truncation=True,\n", " padding=\"max_length\",\n", " max_length=self.max_len,\n", " return_tensors=\"pt\",\n", " )\n", " return {k: v.squeeze(0) for k, v in enc.items()}\n", "\n", "print(f\"Loading test data from: {CONFIG.TEST_CSV}\")\n", "\n", "if not os.path.exists(CONFIG.TEST_CSV):\n", " print(\"Test CSV not found. Please check the path.\")\n", " print(\"\\nIf you're running locally, make sure you have the test data.\")\n", " print(\"On Kaggle, ensure you've added the competition data as input.\")\n", " raise FileNotFoundError(CONFIG.TEST_CSV)\n", "\n", "df_test = pd.read_csv(CONFIG.TEST_CSV)\n", "df_test = ensure_text_column(df_test)\n", "\n", "print(f\"Test data loaded: {len(df_test)} samples\")\n", "print(f\"\\nColumns: {df_test.columns.tolist()}\")\n", "print(f\"\\nFirst few rows:\")\n", "print(df_test.head())\n", "\n", "print(\"\\nGenerating predictions...\\n\")\n", "\n", "test_texts = df_test[\"text\"].tolist()\n", "test_dataset = EmotionDS(test_texts, tokenizer, CONFIG.MAX_LEN)\n", "test_loader = torch.utils.data.DataLoader(\n", " test_dataset, \n", " batch_size=CONFIG.BATCH_SIZE, \n", " shuffle=False, \n", " num_workers=2,\n", " pin_memory=True\n", ")\n", "\n", "all_preds = []\n", "\n", "with torch.no_grad():\n", " for batch_idx, batch in enumerate(test_loader):\n", " batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}\n", " \n", " with autocast(enabled=True):\n", " outputs = model(\n", " input_ids=batch[\"input_ids\"], \n", " attention_mask=batch[\"attention_mask\"]\n", " )\n", " \n", " probs = torch.sigmoid(outputs.logits).float().cpu().numpy()\n", " all_preds.append(probs)\n", " \n", " if (batch_idx + 1) % 10 == 0:\n", " progress = (batch_idx + 1) * CONFIG.BATCH_SIZE\n", " print(f\" Processed {min(progress, len(df_test))}/{len(df_test)} samples...\")\n", "\n", "all_probs = np.vstack(all_preds)\n", "\n", "print(f\"\\nPredictions generated for {len(all_probs)} samples\")\n", "print(f\"Shape: {all_probs.shape}\")\n", "\n", "print(\"\\nApplying optimized thresholds...\\n\")\n", "\n", "final_predictions = (all_probs >= best_thresholds).astype(int)\n", "\n", "print(f\"Thresholds applied\")\n", "print(f\"\\nPrediction distribution:\")\n", "for i, label in enumerate(CONFIG.LABELS):\n", " count = final_predictions[:, i].sum()\n", " percentage = (count / len(final_predictions)) * 100\n", " print(f\" {label:<12} {count:>6} samples ({percentage:>5.1f}%)\")\n", "\n", "avg_labels_per_sample = final_predictions.sum(axis=1).mean()\n", "print(f\"\\n Average labels per sample: {avg_labels_per_sample:.2f}\")\n", "\n", "print(\"\\nCreating submission file...\\n\")\n", "\n", "submission = pd.DataFrame()\n", "\n", "if \"id\" in df_test.columns:\n", " submission[\"id\"] = df_test[\"id\"]\n", "else:\n", " submission[\"id\"] = np.arange(len(df_test))\n", "\n", "for i, label in enumerate(CONFIG.LABELS):\n", " submission[label] = final_predictions[:, i]\n", "\n", "submission.to_csv(CONFIG.SUBMISSION_PATH, index=False)\n", "\n", "print(f\"Submission file saved to: {CONFIG.SUBMISSION_PATH}\")\n", "print(f\"\\nSubmission preview:\")\n", "print(submission.head(10))\n", "print(f\"\\nTotal rows: {len(submission)}\")\n", "print(f\"Columns: {submission.columns.tolist()}\")\n", "\n", "print(\"Verifying submission format...\\n\")\n", "\n", "required_columns = [\"id\"] + CONFIG.LABELS\n", "submission_columns = submission.columns.tolist()\n", "\n", "if submission_columns == required_columns:\n", " print(\"Submission format is correct\")\n", " print(f\" Columns: {submission_columns}\")\n", " \n", " if submission[CONFIG.LABELS].isin([0, 1]).all().all():\n", " print(\"All predictions are binary (0 or 1)\")\n", " else:\n", " print(\"Warning: Some predictions are not binary\")\n", " \n", " if not submission.isnull().any().any():\n", " print(\"No missing values\")\n", " else:\n", " print(\"Missing values detected\")\n", " print(submission.isnull().sum())\n", "else:\n", " print(\"Submission format is incorrect\")\n", " print(f\" Expected: {required_columns}\")\n", " print(f\" Got: {submission_columns}\")\n", "\n", "print(\"\\nSubmitting to Kaggle...\\n\")\n", "\n", "submission_message = f\"DeBERTa-v3 with optimized thresholds - HF: {CONFIG.HF_REPO_ID}\"\n", "\n", "try:\n", " import kaggle\n", " \n", " kaggle.api.competition_submit(\n", " file_name=CONFIG.SUBMISSION_PATH,\n", " message=submission_message,\n", " competition=CONFIG.COMPETITION_NAME\n", " )\n", " \n", " print(\"Submission successful\")\n", " print(f\"\\nSubmission message: {submission_message}\")\n", " print(f\"\\nView your submission at:\")\n", " print(f\" https://www.kaggle.com/c/{CONFIG.COMPETITION_NAME}/submissions\")\n", " \n", "except Exception as e:\n", " print(f\"Submission failed: {e}\")\n", " print(\"\\nPossible reasons:\")\n", " print(\"1. Kaggle API credentials not configured\")\n", " print(\"2. Competition name is incorrect\")\n", " print(\"3. You've reached the daily submission limit\")\n", " print(\"4. The competition has ended\")\n", " print(\"\\nYou can manually upload the submission.csv file to Kaggle.\")\n", "\n", "print(\"\\n\" + \"=\"*60)\n", "print(\"PREDICTION STATISTICS\")\n", "print(\"=\"*60)\n", "\n", "labels_per_sample = final_predictions.sum(axis=1)\n", "print(\"\\nLabels per sample distribution:\")\n", "for i in range(6):\n", " count = (labels_per_sample == i).sum()\n", " percentage = (count / len(labels_per_sample)) * 100\n", " print(f\" {i} labels: {count:>6} samples ({percentage:>5.1f}%)\")\n", "\n", "print(\"\\nMost common label combinations:\")\n", "label_combinations = []\n", "for pred in final_predictions:\n", " active_labels = [CONFIG.LABELS[i] for i, val in enumerate(pred) if val == 1]\n", " if active_labels:\n", " label_combinations.append(\", \".join(sorted(active_labels)))\n", " else:\n", " label_combinations.append(\"(none)\")\n", "\n", "from collections import Counter\n", "combo_counts = Counter(label_combinations)\n", "for combo, count in combo_counts.most_common(10):\n", " percentage = (count / len(label_combinations)) * 100\n", " print(f\" {combo:<30} {count:>6} ({percentage:>5.1f}%)\")\n", "\n", "print(\"\\nAverage probability per label:\")\n", "for i, label in enumerate(CONFIG.LABELS):\n", " avg_prob = all_probs[:, i].mean()\n", " std_prob = all_probs[:, i].std()\n", " print(f\" {label:<12} {avg_prob:.4f} +/- {std_prob:.4f}\")\n", "\n", "print(\"\\n\" + \"=\"*60)\n", "print(\"SUBMISSION COMPLETE\")\n", "print(\"=\"*60)\n", "print(f\"\\nSubmission file: {CONFIG.SUBMISSION_PATH}\")\n", "print(f\"Model used: {CONFIG.HF_REPO_ID}\")\n", "print(f\"Optimized thresholds: {best_thresholds}\")\n", "print(\"\\nCheck Kaggle leaderboard for your score\")" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }