{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "025d2c7e",
   "metadata": {},
   "source": [
    "# Bi-LSTM Submission\n",
    "Generate predictions using the trained Bi-LSTM model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c33463c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q transformers torch pandas numpy huggingface_hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8c71e09",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from transformers import AutoTokenizer\n",
    "from huggingface_hub import hf_hub_download\n",
    "\n",
    "# Config\n",
    "LABELS = [\"anger\", \"fear\", \"joy\", \"sadness\", \"surprise\"]\n",
    "MAX_LEN = 100\n",
    "BATCH_SIZE = 32\n",
    "EMBED_DIM = 100\n",
    "HIDDEN_DIM = 128\n",
    "DROPOUT = 0.3\n",
    "DEVICE = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "# Paths\n",
    "TEST_CSV = \"/kaggle/input/2025-sep-dl-gen-ai-project/test.csv\"\n",
    "SUBMISSION_CSV = \"submission.csv\"\n",
    "MODEL_FILE = \"model.pth\"\n",
    "\n",
    "# HF Repo (Replace with your repo ID from training notebook)\n",
    "HF_REPO_ID = \"hrshlgunjal/emotion-classifier-bilstm\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c30485d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download Model if needed\n",
    "if not os.path.exists(MODEL_FILE):\n",
    "    try:\n",
    "        print(f\"Downloading model from {HF_REPO_ID}...\")\n",
    "        model_path = hf_hub_download(repo_id=HF_REPO_ID, filename=\"pytorch_model.bin\")\n",
    "        import shutil\n",
    "        shutil.copy(model_path, MODEL_FILE)\n",
    "        print(\"Model downloaded.\")\n",
    "    except Exception as e:\n",
    "        print(f\"Could not download model: {e}\")\n",
    "        print(\"Please ensure HF_REPO_ID is correct or upload 'model.pth' manually.\")\n",
    "else:\n",
    "    print(\"Model file found locally.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0ba4952",
   "metadata": {},
   "outputs": [],
   "source": [
    "class BiLSTM(nn.Module):\n",
    "    def __init__(self, vocab_size):\n",
    "        super().__init__()\n",
    "        self.embedding = nn.Embedding(vocab_size, EMBED_DIM)\n",
    "        self.lstm = nn.LSTM(EMBED_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=DROPOUT, num_layers=2)\n",
    "        self.fc = nn.Linear(HIDDEN_DIM * 2, len(LABELS))\n",
    "        \n",
    "    def forward(self, x):\n",
    "        x = self.embedding(x)\n",
    "        _, (hidden, _) = self.lstm(x)\n",
    "        x = torch.cat((hidden[-2], hidden[-1]), dim=1)\n",
    "        return self.fc(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5594b26",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict():\n",
    "    if not os.path.exists(TEST_CSV):\n",
    "        print(\"Test data not found.\")\n",
    "        return\n",
    "\n",
    "    # Load Data\n",
    "    df = pd.read_csv(TEST_CSV)\n",
    "    if \"text\" not in df.columns: df = df.rename(columns={\"comment_text\": \"text\"})\n",
    "    \n",
    "    # Tokenizer\n",
    "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
    "    \n",
    "    # Dataset\n",
    "    class TestDS(Dataset):\n",
    "        def __init__(self, df, tokenizer):\n",
    "            self.texts = df['text'].tolist()\n",
    "            self.tokenizer = tokenizer\n",
    "        def __len__(self): return len(self.texts)\n",
    "        def __getitem__(self, idx):\n",
    "            enc = self.tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')\n",
    "            return enc['input_ids'].squeeze(0)\n",
    "\n",
    "    loader = DataLoader(TestDS(df, tokenizer), batch_size=BATCH_SIZE)\n",
    "\n",
    "    # Load Model\n",
    "    model = BiLSTM(tokenizer.vocab_size).to(DEVICE)\n",
    "    \n",
    "    if os.path.exists(MODEL_FILE):\n",
    "        model.load_state_dict(torch.load(MODEL_FILE, map_location=DEVICE))\n",
    "        print(f\"Loaded weights from {MODEL_FILE}\")\n",
    "    else:\n",
    "        print(\"No model weights found! Predictions will be random.\")\n",
    "            \n",
    "    model.eval()\n",
    "    \n",
    "    # Inference\n",
    "    all_preds = []\n",
    "    print(\"Predicting...\")\n",
    "    with torch.no_grad():\n",
    "        for batch in loader:\n",
    "            logits = model(batch.to(DEVICE))\n",
    "            probs = torch.sigmoid(logits)\n",
    "            all_preds.append(probs.cpu().numpy())\n",
    "            \n",
    "    all_preds = np.vstack(all_preds)\n",
    "    \n",
    "    # Convert to binary (0/1) as per submission requirement\n",
    "    binary_preds = (all_preds >= 0.5).astype(int)\n",
    "    \n",
    "    # Create Submission\n",
    "    submission = pd.DataFrame(binary_preds, columns=LABELS)\n",
    "    submission['id'] = df['id']\n",
    "    submission = submission[['id'] + LABELS]\n",
    "    submission.to_csv(SUBMISSION_CSV, index=False)\n",
    "    print(f\"Saved submission to {SUBMISSION_CSV}\")\n",
    "    print(submission.head())\n",
    "\n",
    "predict()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}