# Bi-LSTM Submission
Generate predictions using the trained Bi-LSTM model.

In [None]:
!pip install -q transformers torch pandas numpy huggingface_hub

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download

# Config
LABELS = ["anger", "fear", "joy", "sadness", "surprise"]
MAX_LEN = 100
BATCH_SIZE = 32
EMBED_DIM = 100
HIDDEN_DIM = 128
DROPOUT = 0.3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths
TEST_CSV = "/kaggle/input/2025-sep-dl-gen-ai-project/test.csv"
SUBMISSION_CSV = "submission.csv"
MODEL_FILE = "model.pth"

# HF Repo (Replace with your repo ID from training notebook)
HF_REPO_ID = "hrshlgunjal/emotion-classifier-bilstm"

In [None]:
# Download Model if needed
if not os.path.exists(MODEL_FILE):
 try:
 print(f"Downloading model from {HF_REPO_ID}...")
 model_path = hf_hub_download(repo_id=HF_REPO_ID, filename="pytorch_model.bin")
 import shutil
 shutil.copy(model_path, MODEL_FILE)
 print("Model downloaded.")
 except Exception as e:
 print(f"Could not download model: {e}")
 print("Please ensure HF_REPO_ID is correct or upload 'model.pth' manually.")
else:
 print("Model file found locally.")

In [None]:
class BiLSTM(nn.Module):
 def __init__(self, vocab_size):
 super().__init__()
 self.embedding = nn.Embedding(vocab_size, EMBED_DIM)
 self.lstm = nn.LSTM(EMBED_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True, dropout=DROPOUT, num_layers=2)
 self.fc = nn.Linear(HIDDEN_DIM * 2, len(LABELS))
 
 def forward(self, x):
 x = self.embedding(x)
 _, (hidden, _) = self.lstm(x)
 x = torch.cat((hidden[-2], hidden[-1]), dim=1)
 return self.fc(x)

In [None]:
def predict():
 if not os.path.exists(TEST_CSV):
 print("Test data not found.")
 return

 # Load Data
 df = pd.read_csv(TEST_CSV)
 if "text" not in df.columns: df = df.rename(columns={"comment_text": "text"})
 
 # Tokenizer
 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 
 # Dataset
 class TestDS(Dataset):
 def __init__(self, df, tokenizer):
 self.texts = df['text'].tolist()
 self.tokenizer = tokenizer
 def __len__(self): return len(self.texts)
 def __getitem__(self, idx):
 enc = self.tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
 return enc['input_ids'].squeeze(0)

 loader = DataLoader(TestDS(df, tokenizer), batch_size=BATCH_SIZE)

 # Load Model
 model = BiLSTM(tokenizer.vocab_size).to(DEVICE)
 
 if os.path.exists(MODEL_FILE):
 model.load_state_dict(torch.load(MODEL_FILE, map_location=DEVICE))
 print(f"Loaded weights from {MODEL_FILE}")
 else:
 print("No model weights found! Predictions will be random.")
 
 model.eval()
 
 # Inference
 all_preds = []
 print("Predicting...")
 with torch.no_grad():
 for batch in loader:
 logits = model(batch.to(DEVICE))
 probs = torch.sigmoid(logits)
 all_preds.append(probs.cpu().numpy())
 
 all_preds = np.vstack(all_preds)
 
 # Convert to binary (0/1) as per submission requirement
 binary_preds = (all_preds >= 0.5).astype(int)
 
 # Create Submission
 submission = pd.DataFrame(binary_preds, columns=LABELS)
 submission['id'] = df['id']
 submission = submission[['id'] + LABELS]
 submission.to_csv(SUBMISSION_CSV, index=False)
 print(f"Saved submission to {SUBMISSION_CSV}")
 print(submission.head())

predict()