Spaces:
Sleeping
Sleeping
File size: 1,301 Bytes
ab62db9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import save_to_pickle
from data_loader import load_and_clean_data
import os
# Paths
data_path = "data/books_summary.csv"
cleaned_data_path = "data/cleaned_books_summary.csv"
vectorizer_path = "model/tfidf_vectorizer.pkl"
tfidf_matrix_path = "model/tfidf_matrix.pkl"
def train_tfidf_model(data):
"""
Train a TF-IDF vectorizer on the combined text data and save the model.
"""
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
# Fit and transform the combined text
print("Training TF-IDF vectorizer...")
tfidf_matrix = vectorizer.fit_transform(data["combined_text"])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
# Save the TF-IDF vectorizer and matrix
save_to_pickle(vectorizer, vectorizer_path)
save_to_pickle(tfidf_matrix, tfidf_matrix_path)
print(
f"TF-IDF vectorizer and matrix saved to: {vectorizer_path} and {tfidf_matrix_path}"
)
def main():
# Ensure the model directory exists
os.makedirs("model", exist_ok=True)
# Load, clean, and prepare data
books_df = load_and_clean_data(data_path, cleaned_data_path)
train_tfidf_model(books_df)
if __name__ == "__main__":
main()
|