Spaces:

Ajayan
/

book_title_recomender

Sleeping

App Files Files Community

Ajayan commited on Dec 16, 2024

Commit

ab62db9

1 Parent(s): 4333ac7

src directory added

Browse files

Files changed (9) hide show

src/__init__.py +0 -0
src/__pycache__/__init__.cpython-39.pyc +0 -0
src/__pycache__/data_loader.cpython-39.pyc +0 -0
src/__pycache__/utils.cpython-39.pyc +0 -0
src/data_loader.py +32 -0
src/feature_engineering.py +42 -0
src/recommeder.py +12 -0
src/similarity_calculator.py +0 -0
src/utils.py +95 -0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (136 Bytes). View file

src/__pycache__/data_loader.cpython-39.pyc ADDED Viewed

Binary file (1.09 kB). View file

src/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (2.59 kB). View file

src/data_loader.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import pandas as pd
+from utils import clean_text
+def load_and_clean_data(data_path, cleaned_data_path):
+    """
+    Load dataset, aggregate categories, drop duplicates, and preprocess text.
+    """
+    # Load the dataset
+    books_df = pd.read_csv(data_path)
+    print(f"Original dataset shape: {books_df.shape}")
+    # Group by 'book_name' and 'book_summary', aggregate 'book_tags'
+    books_df = books_df.groupby(["book_name", "summaries"], as_index=False).agg(
+        {"categories": lambda tags: ", ".join(set(tags.dropna()))}
+    )  # Remove duplicates within tags
+    print(f"After aggregating categories and removing duplicates: {books_df.shape}")
+    books_df = books_df.drop_duplicates(subset=["book_name", "summaries"], keep="first")
+    # Combine 'book_summary' and 'book_tags' into a single text field
+    books_df["combined_text"] = (
+        books_df["summaries"].fillna("") + " " + books_df["categories"].fillna("")
+    )
+    # Clean the combined text
+    books_df["combined_text"] = books_df["combined_text"].apply(clean_text)
+    # Save the cleaned dataset
+    books_df.to_csv(cleaned_data_path, index=False)
+    print(f"Cleaned dataset saved to: {cleaned_data_path}")
+    return books_df

src/feature_engineering.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from sklearn.feature_extraction.text import TfidfVectorizer
+from utils import save_to_pickle
+from data_loader import load_and_clean_data
+import os
+# Paths
+data_path = "data/books_summary.csv"
+cleaned_data_path = "data/cleaned_books_summary.csv"
+vectorizer_path = "model/tfidf_vectorizer.pkl"
+tfidf_matrix_path = "model/tfidf_matrix.pkl"
+def train_tfidf_model(data):
+    """
+    Train a TF-IDF vectorizer on the combined text data and save the model.
+    """
+    # Initialize TF-IDF vectorizer
+    vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
+    # Fit and transform the combined text
+    print("Training TF-IDF vectorizer...")
+    tfidf_matrix = vectorizer.fit_transform(data["combined_text"])
+    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
+    # Save the TF-IDF vectorizer and matrix
+    save_to_pickle(vectorizer, vectorizer_path)
+    save_to_pickle(tfidf_matrix, tfidf_matrix_path)
+    print(
+        f"TF-IDF vectorizer and matrix saved to: {vectorizer_path} and {tfidf_matrix_path}"
+    )
+def main():
+    # Ensure the model directory exists
+    os.makedirs("model", exist_ok=True)
+    # Load, clean, and prepare data
+    books_df = load_and_clean_data(data_path, cleaned_data_path)
+    train_tfidf_model(books_df)
+if __name__ == "__main__":
+    main()

src/recommeder.py ADDED Viewed

	@@ -0,0 +1,12 @@

+def recommend_books(book_title, df, similarity_matrix, top_n=5):
+    if book_title not in df["book_name"].values:
+        return "Book not found. Please check the title."
+    index = df.index[df["book_name"] == book_title][0]
+    similarity_scores = list(enumerate(similarity_matrix[index]))
+    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+    recommendations = []
+    for idx, _ in sorted_scores[1 : top_n + 1]:  # Exclude input book
+        recommendations.append(df["book_name"].iloc[idx])
+    return recommendations

src/similarity_calculator.py ADDED Viewed

File without changes

src/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import re
+import string
+import pickle
+import os
+from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
+# 1. Text Cleaning Function
+def clean_text(text):
+    """
+    Preprocesses the input text by removing special characters, punctuation,
+    converting to lowercase, and removing stopwords.
+    Args:
+        text (str): Input text string.
+    Returns:
+        str: Cleaned and preprocessed text.
+    """
+    if not isinstance(text, str):
+        return ""  # Handle cases where text might not be a string
+    # Convert text to lowercase
+    text = text.lower()
+    # Remove punctuation and special characters
+    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
+    # Remove digits
+    text = re.sub(r"\d+", "", text)
+    # Remove extra whitespaces
+    text = re.sub(r"\s+", " ", text).strip()
+    # Remove stopwords
+    words = text.split()
+    cleaned_words = [word for word in words if word not in ENGLISH_STOP_WORDS]
+    return " ".join(cleaned_words)
+# 2. Stopwords Loader (Optional, if using a custom stopwords list)
+def load_stopwords(file_path="data/custom_stopwords.txt"):
+    """
+    Loads custom stopwords from a file.
+    Args:
+        file_path (str): Path to the stopwords file.
+    Returns:
+        set: Set of stopwords.
+    """
+    if os.path.exists(file_path):
+        with open(file_path, "r") as file:
+            stopwords = set(file.read().splitlines())
+        return stopwords
+    return set()
+# 3. Save to Pickle
+def save_to_pickle(obj, file_path):
+    """
+    Saves an object to a pickle file.
+    Args:
+        obj: Object to save.
+        file_path (str): Path to save the pickle file.
+    """
+    with open(file_path, "wb") as file:
+        pickle.dump(obj, file)
+# 4. Load from Pickle
+def load_from_pickle(file_path):
+    """
+    Loads an object from a pickle file.
+    Args:
+        file_path (str): Path to the pickle file.
+    Returns:
+        The loaded object.
+    """
+    if os.path.exists(file_path):
+        with open(file_path, "rb") as file:
+            return pickle.load(file)
+    else:
+        raise FileNotFoundError(f"Pickle file not found at {file_path}")
+# 5. Input Validation
+def validate_input(book_title, book_list):
+    """
+    Validates if the book title exists in the dataset.
+    Args:
+        book_title (str): Input book title.
+        book_list (list): List of all book titles.
+    Returns:
+        bool: True if book exists, else False.
+    """
+    return book_title in book_list