Ajayan commited on
Commit
ab62db9
·
1 Parent(s): 4333ac7

src directory added

Browse files
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (136 Bytes). View file
 
src/__pycache__/data_loader.cpython-39.pyc ADDED
Binary file (1.09 kB). View file
 
src/__pycache__/utils.cpython-39.pyc ADDED
Binary file (2.59 kB). View file
 
src/data_loader.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from utils import clean_text
3
+
4
+
5
+ def load_and_clean_data(data_path, cleaned_data_path):
6
+ """
7
+ Load dataset, aggregate categories, drop duplicates, and preprocess text.
8
+ """
9
+ # Load the dataset
10
+ books_df = pd.read_csv(data_path)
11
+ print(f"Original dataset shape: {books_df.shape}")
12
+
13
+ # Group by 'book_name' and 'book_summary', aggregate 'book_tags'
14
+ books_df = books_df.groupby(["book_name", "summaries"], as_index=False).agg(
15
+ {"categories": lambda tags: ", ".join(set(tags.dropna()))}
16
+ ) # Remove duplicates within tags
17
+
18
+ print(f"After aggregating categories and removing duplicates: {books_df.shape}")
19
+ books_df = books_df.drop_duplicates(subset=["book_name", "summaries"], keep="first")
20
+ # Combine 'book_summary' and 'book_tags' into a single text field
21
+ books_df["combined_text"] = (
22
+ books_df["summaries"].fillna("") + " " + books_df["categories"].fillna("")
23
+ )
24
+
25
+ # Clean the combined text
26
+ books_df["combined_text"] = books_df["combined_text"].apply(clean_text)
27
+
28
+ # Save the cleaned dataset
29
+ books_df.to_csv(cleaned_data_path, index=False)
30
+ print(f"Cleaned dataset saved to: {cleaned_data_path}")
31
+
32
+ return books_df
src/feature_engineering.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from utils import save_to_pickle
3
+ from data_loader import load_and_clean_data
4
+ import os
5
+
6
+ # Paths
7
+ data_path = "data/books_summary.csv"
8
+ cleaned_data_path = "data/cleaned_books_summary.csv"
9
+ vectorizer_path = "model/tfidf_vectorizer.pkl"
10
+ tfidf_matrix_path = "model/tfidf_matrix.pkl"
11
+
12
+
13
+ def train_tfidf_model(data):
14
+ """
15
+ Train a TF-IDF vectorizer on the combined text data and save the model.
16
+ """
17
+ # Initialize TF-IDF vectorizer
18
+ vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
19
+
20
+ # Fit and transform the combined text
21
+ print("Training TF-IDF vectorizer...")
22
+ tfidf_matrix = vectorizer.fit_transform(data["combined_text"])
23
+ print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
24
+
25
+ # Save the TF-IDF vectorizer and matrix
26
+ save_to_pickle(vectorizer, vectorizer_path)
27
+ save_to_pickle(tfidf_matrix, tfidf_matrix_path)
28
+ print(
29
+ f"TF-IDF vectorizer and matrix saved to: {vectorizer_path} and {tfidf_matrix_path}"
30
+ )
31
+
32
+
33
+ def main():
34
+ # Ensure the model directory exists
35
+ os.makedirs("model", exist_ok=True)
36
+ # Load, clean, and prepare data
37
+ books_df = load_and_clean_data(data_path, cleaned_data_path)
38
+ train_tfidf_model(books_df)
39
+
40
+
41
+ if __name__ == "__main__":
42
+ main()
src/recommeder.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def recommend_books(book_title, df, similarity_matrix, top_n=5):
2
+ if book_title not in df["book_name"].values:
3
+ return "Book not found. Please check the title."
4
+
5
+ index = df.index[df["book_name"] == book_title][0]
6
+ similarity_scores = list(enumerate(similarity_matrix[index]))
7
+ sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
8
+
9
+ recommendations = []
10
+ for idx, _ in sorted_scores[1 : top_n + 1]: # Exclude input book
11
+ recommendations.append(df["book_name"].iloc[idx])
12
+ return recommendations
src/similarity_calculator.py ADDED
File without changes
src/utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import pickle
4
+ import os
5
+
6
+ from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
7
+
8
+
9
+ # 1. Text Cleaning Function
10
+ def clean_text(text):
11
+ """
12
+ Preprocesses the input text by removing special characters, punctuation,
13
+ converting to lowercase, and removing stopwords.
14
+ Args:
15
+ text (str): Input text string.
16
+ Returns:
17
+ str: Cleaned and preprocessed text.
18
+ """
19
+ if not isinstance(text, str):
20
+ return "" # Handle cases where text might not be a string
21
+
22
+ # Convert text to lowercase
23
+ text = text.lower()
24
+
25
+ # Remove punctuation and special characters
26
+ text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
27
+
28
+ # Remove digits
29
+ text = re.sub(r"\d+", "", text)
30
+
31
+ # Remove extra whitespaces
32
+ text = re.sub(r"\s+", " ", text).strip()
33
+
34
+ # Remove stopwords
35
+ words = text.split()
36
+ cleaned_words = [word for word in words if word not in ENGLISH_STOP_WORDS]
37
+
38
+ return " ".join(cleaned_words)
39
+
40
+
41
+ # 2. Stopwords Loader (Optional, if using a custom stopwords list)
42
+ def load_stopwords(file_path="data/custom_stopwords.txt"):
43
+ """
44
+ Loads custom stopwords from a file.
45
+ Args:
46
+ file_path (str): Path to the stopwords file.
47
+ Returns:
48
+ set: Set of stopwords.
49
+ """
50
+ if os.path.exists(file_path):
51
+ with open(file_path, "r") as file:
52
+ stopwords = set(file.read().splitlines())
53
+ return stopwords
54
+ return set()
55
+
56
+
57
+ # 3. Save to Pickle
58
+ def save_to_pickle(obj, file_path):
59
+ """
60
+ Saves an object to a pickle file.
61
+ Args:
62
+ obj: Object to save.
63
+ file_path (str): Path to save the pickle file.
64
+ """
65
+ with open(file_path, "wb") as file:
66
+ pickle.dump(obj, file)
67
+
68
+
69
+ # 4. Load from Pickle
70
+ def load_from_pickle(file_path):
71
+ """
72
+ Loads an object from a pickle file.
73
+ Args:
74
+ file_path (str): Path to the pickle file.
75
+ Returns:
76
+ The loaded object.
77
+ """
78
+ if os.path.exists(file_path):
79
+ with open(file_path, "rb") as file:
80
+ return pickle.load(file)
81
+ else:
82
+ raise FileNotFoundError(f"Pickle file not found at {file_path}")
83
+
84
+
85
+ # 5. Input Validation
86
+ def validate_input(book_title, book_list):
87
+ """
88
+ Validates if the book title exists in the dataset.
89
+ Args:
90
+ book_title (str): Input book title.
91
+ book_list (list): List of all book titles.
92
+ Returns:
93
+ bool: True if book exists, else False.
94
+ """
95
+ return book_title in book_list