book_title_recomender / src /data_loader.py
Ajayan's picture
src directory added
ab62db9
import pandas as pd
from utils import clean_text
def load_and_clean_data(data_path, cleaned_data_path):
"""
Load dataset, aggregate categories, drop duplicates, and preprocess text.
"""
# Load the dataset
books_df = pd.read_csv(data_path)
print(f"Original dataset shape: {books_df.shape}")
# Group by 'book_name' and 'book_summary', aggregate 'book_tags'
books_df = books_df.groupby(["book_name", "summaries"], as_index=False).agg(
{"categories": lambda tags: ", ".join(set(tags.dropna()))}
) # Remove duplicates within tags
print(f"After aggregating categories and removing duplicates: {books_df.shape}")
books_df = books_df.drop_duplicates(subset=["book_name", "summaries"], keep="first")
# Combine 'book_summary' and 'book_tags' into a single text field
books_df["combined_text"] = (
books_df["summaries"].fillna("") + " " + books_df["categories"].fillna("")
)
# Clean the combined text
books_df["combined_text"] = books_df["combined_text"].apply(clean_text)
# Save the cleaned dataset
books_df.to_csv(cleaned_data_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_data_path}")
return books_df