Spaces:
Sleeping
Sleeping
| import re | |
| import string | |
| import pickle | |
| import os | |
| from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS | |
| # 1. Text Cleaning Function | |
| def clean_text(text): | |
| """ | |
| Preprocesses the input text by removing special characters, punctuation, | |
| converting to lowercase, and removing stopwords. | |
| Args: | |
| text (str): Input text string. | |
| Returns: | |
| str: Cleaned and preprocessed text. | |
| """ | |
| if not isinstance(text, str): | |
| return "" # Handle cases where text might not be a string | |
| # Convert text to lowercase | |
| text = text.lower() | |
| # Remove punctuation and special characters | |
| text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) | |
| # Remove digits | |
| text = re.sub(r"\d+", "", text) | |
| # Remove extra whitespaces | |
| text = re.sub(r"\s+", " ", text).strip() | |
| # Remove stopwords | |
| words = text.split() | |
| cleaned_words = [word for word in words if word not in ENGLISH_STOP_WORDS] | |
| return " ".join(cleaned_words) | |
| # 2. Stopwords Loader (Optional, if using a custom stopwords list) | |
| def load_stopwords(file_path="data/custom_stopwords.txt"): | |
| """ | |
| Loads custom stopwords from a file. | |
| Args: | |
| file_path (str): Path to the stopwords file. | |
| Returns: | |
| set: Set of stopwords. | |
| """ | |
| if os.path.exists(file_path): | |
| with open(file_path, "r") as file: | |
| stopwords = set(file.read().splitlines()) | |
| return stopwords | |
| return set() | |
| # 3. Save to Pickle | |
| def save_to_pickle(obj, file_path): | |
| """ | |
| Saves an object to a pickle file. | |
| Args: | |
| obj: Object to save. | |
| file_path (str): Path to save the pickle file. | |
| """ | |
| with open(file_path, "wb") as file: | |
| pickle.dump(obj, file) | |
| # 4. Load from Pickle | |
| def load_from_pickle(file_path): | |
| """ | |
| Loads an object from a pickle file. | |
| Args: | |
| file_path (str): Path to the pickle file. | |
| Returns: | |
| The loaded object. | |
| """ | |
| if os.path.exists(file_path): | |
| with open(file_path, "rb") as file: | |
| return pickle.load(file) | |
| else: | |
| raise FileNotFoundError(f"Pickle file not found at {file_path}") | |
| # 5. Input Validation | |
| def validate_input(book_title, book_list): | |
| """ | |
| Validates if the book title exists in the dataset. | |
| Args: | |
| book_title (str): Input book title. | |
| book_list (list): List of all book titles. | |
| Returns: | |
| bool: True if book exists, else False. | |
| """ | |
| return book_title in book_list | |