| import re | |
| import spacy | |
| from keras.preprocessing.text import Tokenizer | |
| from keras_preprocessing.sequence import pad_sequences | |
| import pickle | |
| # Load spaCy's English model | |
| nlp = spacy.load("en_core_web_sm") | |
| def preprocess_text(text): | |
| # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" or just "(Reuters)" | |
| text = re.sub( | |
| r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", text | |
| ) | |
| # Remove patterns like "Featured image via author name / image place" | |
| text = re.sub(r"Featured image via .+?\.($|\s)", "", text) | |
| # Process text with spaCy | |
| doc = nlp(text) | |
| lemmatized_text = [] | |
| for token in doc: | |
| # Preserve named entities in their original form | |
| if token.ent_type_: | |
| lemmatized_text.append(token.text) | |
| # Lemmatize other tokens and exclude non-alpha tokens if necessary | |
| elif token.is_alpha and not token.is_stop: | |
| lemmatized_text.append(token.lemma_.lower()) | |
| return " ".join(lemmatized_text) | |
| def load_tokenizer(tokenizer_path): | |
| with open(tokenizer_path, "rb") as handle: | |
| tokenizer = pickle.load(handle) | |
| return tokenizer | |
| def prepare_data(texts, tokenizer, max_length=500): | |
| sequences = tokenizer.texts_to_sequences(texts) | |
| padded = pad_sequences(sequences, maxlen=max_length) | |
| return padded | |