zibaatak's picture
Upload 17 files
fdfb2b8
raw
history blame
545 Bytes
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
class TextPreprocessor:
def __init__(self):
pass
def remove_figure_references(self, text):
# Remove references like "(Figure X.Y)" or "(Figure X.Y.Z)" and "see Figure X.Y" or "see Figure X.Y.Z"
text = re.sub(r'\(Figure \d+(\.\d+)+\)', '', text) # Remove (Figure X.Y or X.Y.Z)
text = re.sub(r'see Figure \d+(\.\d+)+', '', text) # Remove "see Figure X.Y or X.Y.Z"
return text