Spaces:
Sleeping
Sleeping
| --- File: /home/sk/Desktop/chat-with-pdf/app.py --- | |
| import streamlit as st | |
| import os | |
| from utils.pdf_utils import PDFProcessor | |
| from utils.embeddings_utils import EmbeddingsManager | |
| from utils.qa_utils import QASystem | |
| from dotenv import load_dotenv | |
| import openai | |
| import time | |
| def initialize_session_state(): | |
| if 'pdf_processor' not in st.session_state: | |
| st.session_state['pdf_processor'] = None | |
| if 'embeddings_manager' not in st.session_state: | |
| st.session_state['embeddings_manager'] = None | |
| if 'qa_system' not in st.session_state: | |
| st.session_state['qa_system'] = None | |
| if 'processed_pdfs' not in st.session_state: | |
| st.session_state['processed_pdfs'] = set() | |
| if 'all_text_chunks' not in st.session_state: | |
| st.session_state['all_text_chunks'] = [] | |
| def main(): | |
| load_dotenv() | |
| st.set_page_config(page_title="Chat with PDF", layout="wide") | |
| st.title("ππ¬ Chat with PDF") | |
| initialize_session_state() | |
| with st.sidebar: | |
| st.header("π How to Use") | |
| st.markdown(""" | |
| 1. Upload PDF document(s) | |
| 2. Ask questions about the content | |
| 3. View answers and relevant context | |
| """) | |
| if 'total_tokens_used' in st.session_state: | |
| st.markdown("---") | |
| st.markdown("### π Usage Statistics") | |
| st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}") | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| st.error("OpenAI API key not found in .env file!") | |
| return | |
| openai.api_key = api_key | |
| if not st.session_state['pdf_processor']: | |
| st.session_state['pdf_processor'] = PDFProcessor() | |
| if not st.session_state['embeddings_manager']: | |
| st.session_state['embeddings_manager'] = EmbeddingsManager(api_key) | |
| if not st.session_state['qa_system']: | |
| st.session_state['qa_system'] = QASystem(api_key) | |
| st.subheader("π€ Upload PDFs") | |
| uploaded_files = st.file_uploader( | |
| "Upload PDF documents", | |
| type=['pdf'], | |
| accept_multiple_files=True | |
| ) | |
| if uploaded_files: | |
| new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']] | |
| if new_files: | |
| with st.spinner("Processing PDFs..."): | |
| for pdf_file in new_files: | |
| try: | |
| pages = st.session_state['pdf_processor'].extract_text(pdf_file) | |
| for page_text in pages.values(): | |
| chunks = st.session_state['pdf_processor'].chunk_text(page_text) | |
| st.session_state['all_text_chunks'].extend(chunks) | |
| st.session_state['processed_pdfs'].add(pdf_file.name) | |
| except Exception as e: | |
| st.error(f"Error processing {pdf_file.name}: {str(e)}") | |
| continue | |
| with st.spinner("Generating embeddings..."): | |
| try: | |
| st.session_state['embeddings_manager'].generate_embeddings( | |
| st.session_state['all_text_chunks'] | |
| ) | |
| st.success("β Documents processed!") | |
| except Exception as e: | |
| st.error(f"Error generating embeddings: {str(e)}") | |
| return | |
| if st.session_state['all_text_chunks']: | |
| st.write("---") | |
| st.subheader("β Ask Questions About Your Documents") | |
| question = st.text_input("Enter your question:") | |
| if question: | |
| try: | |
| with st.spinner("Searching for relevant information..."): | |
| relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks( | |
| question, | |
| k=3 | |
| ) | |
| answer = st.session_state['qa_system'].generate_answer( | |
| question, | |
| relevant_chunks | |
| ) | |
| st.markdown("### π€ Answer:") | |
| st.write(answer) | |
| with st.expander("π View Source Context"): | |
| for i, chunk in enumerate(relevant_chunks, 1): | |
| st.markdown(f"**Context {i}:**") | |
| st.write(chunk) | |
| st.markdown("---") | |
| except openai.error.RateLimitError: | |
| st.error("Rate limit exceeded. Please try again later.") | |
| except Exception as e: | |
| st.error(f"Error: {str(e)}") | |
| if __name__ == "__main__": | |
| main() | |
| --- File: /home/sk/Desktop/chat-with-pdf/requirements.txt --- | |
| streamlit | |
| PyPDF2 | |
| openai | |
| python-dotenv | |
| faiss-cpu | |
| numpy | |
| pdf2image | |
| Pillow | |
| --- File: /home/sk/Desktop/chat-with-pdf/.env --- | |
| OPENAI_API_KEY=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 | |
| --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/app.py --- | |
| import streamlit as st | |
| import os | |
| from utils.pdf_utils import PDFProcessor | |
| from utils.embeddings_utils import EmbeddingsManager | |
| from utils.qa_utils import QASystem | |
| from dotenv import load_dotenv | |
| import openai | |
| import time | |
| def initialize_session_state(): | |
| if 'pdf_processor' not in st.session_state: | |
| st.session_state['pdf_processor'] = None | |
| if 'embeddings_manager' not in st.session_state: | |
| st.session_state['embeddings_manager'] = None | |
| if 'qa_system' not in st.session_state: | |
| st.session_state['qa_system'] = None | |
| if 'processed_pdfs' not in st.session_state: | |
| st.session_state['processed_pdfs'] = set() | |
| if 'all_text_chunks' not in st.session_state: | |
| st.session_state['all_text_chunks'] = [] | |
| def main(): | |
| load_dotenv() | |
| st.set_page_config(page_title="Chat with PDF", layout="wide") | |
| st.title("ππ¬ Chat with PDF") | |
| initialize_session_state() | |
| with st.sidebar: | |
| st.header("π How to Use") | |
| st.markdown(""" | |
| 1. Upload PDF document(s) | |
| 2. Ask questions about the content | |
| 3. View answers and relevant context | |
| """) | |
| if 'total_tokens_used' in st.session_state: | |
| st.markdown("---") | |
| st.markdown("### π Usage Statistics") | |
| st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}") | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| st.error("OpenAI API key not found in .env file!") | |
| return | |
| openai.api_key = api_key | |
| if not st.session_state['pdf_processor']: | |
| st.session_state['pdf_processor'] = PDFProcessor() | |
| if not st.session_state['embeddings_manager']: | |
| st.session_state['embeddings_manager'] = EmbeddingsManager(api_key) | |
| if not st.session_state['qa_system']: | |
| st.session_state['qa_system'] = QASystem(api_key) | |
| st.subheader("π€ Upload PDFs") | |
| uploaded_files = st.file_uploader( | |
| "Upload PDF documents", | |
| type=['pdf'], | |
| accept_multiple_files=True | |
| ) | |
| if uploaded_files: | |
| new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']] | |
| if new_files: | |
| with st.spinner("Processing PDFs..."): | |
| for pdf_file in new_files: | |
| try: | |
| pages = st.session_state['pdf_processor'].extract_text(pdf_file) | |
| for page_text in pages.values(): | |
| chunks = st.session_state['pdf_processor'].chunk_text(page_text) | |
| st.session_state['all_text_chunks'].extend(chunks) | |
| st.session_state['processed_pdfs'].add(pdf_file.name) | |
| except Exception as e: | |
| st.error(f"Error processing {pdf_file.name}: {str(e)}") | |
| continue | |
| with st.spinner("Generating embeddings..."): | |
| try: | |
| st.session_state['embeddings_manager'].generate_embeddings( | |
| st.session_state['all_text_chunks'] | |
| ) | |
| st.success("β Documents processed!") | |
| except Exception as e: | |
| st.error(f"Error generating embeddings: {str(e)}") | |
| return | |
| if st.session_state['all_text_chunks']: | |
| st.write("---") | |
| st.subheader("β Ask Questions About Your Documents") | |
| question = st.text_input("Enter your question:") | |
| if question: | |
| try: | |
| with st.spinner("Searching for relevant information..."): | |
| relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks( | |
| question, | |
| k=3 | |
| ) | |
| answer = st.session_state['qa_system'].generate_answer( | |
| question, | |
| relevant_chunks | |
| ) | |
| st.markdown("### π€ Answer:") | |
| st.write(answer) | |
| with st.expander("π View Source Context"): | |
| for i, chunk in enumerate(relevant_chunks, 1): | |
| st.markdown(f"**Context {i}:**") | |
| st.write(chunk) | |
| st.markdown("---") | |
| except openai.error.RateLimitError: | |
| st.error("Rate limit exceeded. Please try again later.") | |
| except Exception as e: | |
| st.error(f"Error: {str(e)}") | |
| if __name__ == "__main__": | |
| main() | |
| --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/requirements.txt --- | |
| streamlit | |
| PyPDF2 | |
| openai | |
| python-dotenv | |
| faiss-cpu | |
| numpy | |
| pdf2image | |
| Pillow | |
| --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.gitattributes --- | |
| *.7z filter=lfs diff=lfs merge=lfs -text | |
| *.arrow filter=lfs diff=lfs merge=lfs -text | |
| *.bin filter=lfs diff=lfs merge=lfs -text | |
| *.bz2 filter=lfs diff=lfs merge=lfs -text | |
| *.ckpt filter=lfs diff=lfs merge=lfs -text | |
| *.ftz filter=lfs diff=lfs merge=lfs -text | |
| *.gz filter=lfs diff=lfs merge=lfs -text | |
| *.h5 filter=lfs diff=lfs merge=lfs -text | |
| *.joblib filter=lfs diff=lfs merge=lfs -text | |
| *.lfs.* filter=lfs diff=lfs merge=lfs -text | |
| *.mlmodel filter=lfs diff=lfs merge=lfs -text | |
| *.model filter=lfs diff=lfs merge=lfs -text | |
| *.msgpack filter=lfs diff=lfs merge=lfs -text | |
| *.npy filter=lfs diff=lfs merge=lfs -text | |
| *.npz filter=lfs diff=lfs merge=lfs -text | |
| *.onnx filter=lfs diff=lfs merge=lfs -text | |
| *.ot filter=lfs diff=lfs merge=lfs -text | |
| *.parquet filter=lfs diff=lfs merge=lfs -text | |
| *.pb filter=lfs diff=lfs merge=lfs -text | |
| *.pickle filter=lfs diff=lfs merge=lfs -text | |
| *.pkl filter=lfs diff=lfs merge=lfs -text | |
| *.pt filter=lfs diff=lfs merge=lfs -text | |
| *.pth filter=lfs diff=lfs merge=lfs -text | |
| *.rar filter=lfs diff=lfs merge=lfs -text | |
| *.safetensors filter=lfs diff=lfs merge=lfs -text | |
| saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| *.tar.* filter=lfs diff=lfs merge=lfs -text | |
| *.tar filter=lfs diff=lfs merge=lfs -text | |
| *.tflite filter=lfs diff=lfs merge=lfs -text | |
| *.tgz filter=lfs diff=lfs merge=lfs -text | |
| *.wasm filter=lfs diff=lfs merge=lfs -text | |
| *.xz filter=lfs diff=lfs merge=lfs -text | |
| *.zip filter=lfs diff=lfs merge=lfs -text | |
| *.zst filter=lfs diff=lfs merge=lfs -text | |
| *tfevents* filter=lfs diff=lfs merge=lfs -text | |
| --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.env --- | |
| OPENAI_API_KEY=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 | |
| --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/qa_utils.py --- | |
| import openai | |
| from typing import List | |
| class QASystem: | |
| def __init__(self, api_key: str): | |
| openai.api_key = api_key | |
| def generate_answer(self, question: str, context: List[str]) -> str: | |
| prompt = f"""Based on the context provided below, answer the question. | |
| If the answer is not in the context, respond with "The answer is not in the provided context." | |
| Context: | |
| {' '.join(context)} | |
| Question: {question} | |
| """ | |
| response = openai.chat.completions.create( # Updated line | |
| model="gpt-4", | |
| messages=[ | |
| {"role": "system", "content": "You are an assistant answering questions based on the provided context."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0, | |
| max_tokens=500 | |
| ) | |
| return response.choices[0].message.content | |
| --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/embeddings_utils.py --- | |
| import openai | |
| import numpy as np | |
| import faiss | |
| from typing import List | |
| class EmbeddingsManager: | |
| def __init__(self, api_key: str): | |
| self.api_key = api_key | |
| self.index = None | |
| self.chunks = [] | |
| def generate_embeddings(self, text_chunks: List[str]): | |
| """Generate embeddings for text chunks using OpenAI API.""" | |
| batch_size = 10 | |
| embeddings = [] | |
| for i in range(0, len(text_chunks), batch_size): | |
| batch = text_chunks[i:i + batch_size] | |
| response = openai.embeddings.create( | |
| input=batch, | |
| model="text-embedding-ada-002" | |
| ) | |
| # Access the embeddings using attributes | |
| batch_embeddings = [item.embedding for item in response.data] | |
| embeddings.extend(batch_embeddings) | |
| # Create FAISS index | |
| dimension = len(embeddings[0]) | |
| self.index = faiss.IndexFlatL2(dimension) | |
| embeddings_array = np.array(embeddings).astype('float32') | |
| self.index.add(embeddings_array) | |
| self.chunks = text_chunks | |
| def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]: | |
| """Find most relevant text chunks for a given query.""" | |
| response = openai.embeddings.create( | |
| input=[query], | |
| model="text-embedding-ada-002" | |
| ) | |
| # Access the query embedding using attributes | |
| query_embedding = response.data[0].embedding | |
| D, I = self.index.search( | |
| np.array([query_embedding]).astype('float32'), | |
| k | |
| ) | |
| return [self.chunks[i] for i in I[0] if i != -1] | |
| --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/pdf_utils.py --- | |
| import PyPDF2 | |
| from typing import List, Dict | |
| class PDFProcessor: | |
| def __init__(self): | |
| self.pages = {} | |
| def extract_text(self, pdf_file) -> Dict[int, str]: | |
| """Extract text from PDF and return a dictionary of page numbers and text.""" | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| for page_num in range(len(pdf_reader.pages)): | |
| text = pdf_reader.pages[page_num].extract_text() | |
| self.pages[page_num] = text | |
| return self.pages | |
| def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: | |
| """Split text into chunks of specified size.""" | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_size = 0 | |
| for word in words: | |
| current_size += len(word) + 1 # +1 for space | |
| if current_size > chunk_size: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [word] | |
| current_size = len(word) | |
| else: | |
| current_chunk.append(word) | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| return chunks | |
| --- File: /home/sk/Desktop/chat-with-pdf/utils/qa_utils.py --- | |
| import openai | |
| from typing import List | |
| class QASystem: | |
| def __init__(self, api_key: str): | |
| openai.api_key = api_key | |
| def generate_answer(self, question: str, context: List[str]) -> str: | |
| prompt = f"""Based on the context provided below, answer the question. | |
| If the answer is not in the context, respond with "The answer is not in the provided context." | |
| Context: | |
| {' '.join(context)} | |
| Question: {question} | |
| """ | |
| response = openai.chat.completions.create( # Updated line | |
| model="gpt-4", | |
| messages=[ | |
| {"role": "system", "content": "You are an assistant answering questions based on the provided context."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0, | |
| max_tokens=500 | |
| ) | |
| return response.choices[0].message.content | |
| --- File: /home/sk/Desktop/chat-with-pdf/utils/embeddings_utils.py --- | |
| import openai | |
| import numpy as np | |
| import faiss | |
| from typing import List | |
| class EmbeddingsManager: | |
| def __init__(self, api_key: str): | |
| self.api_key = api_key | |
| self.index = None | |
| self.chunks = [] | |
| def generate_embeddings(self, text_chunks: List[str]): | |
| """Generate embeddings for text chunks using OpenAI API.""" | |
| batch_size = 10 | |
| embeddings = [] | |
| for i in range(0, len(text_chunks), batch_size): | |
| batch = text_chunks[i:i + batch_size] | |
| response = openai.embeddings.create( | |
| input=batch, | |
| model="text-embedding-ada-002" | |
| ) | |
| # Access the embeddings using attributes | |
| batch_embeddings = [item.embedding for item in response.data] | |
| embeddings.extend(batch_embeddings) | |
| # Create FAISS index | |
| dimension = len(embeddings[0]) | |
| self.index = faiss.IndexFlatL2(dimension) | |
| embeddings_array = np.array(embeddings).astype('float32') | |
| self.index.add(embeddings_array) | |
| self.chunks = text_chunks | |
| def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]: | |
| """Find most relevant text chunks for a given query.""" | |
| response = openai.embeddings.create( | |
| input=[query], | |
| model="text-embedding-ada-002" | |
| ) | |
| # Access the query embedding using attributes | |
| query_embedding = response.data[0].embedding | |
| D, I = self.index.search( | |
| np.array([query_embedding]).astype('float32'), | |
| k | |
| ) | |
| return [self.chunks[i] for i in I[0] if i != -1] | |
| --- File: /home/sk/Desktop/chat-with-pdf/utils/pdf_utils.py --- | |
| import PyPDF2 | |
| from typing import List, Dict | |
| class PDFProcessor: | |
| def __init__(self): | |
| self.pages = {} | |
| def extract_text(self, pdf_file) -> Dict[int, str]: | |
| """Extract text from PDF and return a dictionary of page numbers and text.""" | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| for page_num in range(len(pdf_reader.pages)): | |
| text = pdf_reader.pages[page_num].extract_text() | |
| self.pages[page_num] = text | |
| return self.pages | |
| def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: | |
| """Split text into chunks of specified size.""" | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_size = 0 | |
| for word in words: | |
| current_size += len(word) + 1 # +1 for space | |
| if current_size > chunk_size: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [word] | |
| current_size = len(word) | |
| else: | |
| current_chunk.append(word) | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| return chunks | |