Spaces:

Inferno-721
/

Sutra_AI

Sleeping

App Files Files Community

Sutra_AI / extracted_text.txt

Inferno-721

Initial

0753d2e 11 months ago

raw

history blame contribute delete

19.8 kB

	--- File: /home/sk/Desktop/chat-with-pdf/app.py ---

	import streamlit as st
	import os
	from utils.pdf_utils import PDFProcessor
	from utils.embeddings_utils import EmbeddingsManager
	from utils.qa_utils import QASystem
	from dotenv import load_dotenv
	import openai
	import time

	def initialize_session_state():
	if 'pdf_processor' not in st.session_state:
	st.session_state['pdf_processor'] = None
	if 'embeddings_manager' not in st.session_state:
	st.session_state['embeddings_manager'] = None
	if 'qa_system' not in st.session_state:
	st.session_state['qa_system'] = None
	if 'processed_pdfs' not in st.session_state:
	st.session_state['processed_pdfs'] = set()
	if 'all_text_chunks' not in st.session_state:
	st.session_state['all_text_chunks'] = []

	def main():
	load_dotenv()
	st.set_page_config(page_title="Chat with PDF", layout="wide")
	st.title("📄💬 Chat with PDF")

	initialize_session_state()

	with st.sidebar:
	st.header("🔍 How to Use")
	st.markdown("""
	1. Upload PDF document(s)
	2. Ask questions about the content
	3. View answers and relevant context
	""")
	if 'total_tokens_used' in st.session_state:
	st.markdown("---")
	st.markdown("### 📊 Usage Statistics")
	st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")

	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	st.error("OpenAI API key not found in .env file!")
	return

	openai.api_key = api_key

	if not st.session_state['pdf_processor']:
	st.session_state['pdf_processor'] = PDFProcessor()
	if not st.session_state['embeddings_manager']:
	st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
	if not st.session_state['qa_system']:
	st.session_state['qa_system'] = QASystem(api_key)

	st.subheader("📤 Upload PDFs")
	uploaded_files = st.file_uploader(
	"Upload PDF documents",
	type=['pdf'],
	accept_multiple_files=True
	)

	if uploaded_files:
	new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
	if new_files:
	with st.spinner("Processing PDFs..."):
	for pdf_file in new_files:
	try:
	pages = st.session_state['pdf_processor'].extract_text(pdf_file)
	for page_text in pages.values():
	chunks = st.session_state['pdf_processor'].chunk_text(page_text)
	st.session_state['all_text_chunks'].extend(chunks)
	st.session_state['processed_pdfs'].add(pdf_file.name)
	except Exception as e:
	st.error(f"Error processing {pdf_file.name}: {str(e)}")
	continue

	with st.spinner("Generating embeddings..."):
	try:
	st.session_state['embeddings_manager'].generate_embeddings(
	st.session_state['all_text_chunks']
	)
	st.success("✅ Documents processed!")
	except Exception as e:
	st.error(f"Error generating embeddings: {str(e)}")
	return

	if st.session_state['all_text_chunks']:
	st.write("---")
	st.subheader("❓ Ask Questions About Your Documents")
	question = st.text_input("Enter your question:")
	if question:
	try:
	with st.spinner("Searching for relevant information..."):
	relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
	question,
	k=3
	)
	answer = st.session_state['qa_system'].generate_answer(
	question,
	relevant_chunks
	)
	st.markdown("### 🤖 Answer:")
	st.write(answer)
	with st.expander("🔍 View Source Context"):
	for i, chunk in enumerate(relevant_chunks, 1):
	st.markdown(f"Context {i}:")
	st.write(chunk)
	st.markdown("---")
	except openai.error.RateLimitError:
	st.error("Rate limit exceeded. Please try again later.")
	except Exception as e:
	st.error(f"Error: {str(e)}")

	if __name__ == "__main__":
	main()


	--- File: /home/sk/Desktop/chat-with-pdf/requirements.txt ---

	streamlit
	PyPDF2
	openai
	python-dotenv
	faiss-cpu
	numpy
	pdf2image
	Pillow

	--- File: /home/sk/Desktop/chat-with-pdf/.env ---

	OPENAI_API_KEY=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

	--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/app.py ---

	import streamlit as st
	import os
	from utils.pdf_utils import PDFProcessor
	from utils.embeddings_utils import EmbeddingsManager
	from utils.qa_utils import QASystem
	from dotenv import load_dotenv
	import openai
	import time

	def initialize_session_state():
	if 'pdf_processor' not in st.session_state:
	st.session_state['pdf_processor'] = None
	if 'embeddings_manager' not in st.session_state:
	st.session_state['embeddings_manager'] = None
	if 'qa_system' not in st.session_state:
	st.session_state['qa_system'] = None
	if 'processed_pdfs' not in st.session_state:
	st.session_state['processed_pdfs'] = set()
	if 'all_text_chunks' not in st.session_state:
	st.session_state['all_text_chunks'] = []

	def main():
	load_dotenv()
	st.set_page_config(page_title="Chat with PDF", layout="wide")
	st.title("📄💬 Chat with PDF")

	initialize_session_state()

	with st.sidebar:
	st.header("🔍 How to Use")
	st.markdown("""
	1. Upload PDF document(s)
	2. Ask questions about the content
	3. View answers and relevant context
	""")
	if 'total_tokens_used' in st.session_state:
	st.markdown("---")
	st.markdown("### 📊 Usage Statistics")
	st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")

	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	st.error("OpenAI API key not found in .env file!")
	return

	openai.api_key = api_key

	if not st.session_state['pdf_processor']:
	st.session_state['pdf_processor'] = PDFProcessor()
	if not st.session_state['embeddings_manager']:
	st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
	if not st.session_state['qa_system']:
	st.session_state['qa_system'] = QASystem(api_key)

	st.subheader("📤 Upload PDFs")
	uploaded_files = st.file_uploader(
	"Upload PDF documents",
	type=['pdf'],
	accept_multiple_files=True
	)

	if uploaded_files:
	new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
	if new_files:
	with st.spinner("Processing PDFs..."):
	for pdf_file in new_files:
	try:
	pages = st.session_state['pdf_processor'].extract_text(pdf_file)
	for page_text in pages.values():
	chunks = st.session_state['pdf_processor'].chunk_text(page_text)
	st.session_state['all_text_chunks'].extend(chunks)
	st.session_state['processed_pdfs'].add(pdf_file.name)
	except Exception as e:
	st.error(f"Error processing {pdf_file.name}: {str(e)}")
	continue

	with st.spinner("Generating embeddings..."):
	try:
	st.session_state['embeddings_manager'].generate_embeddings(
	st.session_state['all_text_chunks']
	)
	st.success("✅ Documents processed!")
	except Exception as e:
	st.error(f"Error generating embeddings: {str(e)}")
	return

	if st.session_state['all_text_chunks']:
	st.write("---")
	st.subheader("❓ Ask Questions About Your Documents")
	question = st.text_input("Enter your question:")
	if question:
	try:
	with st.spinner("Searching for relevant information..."):
	relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
	question,
	k=3
	)
	answer = st.session_state['qa_system'].generate_answer(
	question,
	relevant_chunks
	)
	st.markdown("### 🤖 Answer:")
	st.write(answer)
	with st.expander("🔍 View Source Context"):
	for i, chunk in enumerate(relevant_chunks, 1):
	st.markdown(f"Context {i}:")
	st.write(chunk)
	st.markdown("---")
	except openai.error.RateLimitError:
	st.error("Rate limit exceeded. Please try again later.")
	except Exception as e:
	st.error(f"Error: {str(e)}")

	if __name__ == "__main__":
	main()


	--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/requirements.txt ---

	streamlit
	PyPDF2
	openai
	python-dotenv
	faiss-cpu
	numpy
	pdf2image
	Pillow

	--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.gitattributes ---

	*.7z filter=lfs diff=lfs merge=lfs -text
	*.arrow filter=lfs diff=lfs merge=lfs -text
	*.bin filter=lfs diff=lfs merge=lfs -text
	*.bz2 filter=lfs diff=lfs merge=lfs -text
	*.ckpt filter=lfs diff=lfs merge=lfs -text
	*.ftz filter=lfs diff=lfs merge=lfs -text
	*.gz filter=lfs diff=lfs merge=lfs -text
	*.h5 filter=lfs diff=lfs merge=lfs -text
	*.joblib filter=lfs diff=lfs merge=lfs -text
	.lfs. filter=lfs diff=lfs merge=lfs -text
	*.mlmodel filter=lfs diff=lfs merge=lfs -text
	*.model filter=lfs diff=lfs merge=lfs -text
	*.msgpack filter=lfs diff=lfs merge=lfs -text
	*.npy filter=lfs diff=lfs merge=lfs -text
	*.npz filter=lfs diff=lfs merge=lfs -text
	*.onnx filter=lfs diff=lfs merge=lfs -text
	*.ot filter=lfs diff=lfs merge=lfs -text
	*.parquet filter=lfs diff=lfs merge=lfs -text
	*.pb filter=lfs diff=lfs merge=lfs -text
	*.pickle filter=lfs diff=lfs merge=lfs -text
	*.pkl filter=lfs diff=lfs merge=lfs -text
	*.pt filter=lfs diff=lfs merge=lfs -text
	*.pth filter=lfs diff=lfs merge=lfs -text
	*.rar filter=lfs diff=lfs merge=lfs -text
	*.safetensors filter=lfs diff=lfs merge=lfs -text
	saved_model/*/ filter=lfs diff=lfs merge=lfs -text
	.tar. filter=lfs diff=lfs merge=lfs -text
	*.tar filter=lfs diff=lfs merge=lfs -text
	*.tflite filter=lfs diff=lfs merge=lfs -text
	*.tgz filter=lfs diff=lfs merge=lfs -text
	*.wasm filter=lfs diff=lfs merge=lfs -text
	*.xz filter=lfs diff=lfs merge=lfs -text
	*.zip filter=lfs diff=lfs merge=lfs -text
	*.zst filter=lfs diff=lfs merge=lfs -text
	tfevents filter=lfs diff=lfs merge=lfs -text


	--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.env ---

	OPENAI_API_KEY=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

	--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/qa_utils.py ---

	import openai
	from typing import List

	class QASystem:
	def __init__(self, api_key: str):
	openai.api_key = api_key

	def generate_answer(self, question: str, context: List[str]) -> str:
	prompt = f"""Based on the context provided below, answer the question.
	If the answer is not in the context, respond with "The answer is not in the provided context."

	Context:
	{' '.join(context)}

	Question: {question}
	"""

	response = openai.chat.completions.create( # Updated line
	model="gpt-4",
	messages=[
	{"role": "system", "content": "You are an assistant answering questions based on the provided context."},
	{"role": "user", "content": prompt}
	],
	temperature=0,
	max_tokens=500
	)
	return response.choices[0].message.content


	--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/embeddings_utils.py ---

	import openai
	import numpy as np
	import faiss
	from typing import List

	class EmbeddingsManager:
	def __init__(self, api_key: str):
	self.api_key = api_key
	self.index = None
	self.chunks = []

	def generate_embeddings(self, text_chunks: List[str]):
	"""Generate embeddings for text chunks using OpenAI API."""
	batch_size = 10
	embeddings = []

	for i in range(0, len(text_chunks), batch_size):
	batch = text_chunks[i:i + batch_size]
	response = openai.embeddings.create(
	input=batch,
	model="text-embedding-ada-002"
	)
	# Access the embeddings using attributes
	batch_embeddings = [item.embedding for item in response.data]
	embeddings.extend(batch_embeddings)

	# Create FAISS index
	dimension = len(embeddings[0])
	self.index = faiss.IndexFlatL2(dimension)
	embeddings_array = np.array(embeddings).astype('float32')
	self.index.add(embeddings_array)
	self.chunks = text_chunks

	def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
	"""Find most relevant text chunks for a given query."""
	response = openai.embeddings.create(
	input=[query],
	model="text-embedding-ada-002"
	)
	# Access the query embedding using attributes
	query_embedding = response.data[0].embedding

	D, I = self.index.search(
	np.array([query_embedding]).astype('float32'),
	k
	)

	return [self.chunks[i] for i in I[0] if i != -1]


	--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/pdf_utils.py ---

	import PyPDF2
	from typing import List, Dict

	class PDFProcessor:
	def __init__(self):
	self.pages = {}

	def extract_text(self, pdf_file) -> Dict[int, str]:
	"""Extract text from PDF and return a dictionary of page numbers and text."""
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	for page_num in range(len(pdf_reader.pages)):
	text = pdf_reader.pages[page_num].extract_text()
	self.pages[page_num] = text
	return self.pages

	def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
	"""Split text into chunks of specified size."""
	words = text.split()
	chunks = []
	current_chunk = []
	current_size = 0

	for word in words:
	current_size += len(word) + 1 # +1 for space
	if current_size > chunk_size:
	chunks.append(' '.join(current_chunk))
	current_chunk = [word]
	current_size = len(word)
	else:
	current_chunk.append(word)

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks


	--- File: /home/sk/Desktop/chat-with-pdf/utils/qa_utils.py ---

	import openai
	from typing import List

	class QASystem:
	def __init__(self, api_key: str):
	openai.api_key = api_key

	def generate_answer(self, question: str, context: List[str]) -> str:
	prompt = f"""Based on the context provided below, answer the question.
	If the answer is not in the context, respond with "The answer is not in the provided context."

	Context:
	{' '.join(context)}

	Question: {question}
	"""

	response = openai.chat.completions.create( # Updated line
	model="gpt-4",
	messages=[
	{"role": "system", "content": "You are an assistant answering questions based on the provided context."},
	{"role": "user", "content": prompt}
	],
	temperature=0,
	max_tokens=500
	)
	return response.choices[0].message.content


	--- File: /home/sk/Desktop/chat-with-pdf/utils/embeddings_utils.py ---

	import openai
	import numpy as np
	import faiss
	from typing import List

	class EmbeddingsManager:
	def __init__(self, api_key: str):
	self.api_key = api_key
	self.index = None
	self.chunks = []

	def generate_embeddings(self, text_chunks: List[str]):
	"""Generate embeddings for text chunks using OpenAI API."""
	batch_size = 10
	embeddings = []

	for i in range(0, len(text_chunks), batch_size):
	batch = text_chunks[i:i + batch_size]
	response = openai.embeddings.create(
	input=batch,
	model="text-embedding-ada-002"
	)
	# Access the embeddings using attributes
	batch_embeddings = [item.embedding for item in response.data]
	embeddings.extend(batch_embeddings)

	# Create FAISS index
	dimension = len(embeddings[0])
	self.index = faiss.IndexFlatL2(dimension)
	embeddings_array = np.array(embeddings).astype('float32')
	self.index.add(embeddings_array)
	self.chunks = text_chunks

	def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
	"""Find most relevant text chunks for a given query."""
	response = openai.embeddings.create(
	input=[query],
	model="text-embedding-ada-002"
	)
	# Access the query embedding using attributes
	query_embedding = response.data[0].embedding

	D, I = self.index.search(
	np.array([query_embedding]).astype('float32'),
	k
	)

	return [self.chunks[i] for i in I[0] if i != -1]


	--- File: /home/sk/Desktop/chat-with-pdf/utils/pdf_utils.py ---

	import PyPDF2
	from typing import List, Dict

	class PDFProcessor:
	def __init__(self):
	self.pages = {}

	def extract_text(self, pdf_file) -> Dict[int, str]:
	"""Extract text from PDF and return a dictionary of page numbers and text."""
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	for page_num in range(len(pdf_reader.pages)):
	text = pdf_reader.pages[page_num].extract_text()
	self.pages[page_num] = text
	return self.pages

	def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
	"""Split text into chunks of specified size."""
	words = text.split()
	chunks = []
	current_chunk = []
	current_size = 0

	for word in words:
	current_size += len(word) + 1 # +1 for space
	if current_size > chunk_size:
	chunks.append(' '.join(current_chunk))
	current_chunk = [word]
	current_size = len(word)
	else:
	current_chunk.append(word)

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks