Update app.py
Browse files
app.py
CHANGED
|
@@ -188,51 +188,33 @@ def load_vector_store(embeddings):
|
|
| 188 |
if len(matches) > 100:
|
| 189 |
logger.info(f" Found {len(matches)} potential document fragments")
|
| 190 |
|
| 191 |
-
#
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
# Create documents matching the number of vectors
|
| 196 |
-
new_docstore_dict = {}
|
| 197 |
-
index_to_docstore_id = {}
|
| 198 |
-
|
| 199 |
-
# Use the actual number of vectors, not extracted matches
|
| 200 |
-
for idx in range(min(num_vectors, len(matches))):
|
| 201 |
try:
|
| 202 |
-
|
| 203 |
-
if
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
# Create document with string ID
|
| 212 |
-
doc_id = str(idx)
|
| 213 |
-
new_doc = Document(
|
| 214 |
-
page_content=content,
|
| 215 |
-
metadata={"source": "reconstructed"}
|
| 216 |
-
)
|
| 217 |
-
new_docstore_dict[doc_id] = new_doc
|
| 218 |
-
# CRITICAL: Use string keys for index_to_docstore_id
|
| 219 |
-
index_to_docstore_id[str(idx)] = doc_id
|
| 220 |
-
except Exception as e:
|
| 221 |
-
logger.warning(f" Error creating doc {idx}: {e}")
|
| 222 |
continue
|
| 223 |
|
| 224 |
-
|
|
|
|
| 225 |
|
| 226 |
-
|
|
|
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
index_to_docstore_id=index_to_docstore_id
|
| 233 |
)
|
| 234 |
|
| 235 |
-
logger.info(f"β
FAISS vector store
|
| 236 |
return vectorstore
|
| 237 |
else:
|
| 238 |
raise Exception("Could not extract enough document content from pickle")
|
|
|
|
| 188 |
if len(matches) > 100:
|
| 189 |
logger.info(f" Found {len(matches)} potential document fragments")
|
| 190 |
|
| 191 |
+
# Create documents from extracted text
|
| 192 |
+
documents = []
|
| 193 |
+
for idx, match in enumerate(matches[:5000]): # Use first 5000 quality matches
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
try:
|
| 195 |
+
content = match.decode('utf-8', errors='ignore').strip()
|
| 196 |
+
if len(content) >= 100: # Only high-quality, substantial content
|
| 197 |
+
doc = Document(
|
| 198 |
+
page_content=content,
|
| 199 |
+
metadata={"source": "reconstructed", "id": idx}
|
| 200 |
+
)
|
| 201 |
+
documents.append(doc)
|
| 202 |
+
except:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
continue
|
| 204 |
|
| 205 |
+
if len(documents) < 100:
|
| 206 |
+
raise Exception(f"Only extracted {len(documents)} documents, need at least 100")
|
| 207 |
|
| 208 |
+
logger.info(f" β
Extracted {len(documents)} high-quality documents")
|
| 209 |
+
logger.info(f" π Rebuilding FAISS index from scratch...")
|
| 210 |
|
| 211 |
+
# Create NEW FAISS index from documents (ignore old corrupted index)
|
| 212 |
+
vectorstore = FAISS.from_documents(
|
| 213 |
+
documents=documents,
|
| 214 |
+
embedding=embeddings
|
|
|
|
| 215 |
)
|
| 216 |
|
| 217 |
+
logger.info(f"β
FAISS vector store rebuilt from {len(documents)} documents")
|
| 218 |
return vectorstore
|
| 219 |
else:
|
| 220 |
raise Exception("Could not extract enough document content from pickle")
|