hamxaameer commited on
Commit
ed0b266
Β·
verified Β·
1 Parent(s): 1604786

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -38
app.py CHANGED
@@ -188,51 +188,33 @@ def load_vector_store(embeddings):
188
  if len(matches) > 100:
189
  logger.info(f" Found {len(matches)} potential document fragments")
190
 
191
- # Get total vectors in index
192
- num_vectors = index.ntotal
193
- logger.info(f" FAISS index has {num_vectors} vectors")
194
-
195
- # Create documents matching the number of vectors
196
- new_docstore_dict = {}
197
- index_to_docstore_id = {}
198
-
199
- # Use the actual number of vectors, not extracted matches
200
- for idx in range(min(num_vectors, len(matches))):
201
  try:
202
- # Get content from matches
203
- if idx < len(matches):
204
- content = matches[idx].decode('utf-8', errors='ignore').strip()
205
- else:
206
- content = f"Fashion document {idx}"
207
-
208
- if len(content) < 50:
209
- content = f"Fashion advice and style guide entry {idx}"
210
-
211
- # Create document with string ID
212
- doc_id = str(idx)
213
- new_doc = Document(
214
- page_content=content,
215
- metadata={"source": "reconstructed"}
216
- )
217
- new_docstore_dict[doc_id] = new_doc
218
- # CRITICAL: Use string keys for index_to_docstore_id
219
- index_to_docstore_id[str(idx)] = doc_id
220
- except Exception as e:
221
- logger.warning(f" Error creating doc {idx}: {e}")
222
  continue
223
 
224
- logger.info(f" βœ… Reconstructed {len(new_docstore_dict)} documents from raw data")
 
225
 
226
- docstore = InMemoryDocstore(new_docstore_dict)
 
227
 
228
- vectorstore = FAISS(
229
- embedding_function=embeddings,
230
- index=index,
231
- docstore=docstore,
232
- index_to_docstore_id=index_to_docstore_id
233
  )
234
 
235
- logger.info(f"βœ… FAISS vector store reconstructed from raw data")
236
  return vectorstore
237
  else:
238
  raise Exception("Could not extract enough document content from pickle")
 
188
  if len(matches) > 100:
189
  logger.info(f" Found {len(matches)} potential document fragments")
190
 
191
+ # Create documents from extracted text
192
+ documents = []
193
+ for idx, match in enumerate(matches[:5000]): # Use first 5000 quality matches
 
 
 
 
 
 
 
194
  try:
195
+ content = match.decode('utf-8', errors='ignore').strip()
196
+ if len(content) >= 100: # Only high-quality, substantial content
197
+ doc = Document(
198
+ page_content=content,
199
+ metadata={"source": "reconstructed", "id": idx}
200
+ )
201
+ documents.append(doc)
202
+ except:
 
 
 
 
 
 
 
 
 
 
 
 
203
  continue
204
 
205
+ if len(documents) < 100:
206
+ raise Exception(f"Only extracted {len(documents)} documents, need at least 100")
207
 
208
+ logger.info(f" βœ… Extracted {len(documents)} high-quality documents")
209
+ logger.info(f" πŸ”„ Rebuilding FAISS index from scratch...")
210
 
211
+ # Create NEW FAISS index from documents (ignore old corrupted index)
212
+ vectorstore = FAISS.from_documents(
213
+ documents=documents,
214
+ embedding=embeddings
 
215
  )
216
 
217
+ logger.info(f"βœ… FAISS vector store rebuilt from {len(documents)} documents")
218
  return vectorstore
219
  else:
220
  raise Exception("Could not extract enough document content from pickle")