Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Sleeping

App Files Files Community

hamxaameer commited on 14 days ago

Commit

89410ee

verified ·

1 Parent(s): eceb5f0

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -26

app.py CHANGED Viewed

@@ -44,8 +44,9 @@ def initialize_llm():
     logger.info("🔄 Initializing FREE local language model...")
     BACKUP_MODELS = [
-        "google/flan-t5-base",  # Primary - 250M, very fast on CPU
-        "google/flan-t5-large",  # Backup - 780M, slower but better
     ]
     for model_name in BACKUP_MODELS:
@@ -53,19 +54,37 @@ def initialize_llm():
             logger.info(f"   Trying {model_name}...")
             device = 0 if torch.cuda.is_available() else -1
-            # Use text2text-generation for T5 models (not text-generation)
-            task = "text2text-generation" if "t5" in model_name.lower() else "text-generation"
             llm_client = pipeline(
                 task,
                 model=model_name,
                 device=device,
-                max_length=300,
                 truncation=True,
             )
             CONFIG["llm_model"] = model_name
-            CONFIG["model_type"] = "t5" if "t5" in model_name.lower() else "instruct"
             logger.info(f"✅ FREE LLM initialized: {model_name}")
             logger.info(f"   Device: {'GPU' if device == 0 else 'CPU'}")
             return llm_client
@@ -356,33 +375,68 @@ def generate_llm_answer(
         repetition_penalty = 1.25
     # Create prompt based on model type
-    if CONFIG.get("model_type") == "t5":
-        # T5 needs simple input-output format
-        user_prompt = f"Question: {query}\n\nContext: {context_text[:800]}\n\nProvide a helpful fashion answer:"
     else:
-        # Instruct models use INST format
         user_prompt = f"""[INST] Question: {query}
 Fashion Knowledge:
 {context_text}
-Answer the question using the knowledge above. Be specific and helpful (100-250 words). [/INST]"""
     try:
         logger.info(f"  → Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
         # Call pipeline with model-specific parameters
-        if CONFIG.get("model_type") == "t5":
-            # T5 uses max_length not max_new_tokens
             output = llm_client(
                 user_prompt,
-                max_length=200,  # Shorter for speed
-                temperature=temperature,
-                top_p=top_p,
                 do_sample=True,
             )
         else:
-            # Other models use max_new_tokens
             output = llm_client(
                 user_prompt,
                 max_new_tokens=max_tokens,
@@ -391,7 +445,7 @@ Answer the question using the knowledge above. Be specific and helpful (100-250
                 repetition_penalty=repetition_penalty,
                 do_sample=True,
                 return_full_text=False,
-                pad_token_id=llm_client.tokenizer.eos_token_id
             )
         # Extract generated text
@@ -488,26 +542,62 @@ def generate_answer_langchain(
 # GRADIO INTERFACE
 # ============================================================================
-def fashion_chatbot(message: str, history: List[List[str]]) -> str:
     """
-    Chatbot function for Gradio interface
     """
     try:
         if not message or not message.strip():
-            return "Please ask a fashion-related question!"
-        # Generate answer using RAG pipeline
-        answer = generate_answer_langchain(
             message.strip(),
             vectorstore,
-            llm_client
         )
-        return answer
     except Exception as e:
         logger.error(f"Error in chatbot: {e}")
-        return f"Sorry, I encountered an error: {str(e)}"
 # ============================================================================
 # INITIALIZE AND LAUNCH

     logger.info("🔄 Initializing FREE local language model...")
     BACKUP_MODELS = [
+        "microsoft/phi-2",  # Primary - 2.7B, excellent quality, fast
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Backup - 1.1B, very fast
+        "google/flan-t5-large",  # Fallback - 780M
     ]
     for model_name in BACKUP_MODELS:
             logger.info(f"   Trying {model_name}...")
             device = 0 if torch.cuda.is_available() else -1
+            # Determine task and model type
+            if "t5" in model_name.lower():
+                task = "text2text-generation"
+                model_type = "t5"
+            elif "phi" in model_name.lower():
+                task = "text-generation"
+                model_type = "phi"
+            elif "tinyllama" in model_name.lower():
+                task = "text-generation"
+                model_type = "tinyllama"
+            else:
+                task = "text-generation"
+                model_type = "instruct"
+            # Model-specific kwargs for optimization
+            model_kwargs = {
+                "low_cpu_mem_usage": True,
+                "trust_remote_code": True  # Required for Phi-2
+            }
             llm_client = pipeline(
                 task,
                 model=model_name,
                 device=device,
+                max_length=400,  # Good length for detailed answers
                 truncation=True,
+                model_kwargs=model_kwargs
             )
             CONFIG["llm_model"] = model_name
+            CONFIG["model_type"] = model_type
             logger.info(f"✅ FREE LLM initialized: {model_name}")
             logger.info(f"   Device: {'GPU' if device == 0 else 'CPU'}")
             return llm_client
         repetition_penalty = 1.25
     # Create prompt based on model type
+    model_type = CONFIG.get("model_type", "instruct")
+    if model_type == "t5":
+        # T5 needs simple format
+        user_prompt = f"Question: {query}\n\nContext: {context_text[:800]}\n\nProvide helpful fashion advice:"
+    elif model_type == "phi":
+        # Phi-2 format (no special tokens needed)
+        user_prompt = f"""Instruct: You are a fashion advisor. Use the following knowledge to answer the question.
+Fashion Knowledge:
+{context_text}
+Question: {query}
+Output: Provide specific, helpful fashion advice in 150-200 words."""
+    elif model_type == "tinyllama":
+        # TinyLlama chat format
+        user_prompt = f"""<|system|>
+You are a helpful fashion advisor.</s>
+<|user|>
+Use this fashion knowledge to answer: {context_text[:1000]}
+Question: {query}</s>
+<|assistant|>"""
     else:
+        # Generic instruct format
         user_prompt = f"""[INST] Question: {query}
 Fashion Knowledge:
 {context_text}
+Answer the question using the knowledge above. Be specific and helpful (150-200 words). [/INST]"""
     try:
         logger.info(f"  → Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
         # Call pipeline with model-specific parameters
+        if model_type == "t5":
+            # T5 uses max_length
             output = llm_client(
                 user_prompt,
+                max_length=150,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                num_beams=1,
+                early_stopping=True
+            )
+        elif model_type in ["phi", "tinyllama"]:
+            # Phi-2 and TinyLlama - optimized for quality and speed
+            output = llm_client(
+                user_prompt,
+                max_new_tokens=min(max_tokens, 300),  # Cap at 300 for speed
+                temperature=0.75,  # Balanced creativity
+                top_p=0.92,
+                repetition_penalty=1.15,
                 do_sample=True,
+                return_full_text=False,
+                pad_token_id=llm_client.tokenizer.eos_token_id if hasattr(llm_client.tokenizer, 'eos_token_id') else None
             )
         else:
+            # Other models
             output = llm_client(
                 user_prompt,
                 max_new_tokens=max_tokens,
                 repetition_penalty=repetition_penalty,
                 do_sample=True,
                 return_full_text=False,
+                pad_token_id=llm_client.tokenizer.eos_token_id if hasattr(llm_client.tokenizer, 'eos_token_id') else None
             )
         # Extract generated text
 # GRADIO INTERFACE
 # ============================================================================
+def fashion_chatbot(message: str, history: List[List[str]]):
     """
+    Chatbot function for Gradio interface with streaming
     """
     try:
         if not message or not message.strip():
+            yield "Please ask a fashion-related question!"
+            return
+        # Show searching indicator
+        yield "🔍 Searching fashion knowledge..."
+        # Retrieve documents
+        retrieved_docs, confidence = retrieve_knowledge_langchain(
             message.strip(),
             vectorstore,
+            top_k=CONFIG["top_k"]
         )
+        if not retrieved_docs:
+            yield "I couldn't find relevant information to answer your question."
+            return
+        # Show generating indicator
+        yield f"💭 Generating answer ({len(retrieved_docs)} sources found)..."
+        # Generate answer with multiple attempts
+        llm_answer = None
+        for attempt in range(1, 5):
+            logger.info(f"\n  🤖 LLM Generation Attempt {attempt}/4")
+            llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt)
+            if llm_answer:
+                break
+        # Fallback if needed
+        if not llm_answer:
+            logger.error(f"  ✗ All LLM attempts failed - using fallback")
+            llm_answer = synthesize_direct_answer(message.strip(), retrieved_docs)
+        # Stream the answer word by word for natural flow
+        import time
+        words = llm_answer.split()
+        displayed_text = ""
+        for i, word in enumerate(words):
+            displayed_text += word + " "
+            # Yield every 3 words for smooth streaming
+            if i % 3 == 0 or i == len(words) - 1:
+                yield displayed_text.strip()
+                time.sleep(0.05)  # Small delay for natural flow
     except Exception as e:
         logger.error(f"Error in chatbot: {e}")
+        yield f"Sorry, I encountered an error: {str(e)}"
 # ============================================================================
 # INITIALIZE AND LAUNCH