Spaces:

mgbam
/

Medapp

Sleeping

App Files Files Community

mgbam commited on Jan 27, 2025

Commit

8225d31

verified ·

1 Parent(s): 325b480

Update app.py

Browse files

Files changed (1) hide show

app.py +353 -0

app.py CHANGED Viewed

	@@ -0,0 +1,353 @@

+import os
+import re
+import json
+import math
+import requests
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import streamlit as st
+import pandas as pd
+# NLP
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+# Hugging Face Transformers
+from transformers import pipeline
+# Optional: OpenAI and Google Generative AI
+import openai
+import google.generativeai as genai
+###############################################################################
+#                              CONFIG & ENV                                   #
+###############################################################################
+"""
+In your Hugging Face Space:
+1. Add environment secrets:
+   - OPENAI_API_KEY       (if using OpenAI)
+   - GEMINI_API_KEY       (if using Google PaLM/Gemini)
+   - MY_PUBMED_EMAIL      (to identify yourself to NCBI)
+2. In requirements.txt, install:
+   - streamlit
+   - requests
+   - nltk
+   - transformers
+   - torch
+   - openai (if using OpenAI)
+   - google-generativeai (if using Gemini)
+   - pandas
+"""
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
+MY_PUBMED_EMAIL = os.getenv("MY_PUBMED_EMAIL", "my_email@example.com")
+if OPENAI_API_KEY:
+    openai.api_key = OPENAI_API_KEY
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+###############################################################################
+#                           SUMMARIZATION PIPELINE                            #
+###############################################################################
+@st.cache_resource
+def load_summarizer():
+    """
+    Load a summarization model (e.g., BART, PEGASUS, T5).
+    For a more concise summarization, consider: 'google/pegasus-xsum'
+    For a balanced approach, 'facebook/bart-large-cnn' is popular.
+    """
+    return pipeline(
+        "summarization",
+        model="facebook/bart-large-cnn",
+        tokenizer="facebook/bart-large-cnn"
+    )
+summarizer = load_summarizer()
+###############################################################################
+#                      PUBMED RETRIEVAL (NCBI E-utilities)                    #
+###############################################################################
+def search_pubmed(query, max_results=3):
+    """
+    Searches PubMed for PMIDs matching the query.
+    Includes recommended 'tool' and 'email' in the request.
+    """
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+    params = {
+        "db": "pubmed",
+        "term": query,
+        "retmax": max_results,
+        "retmode": "json",
+        "tool": "ElysiumRAG",
+        "email": MY_PUBMED_EMAIL
+    }
+    resp = requests.get(base_url, params=params)
+    resp.raise_for_status()
+    data = resp.json()
+    id_list = data.get("esearchresult", {}).get("idlist", [])
+    return id_list
+def fetch_one_abstract(pmid):
+    """
+    Fetches a single abstract for a given PMID using EFetch.
+    Returns (pmid, text).
+    """
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    params = {
+        "db": "pubmed",
+        "retmode": "text",
+        "rettype": "abstract",
+        "id": pmid,
+        "tool": "ElysiumRAG",
+        "email": MY_PUBMED_EMAIL
+    }
+    resp = requests.get(base_url, params=params)
+    resp.raise_for_status()
+    raw_text = resp.text.strip()
+    # If there's no clear text returned, mark as empty
+    if not raw_text:
+        return (pmid, "No abstract text found.")
+    return (pmid, raw_text)
+def fetch_pubmed_abstracts(pmids):
+    """
+    Parallel fetching of multiple PMIDs to reduce overall latency.
+    Returns {pmid: abstract_text}.
+    """
+    abstracts_map = {}
+    with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
+        future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
+        for future in as_completed(future_to_pmid):
+            pmid = future_to_pmid[future]
+            try:
+                pmid_result, text = future.result()
+                abstracts_map[pmid_result] = text
+            except Exception as e:
+                abstracts_map[pmid] = f"Error fetching abstract: {str(e)}"
+    return abstracts_map
+###############################################################################
+#                  ABSTRACT CHUNKING + SUMMARIZATION LOGIC                    #
+###############################################################################
+def chunk_and_summarize(abstract_text, chunk_size=512):
+    """
+    Splits a large abstract into manageable chunks (by sentences),
+    then summarizes each chunk with the Hugging Face pipeline.
+    Returns a combined summary for the entire abstract.
+    """
+    # We first split by sentences
+    sentences = sent_tokenize(abstract_text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sent in sentences:
+        tokens_in_sent = len(sent.split())
+        # If adding this sentence exceeds the chunk_size limit, finalize the chunk
+        if current_length + tokens_in_sent > chunk_size:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = []
+            current_length = 0
+        current_chunk.append(sent)
+        current_length += tokens_in_sent
+    # Final chunk if it exists
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    # Summarize each chunk to avoid hitting token or length constraints
+    summarized_pieces = []
+    for c in chunks:
+        summary_out = summarizer(
+            c,
+            max_length=100,   # tweak for desired summary length
+            min_length=30,
+            do_sample=False
+        )
+        summarized_pieces.append(summary_out[0]['summary_text'])
+    # Combine partial summaries into one final text
+    final_summary = " ".join(summarized_pieces)
+    return final_summary.strip()
+###############################################################################
+#                      LLM CALLS (OpenAI / Gemini)                            #
+###############################################################################
+def openai_chat(system_prompt, user_message, model="gpt-3.5-turbo", temperature=0.3):
+    """
+    Basic ChatCompletion with a system + user role for OpenAI.
+    """
+    if not OPENAI_API_KEY:
+        return "Error: OpenAI API key not provided."
+    try:
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_message}
+            ],
+            temperature=temperature
+        )
+        return response.choices[0].message["content"].strip()
+    except Exception as e:
+        return f"Error calling OpenAI: {str(e)}"
+def gemini_chat(system_prompt, user_message, model_name="models/chat-bison-001", temperature=0.3):
+    """
+    Basic PaLM2/Gemini chat call using google.generativeai.
+    """
+    if not GEMINI_API_KEY:
+        return "Error: Gemini API key not provided."
+    try:
+        model = genai.GenerativeModel(model_name=model_name)
+        chat_session = model.start_chat(history=[("system", system_prompt)])
+        reply = chat_session.send_message(user_message, temperature=temperature)
+        return reply.text
+    except Exception as e:
+        return f"Error calling Gemini: {str(e)}"
+###############################################################################
+#                         BUILD REFERENCES FOR ANSWER                         #
+###############################################################################
+def build_system_prompt_with_refs(pmids, summarized_map):
+    """
+    Creates a system prompt that includes the summarized abstracts alongside
+    labeled references. This allows the LLM to quote or cite specific references.
+    """
+    # Example of labeling references: [Ref1], [Ref2], etc.
+    system_context = (
+        "You have access to the following summarized PubMed articles. "
+        "When relevant, cite them in your final answer using their reference label.\n\n"
+    )
+    for idx, pmid in enumerate(pmids, start=1):
+        ref_label = f"[Ref{idx}]"
+        system_context += f"{ref_label} (PMID {pmid}): {summarized_map[pmid]}\n\n"
+    system_context += "Use this contextual info to provide a concise, evidence-based answer."
+    return system_context
+###############################################################################
+#                                STREAMLIT APP                                #
+###############################################################################
+def main():
+    st.set_page_config(page_title="Enhanced RAG + PubMed", layout="wide")
+    st.title("Enhanced RAG + PubMed: Production-Ready Medical Insights")
+    st.markdown("""
+    **Welcome** to an advanced demonstration of **Retrieval-Augmented Generation (RAG)**
+    using PubMed E-utilities, Hugging Face Summarization, and optional LLM calls (OpenAI or Gemini).
+    This version includes:
+    - **Parallel** fetching for multiple PMIDs
+    - Advanced **chunking & summarization** of large abstracts
+    - **Reference labeling** in the final answer
+    - Clear disclaimers & best-practice structures
+    ---
+    **Disclaimer**: This is a demonstration prototype for educational or research purposes.
+    It is *not* a substitute for professional medical advice. Always consult a qualified
+    healthcare provider for personal health decisions.
+    """)
+    user_query = st.text_area(
+        "Enter your medical question or topic:",
+        placeholder="e.g., 'What are the latest treatments for type 2 diabetes complications?'",
+        height=120
+    )
+    # Sidebar or columns for parameters
+    col1, col2 = st.columns(2)
+    with col1:
+        max_papers = st.slider(
+            "Number of PubMed Articles to Retrieve",
+            min_value=1,
+            max_value=10,
+            value=3,
+            help="Number of articles to fetch & summarize."
+        )
+    with col2:
+        selected_llm = st.selectbox(
+            "Select LLM for Final Generation",
+            ["OpenAI: GPT-3.5", "Gemini: PaLM2"],
+            help="Choose which large language model to finalize the answer."
+        )
+    # Additional advanced parameter: chunk size
+    chunk_size = st.slider(
+        "Summarization Chunk Size (words)",
+        min_value=256,
+        max_value=1024,
+        value=512,
+        help="Larger chunks might produce fewer summaries, but risk token limits. Smaller chunks produce more robust summaries."
+    )
+    if st.button("Run Enhanced RAG Pipeline"):
+        if not user_query.strip():
+            st.warning("Please enter a query before running RAG.")
+            return
+        # 1. PubMed Search
+        with st.spinner("Searching PubMed..."):
+            pmids = search_pubmed(query=user_query, max_results=max_papers)
+        if not pmids:
+            st.error("No matching PubMed results. Try a different query.")
+            return
+        # 2. Fetch abstracts in parallel
+        with st.spinner("Fetching and summarizing abstracts..."):
+            abstracts_map = fetch_pubmed_abstracts(pmids)
+            summarized_map = {}
+            for pmid, abstract_text in abstracts_map.items():
+                if "Error fetching" in abstract_text:
+                    summarized_map[pmid] = abstract_text
+                else:
+                    summarized_map[pmid] = chunk_and_summarize(abstract_text, chunk_size=chunk_size)
+        # 3. Display Summaries
+        st.subheader("Retrieved & Summarized PubMed Articles")
+        for idx, pmid in enumerate(pmids, start=1):
+            ref_label = f"[Ref{idx}]"
+            st.markdown(f"**{ref_label} PMID {pmid}**")
+            st.write(summarized_map[pmid])
+            st.write("---")
+        # 4. Build System Prompt
+        st.subheader("Final Answer")
+        system_prompt = build_system_prompt_with_refs(pmids, summarized_map)
+        with st.spinner("Generating final answer..."):
+            if selected_llm == "OpenAI: GPT-3.5":
+                answer = openai_chat(system_prompt=system_prompt, user_message=user_query)
+            else:
+                answer = gemini_chat(system_prompt=system_prompt, user_message=user_query)
+        st.write(answer)
+        st.success("RAG Pipeline Complete.")
+    # Production Considerations & Next Steps
+    st.markdown("---")
+    st.markdown("""
+    ### Production-Ready Enhancements:
+    1. **Vector Databases & Advanced Retrieval**
+       - For large-scale usage, index PubMed articles in a vector DB (e.g. Pinecone, Weaviate) to quickly retrieve relevant passages.
+    2. **Citation Parsing**
+       - Automatically detect which abstract chunks contributed to each sentence.
+    3. **Multi-Lingual**
+       - Integrate translation pipelines for non-English queries or abstracts.
+    4. **Rate Limiting**
+       - Respect NCBI's ~3 requests/sec guideline if you're scaling out.
+    5. **Robust Logging & Error Handling**
+       - Build out logs, handle exceptions gracefully, and provide fallback prompts if an LLM fails or an abstract is missing.
+    6. **Privacy & Security**
+       - This demo only fetches public info. For patient data, ensure HIPAA/GDPR compliance and encrypted data pipelines.
+    """)
+if __name__ == "__main__":
+    main()