Spaces:

Csplk
/

moondream2-batch-processing

Paused

App Files Files Community

Csplk commited on about 17 hours ago

Commit

9414970

verified ·

1 Parent(s): 0aec354

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -83

app.py CHANGED Viewed

@@ -1,100 +1,248 @@
-import spaces
 import torch
-import re
 import gradio as gr
-from threading import Thread
-from transformers import AutoModelForCausalLM
-moondream = AutoModelForCausalLM.from_pretrained(
-    "moondream/moondream3-preview",
     trust_remote_code=True,
-    dtype=torch.bfloat16,
-    device_map={"": "cuda"},
 )
-moondream.compile()
-def answer_questions(image_tuples, prompt_text):
-    # Encode image once
-    image = [img[0] for img in image_tuples if img[0] is not none]
-    encoded = moondream.encode_image(image)
-    questions = prompt_text
-    for q in questions:
-        result1 = moondream.query(image=encoded, question=q, reasoning=False)
-        print(f"Q: {q}")
-        print(f"A: {result1['answer']}\n")
-    # Also works with other skills
-    caption = moondream.caption(encoded, length="normal")
-    objects = moondream.detect(encoded, "poop")
-    pointe = moondream.point(encoded, "grass")
-    print(f"caption: {caption}, objects:{objects}, point:{pointe}")
-    # Segment an object
-    result2 = moondream.segment(image, "cat")
-    svg_path = result2["path"]
-    bbox = result2["bbox"]
-    print(f"SVG Path: {svg_path[:100]}...")
-    print(f"Bounding box: {bbox}")
-    # With spatial hint (point) to guide segmentation
-    result3 = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]])
-    print(result1)
-    # With spatial hint (bounding box)
-    result3 = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])
-    print(result3)
-    result = ""
-    Q_and_A = ""
-    prompts = [p.strip() for p in prompt_text.split('?')]
     image_embeds = [img[0] for img in image_tuples if img[0] is not None]
-    answers = []
     for prompt in prompts:
-        answers.append(moondream.query(
             images=[img.convert("RGB") for img in image_embeds],
             prompts=[prompt] * len(image_embeds),
-        ))
-    for i, prompt in enumerate(prompts):
-        Q_and_A += f"### Q: {prompt}\n"
-        for j, image_tuple in enumerate(image_tuples):
-            image_name = f"image{j+1}"
-            answer_text = answers[i][j]
-            Q_and_A += f"**{image_name} A:** \n {answer_text} \n"
-    result = {'headers': prompts, 'data': answers}
-    print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
-    return Q_and_A, result
 """
-Load Moondream model and tokenizer.
-moondream = AutoModelForCausalLM.from_pretrained(
-  "vikhyatk/moondream2",
-  revision="2025-01-09",
-  trust_remote_code=True,
-  device_map={"": "cuda"},
-)
-tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 """
-with gr.Blocks() as demo:
-    gr.Markdown("# moondream2 unofficial batch processing demo")
-    gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
-    gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each prompt on each image**")
-    gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)")
-    with gr.Row():
-        img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4)
-    with gr.Row():
-        prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by question marks. Ex: Describe this image? What is in this image?", lines=8)
-    with gr.Row():
-        submit = gr.Button("Submit")
-    with gr.Row():
-        output = gr.Markdown(label="Questions and Answers", line_breaks=True)
-    with gr.Row():
-        output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
-    submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])
-demo.queue().launch()

+import os
 import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from PIL import Image
+import requests
+import json
+import base64
+from io import BytesIO
+# Check for CUDA availability for PyTorch
+if torch.cuda.is_available():
+    device, dtype = "cuda", torch.bfloat16
+else:
+    device, dtype = "cpu", torch.float32
+# Load Moondream3 Preview for image analysis
+moondream3_model_id = "moondream/moondream3-preview"
+tokenizer_moondream3 = AutoTokenizer.from_pretrained(moondream3_model_id)
+moondream3 = AutoModelForCausalLM.from_pretrained(
+    moondream3_model_id,
     trust_remote_code=True,
+    torch_dtype=dtype,
+    device_map={"": device}
+).eval()
+moondream3.compile()  # Optional: speeds up inference
+# Initialize DeepSeek-V2 for chat completion
+deepseek_model_name = "deepseek-ai/DeepSeek-V2"
+tokenizer_deepseek = AutoTokenizer.from_pretrained(deepseek_model_name)
+deepseek_model = AutoModelForCausalLM.from_pretrained(
+    deepseek_model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+chat_pipe = pipeline(
+    "text-generation",
+    model=deepseek_model,
+    tokenizer=tokenizer_deepseek,
+    max_new_tokens=512,
+    temperature=0.7,
+    top_p=0.9,
+    repetition_penalty=1.1,
+    do_sample=True,
 )
+def deepseek_chat(user_message, is_json=False):
+    """Chat completion using DeepSeek-V2."""
+    prompt = f"<|BeginOfUtterance|>User: {user_message}<|EndOfUtterance|><|BeginOfUtterance|>Assistant:"
+    response = chat_pipe(prompt, return_full_text=False)[0]["generated_text"]
+    assistant_response = response.split("<|BeginOfUtterance|>Assistant:")[-1].strip()
+    return assistant_response
+# Extract features from images using Moondream3
+def extract_features(image_tuples):
+    headers = ["Image", "Layout", "Decor", "Atmosphere", "Lighting", "Color scheme", "Furniture style"]
+    data = []
     image_embeds = [img[0] for img in image_tuples if img[0] is not None]
+    prompts = [
+        "Describe the spatial arrangement of furniture, walls, and other elements in this image.",
+        "What type, style, and arrangement of decorative elements are present in this image?",
+        "What mood, ambiance, and overall feeling does this image evoke?",
+        "What type, intensity, placement, and direction of lighting is present in this image?",
+        "What are the dominant colors, color palette, and color harmony in this image?",
+        "What type, shape, material, and arrangement of furniture is present in this image?"
+    ]
+    answers = []
     for prompt in prompts:
+        image_answers = moondream3.batch_answer(
             images=[img.convert("RGB") for img in image_embeds],
             prompts=[prompt] * len(image_embeds),
+            tokenizer=tokenizer_moondream3,
+        )
+        answers.append(image_answers)
+    for i in range(len(image_tuples)):
+        image_name = f"image{i+1}"
+        image_answers = [answer[i] for answer in answers]
+        print(f"image{i+1}_answers \n {image_answers} \n")
+        data.append([image_name] + image_answers)
+    result = {'headers': headers, 'data': data}
+    return result
+# Describe room from image using Moondream3
+def describe_room(image):
+    headers = ["Image", "Layout", "Decor", "Atmosphere", "Lighting", "Color scheme", "Furniture style"]
+    data = []
+    image_embeds = [image.convert("RGB")] * 6
+    prompts = [
+        "Describe the spatial arrangement of furniture, walls, and other elements in this image.",
+        "What type, style, and arrangement of decorative elements are present in this image?",
+        "What mood, ambiance, and overall feeling does this image evoke?",
+        "What type, intensity, placement, and direction of lighting is present in this image?",
+        "What are the dominant colors, color palette, and color harmony in this image?",
+        "What type, shape, material, and arrangement of furniture is present in this image?"
+    ]
+    answers = moondream3.batch_answer(
+        images=image_embeds,
+        prompts=prompts,
+        tokenizer=tokenizer_moondream3,
+    )
+    image_name = "ClientRoom"
+    print(f"ClientRoom_answers \n {answers} \n")
+    data.append([image_name] + answers)
+    result = {'headers': headers, 'data': data}
+    return result
+def merge_features(inspiration_features):
+    preferenec_map_extraction = f"""
+You are one of the worlds most knowledgeable minds in the field of both theoretical and applied interior design.
+- You are detailed
+- You are meticulous
+- You can distil a large potentially unstructured potentially multimodal range of input data sources into a highly accurate all encompassing representation of the interior design concept preferences of the input source by mapping input data using a model of fundamental interior design component definitions
+- You can come up with professionally structured, fully detailed, well thought out and all encompassing applied interior design proposals from initial conceptualization and planning to a complete and finished interior design of real world space
+- Generally you can help answer any question or assist in any task asked of you relating to anything in the realm of applied and theoretical design and interior design
+Your task is to analyze the interior design style given information after <<<>>> and merge the analysis results together to generate a comprehensive design style preference map representation for the user who uploaded some images. Return as JSON
+<<<
+{inspiration_features}
+>>>
 """
+    print(f"\npreference_map_extraction prompt\n{preferenec_map_extraction}\n")
+    prefmap = deepseek_chat(preferenec_map_extraction, is_json=True)
+    print(f"\n merge_features chat_response\n{prefmap}\n")
+    return prefmap
+def create_design_concept_report(room_description, inspiration_features):
+    design_report_prompt = f"""
+    Generate a detailed interior design plan proposal report structured as markdown
+    - The report should include three design plan concepts for the clients space based on the clients interior design component preference representation generated from the inspirational images they uploaded clients room that is the target of the project and the design preference map generated from the inspirational design images they uploaded
+    - The report should have an introduction, sections on Style Preference, Color Scheme, Furniture Style, Lighting, Atmosphere, Decor, and Layout for each concept, as well as a placeholder for a mood board image starting each concept section.
+    - Finally, the report should have a summary to conclude the design plan.
+    Very detailed information about the clients room based on the photo they uploaded:
+    {room_description}
+    Design preference map generated from the inspirational design images they uploaded:
+    {inspiration_features}
 """
+    print(f"\ndesign_report_prompt\n{design_report_prompt}\n")
+    designreport = deepseek_chat(design_report_prompt)
+    print(f"\ndesign concept chat_response\n{designreport}\n")
+    return designreport
+def queryllm(payload):
+    response = requests.post(textgen_API_URL, headers=headers, json=payload)
+    print(response)
+    return response.json()
+def generate_mood_board_image(prompt):
+    payload = {"inputs": prompt}
+    response = requests.post(texttoimage_API_URL, headers=headers, json=payload)
+    return response.content
+def getmoodboardprompts(designreport):
+    mood_board_descriptions_prompt = f"""
+    ### interior design report plan
+    {designreport}
+    ###
+    Generate a text prompt for each of the interior design concepts described in the interior design report plan that can be sent to a text-to-image model and receive a design project mood board.
+    The prompt should clearly describe what should go onto the moodboard for each design concept and be structured JSON. For example:
+    {{
+        "Concept1": "Create a mood board for a modern cozy retreat bedroom with a warm and inviting atmosphere. Include a white and brown color palette, modern and contemporary furniture with clean lines, a cozy and functional bed, nightstands with elegant designs, a bench at the foot of the bed with storage, sheer curtains on the window, floor lamps and table lamps with layered lighting effects, potted plants, a vase with branches and twigs, a bowl, a clock, and books on the nightstands.",
+        "Concept2": "Create a mood board for another concept..."
+    }}
+    Only output the JSON, nothing else, no explanations or commentary.
+    """
+    print(f"\nmood_board_descriptions_prompt:\n{mood_board_descriptions_prompt}\n")
+    mood_board_descriptions = deepseek_chat(mood_board_descriptions_prompt)
+    print(f"\nmood_board_descriptions_prompt chat_response\n{mood_board_descriptions}\n")
+    return json.loads(mood_board_descriptions)
+def generate_moodboards(mb_prompts):
+    moodboard_images = {}
+    for concept, prompt in mb_prompts.items():
+        image_data = generate_mood_board_image(prompt)
+        file_path = f"moodboard_{concept}.jpg"
+        with open(file_path, "wb") as f:
+            f.write(image_data)
+        moodboard_images[concept] = file_path
+    return moodboard_images
+def add_moodboards_to_report(moodboard_images, report):
+    add_moodboards_prompt = f"""
+    mood board images
+    <<<
+    {moodboard_images}
+    >>>
+    report
+    <<<
+    {report}
+    >>>
+    Insert paths for each mood board image into the respective placeholder for each in the report and respond with the revised report with moodboard images inserted only, no explanations or commentary
+    """
+    print(f"\nadd_moodboards_prompt\n{add_moodboards_prompt}\n")
+    revised_report = deepseek_chat(add_moodboards_prompt)
+    print(f"\nrevised_report\n{revised_report}\n")
+    return revised_report
+# Gradio Interface
+def process_images(design_images, room_image):
+    design_descriptions = extract_features(design_images)
+    room_description = describe_room(room_image)
+    preference_map = merge_features(design_descriptions)
+    print(f"\npreference_map\n{preference_map}\n")
+    design_report = create_design_concept_report(room_description, preference_map)
+    print(f"\ndesign_report\n{design_report}\n")
+    mb_prompts = getmoodboardprompts(design_report)
+    print(f"\nmb_prompts\n{mb_prompts}\n")
+    moodboard_images = generate_moodboards(mb_prompts)
+    print(f"\nmoodboard_images\n{moodboard_images}\n")
+    revised_report = add_moodboards_to_report(moodboard_images, design_report)
+    print("revised_report")
+    print(revised_report)
+    print("preference map")
+    print(preference_map)
+    return revised_report, preference_map
+gallery = gr.components.Gallery(label="Upload Images of Preferred Design Styles", type="pil")
+image_input = gr.components.Image(label="Upload Image of Your Room", type="pil")
+report_output = gr.components.Markdown(label="Design Concept Report with Mood Boards")
+json_output = gr.components.JSON(label="Design Preference Map")
+interface = gr.Interface(
+    fn=process_images,
+    inputs=[gallery, image_input],
+    outputs=[report_output, json_output],
+    title="Interior Design Assistant",
+    description="Upload images of your preferred interior design styles and a photo of your room to receive a custom design concept report and preference map."
+)
+interface.launch()