Csplk commited on
Commit
9414970
Β·
verified Β·
1 Parent(s): 0aec354

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -83
app.py CHANGED
@@ -1,100 +1,248 @@
1
- import spaces
2
  import torch
3
- import re
4
  import gradio as gr
5
- from threading import Thread
6
- from transformers import AutoModelForCausalLM
 
 
 
 
7
 
8
- moondream = AutoModelForCausalLM.from_pretrained(
9
- "moondream/moondream3-preview",
 
 
 
 
 
 
 
 
 
10
  trust_remote_code=True,
11
- dtype=torch.bfloat16,
12
- device_map={"": "cuda"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  )
14
- moondream.compile()
15
-
16
- def answer_questions(image_tuples, prompt_text):
17
- # Encode image once
18
- image = [img[0] for img in image_tuples if img[0] is not none]
19
- encoded = moondream.encode_image(image)
20
- questions = prompt_text
21
-
22
- for q in questions:
23
- result1 = moondream.query(image=encoded, question=q, reasoning=False)
24
- print(f"Q: {q}")
25
- print(f"A: {result1['answer']}\n")
26
-
27
- # Also works with other skills
28
- caption = moondream.caption(encoded, length="normal")
29
- objects = moondream.detect(encoded, "poop")
30
- pointe = moondream.point(encoded, "grass")
31
- print(f"caption: {caption}, objects:{objects}, point:{pointe}")
32
-
33
- # Segment an object
34
- result2 = moondream.segment(image, "cat")
35
- svg_path = result2["path"]
36
- bbox = result2["bbox"]
37
-
38
- print(f"SVG Path: {svg_path[:100]}...")
39
- print(f"Bounding box: {bbox}")
40
-
41
- # With spatial hint (point) to guide segmentation
42
- result3 = model.segment(image, "cat", spatial_refs=[[0.5, 0.3]])
43
- print(result1)
44
- # With spatial hint (bounding box)
45
- result3 = model.segment(image, "cat", spatial_refs=[[0.2, 0.1, 0.8, 0.9]])
46
- print(result3)
47
-
48
- result = ""
49
- Q_and_A = ""
50
- prompts = [p.strip() for p in prompt_text.split('?')]
51
  image_embeds = [img[0] for img in image_tuples if img[0] is not None]
52
- answers = []
 
 
 
 
 
 
 
53
 
 
54
  for prompt in prompts:
55
- answers.append(moondream.query(
56
  images=[img.convert("RGB") for img in image_embeds],
57
  prompts=[prompt] * len(image_embeds),
58
- ))
 
 
 
 
 
 
 
 
59
 
60
- for i, prompt in enumerate(prompts):
61
- Q_and_A += f"### Q: {prompt}\n"
62
- for j, image_tuple in enumerate(image_tuples):
63
- image_name = f"image{j+1}"
64
- answer_text = answers[i][j]
65
- Q_and_A += f"**{image_name} A:** \n {answer_text} \n"
66
 
67
- result = {'headers': prompts, 'data': answers}
68
- print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
69
- return Q_and_A, result
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  """
72
- Load Moondream model and tokenizer.
73
- moondream = AutoModelForCausalLM.from_pretrained(
74
- "vikhyatk/moondream2",
75
- revision="2025-01-09",
76
- trust_remote_code=True,
77
- device_map={"": "cuda"},
78
- )
79
- tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
 
 
 
 
 
 
 
 
 
80
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- with gr.Blocks() as demo:
84
- gr.Markdown("# moondream2 unofficial batch processing demo")
85
- gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
86
- gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each prompt on each image**")
87
- gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)")
88
- with gr.Row():
89
- img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4)
90
- with gr.Row():
91
- prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by question marks. Ex: Describe this image? What is in this image?", lines=8)
92
- with gr.Row():
93
- submit = gr.Button("Submit")
94
- with gr.Row():
95
- output = gr.Markdown(label="Questions and Answers", line_breaks=True)
96
- with gr.Row():
97
- output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
98
- submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])
99
-
100
- demo.queue().launch()
 
1
+ import os
2
  import torch
 
3
  import gradio as gr
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
5
+ from PIL import Image
6
+ import requests
7
+ import json
8
+ import base64
9
+ from io import BytesIO
10
 
11
+ # Check for CUDA availability for PyTorch
12
+ if torch.cuda.is_available():
13
+ device, dtype = "cuda", torch.bfloat16
14
+ else:
15
+ device, dtype = "cpu", torch.float32
16
+
17
+ # Load Moondream3 Preview for image analysis
18
+ moondream3_model_id = "moondream/moondream3-preview"
19
+ tokenizer_moondream3 = AutoTokenizer.from_pretrained(moondream3_model_id)
20
+ moondream3 = AutoModelForCausalLM.from_pretrained(
21
+ moondream3_model_id,
22
  trust_remote_code=True,
23
+ torch_dtype=dtype,
24
+ device_map={"": device}
25
+ ).eval()
26
+ moondream3.compile() # Optional: speeds up inference
27
+
28
+ # Initialize DeepSeek-V2 for chat completion
29
+ deepseek_model_name = "deepseek-ai/DeepSeek-V2"
30
+ tokenizer_deepseek = AutoTokenizer.from_pretrained(deepseek_model_name)
31
+ deepseek_model = AutoModelForCausalLM.from_pretrained(
32
+ deepseek_model_name,
33
+ torch_dtype=torch.bfloat16,
34
+ device_map="auto"
35
+ )
36
+ chat_pipe = pipeline(
37
+ "text-generation",
38
+ model=deepseek_model,
39
+ tokenizer=tokenizer_deepseek,
40
+ max_new_tokens=512,
41
+ temperature=0.7,
42
+ top_p=0.9,
43
+ repetition_penalty=1.1,
44
+ do_sample=True,
45
  )
46
+
47
+ def deepseek_chat(user_message, is_json=False):
48
+ """Chat completion using DeepSeek-V2."""
49
+ prompt = f"<|BeginOfUtterance|>User: {user_message}<|EndOfUtterance|><|BeginOfUtterance|>Assistant:"
50
+ response = chat_pipe(prompt, return_full_text=False)[0]["generated_text"]
51
+ assistant_response = response.split("<|BeginOfUtterance|>Assistant:")[-1].strip()
52
+ return assistant_response
53
+
54
+ # Extract features from images using Moondream3
55
+ def extract_features(image_tuples):
56
+ headers = ["Image", "Layout", "Decor", "Atmosphere", "Lighting", "Color scheme", "Furniture style"]
57
+ data = []
58
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  image_embeds = [img[0] for img in image_tuples if img[0] is not None]
60
+ prompts = [
61
+ "Describe the spatial arrangement of furniture, walls, and other elements in this image.",
62
+ "What type, style, and arrangement of decorative elements are present in this image?",
63
+ "What mood, ambiance, and overall feeling does this image evoke?",
64
+ "What type, intensity, placement, and direction of lighting is present in this image?",
65
+ "What are the dominant colors, color palette, and color harmony in this image?",
66
+ "What type, shape, material, and arrangement of furniture is present in this image?"
67
+ ]
68
 
69
+ answers = []
70
  for prompt in prompts:
71
+ image_answers = moondream3.batch_answer(
72
  images=[img.convert("RGB") for img in image_embeds],
73
  prompts=[prompt] * len(image_embeds),
74
+ tokenizer=tokenizer_moondream3,
75
+ )
76
+ answers.append(image_answers)
77
+
78
+ for i in range(len(image_tuples)):
79
+ image_name = f"image{i+1}"
80
+ image_answers = [answer[i] for answer in answers]
81
+ print(f"image{i+1}_answers \n {image_answers} \n")
82
+ data.append([image_name] + image_answers)
83
 
84
+ result = {'headers': headers, 'data': data}
85
+ return result
 
 
 
 
86
 
87
+ # Describe room from image using Moondream3
88
+ def describe_room(image):
89
+ headers = ["Image", "Layout", "Decor", "Atmosphere", "Lighting", "Color scheme", "Furniture style"]
90
+ data = []
91
 
92
+ image_embeds = [image.convert("RGB")] * 6
93
+ prompts = [
94
+ "Describe the spatial arrangement of furniture, walls, and other elements in this image.",
95
+ "What type, style, and arrangement of decorative elements are present in this image?",
96
+ "What mood, ambiance, and overall feeling does this image evoke?",
97
+ "What type, intensity, placement, and direction of lighting is present in this image?",
98
+ "What are the dominant colors, color palette, and color harmony in this image?",
99
+ "What type, shape, material, and arrangement of furniture is present in this image?"
100
+ ]
101
+
102
+ answers = moondream3.batch_answer(
103
+ images=image_embeds,
104
+ prompts=prompts,
105
+ tokenizer=tokenizer_moondream3,
106
+ )
107
+
108
+ image_name = "ClientRoom"
109
+ print(f"ClientRoom_answers \n {answers} \n")
110
+ data.append([image_name] + answers)
111
+
112
+ result = {'headers': headers, 'data': data}
113
+ return result
114
+
115
+ def merge_features(inspiration_features):
116
+ preferenec_map_extraction = f"""
117
+ You are one of the worlds most knowledgeable minds in the field of both theoretical and applied interior design.
118
+ - You are detailed
119
+ - You are meticulous
120
+ - You can distil a large potentially unstructured potentially multimodal range of input data sources into a highly accurate all encompassing representation of the interior design concept preferences of the input source by mapping input data using a model of fundamental interior design component definitions
121
+ - You can come up with professionally structured, fully detailed, well thought out and all encompassing applied interior design proposals from initial conceptualization and planning to a complete and finished interior design of real world space
122
+ - Generally you can help answer any question or assist in any task asked of you relating to anything in the realm of applied and theoretical design and interior design
123
+
124
+ Your task is to analyze the interior design style given information after <<<>>> and merge the analysis results together to generate a comprehensive design style preference map representation for the user who uploaded some images. Return as JSON
125
+
126
+ <<<
127
+ {inspiration_features}
128
+ >>>
129
  """
130
+ print(f"\npreference_map_extraction prompt\n{preferenec_map_extraction}\n")
131
+ prefmap = deepseek_chat(preferenec_map_extraction, is_json=True)
132
+ print(f"\n merge_features chat_response\n{prefmap}\n")
133
+ return prefmap
134
+
135
+ def create_design_concept_report(room_description, inspiration_features):
136
+ design_report_prompt = f"""
137
+ Generate a detailed interior design plan proposal report structured as markdown
138
+ - The report should include three design plan concepts for the clients space based on the clients interior design component preference representation generated from the inspirational images they uploaded clients room that is the target of the project and the design preference map generated from the inspirational design images they uploaded
139
+ - The report should have an introduction, sections on Style Preference, Color Scheme, Furniture Style, Lighting, Atmosphere, Decor, and Layout for each concept, as well as a placeholder for a mood board image starting each concept section.
140
+ - Finally, the report should have a summary to conclude the design plan.
141
+
142
+ Very detailed information about the clients room based on the photo they uploaded:
143
+ {room_description}
144
+
145
+ Design preference map generated from the inspirational design images they uploaded:
146
+ {inspiration_features}
147
  """
148
+ print(f"\ndesign_report_prompt\n{design_report_prompt}\n")
149
+ designreport = deepseek_chat(design_report_prompt)
150
+ print(f"\ndesign concept chat_response\n{designreport}\n")
151
+ return designreport
152
+
153
+ def queryllm(payload):
154
+ response = requests.post(textgen_API_URL, headers=headers, json=payload)
155
+ print(response)
156
+ return response.json()
157
+
158
+ def generate_mood_board_image(prompt):
159
+ payload = {"inputs": prompt}
160
+ response = requests.post(texttoimage_API_URL, headers=headers, json=payload)
161
+ return response.content
162
+
163
+ def getmoodboardprompts(designreport):
164
+ mood_board_descriptions_prompt = f"""
165
+ ### interior design report plan
166
+ {designreport}
167
+ ###
168
+
169
+ Generate a text prompt for each of the interior design concepts described in the interior design report plan that can be sent to a text-to-image model and receive a design project mood board.
170
+ The prompt should clearly describe what should go onto the moodboard for each design concept and be structured JSON. For example:
171
+ {{
172
+ "Concept1": "Create a mood board for a modern cozy retreat bedroom with a warm and inviting atmosphere. Include a white and brown color palette, modern and contemporary furniture with clean lines, a cozy and functional bed, nightstands with elegant designs, a bench at the foot of the bed with storage, sheer curtains on the window, floor lamps and table lamps with layered lighting effects, potted plants, a vase with branches and twigs, a bowl, a clock, and books on the nightstands.",
173
+ "Concept2": "Create a mood board for another concept..."
174
+ }}
175
+ Only output the JSON, nothing else, no explanations or commentary.
176
+ """
177
+ print(f"\nmood_board_descriptions_prompt:\n{mood_board_descriptions_prompt}\n")
178
+ mood_board_descriptions = deepseek_chat(mood_board_descriptions_prompt)
179
+ print(f"\nmood_board_descriptions_prompt chat_response\n{mood_board_descriptions}\n")
180
+ return json.loads(mood_board_descriptions)
181
+
182
+ def generate_moodboards(mb_prompts):
183
+ moodboard_images = {}
184
+ for concept, prompt in mb_prompts.items():
185
+ image_data = generate_mood_board_image(prompt)
186
+ file_path = f"moodboard_{concept}.jpg"
187
+ with open(file_path, "wb") as f:
188
+ f.write(image_data)
189
+ moodboard_images[concept] = file_path
190
+ return moodboard_images
191
+
192
+ def add_moodboards_to_report(moodboard_images, report):
193
+ add_moodboards_prompt = f"""
194
+ mood board images
195
+ <<<
196
+ {moodboard_images}
197
+ >>>
198
+
199
+ report
200
+ <<<
201
+ {report}
202
+ >>>
203
+
204
+ Insert paths for each mood board image into the respective placeholder for each in the report and respond with the revised report with moodboard images inserted only, no explanations or commentary
205
+ """
206
+ print(f"\nadd_moodboards_prompt\n{add_moodboards_prompt}\n")
207
+ revised_report = deepseek_chat(add_moodboards_prompt)
208
+ print(f"\nrevised_report\n{revised_report}\n")
209
+ return revised_report
210
 
211
+ # Gradio Interface
212
+ def process_images(design_images, room_image):
213
+ design_descriptions = extract_features(design_images)
214
+ room_description = describe_room(room_image)
215
+
216
+ preference_map = merge_features(design_descriptions)
217
+ print(f"\npreference_map\n{preference_map}\n")
218
+
219
+ design_report = create_design_concept_report(room_description, preference_map)
220
+ print(f"\ndesign_report\n{design_report}\n")
221
+
222
+ mb_prompts = getmoodboardprompts(design_report)
223
+ print(f"\nmb_prompts\n{mb_prompts}\n")
224
+
225
+ moodboard_images = generate_moodboards(mb_prompts)
226
+ print(f"\nmoodboard_images\n{moodboard_images}\n")
227
+
228
+ revised_report = add_moodboards_to_report(moodboard_images, design_report)
229
+ print("revised_report")
230
+ print(revised_report)
231
+ print("preference map")
232
+ print(preference_map)
233
+ return revised_report, preference_map
234
+
235
+ gallery = gr.components.Gallery(label="Upload Images of Preferred Design Styles", type="pil")
236
+ image_input = gr.components.Image(label="Upload Image of Your Room", type="pil")
237
+ report_output = gr.components.Markdown(label="Design Concept Report with Mood Boards")
238
+ json_output = gr.components.JSON(label="Design Preference Map")
239
+
240
+ interface = gr.Interface(
241
+ fn=process_images,
242
+ inputs=[gallery, image_input],
243
+ outputs=[report_output, json_output],
244
+ title="Interior Design Assistant",
245
+ description="Upload images of your preferred interior design styles and a photo of your room to receive a custom design concept report and preference map."
246
+ )
247
 
248
+ interface.launch()