kazuhina commited on
Commit
23573b0
Β·
1 Parent(s): a0f3eb8

Add JoyCaption - Advanced Image Captioning with LLaVA

Browse files
Files changed (3) hide show
  1. README.md +6 -6
  2. joycaption_app.py +269 -0
  3. requirements.txt +15 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: Joycaption
3
- emoji: πŸ”₯
4
- colorFrom: indigo
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: JoyCaption
3
+ emoji: 🎨
4
+ colorFrom: purple
5
+ colorTo: slate
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
+ app_file: joycaption_app.py
9
  pinned: false
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
joycaption_app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ JoyCaption - Advanced Image Captioning with LLaVA
4
+ Uses fancyfeast/llama-joycaption-alpha-two-hf-llava model for high-quality image descriptions
5
+ Free, open, and uncensored model for training Diffusion models
6
+ """
7
+
8
+ import gradio as gr
9
+ import torch
10
+ import spaces
11
+ from transformers import AutoProcessor, LlavaForConditionalGeneration
12
+ from PIL import Image
13
+ import tempfile
14
+ import os
15
+ from pathlib import Path
16
+
17
+ # Initialize the JoyCaption model
18
+ print("Loading JoyCaption model...")
19
+ try:
20
+ # Model configuration for optimal performance
21
+ model_name = "fancyfeast/llama-joycaption-alpha-two-hf-llava"
22
+
23
+ # Load processor and model with correct configuration
24
+ processor = AutoProcessor.from_pretrained(model_name)
25
+
26
+ # Load model with bfloat16 (native dtype of Llama 3.1)
27
+ llava_model = LlavaForConditionalGeneration.from_pretrained(
28
+ model_name,
29
+ torch_dtype="bfloat16",
30
+ device_map="auto" if torch.cuda.is_available() else None
31
+ )
32
+ llava_model.eval()
33
+
34
+ print("JoyCaption model loaded successfully!")
35
+
36
+ except Exception as e:
37
+ print(f"Error loading model: {e}")
38
+ # Create a fallback function for when model loading fails
39
+ def process_image_with_caption(*args, **kwargs):
40
+ return "Error: Model not loaded. Please check the model availability."
41
+
42
+ @spaces.GPU
43
+ def generate_image_caption(image_file, prompt_type="formal_detailed", custom_prompt=""):
44
+ """
45
+ Generate high-quality image captions using JoyCaption model
46
+
47
+ Args:
48
+ image_file: Path to the image file or uploaded file
49
+ prompt_type: Type of captioning (formal_detailed, creative, simple, custom)
50
+ custom_prompt: Custom prompt for specialized captioning
51
+
52
+ Returns:
53
+ str: Generated image caption
54
+ """
55
+ try:
56
+ if not image_file:
57
+ return "Please upload an image file."
58
+
59
+ # Handle different types of image inputs
60
+ if hasattr(image_file, 'name'):
61
+ # Gradio file object
62
+ image_path = image_file.name
63
+ elif isinstance(image_file, str):
64
+ # File path string
65
+ image_path = image_file
66
+ else:
67
+ return "Invalid image file format."
68
+
69
+ # Check if file exists
70
+ if not os.path.exists(image_path):
71
+ return "Image file not found."
72
+
73
+ print(f"Processing image: {image_path}")
74
+
75
+ # Load and preprocess image
76
+ try:
77
+ image = Image.open(image_path).convert('RGB')
78
+ except Exception as e:
79
+ return f"Error loading image: {str(e)}"
80
+
81
+ # Define prompt templates based on type
82
+ prompt_templates = {
83
+ "formal_detailed": "Write a long descriptive caption for this image in a formal tone.",
84
+ "creative": "Write a creative and artistic caption for this image, capturing its essence and mood.",
85
+ "simple": "Write a simple, concise caption describing what you see in this image.",
86
+ "technical": "Provide a detailed technical description of this image including composition, lighting, and visual elements.",
87
+ "custom": custom_prompt if custom_prompt else "Write a descriptive caption for this image."
88
+ }
89
+
90
+ # Select appropriate prompt
91
+ prompt = prompt_templates.get(prompt_type, prompt_templates["formal_detailed"])
92
+
93
+ # Build conversation following JoyCaption's recommended format
94
+ convo = [
95
+ {
96
+ "role": "system",
97
+ "content": "You are a helpful image captioner.",
98
+ },
99
+ {
100
+ "role": "user",
101
+ "content": prompt,
102
+ },
103
+ ]
104
+
105
+ # Format the conversation using JoyCaption's specific method
106
+ # WARNING: HF's handling of chat's on Llava models is very fragile
107
+ convo_string = processor.apply_chat_template(
108
+ convo,
109
+ tokenize=False,
110
+ add_generation_prompt=True
111
+ )
112
+ assert isinstance(convo_string, str)
113
+
114
+ # Process the inputs with proper tensor handling
115
+ inputs = processor(
116
+ text=[convo_string],
117
+ images=[image],
118
+ return_tensors="pt"
119
+ ).to('cuda' if torch.cuda.is_available() else 'cpu')
120
+
121
+ # Ensure pixel_values are in bfloat16
122
+ if 'pixel_values' in inputs:
123
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
124
+
125
+ # Generate captions with JoyCaption's recommended parameters
126
+ with torch.no_grad():
127
+ generate_ids = llava_model.generate(
128
+ **inputs,
129
+ max_new_tokens=300,
130
+ do_sample=True,
131
+ suppress_tokens=None,
132
+ use_cache=True,
133
+ temperature=0.6,
134
+ top_k=None,
135
+ top_p=0.9,
136
+ repetition_penalty=1.1
137
+ )[0]
138
+
139
+ # Trim off the prompt
140
+ generate_ids = generate_ids[inputs['input_ids'].shape[1]:]
141
+
142
+ # Decode the caption
143
+ caption = processor.tokenizer.decode(
144
+ generate_ids,
145
+ skip_special_tokens=True,
146
+ clean_up_tokenization_spaces=False
147
+ )
148
+ caption = caption.strip()
149
+
150
+ print(f"Caption generated successfully: {caption[:100]}...")
151
+ return caption
152
+
153
+ except Exception as e:
154
+ error_msg = f"Error during caption generation: {str(e)}"
155
+ print(error_msg)
156
+ return error_msg
157
+
158
+ def create_demo_image():
159
+ """Create a demo image for testing"""
160
+ try:
161
+ # Create a simple colored rectangle as demo
162
+ from PIL import Image, ImageDraw
163
+
164
+ # Create a 512x512 image with gradient
165
+ width, height = 512, 512
166
+ image = Image.new('RGB', (width, height), color='white')
167
+ draw = ImageDraw.Draw(image)
168
+
169
+ # Draw a simple pattern
170
+ for i in range(0, width, 50):
171
+ for j in range(0, height, 50):
172
+ color = (i % 255, j % 255, (i + j) % 255)
173
+ draw.rectangle([i, j, i+25, j+25], fill=color)
174
+
175
+ # Save demo image
176
+ demo_file = "demo_image.png"
177
+ image.save(demo_file)
178
+ return demo_file
179
+
180
+ except Exception as e:
181
+ print(f"Error creating demo image: {e}")
182
+ return None
183
+
184
+ # Create Gradio interface
185
+ demo = gr.Interface(
186
+ fn=generate_image_caption,
187
+ inputs=[
188
+ gr.Image(
189
+ label="Upload Image for Captioning",
190
+ type="filepath",
191
+ format="png"
192
+ ),
193
+ gr.Dropdown(
194
+ choices=["formal_detailed", "creative", "simple", "technical", "custom"],
195
+ value="formal_detailed",
196
+ label="Caption Style",
197
+ info="Choose the style of caption generation"
198
+ ),
199
+ gr.Textbox(
200
+ label="Custom Prompt (Optional)",
201
+ placeholder="Enter custom prompt for specialized captioning...",
202
+ lines=3,
203
+ visible=False
204
+ )
205
+ ],
206
+ outputs=[
207
+ gr.Textbox(
208
+ label="Generated Caption",
209
+ lines=8,
210
+ placeholder="The generated caption will appear here..."
211
+ )
212
+ ],
213
+ title="🎨 JoyCaption - Advanced Image Captioning",
214
+ description="""
215
+ This application uses the **JoyCaption** model to generate high-quality, detailed captions for images.
216
+
217
+ **Key Features:**
218
+ - πŸ†“ **Free & Open**: No restrictions, open weights, training scripts included
219
+ - πŸ”“ **Uncensored**: Equal coverage of SFW and NSFW concepts
220
+ - 🌈 **Diversity**: Supports digital art, photoreal, anime, furry, and all styles
221
+ - 🎯 **High Performance**: Near GPT4o-level captioning quality
222
+ - πŸ”§ **Minimal Filtering**: Trained on diverse images for broad understanding
223
+
224
+ **Supported image formats:** PNG, JPG, JPEG, WEBP
225
+
226
+ **Caption Styles:**
227
+ - **Formal Detailed**: Long descriptive captions in formal tone
228
+ - **Creative**: Artistic and expressive descriptions
229
+ - **Simple**: Concise, straightforward descriptions
230
+ - **Technical**: Detailed technical analysis of composition and elements
231
+ - **Custom**: User-defined prompts for specialized captioning
232
+
233
+ **Model**: fancyfeast/llama-joycaption-alpha-two-hf-llava
234
+ **Architecture**: LLaVA with Llama 3.1 base
235
+ """,
236
+ examples=[
237
+ ["Upload an image for formal detailed captioning"],
238
+ ["Upload an image for creative captioning"],
239
+ ["Upload an image with custom prompt"],
240
+ ],
241
+ theme=gr.themes.Soft(
242
+ primary_hue="purple",
243
+ secondary_hue="slate",
244
+ neutral_hue="slate"
245
+ ),
246
+ css="""
247
+ .gradio-container {max-width: 900px !important; margin: auto !important;}
248
+ .title {text-align: center; color: #7c3aed;}
249
+ .description {text-align: center; font-size: 1.1em;}
250
+ """,
251
+ flagging_mode="never",
252
+ submit_btn="🎨 Generate Caption",
253
+ stop_btn="⏹️ Stop"
254
+ )
255
+
256
+ if __name__ == "__main__":
257
+ print("πŸš€ Starting JoyCaption App...")
258
+ print("πŸ“± Interface will be available at: http://localhost:7860")
259
+ print("🎨 Using JoyCaption model by fancyfeast")
260
+ print("πŸ”“ Free, Open, and Uncensored Image Captioning")
261
+
262
+ # Launch the interface
263
+ demo.launch(
264
+ server_name="0.0.0.0",
265
+ server_port=7860,
266
+ share=False,
267
+ debug=False,
268
+ show_error=True
269
+ )
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # JoyCaption - Required Dependencies for Spaces
2
+ # Core ML/AI libraries (not included in Spaces base)
3
+ transformers>=4.40.0
4
+ torch>=2.0.0
5
+ torchvision>=0.15.0
6
+
7
+ # Image processing
8
+ Pillow>=10.0.0
9
+
10
+ # Gradio and UI
11
+ gradio>=5.0.0
12
+ spaces>=0.19.0
13
+
14
+ # Optional: Hugging Face Hub enhancements
15
+ huggingface_hub>=0.15.0