Spaces:
Running
on
Zero
Running
on
Zero
update app
Browse files
app.py
CHANGED
|
@@ -9,6 +9,11 @@ from transformers import (
|
|
| 9 |
)
|
| 10 |
from gradio.themes import Soft
|
| 11 |
from gradio.themes.utils import colors, fonts, sizes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
colors.steel_blue = colors.Color(
|
| 14 |
name="steel_blue",
|
|
@@ -78,15 +83,17 @@ class SteelBlueTheme(Soft):
|
|
| 78 |
|
| 79 |
steel_blue_theme = SteelBlueTheme()
|
| 80 |
|
| 81 |
-
import json
|
| 82 |
-
import ast
|
| 83 |
-
import re
|
| 84 |
-
from PIL import Image
|
| 85 |
-
from spaces import GPU
|
| 86 |
-
|
| 87 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 88 |
DTYPE = "auto"
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 91 |
"Qwen/Qwen3-VL-4B-Instruct",
|
| 92 |
dtype=DTYPE,
|
|
@@ -250,57 +257,53 @@ def process_inputs(image, category, prompt):
|
|
| 250 |
|
| 251 |
return qwen_annotated_image, qwen_text
|
| 252 |
|
| 253 |
-
CATEGORIES = ["Query", "Caption", "Point", "Detect"]
|
| 254 |
-
PLACEHOLDERS = {
|
| 255 |
-
"Query": "What's in this image?",
|
| 256 |
-
"Caption": "Enter caption length: short, normal, or long",
|
| 257 |
-
"Point": "Enter the object for keypoint detection (e.g., 'the person's face')",
|
| 258 |
-
"Detect": "Enter the object to detect (e.g., 'the person')",
|
| 259 |
-
}
|
| 260 |
-
|
| 261 |
css="""
|
| 262 |
#col-container {
|
| 263 |
margin: 0 auto;
|
| 264 |
max-width: 960px;
|
| 265 |
}
|
| 266 |
-
#main-title
|
|
|
|
|
|
|
|
|
|
| 267 |
"""
|
| 268 |
|
| 269 |
with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
| 270 |
-
gr.
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
with gr.
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
[
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
|
|
|
| 304 |
|
| 305 |
category_select.change(
|
| 306 |
fn=on_category_change,
|
|
@@ -315,4 +318,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
|
| 315 |
)
|
| 316 |
|
| 317 |
if __name__ == "__main__":
|
| 318 |
-
demo.launch(
|
|
|
|
| 9 |
)
|
| 10 |
from gradio.themes import Soft
|
| 11 |
from gradio.themes.utils import colors, fonts, sizes
|
| 12 |
+
import json
|
| 13 |
+
import ast
|
| 14 |
+
import re
|
| 15 |
+
from PIL import Image
|
| 16 |
+
from spaces import GPU
|
| 17 |
|
| 18 |
colors.steel_blue = colors.Color(
|
| 19 |
name="steel_blue",
|
|
|
|
| 83 |
|
| 84 |
steel_blue_theme = SteelBlueTheme()
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 87 |
DTYPE = "auto"
|
| 88 |
|
| 89 |
+
CATEGORIES = ["Query", "Caption", "Point", "Detect"]
|
| 90 |
+
PLACEHOLDERS = {
|
| 91 |
+
"Query": "What's in this image?",
|
| 92 |
+
"Caption": "Enter caption length: short, normal, or long",
|
| 93 |
+
"Point": "Enter the object for keypoint detection (e.g., 'the person's face')",
|
| 94 |
+
"Detect": "Enter the object to detect (e.g., 'the person')",
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 98 |
"Qwen/Qwen3-VL-4B-Instruct",
|
| 99 |
dtype=DTYPE,
|
|
|
|
| 257 |
|
| 258 |
return qwen_annotated_image, qwen_text
|
| 259 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
css="""
|
| 261 |
#col-container {
|
| 262 |
margin: 0 auto;
|
| 263 |
max-width: 960px;
|
| 264 |
}
|
| 265 |
+
#main-title {
|
| 266 |
+
text-align: center;
|
| 267 |
+
max-width: 100%;
|
| 268 |
+
}
|
| 269 |
"""
|
| 270 |
|
| 271 |
with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
| 272 |
+
with gr.Column(elem_id="col-container"):
|
| 273 |
+
gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
|
| 274 |
+
|
| 275 |
+
with gr.Row():
|
| 276 |
+
with gr.Column(scale=1):
|
| 277 |
+
image_input = gr.Image(type="pil", label="Upload Image")
|
| 278 |
+
category_select = gr.Radio(
|
| 279 |
+
choices=CATEGORIES,
|
| 280 |
+
value=CATEGORIES[0],
|
| 281 |
+
label="Select Task Category",
|
| 282 |
+
interactive=True,
|
| 283 |
+
)
|
| 284 |
+
prompt_input = gr.Textbox(
|
| 285 |
+
placeholder=PLACEHOLDERS[CATEGORIES[0]],
|
| 286 |
+
label="Prompt",
|
| 287 |
+
lines=2,
|
| 288 |
+
)
|
| 289 |
+
submit_btn = gr.Button("Process Image", variant="primary")
|
| 290 |
+
|
| 291 |
+
with gr.Column(scale=2):
|
| 292 |
+
qwen_img_output = gr.Image(label="Output Image")
|
| 293 |
+
qwen_text_output = gr.Textbox(
|
| 294 |
+
label="Text Output", lines=10, interactive=False
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
gr.Examples(
|
| 298 |
+
examples=[
|
| 299 |
+
["examples/4.jpg", "Detect", "Headlight"],
|
| 300 |
+
["examples/3.jpg", "Point", "Gun"],
|
| 301 |
+
["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
|
| 302 |
+
["examples/2.jpg", "Caption", "normal"], # <-- FIX: Changed prompt to a valid length
|
| 303 |
+
|
| 304 |
+
],
|
| 305 |
+
inputs=[image_input, category_select, prompt_input],
|
| 306 |
+
)
|
| 307 |
|
| 308 |
category_select.change(
|
| 309 |
fn=on_category_change,
|
|
|
|
| 318 |
)
|
| 319 |
|
| 320 |
if __name__ == "__main__":
|
| 321 |
+
demo.launch(show_error=True)
|