prithivMLmods commited on
Commit
89734ae
·
verified ·
1 Parent(s): 9676f43

update app

Browse files
Files changed (1) hide show
  1. app.py +53 -50
app.py CHANGED
@@ -9,6 +9,11 @@ from transformers import (
9
  )
10
  from gradio.themes import Soft
11
  from gradio.themes.utils import colors, fonts, sizes
 
 
 
 
 
12
 
13
  colors.steel_blue = colors.Color(
14
  name="steel_blue",
@@ -78,15 +83,17 @@ class SteelBlueTheme(Soft):
78
 
79
  steel_blue_theme = SteelBlueTheme()
80
 
81
- import json
82
- import ast
83
- import re
84
- from PIL import Image
85
- from spaces import GPU
86
-
87
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
88
  DTYPE = "auto"
89
 
 
 
 
 
 
 
 
 
90
  qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
91
  "Qwen/Qwen3-VL-4B-Instruct",
92
  dtype=DTYPE,
@@ -250,57 +257,53 @@ def process_inputs(image, category, prompt):
250
 
251
  return qwen_annotated_image, qwen_text
252
 
253
- CATEGORIES = ["Query", "Caption", "Point", "Detect"]
254
- PLACEHOLDERS = {
255
- "Query": "What's in this image?",
256
- "Caption": "Enter caption length: short, normal, or long",
257
- "Point": "Enter the object for keypoint detection (e.g., 'the person's face')",
258
- "Detect": "Enter the object to detect (e.g., 'the person')",
259
- }
260
-
261
  css="""
262
  #col-container {
263
  margin: 0 auto;
264
  max-width: 960px;
265
  }
266
- #main-title h1 {font-size: 2.1em !important;}
 
 
 
267
  """
268
 
269
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
270
- gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
271
-
272
- with gr.Row():
273
- with gr.Column(scale=1):
274
- image_input = gr.Image(type="pil", label="Upload Image")
275
- category_select = gr.Radio(
276
- choices=CATEGORIES,
277
- value=CATEGORIES[0],
278
- label="Select Task Category",
279
- interactive=True,
280
- )
281
- prompt_input = gr.Textbox(
282
- placeholder=PLACEHOLDERS[CATEGORIES[0]],
283
- label="Prompt",
284
- lines=2,
285
- )
286
- submit_btn = gr.Button("Process Image", variant="primary")
287
-
288
- with gr.Column(scale=2):
289
- qwen_img_output = gr.Image(label="Output Image")
290
- qwen_text_output = gr.Textbox(
291
- label="Text Output", lines=10, interactive=False
292
- )
293
-
294
- gr.Examples(
295
- examples=[
296
- ["examples/4.jpg", "Detect", "Headlight"],
297
- ["examples/3.jpg", "Point", "Gun"],
298
- ["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
299
- ["examples/2.jpg", "Caption", "Caption the image."],
300
-
301
- ],
302
- inputs=[image_input, category_select, prompt_input],
303
- )
 
304
 
305
  category_select.change(
306
  fn=on_category_change,
@@ -315,4 +318,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
315
  )
316
 
317
  if __name__ == "__main__":
318
- demo.launch(mcp_server=True, ssr_mode=False, show_error=True)
 
9
  )
10
  from gradio.themes import Soft
11
  from gradio.themes.utils import colors, fonts, sizes
12
+ import json
13
+ import ast
14
+ import re
15
+ from PIL import Image
16
+ from spaces import GPU
17
 
18
  colors.steel_blue = colors.Color(
19
  name="steel_blue",
 
83
 
84
  steel_blue_theme = SteelBlueTheme()
85
 
 
 
 
 
 
 
86
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
87
  DTYPE = "auto"
88
 
89
+ CATEGORIES = ["Query", "Caption", "Point", "Detect"]
90
+ PLACEHOLDERS = {
91
+ "Query": "What's in this image?",
92
+ "Caption": "Enter caption length: short, normal, or long",
93
+ "Point": "Enter the object for keypoint detection (e.g., 'the person's face')",
94
+ "Detect": "Enter the object to detect (e.g., 'the person')",
95
+ }
96
+
97
  qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
98
  "Qwen/Qwen3-VL-4B-Instruct",
99
  dtype=DTYPE,
 
257
 
258
  return qwen_annotated_image, qwen_text
259
 
 
 
 
 
 
 
 
 
260
  css="""
261
  #col-container {
262
  margin: 0 auto;
263
  max-width: 960px;
264
  }
265
+ #main-title {
266
+ text-align: center;
267
+ max-width: 100%;
268
+ }
269
  """
270
 
271
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
272
+ with gr.Column(elem_id="col-container"):
273
+ gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
274
+
275
+ with gr.Row():
276
+ with gr.Column(scale=1):
277
+ image_input = gr.Image(type="pil", label="Upload Image")
278
+ category_select = gr.Radio(
279
+ choices=CATEGORIES,
280
+ value=CATEGORIES[0],
281
+ label="Select Task Category",
282
+ interactive=True,
283
+ )
284
+ prompt_input = gr.Textbox(
285
+ placeholder=PLACEHOLDERS[CATEGORIES[0]],
286
+ label="Prompt",
287
+ lines=2,
288
+ )
289
+ submit_btn = gr.Button("Process Image", variant="primary")
290
+
291
+ with gr.Column(scale=2):
292
+ qwen_img_output = gr.Image(label="Output Image")
293
+ qwen_text_output = gr.Textbox(
294
+ label="Text Output", lines=10, interactive=False
295
+ )
296
+
297
+ gr.Examples(
298
+ examples=[
299
+ ["examples/4.jpg", "Detect", "Headlight"],
300
+ ["examples/3.jpg", "Point", "Gun"],
301
+ ["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
302
+ ["examples/2.jpg", "Caption", "normal"], # <-- FIX: Changed prompt to a valid length
303
+
304
+ ],
305
+ inputs=[image_input, category_select, prompt_input],
306
+ )
307
 
308
  category_select.change(
309
  fn=on_category_change,
 
318
  )
319
 
320
  if __name__ == "__main__":
321
+ demo.launch(show_error=True)