Spaces:

prithivMLmods
/

Qwen3-VL-HF-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 28 days ago

Commit

89734ae

verified ·

1 Parent(s): 9676f43

update app

Browse files

Files changed (1) hide show

app.py +53 -50

app.py CHANGED Viewed

@@ -9,6 +9,11 @@ from transformers import (
 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.steel_blue = colors.Color(
     name="steel_blue",
@@ -78,15 +83,17 @@ class SteelBlueTheme(Soft):
 steel_blue_theme = SteelBlueTheme()
-import json
-import ast
-import re
-from PIL import Image
-from spaces import GPU
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = "auto"
 qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen3-VL-4B-Instruct",
     dtype=DTYPE,
@@ -250,57 +257,53 @@ def process_inputs(image, category, prompt):
     return qwen_annotated_image, qwen_text
-CATEGORIES = ["Query", "Caption", "Point", "Detect"]
-PLACEHOLDERS = {
-    "Query": "What's in this image?",
-    "Caption": "Enter caption length: short, normal, or long",
-    "Point": "Enter the object for keypoint detection (e.g., 'the person's face')",
-    "Detect": "Enter the object to detect (e.g., 'the person')",
-}
 css="""
 #col-container {
     margin: 0 auto;
     max-width: 960px;
 }
-#main-title h1 {font-size: 2.1em !important;}
 """
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
-    gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="Upload Image")
-            category_select = gr.Radio(
-                choices=CATEGORIES,
-                value=CATEGORIES[0],
-                label="Select Task Category",
-                interactive=True,
-            )
-            prompt_input = gr.Textbox(
-                placeholder=PLACEHOLDERS[CATEGORIES[0]],
-                label="Prompt",
-                lines=2,
-            )
-            submit_btn = gr.Button("Process Image", variant="primary")
-        with gr.Column(scale=2):
-            qwen_img_output = gr.Image(label="Output Image")
-            qwen_text_output = gr.Textbox(
-                label="Text Output", lines=10, interactive=False
-            )
-    gr.Examples(
-        examples=[
-            ["examples/4.jpg", "Detect", "Headlight"],
-            ["examples/3.jpg", "Point", "Gun"],
-            ["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
-            ["examples/2.jpg", "Caption", "Caption the image."],
-        ],
-        inputs=[image_input, category_select, prompt_input],
-    )
     category_select.change(
         fn=on_category_change,
@@ -315,4 +318,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     )
 if __name__ == "__main__":
-    demo.launch(mcp_server=True, ssr_mode=False, show_error=True)

 )
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+import json
+import ast
+import re
+from PIL import Image
+from spaces import GPU
 colors.steel_blue = colors.Color(
     name="steel_blue",
 steel_blue_theme = SteelBlueTheme()
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = "auto"
+CATEGORIES = ["Query", "Caption", "Point", "Detect"]
+PLACEHOLDERS = {
+    "Query": "What's in this image?",
+    "Caption": "Enter caption length: short, normal, or long",
+    "Point": "Enter the object for keypoint detection (e.g., 'the person's face')",
+    "Detect": "Enter the object to detect (e.g., 'the person')",
+}
 qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen3-VL-4B-Instruct",
     dtype=DTYPE,
     return qwen_annotated_image, qwen_text
 css="""
 #col-container {
     margin: 0 auto;
     max-width: 960px;
 }
+#main-title {
+    text-align: center;
+    max-width: 100%;
+}
 """
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
+        with gr.Row():
+            with gr.Column(scale=1):
+                image_input = gr.Image(type="pil", label="Upload Image")
+                category_select = gr.Radio(
+                    choices=CATEGORIES,
+                    value=CATEGORIES[0],
+                    label="Select Task Category",
+                    interactive=True,
+                )
+                prompt_input = gr.Textbox(
+                    placeholder=PLACEHOLDERS[CATEGORIES[0]],
+                    label="Prompt",
+                    lines=2,
+                )
+                submit_btn = gr.Button("Process Image", variant="primary")
+            with gr.Column(scale=2):
+                qwen_img_output = gr.Image(label="Output Image")
+                qwen_text_output = gr.Textbox(
+                    label="Text Output", lines=10, interactive=False
+                )
+        gr.Examples(
+            examples=[
+                ["examples/4.jpg", "Detect", "Headlight"],
+                ["examples/3.jpg", "Point", "Gun"],
+                ["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
+                ["examples/2.jpg", "Caption", "normal"], # <-- FIX: Changed prompt to a valid length
+            ],
+            inputs=[image_input, category_select, prompt_input],
+        )
     category_select.change(
         fn=on_category_change,
     )
 if __name__ == "__main__":
+    demo.launch(show_error=True)