Spaces:

prithivMLmods
/

Qwen3-VL-HF-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 28 days ago

Commit

d1a91c5

verified ·

1 Parent(s): c7a140a

update app

Browse files

Files changed (1) hide show

app.py +118 -112

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ DTYPE = "auto"
 CATEGORIES = ["Query", "Caption", "Point", "Detect"]
 PLACEHOLDERS = {
     "Query": "What's in this image?",
-    "Caption": "Select caption length: short, normal, or long",
     "Point": "Select an object from suggestions or enter manually",
     "Detect": "Select an object from suggestions or enter manually",
 }
@@ -39,9 +39,7 @@ qwen_processor = Qwen3VLProcessor.from_pretrained(
 # --- Utility Functions ---
 def safe_parse_json(text: str):
-    """Safely parse a string that may be JSON or a Python literal."""
     text = text.strip()
-    # Remove markdown code blocks
     text = re.sub(r"^```(json)?", "", text)
     text = re.sub(r"```$", "", text)
     text = text.strip()
@@ -50,127 +48,142 @@ def safe_parse_json(text: str):
     except json.JSONDecodeError:
         pass
     try:
-        # Fallback to literal_eval for Python-like dictionary/list strings
         return ast.literal_eval(text)
     except Exception:
         return {}
-# --- Inference Functions ---
-def run_qwen_inference(image: Image.Image, prompt: str):
-    """Core function to run inference with the Qwen model."""
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": prompt},
-            ],
-        }
-    ]
-    inputs = qwen_processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt",
-    ).to(DEVICE)
-    with torch.inference_mode():
-        generated_ids = qwen_model.generate(
-            **inputs,
-            max_new_tokens=512,
-        )
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :]
-        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = qwen_processor.batch_decode(
-        generated_ids_trimmed,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False,
-    )[0]
-    return output_text
 @GPU
 def get_suggested_objects(image: Image.Image):
-    """Get suggested objects in the image using Qwen."""
     if image is None:
         return []
     try:
-        # Resize image for faster suggestion generation
-        suggest_image = image.copy()
-        suggest_image.thumbnail((512, 512))
-        prompt = "List the main objects in the image in a Python list format. For example: ['cat', 'dog', 'table']"
-        result_text = run_qwen_inference(suggest_image, prompt)
-        # Clean up the output to find the list
-        match = re.search(r'\[.*?\]', result_text)
-        if match:
-            suggested_objects = ast.literal_eval(match.group())
-            if isinstance(suggested_objects, list):
-                # Return up to 3 suggestions
-                return suggested_objects[:3]
         return []
     except Exception as e:
-        print(f"Error getting suggestions with Qwen: {e}")
         return []
 def annotate_image(image: Image.Image, result: dict):
-    """Annotates the image with points or bounding boxes based on model output."""
     if not isinstance(image, Image.Image) or not isinstance(result, dict):
         return image
     original_width, original_height = image.size
-    scene_np = np.array(image.copy())
     # Handle Point annotations
     if "points" in result and result["points"]:
-        points_list = []
-        for point in result.get("points", []):
-            x = int(point["x"] * original_width)
-            y = int(point["y"] * original_height)
-            points_list.append([x, y])
         if not points_list:
             return image
-        points_array = np.array(points_list).reshape(-1, 2)
         key_points = sv.KeyPoints(xy=points_array)
         vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
-        annotated_image_np = vertex_annotator.annotate(
-            scene=scene_np, key_points=key_points
-        )
-        return Image.fromarray(annotated_image_np)
     # Handle Detection annotations
     if "objects" in result and result["objects"]:
         boxes = []
         for obj in result["objects"]:
-            x_min = obj["x_min"] * original_width
-            y_min = obj["y_min"] * original_height
-            x_max = obj["x_max"] * original_width
-            y_max = obj["y_max"] * original_height
             boxes.append([x_min, y_min, x_max, y_max])
         if not boxes:
             return image
         detections = sv.Detections(xyxy=np.array(boxes))
-        box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=4)
-        label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
-        annotated_image_np = box_annotator.annotate(
-            scene=scene_np, detections=detections
-        )
-        return Image.fromarray(annotated_image_np)
     return image
 @GPU
 def process_qwen(image: Image.Image, category: str, prompt: str):
     if category == "Query":
@@ -216,59 +229,56 @@ def process_qwen(image: Image.Image, category: str, prompt: str):
 # --- Gradio Interface Logic ---
 def on_category_and_image_change(image, category):
-    """Generate suggestions when category or image changes."""
     text_box = gr.Textbox(value="", placeholder=PLACEHOLDERS.get(category, ""), interactive=True)
     if category == "Caption":
-        return gr.Radio(choices=["short", "normal", "long"], label="Caption Length", value="normal", visible=True), text_box
     if image is None or category not in ["Point", "Detect"]:
         return gr.Radio(choices=[], visible=False), text_box
     suggestions = get_suggested_objects(image)
     if suggestions:
-        return gr.Radio(choices=suggestions, label="Suggestions", visible=True, interactive=True), text_box
     else:
         return gr.Radio(choices=[], visible=False), text_box
 def update_prompt_from_radio(selected_object):
-    """Update prompt textbox when a radio option is selected."""
-    if selected_object:
-        return gr.Textbox(value=selected_object)
-    return gr.Textbox(value="")
 def process_inputs(image, category, prompt):
-    """Main function to handle the user's request."""
     if image is None:
         raise gr.Error("Please upload an image.")
-    if not prompt and category not in ["Caption"]:
-         # Caption can have an empty prompt if a length is selected
-        if category == "Caption" and not prompt:
-            prompt = "normal" # default
-        else:
-            raise gr.Error("Please provide a prompt or select a suggestion.")
-    # Resize the image to make inference quicker
-    image.thumbnail((1024, 1024))
-    # Process with Qwen
     qwen_text, qwen_data = process_qwen(image, category, prompt)
-    qwen_annotated_image = annotate_image(image, qwen_data)
     return qwen_annotated_image, qwen_text
 # --- Gradio UI Layout ---
-with gr.Blocks(theme=Ocean()) as demo:
     gr.Markdown("# 👓 Object Understanding with Qwen3-VL")
     gr.Markdown(
         "### Explore object detection, visual grounding, and keypoint detection through natural language prompts."
     )
-    gr.Markdown("""
-    *Powered by [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct). Inspired by the tutorial [Object Detection and Visual Grounding with Qwen 2.5](https://pyimagesearch.com/2025/06/09/object-detection-and-visual-grounding-with-qwen-2-5/) on PyImageSearch.*
-    """)
     with gr.Row():
         with gr.Column(scale=1):
@@ -293,7 +303,6 @@ with gr.Blocks(theme=Ocean()) as demo:
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
-            gr.Markdown("### Qwen/Qwen3-VL-4B-Instruct Output")
             qwen_img_output = gr.Image(label="Annotated Image")
             qwen_text_output = gr.Textbox(
                 label="Text Output", lines=10, interactive=False
@@ -302,15 +311,14 @@ with gr.Blocks(theme=Ocean()) as demo:
     gr.Examples(
         examples=[
             ["examples/example_1.jpg", "Query", "How many cars are in the image?"],
-            ["examples/example_1.jpg", "Detect", "car"],
             ["examples/example_2.JPG", "Point", "the person's face"],
-            ["examples/example_2.JPG", "Caption", "short"],
         ],
         inputs=[image_input, category_select, prompt_input],
     )
     # --- Event Listeners ---
-    # When image or category changes, update suggestions
     category_select.change(
         fn=on_category_and_image_change,
         inputs=[image_input, category_select],
@@ -322,14 +330,12 @@ with gr.Blocks(theme=Ocean()) as demo:
         outputs=[suggestions_radio, prompt_input],
     )
-    # When a suggestion is clicked, update the prompt box
     suggestions_radio.change(
         fn=update_prompt_from_radio,
         inputs=[suggestions_radio],
         outputs=[prompt_input],
     )
-    # Main submission action
     submit_btn.click(
         fn=process_inputs,
         inputs=[image_input, category_select, prompt_input],
@@ -337,4 +343,4 @@ with gr.Blocks(theme=Ocean()) as demo:
     )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 CATEGORIES = ["Query", "Caption", "Point", "Detect"]
 PLACEHOLDERS = {
     "Query": "What's in this image?",
+    "Caption": "Enter caption length: short, normal, or long",
     "Point": "Select an object from suggestions or enter manually",
     "Detect": "Select an object from suggestions or enter manually",
 }
 # --- Utility Functions ---
 def safe_parse_json(text: str):
     text = text.strip()
     text = re.sub(r"^```(json)?", "", text)
     text = re.sub(r"```$", "", text)
     text = text.strip()
     except json.JSONDecodeError:
         pass
     try:
         return ast.literal_eval(text)
     except Exception:
         return {}
 @GPU
 def get_suggested_objects(image: Image.Image):
+    """Get suggested objects in the image using Qwen"""
     if image is None:
         return []
     try:
+        prompt = "List the objects in the image in python list format."
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        inputs = qwen_processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(DEVICE)
+        with torch.inference_mode():
+            generated_ids = qwen_model.generate(
+                **inputs,
+                max_new_tokens=128,
+            )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = qwen_processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        suggested_objects = ast.literal_eval(output_text)
+        if isinstance(suggested_objects, list):
+            return suggested_objects[:3] if len(suggested_objects) > 3 else suggested_objects
         return []
     except Exception as e:
+        print(f"Error getting suggestions: {e}")
         return []
 def annotate_image(image: Image.Image, result: dict):
     if not isinstance(image, Image.Image) or not isinstance(result, dict):
         return image
     original_width, original_height = image.size
     # Handle Point annotations
     if "points" in result and result["points"]:
+        points_list = [
+            [int(p["x"] * original_width), int(p["y"] * original_height)]
+            for p in result.get("points", [])
+        ]
         if not points_list:
             return image
+        points_array = np.array(points_list).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
         vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
+        return vertex_annotator.annotate(scene=image.copy(), key_points=key_points)
     # Handle Detection annotations
     if "objects" in result and result["objects"]:
+        # Manually create detections from the Qwen output format
         boxes = []
         for obj in result["objects"]:
+            x_min = obj.get("x_min", 0.0) * original_width
+            y_min = obj.get("y_min", 0.0) * original_height
+            x_max = obj.get("x_max", 0.0) * original_width
+            y_max = obj.get("y_max", 0.0) * original_height
             boxes.append([x_min, y_min, x_max, y_max])
         if not boxes:
             return image
         detections = sv.Detections(xyxy=np.array(boxes))
+        if len(detections) == 0:
+            return image
+        box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=5)
+        return box_annotator.annotate(scene=image.copy(), detections=detections)
     return image
+# --- Inference Functions ---
+def run_qwen_inference(image: Image.Image, prompt: str):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    inputs = qwen_processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(DEVICE)
+    with torch.inference_mode():
+        generated_ids = qwen_model.generate(
+            **inputs,
+            max_new_tokens=512,
+        )
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    return qwen_processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )[0]
 @GPU
 def process_qwen(image: Image.Image, category: str, prompt: str):
     if category == "Query":
 # --- Gradio Interface Logic ---
 def on_category_and_image_change(image, category):
+    """Generate suggestions when category changes"""
     text_box = gr.Textbox(value="", placeholder=PLACEHOLDERS.get(category, ""), interactive=True)
     if category == "Caption":
+        return gr.Radio(choices=["short", "normal", "long"], visible=True, label="Caption Length"), text_box
     if image is None or category not in ["Point", "Detect"]:
         return gr.Radio(choices=[], visible=False), text_box
     suggestions = get_suggested_objects(image)
     if suggestions:
+        return gr.Radio(choices=suggestions, visible=True, interactive=True, label="Suggestions"), text_box
     else:
         return gr.Radio(choices=[], visible=False), text_box
 def update_prompt_from_radio(selected_object):
+    """Update prompt textbox when a radio option is selected"""
+    return gr.Textbox(value=selected_object) if selected_object else gr.Textbox(value="")
 def process_inputs(image, category, prompt):
     if image is None:
         raise gr.Error("Please upload an image.")
+    if not prompt:
+        raise gr.Error("Please provide a prompt.")
+    image.thumbnail((512, 512))
     qwen_text, qwen_data = process_qwen(image, category, prompt)
+    qwen_annotated_image = annotate_image(image.copy(), qwen_data)
     return qwen_annotated_image, qwen_text
+css_hide_share = """
+button#gradio-share-link-button-0 {
+    display: none !important;
+}
+"""
 # --- Gradio UI Layout ---
+with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
     gr.Markdown("# 👓 Object Understanding with Qwen3-VL")
     gr.Markdown(
         "### Explore object detection, visual grounding, and keypoint detection through natural language prompts."
     )
+    gr.Markdown(
+        "*Powered by [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).*"
+    )
     with gr.Row():
         with gr.Column(scale=1):
             submit_btn = gr.Button("Process Image", variant="primary")
         with gr.Column(scale=2):
             qwen_img_output = gr.Image(label="Annotated Image")
             qwen_text_output = gr.Textbox(
                 label="Text Output", lines=10, interactive=False
     gr.Examples(
         examples=[
             ["examples/example_1.jpg", "Query", "How many cars are in the image?"],
+            ["examples/example_1.jpg", "Caption", "short"],
             ["examples/example_2.JPG", "Point", "the person's face"],
+            ["examples/example_2.JPG", "Detect", "the person"],
         ],
         inputs=[image_input, category_select, prompt_input],
     )
     # --- Event Listeners ---
     category_select.change(
         fn=on_category_and_image_change,
         inputs=[image_input, category_select],
         outputs=[suggestions_radio, prompt_input],
     )
     suggestions_radio.change(
         fn=update_prompt_from_radio,
         inputs=[suggestions_radio],
         outputs=[prompt_input],
     )
     submit_btn.click(
         fn=process_inputs,
         inputs=[image_input, category_select, prompt_input],
     )
 if __name__ == "__main__":
+    demo.launch()