ISSAAYMAN4 commited on
Commit
eb3c23e
·
verified ·
1 Parent(s): 44547c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -94
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import base64
2
  import json
3
  import ast
@@ -5,27 +7,68 @@ import os
5
  import re
6
  import io
7
  import math
8
- import gradio as gr
9
- import oss2
10
- from oss2.Credentials import EnvironmentVariableCredentialsProvider
11
- ...
12
- auth = oss2.ProviderAuthV4(EnvironmentVariableCredentialsProvider())
13
- endpoint = 'oss-us-east-1.aliyuncs.com'
14
- region = "us-east-1"
15
- bucket = os.environ.get("BUCKET")
16
- bucket = oss2.Bucket(auth, endpoint, bucket, region=region)
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  def draw_point_area(image, point):
 
 
 
21
  radius = min(image.width, image.height) // 15
22
- x, y = round(point[0]/1000 * image.width), round(point[1]/1000 * image.height)
23
- ImageDraw.Draw(image).ellipse((x - radius, y - radius, x + radius, y + radius), outline='red', width=2)
24
- ImageDraw.Draw(image).ellipse((x - 2, y - 2, x + 2, y + 2), fill='red')
 
 
25
  return image
26
 
27
 
28
  def resize_image(image):
 
29
  max_pixels = 6000 * 28 * 28
30
  if image.width * image.height > max_pixels:
31
  max_pixels = 2700 * 28 * 28
@@ -33,40 +76,49 @@ def resize_image(image):
33
  max_pixels = 1340 * 28 * 28
34
  resize_factor = math.sqrt(max_pixels / (image.width * image.height))
35
  width, height = int(image.width * resize_factor), int(image.height * resize_factor)
36
- image = image.resize((width, height))
37
- return image
38
 
39
 
40
  def upload_images(session_id, image, result_image, query):
 
 
 
 
 
41
  img_path = f"{session_id}.png"
42
  result_img_path = f"{session_id}-draw.png"
43
  metadata = dict(
44
  query=query,
45
  resize_image=img_path,
46
  result_image=result_img_path,
47
- session_id=session_id
48
  )
 
49
  img_bytes = io.BytesIO()
50
  image.save(img_bytes, format="png")
51
- img_bytes = img_bytes.getvalue()
52
- bucket.put_object(img_path, img_bytes)
53
 
54
  rst_img_bytes = io.BytesIO()
55
  result_image.save(rst_img_bytes, format="png")
56
- rst_img_bytes = rst_img_bytes.getvalue()
57
- bucket.put_object(result_img_path, rst_img_bytes)
58
- bucket.put_object(f"{session_id}.json", json.dumps(metadata))
59
- print("end upload images")
60
 
61
 
62
  def run_ui(image, query, session_id, is_example_image):
 
63
  click_xy = None
64
- images_during_iterations = [] # List to store images at each step
65
  width, height = image.width, image.height
 
 
66
  image = resize_image(image)
67
- bytes = io.BytesIO()
68
- image.save(bytes, format="png")
69
- base64_image = base64.standard_b64encode(bytes.getvalue()).decode("utf-8")
 
 
70
  messages = [
71
  {
72
  "role": "user",
@@ -76,36 +128,62 @@ def run_ui(image, query, session_id, is_example_image):
76
  ],
77
  }
78
  ]
79
- response = client.chat.completions.create(model="tgi", messages=messages, temperature=1.0, top_p=0.7, max_tokens=128, frequency_penalty=1, stream=False)
80
- output_text = response.choices[0].message.content
81
- pattern = r"\((\d+,\d+)\)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  match = re.search(pattern, output_text)
83
  if match:
84
  coordinates = match.group(1)
85
- click_xy = ast.literal_eval(coordinates)
86
- result_image = draw_point_area(image, click_xy)
 
 
 
 
 
 
 
 
 
87
  images_during_iterations.append(result_image)
88
- click_xy = round(click_xy[0]/1000 * width), round(click_xy[1]/1000 * height)
89
- # TODO: async
90
- if is_example_image == "False":
 
91
  upload_images(session_id, image, result_image, query)
92
 
93
- return images_during_iterations, str(click_xy)
94
 
95
 
96
- def update_vote(vote_type, image, click_image, prompt, is_example):
97
- """upload bad cases to somewhere"""
98
  if vote_type == "upvote":
99
  return "Everything good"
100
-
101
  if is_example == "True":
102
  return "Do nothing for example"
103
- click_img_path = click_image[0] # webp format
104
- image.size
105
- # TODO: upload to some where
106
- return f"Thank you for your feedback!"
107
 
108
 
 
109
  examples = [
110
  ["./examples/solitaire.png", "Play the solitaire collection", True],
111
  ["./examples/weather_ui.png", "Open map", True],
@@ -121,43 +199,34 @@ examples = [
121
  ["./examples/ios_setting.png", "Turn off Do not disturb.", True],
122
  ]
123
 
124
-
125
-
126
- title_markdown = ("""
127
  # UI-TARS Pioneering Automated GUI Interaction with Native Agents
128
  [[🤗Model](https://huggingface.co/bytedance-research/UI-TARS-7B-SFT)] [[⌨️Code](https://github.com/bytedance/UI-TARS)] [[📑Paper](https://github.com/bytedance/UI-TARS/blob/main/UI_TARS_paper.pdf)] [🏄[Midscene (Browser Automation)](https://github.com/web-infra-dev/Midscene)] [🫨[Discord](https://discord.gg/txAE43ps)]
129
- """)
130
-
131
 
132
- tos_markdown = ("""
133
  ### Terms of use
134
  This demo is governed by the original license of UI-TARS. We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, including hate speech, violence, pornography, deception, etc. (注:本演示受UI-TARS的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)
135
- """)
136
-
137
 
138
- learn_more_markdown = ("""
139
  ### License
140
  Apache License 2.0
141
- """)
142
-
143
 
144
- code_adapt_markdown = ("""
145
  ### Acknowledgments
146
  The app code is modified from [ShowUI](https://huggingface.co/spaces/showlab/ShowUI)
147
- """)
148
-
149
 
150
  block_css = """
151
- #buttons button {
152
- min-width: min(120px,100%);
153
- }
154
-
155
  #chatbot img {
156
- max-width: 80%;
157
- max-height: 80vh;
158
- width: auto;
159
- height: auto;
160
- object-fit: contain;
161
  }
162
  """
163
 
@@ -167,11 +236,9 @@ def build_demo():
167
  state_session_id = gr.State(value=None)
168
  gr.Markdown(title_markdown)
169
 
170
-
171
  with gr.Row():
172
  with gr.Column(scale=3):
173
  imagebox = gr.Image(type="pil", label="Input Screenshot")
174
-
175
  textbox = gr.Textbox(
176
  show_label=True,
177
  placeholder="Enter an instruction and press Submit",
@@ -181,7 +248,6 @@ def build_demo():
181
 
182
  with gr.Column(scale=6):
183
  output_gallery = gr.Gallery(label="Output with click", object_fit="contain", preview=True)
184
- # output_gallery = gr.Gallery(label="Iterative Refinement")
185
  gr.HTML(
186
  """
187
  <p><strong>Notice:</strong> The <span style="color: red;">red point</span> with a circle on the output image represents the predicted coordinates for a click.</p>
@@ -191,47 +257,34 @@ def build_demo():
191
  output_coords = gr.Textbox(label="Final Coordinates")
192
  image_size = gr.Textbox(label="Image Size")
193
 
194
- gr.HTML(
195
- """
196
- <p><strong>Expected result or not? help us improve! ⬇️</strong></p>
197
- """
198
- )
199
  with gr.Row(elem_id="action-buttons", equal_height=True):
200
- upvote_btn = gr.Button(value="👍 Looks good!", variant="secondary")
201
  downvote_btn = gr.Button(value="👎 Wrong coordinates!", variant="secondary")
202
- clear_btn = gr.Button(value="🗑️ Clear", interactive=True)
203
-
204
  with gr.Column(scale=3):
205
  gr.Examples(
206
  examples=[[e[0], e[1]] for e in examples],
207
  inputs=[imagebox, textbox],
208
- outputs=[textbox], # Only update the query textbox
209
  examples_per_page=3,
210
  )
211
-
212
  is_example_dropdown = gr.Dropdown(
213
- choices=["True", "False"],
214
- value="False",
215
- visible=False,
216
- label="Is Example Image",
217
  )
218
 
219
  def set_is_example(query):
220
  for _, example_query, is_example in examples:
221
  if query.strip() == example_query.strip():
222
- return str(is_example) # Return as string for Dropdown compatibility
223
  return "False"
224
 
225
- textbox.change(
226
- set_is_example,
227
- inputs=[textbox],
228
- outputs=[is_example_dropdown],
229
- )
230
 
231
  def on_submit(image, query, is_example_image):
232
  if image is None:
233
  raise ValueError("No image provided. Please upload an image before submitting.")
234
-
235
  session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
236
  images_during_iterations, click_coords = run_ui(image, query, session_id, is_example_image)
237
  return images_during_iterations, click_coords, session_id, f"{image.width}x{image.height}"
@@ -246,21 +299,23 @@ def build_demo():
246
  lambda: (None, None, None, None, None, None),
247
  inputs=None,
248
  outputs=[imagebox, textbox, output_gallery, output_coords, state_session_id, image_size],
249
- queue=False
250
  )
251
 
252
  upvote_btn.click(
253
- lambda image, click_image, prompt, is_example: update_vote("upvote", image, click_image, prompt, is_example),
 
254
  inputs=[imagebox, output_gallery, textbox, is_example_dropdown],
255
  outputs=[],
256
- queue=False
257
  )
258
 
259
  downvote_btn.click(
260
- lambda image, click_image, prompt, is_example: update_vote("downvote", image, click_image, prompt, is_example),
 
261
  inputs=[imagebox, output_gallery, textbox, is_example_dropdown],
262
  outputs=[],
263
- queue=False
264
  )
265
 
266
  gr.Markdown(tos_markdown)
@@ -269,10 +324,11 @@ def build_demo():
269
 
270
  return demo
271
 
 
272
  if __name__ == "__main__":
273
  demo = build_demo()
274
  demo.queue(api_open=False).launch(
275
  server_name="0.0.0.0",
276
  server_port=7860,
277
  debug=True,
278
- )
 
1
+ # app.py — UI-TARS demo (OSS disabled)
2
+
3
  import base64
4
  import json
5
  import ast
 
7
  import re
8
  import io
9
  import math
10
+ from datetime import datetime
 
 
 
 
 
 
 
 
11
 
12
+ import gradio as gr
13
+ from PIL import ImageDraw
14
+
15
+ # =========================
16
+ # OpenAI client (optional)
17
+ # =========================
18
+ # If OPENAI_API_KEY is set we will use OpenAI for parsing the model output text.
19
+ # If ENDPOINT_URL is set, we'll point the OpenAI client at that base URL (advanced use).
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
+ ENDPOINT_URL = os.getenv("ENDPOINT_URL") # optional
22
+ MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini") # safe default instead of "tgi"
23
+
24
+ client = None
25
+ if OPENAI_API_KEY:
26
+ try:
27
+ from openai import OpenAI
28
+ if ENDPOINT_URL:
29
+ client = OpenAI(api_key=OPENAI_API_KEY, base_url=ENDPOINT_URL)
30
+ else:
31
+ client = OpenAI(api_key=OPENAI_API_KEY)
32
+ print("✅ OpenAI client initialized.")
33
+ except Exception as e:
34
+ print(f"⚠️ OpenAI client not available: {e}")
35
+ else:
36
+ print("ℹ️ OPENAI_API_KEY not set. Running without OpenAI parsing.")
37
+
38
+ # =========================
39
+ # UI-TARS prompt
40
+ # =========================
41
+ DESCRIPTION = "[UI-TARS](https://github.com/bytedance/UI-TARS)"
42
+ prompt = (
43
+ "Output only the coordinate of one box in your response. "
44
+ "Return a tuple like (x,y) with values in 0..1000 for x and y. "
45
+ "Do not include any extra text. "
46
+ )
47
+
48
+ # =========================
49
+ # OSS (Aliyun) — DISABLED
50
+ # =========================
51
+ # The original demo used Aliyun OSS (oss2) to upload images/metadata.
52
+ # We disable it fully so no ENV like BUCKET / ENDPOINT is required.
53
+ bucket = None
54
+ print("⚠️ OSS integration disabled: skipping Aliyun storage.")
55
 
56
 
57
  def draw_point_area(image, point):
58
+ """Draw a red point+circle at a (0..1000, 0..1000) coordinate on the given PIL image."""
59
+ if not point:
60
+ return image
61
  radius = min(image.width, image.height) // 15
62
+ x = round(point[0] / 1000 * image.width)
63
+ y = round(point[1] / 1000 * image.height)
64
+ drawer = ImageDraw.Draw(image)
65
+ drawer.ellipse((x - radius, y - radius, x + radius, y + radius), outline="red", width=2)
66
+ drawer.ellipse((x - 2, y - 2, x + 2, y + 2), fill="red")
67
  return image
68
 
69
 
70
  def resize_image(image):
71
+ """Resize extremely large screenshots to keep compute stable."""
72
  max_pixels = 6000 * 28 * 28
73
  if image.width * image.height > max_pixels:
74
  max_pixels = 2700 * 28 * 28
 
76
  max_pixels = 1340 * 28 * 28
77
  resize_factor = math.sqrt(max_pixels / (image.width * image.height))
78
  width, height = int(image.width * resize_factor), int(image.height * resize_factor)
79
+ return image.resize((width, height))
 
80
 
81
 
82
  def upload_images(session_id, image, result_image, query):
83
+ """No-op when OSS is disabled. Keeps API stable."""
84
+ if bucket is None:
85
+ print("↪️ Skipped OSS upload (no bucket configured).")
86
+ return
87
+
88
  img_path = f"{session_id}.png"
89
  result_img_path = f"{session_id}-draw.png"
90
  metadata = dict(
91
  query=query,
92
  resize_image=img_path,
93
  result_image=result_img_path,
94
+ session_id=session_id,
95
  )
96
+
97
  img_bytes = io.BytesIO()
98
  image.save(img_bytes, format="png")
99
+ bucket.put_object(img_path, img_bytes.getvalue())
 
100
 
101
  rst_img_bytes = io.BytesIO()
102
  result_image.save(rst_img_bytes, format="png")
103
+ bucket.put_object(result_img_path, rst_img_bytes.getvalue())
104
+
105
+ bucket.put_object(f"{session_id}.json", json.dumps(metadata).encode("utf-8"))
106
+ print(" (would) upload images — skipped unless bucket configured")
107
 
108
 
109
  def run_ui(image, query, session_id, is_example_image):
110
+ """Main inference path: builds the message, asks the model for (x,y), draws, returns results."""
111
  click_xy = None
112
+ images_during_iterations = []
113
  width, height = image.width, image.height
114
+
115
+ # Resize for throughput + encode
116
  image = resize_image(image)
117
+ buf = io.BytesIO()
118
+ image.save(buf, format="png")
119
+ base64_image = base64.standard_b64encode(buf.getvalue()).decode("utf-8")
120
+
121
+ # Prepare prompt for an LLM that returns '(x,y)'
122
  messages = [
123
  {
124
  "role": "user",
 
128
  ],
129
  }
130
  ]
131
+
132
+ # If OpenAI client is present, ask it to parse coordinates. Otherwise we return a safe default.
133
+ output_text = ""
134
+ if client is not None:
135
+ try:
136
+ resp = client.chat.completions.create(
137
+ model=MODEL_NAME,
138
+ messages=messages,
139
+ temperature=1.0,
140
+ top_p=0.7,
141
+ max_tokens=128,
142
+ frequency_penalty=1,
143
+ stream=False,
144
+ )
145
+ output_text = resp.choices[0].message.content or ""
146
+ except Exception as e:
147
+ output_text = ""
148
+ print(f"⚠️ OpenAI call failed: {e}")
149
+
150
+ # Extract "(x,y)" from the text using regex
151
+ pattern = r"\((\d+,\s*\d+)\)"
152
  match = re.search(pattern, output_text)
153
  if match:
154
  coordinates = match.group(1)
155
+ try:
156
+ click_xy = ast.literal_eval(coordinates) # (x, y) with 0..1000 scale
157
+ except Exception:
158
+ click_xy = None
159
+
160
+ # If we still don't have coordinates, fall back to center
161
+ if click_xy is None:
162
+ click_xy = (500, 500)
163
+
164
+ # Draw result + convert to absolute pixel coords for display
165
+ result_image = draw_point_area(image.copy(), click_xy)
166
  images_during_iterations.append(result_image)
167
+ abs_xy = (round(click_xy[0] / 1000 * width), round(click_xy[1] / 1000 * height))
168
+
169
+ # Upload artifacts only for real (non-example) inputs
170
+ if str(is_example_image) == "False":
171
  upload_images(session_id, image, result_image, query)
172
 
173
+ return images_during_iterations, str(abs_xy)
174
 
175
 
176
+ def update_vote(vote_type, image, click_image, prompt_text, is_example):
177
+ """Simple feedback hook (no external upload when OSS disabled)."""
178
  if vote_type == "upvote":
179
  return "Everything good"
 
180
  if is_example == "True":
181
  return "Do nothing for example"
182
+ # Example gallery returns file paths; we do nothing here
183
+ return "Thank you for your feedback!"
 
 
184
 
185
 
186
+ # Demo examples
187
  examples = [
188
  ["./examples/solitaire.png", "Play the solitaire collection", True],
189
  ["./examples/weather_ui.png", "Open map", True],
 
199
  ["./examples/ios_setting.png", "Turn off Do not disturb.", True],
200
  ]
201
 
202
+ title_markdown = """
 
 
203
  # UI-TARS Pioneering Automated GUI Interaction with Native Agents
204
  [[🤗Model](https://huggingface.co/bytedance-research/UI-TARS-7B-SFT)] [[⌨️Code](https://github.com/bytedance/UI-TARS)] [[📑Paper](https://github.com/bytedance/UI-TARS/blob/main/UI_TARS_paper.pdf)] [🏄[Midscene (Browser Automation)](https://github.com/web-infra-dev/Midscene)] [🫨[Discord](https://discord.gg/txAE43ps)]
205
+ """
 
206
 
207
+ tos_markdown = """
208
  ### Terms of use
209
  This demo is governed by the original license of UI-TARS. We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, including hate speech, violence, pornography, deception, etc. (注:本演示受UI-TARS的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)
210
+ """
 
211
 
212
+ learn_more_markdown = """
213
  ### License
214
  Apache License 2.0
215
+ """
 
216
 
217
+ code_adapt_markdown = """
218
  ### Acknowledgments
219
  The app code is modified from [ShowUI](https://huggingface.co/spaces/showlab/ShowUI)
220
+ """
 
221
 
222
  block_css = """
223
+ #buttons button { min-width: min(120px,100%); }
 
 
 
224
  #chatbot img {
225
+ max-width: 80%;
226
+ max-height: 80vh;
227
+ width: auto;
228
+ height: auto;
229
+ object-fit: contain;
230
  }
231
  """
232
 
 
236
  state_session_id = gr.State(value=None)
237
  gr.Markdown(title_markdown)
238
 
 
239
  with gr.Row():
240
  with gr.Column(scale=3):
241
  imagebox = gr.Image(type="pil", label="Input Screenshot")
 
242
  textbox = gr.Textbox(
243
  show_label=True,
244
  placeholder="Enter an instruction and press Submit",
 
248
 
249
  with gr.Column(scale=6):
250
  output_gallery = gr.Gallery(label="Output with click", object_fit="contain", preview=True)
 
251
  gr.HTML(
252
  """
253
  <p><strong>Notice:</strong> The <span style="color: red;">red point</span> with a circle on the output image represents the predicted coordinates for a click.</p>
 
257
  output_coords = gr.Textbox(label="Final Coordinates")
258
  image_size = gr.Textbox(label="Image Size")
259
 
260
+ gr.HTML("<p><strong>Expected result or not? help us improve! ⬇️</strong></p>")
 
 
 
 
261
  with gr.Row(elem_id="action-buttons", equal_height=True):
262
+ upvote_btn = gr.Button(value="👍 Looks good!", variant="secondary")
263
  downvote_btn = gr.Button(value="👎 Wrong coordinates!", variant="secondary")
264
+ clear_btn = gr.Button(value="🗑️ Clear", interactive=True)
265
+
266
  with gr.Column(scale=3):
267
  gr.Examples(
268
  examples=[[e[0], e[1]] for e in examples],
269
  inputs=[imagebox, textbox],
270
+ outputs=[textbox],
271
  examples_per_page=3,
272
  )
 
273
  is_example_dropdown = gr.Dropdown(
274
+ choices=["True", "False"], value="False", visible=False, label="Is Example Image",
 
 
 
275
  )
276
 
277
  def set_is_example(query):
278
  for _, example_query, is_example in examples:
279
  if query.strip() == example_query.strip():
280
+ return str(is_example)
281
  return "False"
282
 
283
+ textbox.change(set_is_example, inputs=[textbox], outputs=[is_example_dropdown])
 
 
 
 
284
 
285
  def on_submit(image, query, is_example_image):
286
  if image is None:
287
  raise ValueError("No image provided. Please upload an image before submitting.")
 
288
  session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
289
  images_during_iterations, click_coords = run_ui(image, query, session_id, is_example_image)
290
  return images_during_iterations, click_coords, session_id, f"{image.width}x{image.height}"
 
299
  lambda: (None, None, None, None, None, None),
300
  inputs=None,
301
  outputs=[imagebox, textbox, output_gallery, output_coords, state_session_id, image_size],
302
+ queue=False,
303
  )
304
 
305
  upvote_btn.click(
306
+ lambda image, click_image, prompt_text, is_example:
307
+ update_vote("upvote", image, click_image, prompt_text, is_example),
308
  inputs=[imagebox, output_gallery, textbox, is_example_dropdown],
309
  outputs=[],
310
+ queue=False,
311
  )
312
 
313
  downvote_btn.click(
314
+ lambda image, click_image, prompt_text, is_example:
315
+ update_vote("downvote", image, click_image, prompt_text, is_example),
316
  inputs=[imagebox, output_gallery, textbox, is_example_dropdown],
317
  outputs=[],
318
+ queue=False,
319
  )
320
 
321
  gr.Markdown(tos_markdown)
 
324
 
325
  return demo
326
 
327
+
328
  if __name__ == "__main__":
329
  demo = build_demo()
330
  demo.queue(api_open=False).launch(
331
  server_name="0.0.0.0",
332
  server_port=7860,
333
  debug=True,
334
+ )