HeshamHaroon Claude commited on
Commit
7b9de45
·
1 Parent(s): 751def7

Auto-run leaderboard evaluation and improve UI colors

Browse files

- Leaderboard now runs automatically on page load
- Options moved to collapsible accordion for cleaner UX
- Redesigned tables with professional color scheme:
- Neutral dark headers (#2c3e50, #495057)
- Subtle row backgrounds for top 3
- Medals for rank display (🥇🥈🥉)
- Clean Bootstrap-style metric colors
- Improved typography and spacing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (2) hide show
  1. app.py +41 -39
  2. leaderboard.py +87 -68
app.py CHANGED
@@ -132,48 +132,43 @@ def create_interface():
132
  with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
133
  gr.Markdown("""
134
  ## 🏆 Arabic Tokenizer Leaderboard
135
-
136
- Evaluate and rank tokenizers using **real Arabic datasets from HuggingFace**.
137
- Select datasets and tokenizers below, then click "Run Evaluation" to generate the leaderboard.
138
-
139
- ⚠️ **Note:** First run will download datasets from HuggingFace (may take a few minutes).
140
  """)
141
-
142
- with gr.Row():
143
- with gr.Column(scale=1):
144
- gr.Markdown("### 📚 Select Datasets")
145
- dataset_choices = gr.CheckboxGroup(
146
- choices=[(f"{v['name']} ({v['category']})", k) for k, v in LEADERBOARD_DATASETS.items()],
147
- value=["arabic_mmlu", "arsentd_lev", "athar", "arcd"],
148
- label="HuggingFace Datasets",
149
- info="Datasets will be downloaded from HuggingFace"
150
- )
151
-
152
- with gr.Column(scale=1):
153
- gr.Markdown("### 🔧 Select Tokenizers")
154
- leaderboard_tokenizer_choices = gr.CheckboxGroup(
155
- choices=available_tokenizers,
156
- value=available_tokenizers[:8] if len(available_tokenizers) >= 8 else available_tokenizers,
157
- label="Tokenizers to Evaluate"
158
- )
159
-
160
- run_leaderboard_btn = gr.Button("🚀 Run Evaluation", variant="primary", size="lg")
161
-
162
- status_output = gr.Markdown("Click 'Run Evaluation' to start...")
163
-
164
- gr.Markdown("---")
165
  gr.Markdown("### 📊 Leaderboard Results")
166
-
167
  leaderboard_output = gr.HTML()
168
-
169
  gr.Markdown("### 📈 Per-Dataset Breakdown")
170
  per_dataset_output = gr.HTML()
171
-
172
- run_leaderboard_btn.click(
173
- fn=run_leaderboard_evaluation,
174
- inputs=[dataset_choices, leaderboard_tokenizer_choices],
175
- outputs=[leaderboard_output, per_dataset_output, status_output]
176
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  gr.Markdown("""
179
  ---
@@ -242,11 +237,18 @@ def create_interface():
242
  # ===== TAB 5: About =====
243
  with gr.TabItem("ℹ️ About", id="about"):
244
  about_html = generate_about_html(
245
- tokenizers_by_type,
246
  len(available_tokenizers)
247
  )
248
  gr.HTML(about_html)
249
-
 
 
 
 
 
 
 
250
  return demo
251
 
252
 
 
132
  with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
133
  gr.Markdown("""
134
  ## 🏆 Arabic Tokenizer Leaderboard
135
+
136
+ Tokenizers ranked by performance on **real Arabic datasets from HuggingFace**.
 
 
 
137
  """)
138
+
139
+ status_output = gr.Markdown("⏳ Loading evaluation...")
140
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  gr.Markdown("### 📊 Leaderboard Results")
 
142
  leaderboard_output = gr.HTML()
143
+
144
  gr.Markdown("### 📈 Per-Dataset Breakdown")
145
  per_dataset_output = gr.HTML()
146
+
147
+ with gr.Accordion("⚙️ Customize Evaluation", open=False):
148
+ with gr.Row():
149
+ with gr.Column(scale=1):
150
+ gr.Markdown("### 📚 Datasets")
151
+ dataset_choices = gr.CheckboxGroup(
152
+ choices=[(f"{v['name']} ({v['category']})", k) for k, v in LEADERBOARD_DATASETS.items()],
153
+ value=["arabic_mmlu", "arsentd_lev", "athar", "arcd"],
154
+ label="HuggingFace Datasets"
155
+ )
156
+
157
+ with gr.Column(scale=1):
158
+ gr.Markdown("### 🔧 Tokenizers")
159
+ leaderboard_tokenizer_choices = gr.CheckboxGroup(
160
+ choices=available_tokenizers,
161
+ value=available_tokenizers[:8] if len(available_tokenizers) >= 8 else available_tokenizers,
162
+ label="Tokenizers to Evaluate"
163
+ )
164
+
165
+ run_leaderboard_btn = gr.Button("🔄 Re-run Evaluation", variant="primary", size="lg")
166
+
167
+ run_leaderboard_btn.click(
168
+ fn=run_leaderboard_evaluation,
169
+ inputs=[dataset_choices, leaderboard_tokenizer_choices],
170
+ outputs=[leaderboard_output, per_dataset_output, status_output]
171
+ )
172
 
173
  gr.Markdown("""
174
  ---
 
237
  # ===== TAB 5: About =====
238
  with gr.TabItem("ℹ️ About", id="about"):
239
  about_html = generate_about_html(
240
+ tokenizers_by_type,
241
  len(available_tokenizers)
242
  )
243
  gr.HTML(about_html)
244
+
245
+ # Auto-run leaderboard evaluation on load
246
+ demo.load(
247
+ fn=run_leaderboard_evaluation,
248
+ inputs=[dataset_choices, leaderboard_tokenizer_choices],
249
+ outputs=[leaderboard_output, per_dataset_output, status_output]
250
+ )
251
+
252
  return demo
253
 
254
 
leaderboard.py CHANGED
@@ -260,57 +260,62 @@ def run_leaderboard_evaluation(
260
 
261
 
262
  def generate_leaderboard_html(data: List[Dict]) -> str:
263
- """Generate HTML for main leaderboard"""
264
-
265
  if not data:
266
  return "<p>No results to display</p>"
267
-
268
  html = """
269
  <style>
270
  .leaderboard-table {
271
  width: 100%;
272
  border-collapse: collapse;
273
- font-family: system-ui, -apple-system, sans-serif;
274
- margin: 20px 0;
 
275
  }
276
  .leaderboard-table th {
277
- background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4e 100%);
278
- color: white;
279
- padding: 12px 8px;
280
  text-align: left;
281
- font-weight: 600;
 
282
  }
283
  .leaderboard-table td {
284
- padding: 10px 8px;
285
- border-bottom: 1px solid #e0e0e0;
 
286
  }
287
  .leaderboard-table tr:nth-child(even) {
288
  background-color: #f8f9fa;
289
  }
290
  .leaderboard-table tr:hover {
291
- background-color: #e8f5e9;
292
  }
293
- .rank-1 { background: linear-gradient(90deg, #ffd700 0%, #fff8dc 100%) !important; }
294
- .rank-2 { background: linear-gradient(90deg, #c0c0c0 0%, #f5f5f5 100%) !important; }
295
- .rank-3 { background: linear-gradient(90deg, #cd7f32 0%, #ffe4c4 100%) !important; }
296
  .score-badge {
297
- background: #2d8f4e;
298
- color: white;
299
- padding: 4px 8px;
300
- border-radius: 12px;
301
- font-weight: bold;
 
302
  }
303
  .type-badge {
304
- background: #e3f2fd;
305
- color: #1565c0;
306
- padding: 2px 6px;
307
- border-radius: 4px;
308
- font-size: 0.85em;
309
  }
310
- .metric-good { color: #2e7d32; font-weight: 600; }
311
- .metric-bad { color: #c62828; }
 
312
  </style>
313
-
314
  <table class="leaderboard-table">
315
  <thead>
316
  <tr>
@@ -318,27 +323,37 @@ def generate_leaderboard_html(data: List[Dict]) -> str:
318
  <th>Tokenizer</th>
319
  <th>Type</th>
320
  <th>Organization</th>
321
- <th>Score ↑</th>
322
- <th>Fertility ↓</th>
323
- <th>Compression ↑</th>
324
- <th>UNK Rate ↓</th>
325
  <th>Datasets</th>
326
  </tr>
327
  </thead>
328
  <tbody>
329
  """
330
-
331
  for i, entry in enumerate(data):
332
  rank = i + 1
333
  rank_class = f"rank-{rank}" if rank <= 3 else ""
334
-
 
 
 
 
 
 
 
 
 
 
335
  fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
336
  comp_class = "metric-good" if entry["compression"] > 3.5 else ""
337
  unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
338
-
339
  html += f"""
340
  <tr class="{rank_class}">
341
- <td><strong>#{rank}</strong></td>
342
  <td><strong>{entry["name"]}</strong></td>
343
  <td><span class="type-badge">{entry["type"]}</span></td>
344
  <td>{entry["org"]}</td>
@@ -349,82 +364,86 @@ def generate_leaderboard_html(data: List[Dict]) -> str:
349
  <td>{entry["num_datasets"]}</td>
350
  </tr>
351
  """
352
-
353
  html += """
354
  </tbody>
355
  </table>
356
-
357
- <div style="margin-top: 15px; padding: 10px; background: #f5f5f5; border-radius: 8px; font-size: 0.9em;">
358
- <strong>📊 Metric Guide:</strong><br>
359
- • <strong>Score:</strong> Overall ranking (0-100, higher = better)<br>
360
- • <strong>Fertility:</strong> Tokens per word (lower = better, 1.0 ideal for Arabic)<br>
361
- • <strong>Compression:</strong> Bytes per token (higher = more efficient)<br>
362
- • <strong>UNK Rate:</strong> Unknown token percentage (lower = better)
363
  </div>
364
  """
365
-
366
  return html
367
 
368
 
369
  def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
370
- """Generate HTML for per-dataset fertility table"""
371
-
372
  if not data:
373
  return "<p>No per-dataset results</p>"
374
-
375
  ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
376
-
377
  html = """
378
  <style>
379
  .dataset-table {
380
  width: 100%;
381
  border-collapse: collapse;
382
- font-family: system-ui, -apple-system, sans-serif;
383
- margin: 20px 0;
384
- font-size: 0.9em;
385
  }
386
  .dataset-table th {
387
- background: #37474f;
388
- color: white;
389
- padding: 10px 6px;
390
  text-align: center;
 
391
  }
392
  .dataset-table th:first-child {
393
  text-align: left;
394
  }
395
  .dataset-table td {
396
- padding: 8px 6px;
397
  text-align: center;
398
- border-bottom: 1px solid #e0e0e0;
 
399
  }
400
  .dataset-table td:first-child {
401
  text-align: left;
402
  font-weight: 500;
403
  }
404
  .dataset-table tr:nth-child(even) {
405
- background-color: #fafafa;
 
 
 
406
  }
407
- .fert-excellent { background: #c8e6c9; color: #1b5e20; font-weight: 600; }
408
- .fert-good { background: #fff9c4; color: #f57f17; }
409
- .fert-poor { background: #ffcdd2; color: #b71c1c; }
410
  </style>
411
-
412
- <h4>📈 Fertility per Dataset (tokens/word - lower is better)</h4>
413
  <table class="dataset-table">
414
  <thead>
415
  <tr>
416
  <th>Tokenizer</th>
417
  """
418
-
419
  for ds_name in ds_names:
420
  html += f"<th>{ds_name}</th>"
421
-
422
  html += """
423
  </tr>
424
  </thead>
425
  <tbody>
426
  """
427
-
428
  for row in data:
429
  html += f"<tr><td>{row['Tokenizer']}</td>"
430
  for ds_name in ds_names:
@@ -440,10 +459,10 @@ def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
440
  else:
441
  html += '<td>-</td>'
442
  html += "</tr>"
443
-
444
  html += """
445
  </tbody>
446
  </table>
447
  """
448
-
449
  return html
 
260
 
261
 
262
  def generate_leaderboard_html(data: List[Dict]) -> str:
263
+ """Generate HTML for main leaderboard - clean professional design"""
264
+
265
  if not data:
266
  return "<p>No results to display</p>"
267
+
268
  html = """
269
  <style>
270
  .leaderboard-table {
271
  width: 100%;
272
  border-collapse: collapse;
273
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
274
+ font-size: 14px;
275
+ margin: 16px 0;
276
  }
277
  .leaderboard-table th {
278
+ background: #2c3e50;
279
+ color: #fff;
280
+ padding: 12px 10px;
281
  text-align: left;
282
+ font-weight: 500;
283
+ border-bottom: 2px solid #1a252f;
284
  }
285
  .leaderboard-table td {
286
+ padding: 10px;
287
+ border-bottom: 1px solid #e9ecef;
288
+ color: #333;
289
  }
290
  .leaderboard-table tr:nth-child(even) {
291
  background-color: #f8f9fa;
292
  }
293
  .leaderboard-table tr:hover {
294
+ background-color: #eef2f7;
295
  }
296
+ .leaderboard-table .rank-1 td { background: #f0f7ff; }
297
+ .leaderboard-table .rank-2 td { background: #f5f5f5; }
298
+ .leaderboard-table .rank-3 td { background: #fdf8f3; }
299
  .score-badge {
300
+ background: #2c3e50;
301
+ color: #fff;
302
+ padding: 4px 10px;
303
+ border-radius: 4px;
304
+ font-weight: 600;
305
+ font-size: 13px;
306
  }
307
  .type-badge {
308
+ background: #e9ecef;
309
+ color: #495057;
310
+ padding: 3px 8px;
311
+ border-radius: 3px;
312
+ font-size: 12px;
313
  }
314
+ .metric-good { color: #198754; font-weight: 500; }
315
+ .metric-bad { color: #dc3545; font-weight: 500; }
316
+ .rank-medal { font-size: 16px; margin-right: 4px; }
317
  </style>
318
+
319
  <table class="leaderboard-table">
320
  <thead>
321
  <tr>
 
323
  <th>Tokenizer</th>
324
  <th>Type</th>
325
  <th>Organization</th>
326
+ <th>Score</th>
327
+ <th>Fertility</th>
328
+ <th>Compression</th>
329
+ <th>UNK Rate</th>
330
  <th>Datasets</th>
331
  </tr>
332
  </thead>
333
  <tbody>
334
  """
335
+
336
  for i, entry in enumerate(data):
337
  rank = i + 1
338
  rank_class = f"rank-{rank}" if rank <= 3 else ""
339
+
340
+ # Medal for top 3
341
+ if rank == 1:
342
+ rank_display = '<span class="rank-medal">🥇</span> 1'
343
+ elif rank == 2:
344
+ rank_display = '<span class="rank-medal">🥈</span> 2'
345
+ elif rank == 3:
346
+ rank_display = '<span class="rank-medal">🥉</span> 3'
347
+ else:
348
+ rank_display = f"#{rank}"
349
+
350
  fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
351
  comp_class = "metric-good" if entry["compression"] > 3.5 else ""
352
  unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
353
+
354
  html += f"""
355
  <tr class="{rank_class}">
356
+ <td><strong>{rank_display}</strong></td>
357
  <td><strong>{entry["name"]}</strong></td>
358
  <td><span class="type-badge">{entry["type"]}</span></td>
359
  <td>{entry["org"]}</td>
 
364
  <td>{entry["num_datasets"]}</td>
365
  </tr>
366
  """
367
+
368
  html += """
369
  </tbody>
370
  </table>
371
+
372
+ <div style="margin-top: 12px; padding: 12px 16px; background: #f8f9fa; border-left: 3px solid #2c3e50; font-size: 13px; color: #495057;">
373
+ <strong>Metrics:</strong>
374
+ Score (0-100, higher=better)
375
+ Fertility (tokens/word, lower=better)
376
+ Compression (bytes/token, higher=better)
377
+ UNK Rate (lower=better)
378
  </div>
379
  """
380
+
381
  return html
382
 
383
 
384
  def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
385
+ """Generate HTML for per-dataset fertility table - clean professional design"""
386
+
387
  if not data:
388
  return "<p>No per-dataset results</p>"
389
+
390
  ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
391
+
392
  html = """
393
  <style>
394
  .dataset-table {
395
  width: 100%;
396
  border-collapse: collapse;
397
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
398
+ font-size: 13px;
399
+ margin: 16px 0;
400
  }
401
  .dataset-table th {
402
+ background: #495057;
403
+ color: #fff;
404
+ padding: 10px 8px;
405
  text-align: center;
406
+ font-weight: 500;
407
  }
408
  .dataset-table th:first-child {
409
  text-align: left;
410
  }
411
  .dataset-table td {
412
+ padding: 8px;
413
  text-align: center;
414
+ border-bottom: 1px solid #e9ecef;
415
+ color: #333;
416
  }
417
  .dataset-table td:first-child {
418
  text-align: left;
419
  font-weight: 500;
420
  }
421
  .dataset-table tr:nth-child(even) {
422
+ background-color: #f8f9fa;
423
+ }
424
+ .dataset-table tr:hover {
425
+ background-color: #eef2f7;
426
  }
427
+ .fert-excellent { background: #d4edda; color: #155724; font-weight: 500; }
428
+ .fert-good { background: #fff3cd; color: #856404; font-weight: 500; }
429
+ .fert-poor { background: #f8d7da; color: #721c24; font-weight: 500; }
430
  </style>
431
+
 
432
  <table class="dataset-table">
433
  <thead>
434
  <tr>
435
  <th>Tokenizer</th>
436
  """
437
+
438
  for ds_name in ds_names:
439
  html += f"<th>{ds_name}</th>"
440
+
441
  html += """
442
  </tr>
443
  </thead>
444
  <tbody>
445
  """
446
+
447
  for row in data:
448
  html += f"<tr><td>{row['Tokenizer']}</td>"
449
  for ds_name in ds_names:
 
459
  else:
460
  html += '<td>-</td>'
461
  html += "</tr>"
462
+
463
  html += """
464
  </tbody>
465
  </table>
466
  """
467
+
468
  return html