Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

HeshamHaroon Claude commited on 15 days ago

Commit

7b9de45

1 Parent(s): 751def7

Auto-run leaderboard evaluation and improve UI colors

- Leaderboard now runs automatically on page load
- Options moved to collapsible accordion for cleaner UX
- Redesigned tables with professional color scheme:
- Neutral dark headers (#2c3e50, #495057)
- Subtle row backgrounds for top 3
- Medals for rank display (🥇🥈🥉)
- Clean Bootstrap-style metric colors
- Improved typography and spacing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (2) hide show

app.py +41 -39
leaderboard.py +87 -68

app.py CHANGED Viewed

@@ -132,48 +132,43 @@ def create_interface():
             with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
                 gr.Markdown("""
                 ## 🏆 Arabic Tokenizer Leaderboard
-                Evaluate and rank tokenizers using **real Arabic datasets from HuggingFace**.
-                Select datasets and tokenizers below, then click "Run Evaluation" to generate the leaderboard.
-                ⚠️ **Note:** First run will download datasets from HuggingFace (may take a few minutes).
                 """)
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        gr.Markdown("### 📚 Select Datasets")
-                        dataset_choices = gr.CheckboxGroup(
-                            choices=[(f"{v['name']} ({v['category']})", k) for k, v in LEADERBOARD_DATASETS.items()],
-                            value=["arabic_mmlu", "arsentd_lev", "athar", "arcd"],
-                            label="HuggingFace Datasets",
-                            info="Datasets will be downloaded from HuggingFace"
-                        )
-                    with gr.Column(scale=1):
-                        gr.Markdown("### 🔧 Select Tokenizers")
-                        leaderboard_tokenizer_choices = gr.CheckboxGroup(
-                            choices=available_tokenizers,
-                            value=available_tokenizers[:8] if len(available_tokenizers) >= 8 else available_tokenizers,
-                            label="Tokenizers to Evaluate"
-                        )
-                run_leaderboard_btn = gr.Button("🚀 Run Evaluation", variant="primary", size="lg")
-                status_output = gr.Markdown("Click 'Run Evaluation' to start...")
-                gr.Markdown("---")
                 gr.Markdown("### 📊 Leaderboard Results")
                 leaderboard_output = gr.HTML()
                 gr.Markdown("### 📈 Per-Dataset Breakdown")
                 per_dataset_output = gr.HTML()
-                run_leaderboard_btn.click(
-                    fn=run_leaderboard_evaluation,
-                    inputs=[dataset_choices, leaderboard_tokenizer_choices],
-                    outputs=[leaderboard_output, per_dataset_output, status_output]
-                )
                 gr.Markdown("""
                 ---
@@ -242,11 +237,18 @@ def create_interface():
             # ===== TAB 5: About =====
             with gr.TabItem("ℹ️ About", id="about"):
                 about_html = generate_about_html(
-                    tokenizers_by_type,
                     len(available_tokenizers)
                 )
                 gr.HTML(about_html)
         return demo

             with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
                 gr.Markdown("""
                 ## 🏆 Arabic Tokenizer Leaderboard
+                Tokenizers ranked by performance on **real Arabic datasets from HuggingFace**.
                 """)
+                status_output = gr.Markdown("⏳ Loading evaluation...")
                 gr.Markdown("### 📊 Leaderboard Results")
                 leaderboard_output = gr.HTML()
                 gr.Markdown("### 📈 Per-Dataset Breakdown")
                 per_dataset_output = gr.HTML()
+                with gr.Accordion("⚙️ Customize Evaluation", open=False):
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            gr.Markdown("### 📚 Datasets")
+                            dataset_choices = gr.CheckboxGroup(
+                                choices=[(f"{v['name']} ({v['category']})", k) for k, v in LEADERBOARD_DATASETS.items()],
+                                value=["arabic_mmlu", "arsentd_lev", "athar", "arcd"],
+                                label="HuggingFace Datasets"
+                            )
+                        with gr.Column(scale=1):
+                            gr.Markdown("### 🔧 Tokenizers")
+                            leaderboard_tokenizer_choices = gr.CheckboxGroup(
+                                choices=available_tokenizers,
+                                value=available_tokenizers[:8] if len(available_tokenizers) >= 8 else available_tokenizers,
+                                label="Tokenizers to Evaluate"
+                            )
+                    run_leaderboard_btn = gr.Button("🔄 Re-run Evaluation", variant="primary", size="lg")
+                    run_leaderboard_btn.click(
+                        fn=run_leaderboard_evaluation,
+                        inputs=[dataset_choices, leaderboard_tokenizer_choices],
+                        outputs=[leaderboard_output, per_dataset_output, status_output]
+                    )
                 gr.Markdown("""
                 ---
             # ===== TAB 5: About =====
             with gr.TabItem("ℹ️ About", id="about"):
                 about_html = generate_about_html(
+                    tokenizers_by_type,
                     len(available_tokenizers)
                 )
                 gr.HTML(about_html)
+        # Auto-run leaderboard evaluation on load
+        demo.load(
+            fn=run_leaderboard_evaluation,
+            inputs=[dataset_choices, leaderboard_tokenizer_choices],
+            outputs=[leaderboard_output, per_dataset_output, status_output]
+        )
         return demo

leaderboard.py CHANGED Viewed

@@ -260,57 +260,62 @@ def run_leaderboard_evaluation(
 def generate_leaderboard_html(data: List[Dict]) -> str:
-    """Generate HTML for main leaderboard"""
     if not data:
         return "<p>No results to display</p>"
     html = """
     <style>
         .leaderboard-table {
             width: 100%;
             border-collapse: collapse;
-            font-family: system-ui, -apple-system, sans-serif;
-            margin: 20px 0;
         }
         .leaderboard-table th {
-            background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4e 100%);
-            color: white;
-            padding: 12px 8px;
             text-align: left;
-            font-weight: 600;
         }
         .leaderboard-table td {
-            padding: 10px 8px;
-            border-bottom: 1px solid #e0e0e0;
         }
         .leaderboard-table tr:nth-child(even) {
             background-color: #f8f9fa;
         }
         .leaderboard-table tr:hover {
-            background-color: #e8f5e9;
         }
-        .rank-1 { background: linear-gradient(90deg, #ffd700 0%, #fff8dc 100%) !important; }
-        .rank-2 { background: linear-gradient(90deg, #c0c0c0 0%, #f5f5f5 100%) !important; }
-        .rank-3 { background: linear-gradient(90deg, #cd7f32 0%, #ffe4c4 100%) !important; }
         .score-badge {
-            background: #2d8f4e;
-            color: white;
-            padding: 4px 8px;
-            border-radius: 12px;
-            font-weight: bold;
         }
         .type-badge {
-            background: #e3f2fd;
-            color: #1565c0;
-            padding: 2px 6px;
-            border-radius: 4px;
-            font-size: 0.85em;
         }
-        .metric-good { color: #2e7d32; font-weight: 600; }
-        .metric-bad { color: #c62828; }
     </style>
     <table class="leaderboard-table">
         <thead>
             <tr>
@@ -318,27 +323,37 @@ def generate_leaderboard_html(data: List[Dict]) -> str:
                 <th>Tokenizer</th>
                 <th>Type</th>
                 <th>Organization</th>
-                <th>Score ↑</th>
-                <th>Fertility ↓</th>
-                <th>Compression ↑</th>
-                <th>UNK Rate ↓</th>
                 <th>Datasets</th>
             </tr>
         </thead>
         <tbody>
     """
     for i, entry in enumerate(data):
         rank = i + 1
         rank_class = f"rank-{rank}" if rank <= 3 else ""
         fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
         comp_class = "metric-good" if entry["compression"] > 3.5 else ""
         unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
         html += f"""
             <tr class="{rank_class}">
-                <td><strong>#{rank}</strong></td>
                 <td><strong>{entry["name"]}</strong></td>
                 <td><span class="type-badge">{entry["type"]}</span></td>
                 <td>{entry["org"]}</td>
@@ -349,82 +364,86 @@ def generate_leaderboard_html(data: List[Dict]) -> str:
                 <td>{entry["num_datasets"]}</td>
             </tr>
         """
     html += """
         </tbody>
     </table>
-    <div style="margin-top: 15px; padding: 10px; background: #f5f5f5; border-radius: 8px; font-size: 0.9em;">
-        <strong>📊 Metric Guide:</strong><br>
-        • <strong>Score:</strong> Overall ranking (0-100, higher = better)<br>
-        • <strong>Fertility:</strong> Tokens per word (lower = better, 1.0 ideal for Arabic)<br>
-        • <strong>Compression:</strong> Bytes per token (higher = more efficient)<br>
-        • <strong>UNK Rate:</strong> Unknown token percentage (lower = better)
     </div>
     """
     return html
 def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
-    """Generate HTML for per-dataset fertility table"""
     if not data:
         return "<p>No per-dataset results</p>"
     ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
     html = """
     <style>
         .dataset-table {
             width: 100%;
             border-collapse: collapse;
-            font-family: system-ui, -apple-system, sans-serif;
-            margin: 20px 0;
-            font-size: 0.9em;
         }
         .dataset-table th {
-            background: #37474f;
-            color: white;
-            padding: 10px 6px;
             text-align: center;
         }
         .dataset-table th:first-child {
             text-align: left;
         }
         .dataset-table td {
-            padding: 8px 6px;
             text-align: center;
-            border-bottom: 1px solid #e0e0e0;
         }
         .dataset-table td:first-child {
             text-align: left;
             font-weight: 500;
         }
         .dataset-table tr:nth-child(even) {
-            background-color: #fafafa;
         }
-        .fert-excellent { background: #c8e6c9; color: #1b5e20; font-weight: 600; }
-        .fert-good { background: #fff9c4; color: #f57f17; }
-        .fert-poor { background: #ffcdd2; color: #b71c1c; }
     </style>
-    <h4>📈 Fertility per Dataset (tokens/word - lower is better)</h4>
     <table class="dataset-table">
         <thead>
             <tr>
                 <th>Tokenizer</th>
     """
     for ds_name in ds_names:
         html += f"<th>{ds_name}</th>"
     html += """
             </tr>
         </thead>
         <tbody>
     """
     for row in data:
         html += f"<tr><td>{row['Tokenizer']}</td>"
         for ds_name in ds_names:
@@ -440,10 +459,10 @@ def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
             else:
                 html += '<td>-</td>'
         html += "</tr>"
     html += """
         </tbody>
     </table>
     """
     return html

 def generate_leaderboard_html(data: List[Dict]) -> str:
+    """Generate HTML for main leaderboard - clean professional design"""
     if not data:
         return "<p>No results to display</p>"
     html = """
     <style>
         .leaderboard-table {
             width: 100%;
             border-collapse: collapse;
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            font-size: 14px;
+            margin: 16px 0;
         }
         .leaderboard-table th {
+            background: #2c3e50;
+            color: #fff;
+            padding: 12px 10px;
             text-align: left;
+            font-weight: 500;
+            border-bottom: 2px solid #1a252f;
         }
         .leaderboard-table td {
+            padding: 10px;
+            border-bottom: 1px solid #e9ecef;
+            color: #333;
         }
         .leaderboard-table tr:nth-child(even) {
             background-color: #f8f9fa;
         }
         .leaderboard-table tr:hover {
+            background-color: #eef2f7;
         }
+        .leaderboard-table .rank-1 td { background: #f0f7ff; }
+        .leaderboard-table .rank-2 td { background: #f5f5f5; }
+        .leaderboard-table .rank-3 td { background: #fdf8f3; }
         .score-badge {
+            background: #2c3e50;
+            color: #fff;
+            padding: 4px 10px;
+            border-radius: 4px;
+            font-weight: 600;
+            font-size: 13px;
         }
         .type-badge {
+            background: #e9ecef;
+            color: #495057;
+            padding: 3px 8px;
+            border-radius: 3px;
+            font-size: 12px;
         }
+        .metric-good { color: #198754; font-weight: 500; }
+        .metric-bad { color: #dc3545; font-weight: 500; }
+        .rank-medal { font-size: 16px; margin-right: 4px; }
     </style>
     <table class="leaderboard-table">
         <thead>
             <tr>
                 <th>Tokenizer</th>
                 <th>Type</th>
                 <th>Organization</th>
+                <th>Score</th>
+                <th>Fertility</th>
+                <th>Compression</th>
+                <th>UNK Rate</th>
                 <th>Datasets</th>
             </tr>
         </thead>
         <tbody>
     """
     for i, entry in enumerate(data):
         rank = i + 1
         rank_class = f"rank-{rank}" if rank <= 3 else ""
+        # Medal for top 3
+        if rank == 1:
+            rank_display = '<span class="rank-medal">🥇</span> 1'
+        elif rank == 2:
+            rank_display = '<span class="rank-medal">🥈</span> 2'
+        elif rank == 3:
+            rank_display = '<span class="rank-medal">🥉</span> 3'
+        else:
+            rank_display = f"#{rank}"
         fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
         comp_class = "metric-good" if entry["compression"] > 3.5 else ""
         unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
         html += f"""
             <tr class="{rank_class}">
+                <td><strong>{rank_display}</strong></td>
                 <td><strong>{entry["name"]}</strong></td>
                 <td><span class="type-badge">{entry["type"]}</span></td>
                 <td>{entry["org"]}</td>
                 <td>{entry["num_datasets"]}</td>
             </tr>
         """
     html += """
         </tbody>
     </table>
+    <div style="margin-top: 12px; padding: 12px 16px; background: #f8f9fa; border-left: 3px solid #2c3e50; font-size: 13px; color: #495057;">
+        <strong>Metrics:</strong>
+        Score (0-100, higher=better) •
+        Fertility (tokens/word, lower=better) •
+        Compression (bytes/token, higher=better) •
+        UNK Rate (lower=better)
     </div>
     """
     return html
 def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
+    """Generate HTML for per-dataset fertility table - clean professional design"""
     if not data:
         return "<p>No per-dataset results</p>"
     ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
     html = """
     <style>
         .dataset-table {
             width: 100%;
             border-collapse: collapse;
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            font-size: 13px;
+            margin: 16px 0;
         }
         .dataset-table th {
+            background: #495057;
+            color: #fff;
+            padding: 10px 8px;
             text-align: center;
+            font-weight: 500;
         }
         .dataset-table th:first-child {
             text-align: left;
         }
         .dataset-table td {
+            padding: 8px;
             text-align: center;
+            border-bottom: 1px solid #e9ecef;
+            color: #333;
         }
         .dataset-table td:first-child {
             text-align: left;
             font-weight: 500;
         }
         .dataset-table tr:nth-child(even) {
+            background-color: #f8f9fa;
+        }
+        .dataset-table tr:hover {
+            background-color: #eef2f7;
         }
+        .fert-excellent { background: #d4edda; color: #155724; font-weight: 500; }
+        .fert-good { background: #fff3cd; color: #856404; font-weight: 500; }
+        .fert-poor { background: #f8d7da; color: #721c24; font-weight: 500; }
     </style>
     <table class="dataset-table">
         <thead>
             <tr>
                 <th>Tokenizer</th>
     """
     for ds_name in ds_names:
         html += f"<th>{ds_name}</th>"
     html += """
             </tr>
         </thead>
         <tbody>
     """
     for row in data:
         html += f"<tr><td>{row['Tokenizer']}</td>"
         for ds_name in ds_names:
             else:
                 html += '<td>-</td>'
         html += "</tr>"
     html += """
         </tbody>
     </table>
     """
     return html