HeshamHaroon's picture
Add leaderboard caching and fix dataset configurations
532fc72
"""
Arabic Tokenizer Arena Pro - Main Application
==============================================
Advanced research & production platform for Arabic tokenization analysis
Run with: python app.py
"""
import gradio as gr
# Import modules
from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS
from styles import CUSTOM_CSS
from tokenizer_manager import tokenizer_manager
from analysis import analyze_single_tokenizer, compare_tokenizers
from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer, get_cached_leaderboard
from ui_components import generate_about_html
def create_interface():
"""Create the Gradio interface"""
available_tokenizers = tokenizer_manager.get_tokenizer_choices()
tokenizers_by_type = tokenizer_manager.get_tokenizers_by_type()
with gr.Blocks(
css=CUSTOM_CSS,
title="Arabic Tokenizer Arena Pro",
theme=gr.themes.Base(
primary_hue="green",
secondary_hue="blue",
neutral_hue="slate",
font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
)
) as demo:
# Header
gr.HTML("""
<div class="header-section">
<h1>🏟️ Arabic Tokenizer Arena Pro</h1>
<p>Advanced research & production platform for Arabic tokenization analysis</p>
</div>
""")
with gr.Tabs():
# ===== TAB 1: Single Tokenizer Analysis =====
with gr.TabItem("πŸ”¬ Single Analysis", id="single"):
with gr.Row():
with gr.Column(scale=1):
tokenizer_dropdown = gr.Dropdown(
choices=available_tokenizers,
value=available_tokenizers[0] if available_tokenizers else None,
label="Select Tokenizer",
info="Choose a tokenizer to analyze"
)
sample_dropdown = gr.Dropdown(
choices=list(SAMPLE_TEXTS.keys()),
label="Sample Texts",
info="Select a sample or enter custom text"
)
input_text = gr.Textbox(
lines=4,
placeholder="Ψ§ΩƒΨͺΨ¨ Ψ§Ω„Ω†Ψ΅ Ψ§Ω„ΨΉΨ±Ψ¨ΩŠ Ω‡Ω†Ψ§...\nEnter Arabic text here...",
label="Input Text",
rtl=True
)
analyze_btn = gr.Button("πŸ” Analyze", variant="primary", size="lg")
with gr.Column(scale=2):
info_output = gr.HTML(label="Tokenizer Information")
metrics_output = gr.HTML(label="Evaluation Metrics")
tokens_output = gr.HTML(label="Token Visualization")
decoded_output = gr.HTML(label="Decoded Output")
sample_dropdown.change(
lambda x: SAMPLE_TEXTS.get(x, ""),
inputs=[sample_dropdown],
outputs=[input_text]
)
analyze_btn.click(
analyze_single_tokenizer,
inputs=[tokenizer_dropdown, input_text],
outputs=[info_output, metrics_output, tokens_output, decoded_output]
)
# ===== TAB 2: Comparison Mode =====
with gr.TabItem("βš–οΈ Compare Tokenizers", id="compare"):
with gr.Row():
with gr.Column(scale=1):
compare_tokenizers_select = gr.CheckboxGroup(
choices=available_tokenizers,
value=available_tokenizers[:5] if len(available_tokenizers) >= 5 else available_tokenizers,
label="Select Tokenizers to Compare",
info="Choose 2 or more tokenizers"
)
compare_sample = gr.Dropdown(
choices=list(SAMPLE_TEXTS.keys()),
label="Sample Texts"
)
compare_text = gr.Textbox(
lines=4,
placeholder="Ψ§ΩƒΨͺΨ¨ Ψ§Ω„Ω†Ψ΅ Ψ§Ω„ΨΉΨ±Ψ¨ΩŠ Ω‡Ω†Ψ§...",
label="Input Text",
rtl=True
)
compare_btn = gr.Button("βš–οΈ Compare", variant="primary", size="lg")
with gr.Column(scale=2):
comparison_output = gr.HTML(label="Comparison Results")
compare_sample.change(
lambda x: SAMPLE_TEXTS.get(x, ""),
inputs=[compare_sample],
outputs=[compare_text]
)
compare_btn.click(
compare_tokenizers,
inputs=[compare_tokenizers_select, compare_text],
outputs=[comparison_output]
)
# ===== TAB 3: LEADERBOARD =====
with gr.TabItem("πŸ† Leaderboard", id="leaderboard"):
gr.Markdown("""
## πŸ† Arabic Tokenizer Leaderboard
All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total).
""")
with gr.Row():
status_output = gr.Markdown("⏳ Loading cached results...")
re_evaluate_btn = gr.Button("πŸ”„ Re-evaluate All", variant="secondary", size="sm")
gr.Markdown("### πŸ“Š Leaderboard Results")
leaderboard_output = gr.HTML()
gr.Markdown("### πŸ“ˆ Per-Dataset Breakdown")
per_dataset_output = gr.HTML()
re_evaluate_btn.click(
fn=run_leaderboard_evaluation,
inputs=[],
outputs=[leaderboard_output, per_dataset_output, status_output]
)
gr.Markdown("""
---
### πŸ“– Evaluation Datasets
| Dataset | Category | Samples |
|---------|----------|---------|
| ArabicMMLU | MSA Benchmark | 5,000 |
| ASTD | Egyptian Dialect | 5,000 |
| ATHAR | Classical Arabic | 5,000 |
| ARCD | QA Dataset | 1,395 |
| Ashaar | Poetry | 5,000 |
| Hadith | Religious | 5,000 |
| Arabic Sentiment | Social Media | 5,000 |
| SANAD | News | 5,000 |
""")
# ===== TAB 4: Metrics Reference =====
with gr.TabItem("πŸ“– Metrics Guide", id="guide"):
gr.Markdown("""
## Tokenization Evaluation Metrics Guide
### Efficiency Metrics
| Metric | Description | Ideal Value | Why It Matters |
|--------|-------------|-------------|----------------|
| **Fertility** | Tokens per word | 1.0 | Lower fertility = fewer tokens = faster inference & lower cost |
| **Compression Ratio** | Bytes per token | Higher is better | Better compression = more efficient encoding |
| **Chars/Token** | Characters per token | Higher is better | More characters per token = better vocabulary utilization |
### Coverage Metrics
| Metric | Description | Ideal Value | Why It Matters |
|--------|-------------|-------------|----------------|
| **OOV Rate** | Out-of-vocabulary percentage | 0% | Lower OOV = better vocabulary coverage |
| **STRR** | Single Token Retention Rate | Higher is better | More words preserved as single tokens = better semantic boundaries |
| **Continued Words Ratio** | Words split into multiple tokens | Lower is better | Fewer splits = better word boundary preservation |
### Arabic-Specific Metrics
| Metric | Description | Why It Matters |
|--------|-------------|----------------|
| **Arabic Fertility** | Tokens per Arabic word | Arabic-specific efficiency measure |
| **Diacritic Preservation** | Whether tashkeel is preserved | Important for religious & educational texts |
### Scoring Formula (Leaderboard)
```
Score = (Fertility Score Γ— 0.45) + (Compression Score Γ— 0.35) + (UNK Score Γ— 0.20) Γ— 100
```
Where:
- **Fertility Score** = 2.0 / fertility (capped 0-1, inverted - lower fertility = higher score)
- **Compression Score** = compression / 6 (capped 0-1)
- **UNK Score** = 1 - (unk_ratio Γ— 20) (capped 0-1, inverted)
### Research Background
These metrics are based on recent research including:
- *"A Comprehensive Analysis of Various Tokenizers for Arabic LLMs"* (2024)
- *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.)
- *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025)
- *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024)
""")
# ===== TAB 5: Submit Tokenizer =====
with gr.TabItem("πŸš€ Submit", id="submit"):
gr.Markdown("""
## πŸš€ Submit Your Tokenizer
Evaluate any HuggingFace tokenizer on **all 8 Arabic datasets** and see how it compares.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Model Information")
submit_model_id = gr.Textbox(
label="HuggingFace Model ID *",
placeholder="e.g., google/gemma-2-9b",
info="The model ID from HuggingFace Hub"
)
submit_model_name = gr.Textbox(
label="Display Name (optional)",
placeholder="e.g., My Custom Tokenizer",
info="Leave empty to use model name"
)
submit_organization = gr.Textbox(
label="Organization (optional)",
placeholder="e.g., My Organization",
info="Leave empty to auto-detect"
)
submit_model_type = gr.Dropdown(
choices=[
"Arabic LLM",
"Arabic BERT",
"Arabic Tokenizer",
"Multilingual LLM",
"Custom"
],
value="Custom",
label="Model Type"
)
submit_btn = gr.Button("πŸš€ Evaluate Tokenizer", variant="primary", size="lg")
submit_status = gr.Markdown("")
with gr.Column(scale=2):
gr.Markdown("### Evaluation Results")
submit_results = gr.HTML()
submit_btn.click(
fn=evaluate_submitted_tokenizer,
inputs=[submit_model_id, submit_model_name, submit_organization, submit_model_type],
outputs=[submit_results, submit_status]
)
gr.Markdown("""
---
### πŸ“‹ Submission Guidelines
- **Model ID**: Must be a valid HuggingFace model ID (e.g., `organization/model-name`)
- **Tokenizer**: The model must have a tokenizer that can be loaded with `AutoTokenizer`
- **Public Models**: Only public models on HuggingFace Hub are supported
- **Evaluation**: Your tokenizer will be evaluated on all 8 Arabic datasets (~36,000+ samples)
### πŸ’‘ Tips
- Lower fertility scores indicate better Arabic tokenization efficiency
- Compare your results with the leaderboard to see how your tokenizer ranks
""")
# ===== TAB 6: About =====
with gr.TabItem("ℹ️ About", id="about"):
about_html = generate_about_html(
tokenizers_by_type,
len(available_tokenizers)
)
gr.HTML(about_html)
# Load cached leaderboard results on page load (fast)
demo.load(
fn=get_cached_leaderboard,
inputs=[],
outputs=[leaderboard_output, per_dataset_output, status_output]
)
return demo
# ============================================================================
# MAIN
# ============================================================================
if __name__ == "__main__":
demo = create_interface()
demo.launch()