Spaces:
Running
Running
| """ | |
| Arabic Tokenizer Arena Pro - Main Application | |
| ============================================== | |
| Advanced research & production platform for Arabic tokenization analysis | |
| Run with: python app.py | |
| """ | |
| import gradio as gr | |
| # Import modules | |
| from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS | |
| from styles import CUSTOM_CSS | |
| from tokenizer_manager import tokenizer_manager | |
| from analysis import analyze_single_tokenizer, compare_tokenizers | |
| from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer, get_cached_leaderboard | |
| from ui_components import generate_about_html | |
| def create_interface(): | |
| """Create the Gradio interface""" | |
| available_tokenizers = tokenizer_manager.get_tokenizer_choices() | |
| tokenizers_by_type = tokenizer_manager.get_tokenizers_by_type() | |
| with gr.Blocks( | |
| css=CUSTOM_CSS, | |
| title="Arabic Tokenizer Arena Pro", | |
| theme=gr.themes.Base( | |
| primary_hue="green", | |
| secondary_hue="blue", | |
| neutral_hue="slate", | |
| font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"] | |
| ) | |
| ) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header-section"> | |
| <h1>ποΈ Arabic Tokenizer Arena Pro</h1> | |
| <p>Advanced research & production platform for Arabic tokenization analysis</p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # ===== TAB 1: Single Tokenizer Analysis ===== | |
| with gr.TabItem("π¬ Single Analysis", id="single"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| tokenizer_dropdown = gr.Dropdown( | |
| choices=available_tokenizers, | |
| value=available_tokenizers[0] if available_tokenizers else None, | |
| label="Select Tokenizer", | |
| info="Choose a tokenizer to analyze" | |
| ) | |
| sample_dropdown = gr.Dropdown( | |
| choices=list(SAMPLE_TEXTS.keys()), | |
| label="Sample Texts", | |
| info="Select a sample or enter custom text" | |
| ) | |
| input_text = gr.Textbox( | |
| lines=4, | |
| placeholder="Ψ§ΩΨͺΨ¨ Ψ§ΩΩΨ΅ Ψ§ΩΨΉΨ±Ψ¨Ω ΩΩΨ§...\nEnter Arabic text here...", | |
| label="Input Text", | |
| rtl=True | |
| ) | |
| analyze_btn = gr.Button("π Analyze", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| info_output = gr.HTML(label="Tokenizer Information") | |
| metrics_output = gr.HTML(label="Evaluation Metrics") | |
| tokens_output = gr.HTML(label="Token Visualization") | |
| decoded_output = gr.HTML(label="Decoded Output") | |
| sample_dropdown.change( | |
| lambda x: SAMPLE_TEXTS.get(x, ""), | |
| inputs=[sample_dropdown], | |
| outputs=[input_text] | |
| ) | |
| analyze_btn.click( | |
| analyze_single_tokenizer, | |
| inputs=[tokenizer_dropdown, input_text], | |
| outputs=[info_output, metrics_output, tokens_output, decoded_output] | |
| ) | |
| # ===== TAB 2: Comparison Mode ===== | |
| with gr.TabItem("βοΈ Compare Tokenizers", id="compare"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| compare_tokenizers_select = gr.CheckboxGroup( | |
| choices=available_tokenizers, | |
| value=available_tokenizers[:5] if len(available_tokenizers) >= 5 else available_tokenizers, | |
| label="Select Tokenizers to Compare", | |
| info="Choose 2 or more tokenizers" | |
| ) | |
| compare_sample = gr.Dropdown( | |
| choices=list(SAMPLE_TEXTS.keys()), | |
| label="Sample Texts" | |
| ) | |
| compare_text = gr.Textbox( | |
| lines=4, | |
| placeholder="Ψ§ΩΨͺΨ¨ Ψ§ΩΩΨ΅ Ψ§ΩΨΉΨ±Ψ¨Ω ΩΩΨ§...", | |
| label="Input Text", | |
| rtl=True | |
| ) | |
| compare_btn = gr.Button("βοΈ Compare", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| comparison_output = gr.HTML(label="Comparison Results") | |
| compare_sample.change( | |
| lambda x: SAMPLE_TEXTS.get(x, ""), | |
| inputs=[compare_sample], | |
| outputs=[compare_text] | |
| ) | |
| compare_btn.click( | |
| compare_tokenizers, | |
| inputs=[compare_tokenizers_select, compare_text], | |
| outputs=[comparison_output] | |
| ) | |
| # ===== TAB 3: LEADERBOARD ===== | |
| with gr.TabItem("π Leaderboard", id="leaderboard"): | |
| gr.Markdown(""" | |
| ## π Arabic Tokenizer Leaderboard | |
| All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total). | |
| """) | |
| with gr.Row(): | |
| status_output = gr.Markdown("β³ Loading cached results...") | |
| re_evaluate_btn = gr.Button("π Re-evaluate All", variant="secondary", size="sm") | |
| gr.Markdown("### π Leaderboard Results") | |
| leaderboard_output = gr.HTML() | |
| gr.Markdown("### π Per-Dataset Breakdown") | |
| per_dataset_output = gr.HTML() | |
| re_evaluate_btn.click( | |
| fn=run_leaderboard_evaluation, | |
| inputs=[], | |
| outputs=[leaderboard_output, per_dataset_output, status_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### π Evaluation Datasets | |
| | Dataset | Category | Samples | | |
| |---------|----------|---------| | |
| | ArabicMMLU | MSA Benchmark | 5,000 | | |
| | ASTD | Egyptian Dialect | 5,000 | | |
| | ATHAR | Classical Arabic | 5,000 | | |
| | ARCD | QA Dataset | 1,395 | | |
| | Ashaar | Poetry | 5,000 | | |
| | Hadith | Religious | 5,000 | | |
| | Arabic Sentiment | Social Media | 5,000 | | |
| | SANAD | News | 5,000 | | |
| """) | |
| # ===== TAB 4: Metrics Reference ===== | |
| with gr.TabItem("π Metrics Guide", id="guide"): | |
| gr.Markdown(""" | |
| ## Tokenization Evaluation Metrics Guide | |
| ### Efficiency Metrics | |
| | Metric | Description | Ideal Value | Why It Matters | | |
| |--------|-------------|-------------|----------------| | |
| | **Fertility** | Tokens per word | 1.0 | Lower fertility = fewer tokens = faster inference & lower cost | | |
| | **Compression Ratio** | Bytes per token | Higher is better | Better compression = more efficient encoding | | |
| | **Chars/Token** | Characters per token | Higher is better | More characters per token = better vocabulary utilization | | |
| ### Coverage Metrics | |
| | Metric | Description | Ideal Value | Why It Matters | | |
| |--------|-------------|-------------|----------------| | |
| | **OOV Rate** | Out-of-vocabulary percentage | 0% | Lower OOV = better vocabulary coverage | | |
| | **STRR** | Single Token Retention Rate | Higher is better | More words preserved as single tokens = better semantic boundaries | | |
| | **Continued Words Ratio** | Words split into multiple tokens | Lower is better | Fewer splits = better word boundary preservation | | |
| ### Arabic-Specific Metrics | |
| | Metric | Description | Why It Matters | | |
| |--------|-------------|----------------| | |
| | **Arabic Fertility** | Tokens per Arabic word | Arabic-specific efficiency measure | | |
| | **Diacritic Preservation** | Whether tashkeel is preserved | Important for religious & educational texts | | |
| ### Scoring Formula (Leaderboard) | |
| ``` | |
| Score = (Fertility Score Γ 0.45) + (Compression Score Γ 0.35) + (UNK Score Γ 0.20) Γ 100 | |
| ``` | |
| Where: | |
| - **Fertility Score** = 2.0 / fertility (capped 0-1, inverted - lower fertility = higher score) | |
| - **Compression Score** = compression / 6 (capped 0-1) | |
| - **UNK Score** = 1 - (unk_ratio Γ 20) (capped 0-1, inverted) | |
| ### Research Background | |
| These metrics are based on recent research including: | |
| - *"A Comprehensive Analysis of Various Tokenizers for Arabic LLMs"* (2024) | |
| - *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.) | |
| - *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025) | |
| - *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024) | |
| """) | |
| # ===== TAB 5: Submit Tokenizer ===== | |
| with gr.TabItem("π Submit", id="submit"): | |
| gr.Markdown(""" | |
| ## π Submit Your Tokenizer | |
| Evaluate any HuggingFace tokenizer on **all 8 Arabic datasets** and see how it compares. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Model Information") | |
| submit_model_id = gr.Textbox( | |
| label="HuggingFace Model ID *", | |
| placeholder="e.g., google/gemma-2-9b", | |
| info="The model ID from HuggingFace Hub" | |
| ) | |
| submit_model_name = gr.Textbox( | |
| label="Display Name (optional)", | |
| placeholder="e.g., My Custom Tokenizer", | |
| info="Leave empty to use model name" | |
| ) | |
| submit_organization = gr.Textbox( | |
| label="Organization (optional)", | |
| placeholder="e.g., My Organization", | |
| info="Leave empty to auto-detect" | |
| ) | |
| submit_model_type = gr.Dropdown( | |
| choices=[ | |
| "Arabic LLM", | |
| "Arabic BERT", | |
| "Arabic Tokenizer", | |
| "Multilingual LLM", | |
| "Custom" | |
| ], | |
| value="Custom", | |
| label="Model Type" | |
| ) | |
| submit_btn = gr.Button("π Evaluate Tokenizer", variant="primary", size="lg") | |
| submit_status = gr.Markdown("") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### Evaluation Results") | |
| submit_results = gr.HTML() | |
| submit_btn.click( | |
| fn=evaluate_submitted_tokenizer, | |
| inputs=[submit_model_id, submit_model_name, submit_organization, submit_model_type], | |
| outputs=[submit_results, submit_status] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### π Submission Guidelines | |
| - **Model ID**: Must be a valid HuggingFace model ID (e.g., `organization/model-name`) | |
| - **Tokenizer**: The model must have a tokenizer that can be loaded with `AutoTokenizer` | |
| - **Public Models**: Only public models on HuggingFace Hub are supported | |
| - **Evaluation**: Your tokenizer will be evaluated on all 8 Arabic datasets (~36,000+ samples) | |
| ### π‘ Tips | |
| - Lower fertility scores indicate better Arabic tokenization efficiency | |
| - Compare your results with the leaderboard to see how your tokenizer ranks | |
| """) | |
| # ===== TAB 6: About ===== | |
| with gr.TabItem("βΉοΈ About", id="about"): | |
| about_html = generate_about_html( | |
| tokenizers_by_type, | |
| len(available_tokenizers) | |
| ) | |
| gr.HTML(about_html) | |
| # Load cached leaderboard results on page load (fast) | |
| demo.load( | |
| fn=get_cached_leaderboard, | |
| inputs=[], | |
| outputs=[leaderboard_output, per_dataset_output, status_output] | |
| ) | |
| return demo | |
| # ============================================================================ | |
| # MAIN | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() | |