Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

File size: 13,833 Bytes

"""
Arabic Tokenizer Arena Pro - Main Application
==============================================
Advanced research & production platform for Arabic tokenization analysis

Run with: python app.py
"""

import gradio as gr

# Import modules
from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS
from styles import CUSTOM_CSS
from tokenizer_manager import tokenizer_manager
from analysis import analyze_single_tokenizer, compare_tokenizers
from leaderboard import run_leaderboard_evaluation, evaluate_submitted_tokenizer, get_cached_leaderboard
from ui_components import generate_about_html


def create_interface():
    """Create the Gradio interface"""
    
    available_tokenizers = tokenizer_manager.get_tokenizer_choices()
    tokenizers_by_type = tokenizer_manager.get_tokenizers_by_type()
    
    with gr.Blocks(
        css=CUSTOM_CSS, 
        title="Arabic Tokenizer Arena Pro", 
        theme=gr.themes.Base(
            primary_hue="green",
            secondary_hue="blue",
            neutral_hue="slate",
            font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
        )
    ) as demo:
        
        # Header
        gr.HTML("""
        <div class="header-section">
            <h1>🏟️ Arabic Tokenizer Arena Pro</h1>
            <p>Advanced research & production platform for Arabic tokenization analysis</p>
        </div>
        """)
        
        with gr.Tabs():
            # ===== TAB 1: Single Tokenizer Analysis =====
            with gr.TabItem("🔬 Single Analysis", id="single"):
                with gr.Row():
                    with gr.Column(scale=1):
                        tokenizer_dropdown = gr.Dropdown(
                            choices=available_tokenizers,
                            value=available_tokenizers[0] if available_tokenizers else None,
                            label="Select Tokenizer",
                            info="Choose a tokenizer to analyze"
                        )
                        
                        sample_dropdown = gr.Dropdown(
                            choices=list(SAMPLE_TEXTS.keys()),
                            label="Sample Texts",
                            info="Select a sample or enter custom text"
                        )
                        
                        input_text = gr.Textbox(
                            lines=4,
                            placeholder="اكتب النص العربي هنا...\nEnter Arabic text here...",
                            label="Input Text",
                            rtl=True
                        )
                        
                        analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
                    
                    with gr.Column(scale=2):
                        info_output = gr.HTML(label="Tokenizer Information")
                
                metrics_output = gr.HTML(label="Evaluation Metrics")
                tokens_output = gr.HTML(label="Token Visualization")
                decoded_output = gr.HTML(label="Decoded Output")
                
                sample_dropdown.change(
                    lambda x: SAMPLE_TEXTS.get(x, ""),
                    inputs=[sample_dropdown],
                    outputs=[input_text]
                )
                
                analyze_btn.click(
                    analyze_single_tokenizer,
                    inputs=[tokenizer_dropdown, input_text],
                    outputs=[info_output, metrics_output, tokens_output, decoded_output]
                )
            
            # ===== TAB 2: Comparison Mode =====
            with gr.TabItem("⚖️ Compare Tokenizers", id="compare"):
                with gr.Row():
                    with gr.Column(scale=1):
                        compare_tokenizers_select = gr.CheckboxGroup(
                            choices=available_tokenizers,
                            value=available_tokenizers[:5] if len(available_tokenizers) >= 5 else available_tokenizers,
                            label="Select Tokenizers to Compare",
                            info="Choose 2 or more tokenizers"
                        )
                        
                        compare_sample = gr.Dropdown(
                            choices=list(SAMPLE_TEXTS.keys()),
                            label="Sample Texts"
                        )
                        
                        compare_text = gr.Textbox(
                            lines=4,
                            placeholder="اكتب النص العربي هنا...",
                            label="Input Text",
                            rtl=True
                        )
                        
                        compare_btn = gr.Button("⚖️ Compare", variant="primary", size="lg")
                    
                    with gr.Column(scale=2):
                        comparison_output = gr.HTML(label="Comparison Results")
                
                compare_sample.change(
                    lambda x: SAMPLE_TEXTS.get(x, ""),
                    inputs=[compare_sample],
                    outputs=[compare_text]
                )
                
                compare_btn.click(
                    compare_tokenizers,
                    inputs=[compare_tokenizers_select, compare_text],
                    outputs=[comparison_output]
                )
            
            # ===== TAB 3: LEADERBOARD =====
            with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
                gr.Markdown("""
                ## 🏆 Arabic Tokenizer Leaderboard

                All tokenizers evaluated on **all 8 Arabic datasets** from HuggingFace (~36,000+ samples total).
                """)

                with gr.Row():
                    status_output = gr.Markdown("⏳ Loading cached results...")
                    re_evaluate_btn = gr.Button("🔄 Re-evaluate All", variant="secondary", size="sm")

                gr.Markdown("### 📊 Leaderboard Results")
                leaderboard_output = gr.HTML()

                gr.Markdown("### 📈 Per-Dataset Breakdown")
                per_dataset_output = gr.HTML()

                re_evaluate_btn.click(
                    fn=run_leaderboard_evaluation,
                    inputs=[],
                    outputs=[leaderboard_output, per_dataset_output, status_output]
                )

                gr.Markdown("""
                ---
                ### 📖 Evaluation Datasets

                | Dataset | Category | Samples |
                |---------|----------|---------|
                | ArabicMMLU | MSA Benchmark | 5,000 |
                | ASTD | Egyptian Dialect | 5,000 |
                | ATHAR | Classical Arabic | 5,000 |
                | ARCD | QA Dataset | 1,395 |
                | Ashaar | Poetry | 5,000 |
                | Hadith | Religious | 5,000 |
                | Arabic Sentiment | Social Media | 5,000 |
                | SANAD | News | 5,000 |
                """)
            
            # ===== TAB 4: Metrics Reference =====
            with gr.TabItem("📖 Metrics Guide", id="guide"):
                gr.Markdown("""
                ## Tokenization Evaluation Metrics Guide
                
                ### Efficiency Metrics
                
                | Metric | Description | Ideal Value | Why It Matters |
                |--------|-------------|-------------|----------------|
                | **Fertility** | Tokens per word | 1.0 | Lower fertility = fewer tokens = faster inference & lower cost |
                | **Compression Ratio** | Bytes per token | Higher is better | Better compression = more efficient encoding |
                | **Chars/Token** | Characters per token | Higher is better | More characters per token = better vocabulary utilization |
                
                ### Coverage Metrics
                
                | Metric | Description | Ideal Value | Why It Matters |
                |--------|-------------|-------------|----------------|
                | **OOV Rate** | Out-of-vocabulary percentage | 0% | Lower OOV = better vocabulary coverage |
                | **STRR** | Single Token Retention Rate | Higher is better | More words preserved as single tokens = better semantic boundaries |
                | **Continued Words Ratio** | Words split into multiple tokens | Lower is better | Fewer splits = better word boundary preservation |
                
                ### Arabic-Specific Metrics
                
                | Metric | Description | Why It Matters |
                |--------|-------------|----------------|
                | **Arabic Fertility** | Tokens per Arabic word | Arabic-specific efficiency measure |
                | **Diacritic Preservation** | Whether tashkeel is preserved | Important for religious & educational texts |
                
                ### Scoring Formula (Leaderboard)
                
                ```
                Score = (Fertility Score × 0.45) + (Compression Score × 0.35) + (UNK Score × 0.20) × 100
                ```
                
                Where:
                - **Fertility Score** = 2.0 / fertility (capped 0-1, inverted - lower fertility = higher score)
                - **Compression Score** = compression / 6 (capped 0-1)
                - **UNK Score** = 1 - (unk_ratio × 20) (capped 0-1, inverted)
                
                ### Research Background
                
                These metrics are based on recent research including:
                - *"A Comprehensive Analysis of Various Tokenizers for Arabic LLMs"* (2024)
                - *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.)
                - *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025)
                - *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024)
                """)
            
            # ===== TAB 5: Submit Tokenizer =====
            with gr.TabItem("🚀 Submit", id="submit"):
                gr.Markdown("""
                ## 🚀 Submit Your Tokenizer

                Evaluate any HuggingFace tokenizer on **all 8 Arabic datasets** and see how it compares.
                """)

                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("### Model Information")

                        submit_model_id = gr.Textbox(
                            label="HuggingFace Model ID *",
                            placeholder="e.g., google/gemma-2-9b",
                            info="The model ID from HuggingFace Hub"
                        )

                        submit_model_name = gr.Textbox(
                            label="Display Name (optional)",
                            placeholder="e.g., My Custom Tokenizer",
                            info="Leave empty to use model name"
                        )

                        submit_organization = gr.Textbox(
                            label="Organization (optional)",
                            placeholder="e.g., My Organization",
                            info="Leave empty to auto-detect"
                        )

                        submit_model_type = gr.Dropdown(
                            choices=[
                                "Arabic LLM",
                                "Arabic BERT",
                                "Arabic Tokenizer",
                                "Multilingual LLM",
                                "Custom"
                            ],
                            value="Custom",
                            label="Model Type"
                        )

                        submit_btn = gr.Button("🚀 Evaluate Tokenizer", variant="primary", size="lg")

                        submit_status = gr.Markdown("")

                    with gr.Column(scale=2):
                        gr.Markdown("### Evaluation Results")
                        submit_results = gr.HTML()

                submit_btn.click(
                    fn=evaluate_submitted_tokenizer,
                    inputs=[submit_model_id, submit_model_name, submit_organization, submit_model_type],
                    outputs=[submit_results, submit_status]
                )

                gr.Markdown("""
                ---
                ### 📋 Submission Guidelines

                - **Model ID**: Must be a valid HuggingFace model ID (e.g., `organization/model-name`)
                - **Tokenizer**: The model must have a tokenizer that can be loaded with `AutoTokenizer`
                - **Public Models**: Only public models on HuggingFace Hub are supported
                - **Evaluation**: Your tokenizer will be evaluated on all 8 Arabic datasets (~36,000+ samples)

                ### 💡 Tips

                - Lower fertility scores indicate better Arabic tokenization efficiency
                - Compare your results with the leaderboard to see how your tokenizer ranks
                """)

            # ===== TAB 6: About =====
            with gr.TabItem("ℹ️ About", id="about"):
                about_html = generate_about_html(
                    tokenizers_by_type,
                    len(available_tokenizers)
                )
                gr.HTML(about_html)

        # Load cached leaderboard results on page load (fast)
        demo.load(
            fn=get_cached_leaderboard,
            inputs=[],
            outputs=[leaderboard_output, per_dataset_output, status_output]
        )

        return demo


# ============================================================================
# MAIN
# ============================================================================

if __name__ == "__main__":
    demo = create_interface()
    demo.launch()