Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

App Files Files Community

HeshamHaroon commited on 16 days ago

Commit

59baef6

verified ·

1 Parent(s): 06d6710

Update app.py

Browse files

Files changed (1) hide show

app.py +923 -727

app.py CHANGED Viewed

@@ -4,10 +4,13 @@ Arabic Tokenizer Arena Pro - Advanced Arabic Tokenization Analysis Platform
 A comprehensive research and production-grade tool for evaluating Arabic tokenizers
 across multiple dimensions: efficiency, coverage, morphological awareness, and more.
 Supports:
 - Arabic-specific tokenizers (Aranizer, AraBERT, CAMeLBERT, MARBERT, etc.)
 - Major LLM tokenizers (Jais, AceGPT, Falcon-Arabic, ALLaM, Qwen, Llama, Mistral, GPT)
 - Comprehensive evaluation metrics based on latest research
 """
 import gradio as gr
@@ -18,6 +21,8 @@ import unicodedata
 from typing import Dict, List, Tuple, Optional, Any
 from dataclasses import dataclass, field
 from enum import Enum
 import os
 # Hugging Face authentication
@@ -30,6 +35,9 @@ if HF_TOKEN:
 from transformers import AutoTokenizer, logging
 logging.set_verbosity_error()
 # ============================================================================
 # DATA CLASSES AND ENUMS
 # ============================================================================
@@ -204,30 +212,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA"],
         special_features=["100K vocabulary", "MSA focused"]
     ),
-    "asafaya/bert-base-arabic": TokenizerInfo(
-        name="Arabic BERT (Safaya)",
-        model_id="asafaya/bert-base-arabic",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=32000,
-        description="Arabic BERT trained on MSA and dialectal Arabic",
-        organization="Safaya",
-        arabic_support="Native",
-        dialect_support=["MSA", "DA"],
-        special_features=["TPU trained", "Dialect support"]
-    ),
-    "UBC-NLP/AraT5-base": TokenizerInfo(
-        name="AraT5 Base",
-        model_id="UBC-NLP/AraT5-base",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=110000,
-        description="Arabic text-to-text transformer for generation tasks",
-        organization="UBC NLP",
-        arabic_support="Native",
-        dialect_support=["MSA", "Tweet"],
-        special_features=["Text-to-Text", "Generation optimized"]
-    ),
     # ========== ARABIC-SPECIFIC TOKENIZERS ==========
     "riotu-lab/Aranizer-PBE-86k": TokenizerInfo(
@@ -254,30 +238,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA"],
         special_features=["Low fertility", "SentencePiece", "86K vocab"]
     ),
-    "riotu-lab/Aranizer-PBE-32k": TokenizerInfo(
-        name="Aranizer PBE 32K",
-        model_id="riotu-lab/Aranizer-PBE-32k",
-        type=TokenizerType.ARABIC_SPECIFIC,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=32000,
-        description="Compact PBE tokenizer for Arabic",
-        organization="RIOTU Lab",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["Compact", "LLM compatible"]
-    ),
-    "riotu-lab/Aranizer-SP-32k": TokenizerInfo(
-        name="Aranizer SP 32K",
-        model_id="riotu-lab/Aranizer-SP-32k",
-        type=TokenizerType.ARABIC_SPECIFIC,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=32000,
-        description="Compact SentencePiece tokenizer for Arabic",
-        organization="RIOTU Lab",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["Compact", "Efficient"]
-    ),
     # ========== ARABIC-SPECIFIC LLMs ==========
     "ALLaM-AI/ALLaM-7B-Instruct-preview": TokenizerInfo(
@@ -328,18 +288,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA"],
         special_features=["LLaMA-based", "Cultural alignment", "RLHF", "Chat"]
     ),
-    "FreedomIntelligence/AceGPT-7B-chat": TokenizerInfo(
-        name="AceGPT 7B Chat",
-        model_id="FreedomIntelligence/AceGPT-7B-chat",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=32000,
-        description="Smaller Arabic-enhanced LLaMA variant with chat",
-        organization="Freedom Intelligence",
-        arabic_support="Adapted",
-        dialect_support=["MSA"],
-        special_features=["LLaMA-based", "Efficient", "Chat"]
-    ),
     "silma-ai/SILMA-9B-Instruct-v1.0": TokenizerInfo(
         name="SILMA 9B Instruct",
         model_id="silma-ai/SILMA-9B-Instruct-v1.0",
@@ -352,18 +300,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
         special_features=["Gemma-based", "SOTA 9B class", "Efficient"]
     ),
-    "silma-ai/SILMA-Kashif-2B-Instruct-v1.0": TokenizerInfo(
-        name="SILMA Kashif 2B (RAG)",
-        model_id="silma-ai/SILMA-Kashif-2B-Instruct-v1.0",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=256000,
-        description="RAG-optimized Arabic model, excellent for context-based QA",
-        organization="SILMA AI",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["RAG optimized", "12K context", "Compact"]
-    ),
     "QCRI/Fanar-1-9B-Instruct": TokenizerInfo(
         name="Fanar 9B Instruct",
         model_id="QCRI/Fanar-1-9B-Instruct",
@@ -376,54 +312,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
         special_features=["Islamic RAG", "Cultural alignment", "Gemma-based"]
     ),
-    "tiiuae/Falcon-Arabic-7B-Instruct": TokenizerInfo(
-        name="Falcon Arabic 7B Instruct",
-        model_id="tiiuae/Falcon-Arabic-7B-Instruct",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=97024,
-        description="SOTA Arabic LLM from TII, outperforms models 4x its size",
-        organization="Technology Innovation Institute",
-        arabic_support="Native",
-        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine", "Maghrebi"],
-        special_features=["Falcon3-based", "32K context", "DPO aligned"]
-    ),
-    "tiiuae/Falcon-Arabic-7B-Base": TokenizerInfo(
-        name="Falcon Arabic 7B Base",
-        model_id="tiiuae/Falcon-Arabic-7B-Base",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=97024,
-        description="Base model of Falcon Arabic for fine-tuning",
-        organization="Technology Innovation Institute",
-        arabic_support="Native",
-        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
-        special_features=["Falcon3-based", "Fine-tuning ready"]
-    ),
-    "CohereForAI/c4ai-command-r7b-arabic-02-2025": TokenizerInfo(
-        name="Cohere Command R7B Arabic",
-        model_id="CohereForAI/c4ai-command-r7b-arabic-02-2025",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=256000,
-        description="Cohere's Arabic-optimized model for RAG and enterprise use",
-        organization="Cohere",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["RAG optimized", "128K context", "Enterprise ready"]
-    ),
-    "stabilityai/ar-stablelm-2-chat": TokenizerInfo(
-        name="Arabic StableLM 2 Chat",
-        model_id="stabilityai/ar-stablelm-2-chat",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=100289,
-        description="Stability AI's Arabic instruction-tuned 1.6B model",
-        organization="Stability AI",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["Compact 1.6B", "Chat optimized", "Efficient"]
-    ),
     "Navid-AI/Yehia-7B-preview": TokenizerInfo(
         name="Yehia 7B Preview",
         model_id="Navid-AI/Yehia-7B-preview",
@@ -450,30 +338,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["Darija", "MSA"],
         special_features=["Moroccan dialect", "Transliteration", "Cultural"]
     ),
-    "MBZUAI-Paris/Atlas-Chat-2B": TokenizerInfo(
-        name="Atlas-Chat 2B (Darija)",
-        model_id="MBZUAI-Paris/Atlas-Chat-2B",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=256000,
-        description="Compact Moroccan Arabic model for edge deployment",
-        organization="MBZUAI Paris",
-        arabic_support="Native",
-        dialect_support=["Darija", "MSA"],
-        special_features=["Compact", "Moroccan dialect", "Edge-ready"]
-    ),
-    "MBZUAI-Paris/Atlas-Chat-27B": TokenizerInfo(
-        name="Atlas-Chat 27B (Darija)",
-        model_id="MBZUAI-Paris/Atlas-Chat-27B",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=256000,
-        description="Largest Moroccan Arabic model with best performance",
-        organization="MBZUAI Paris",
-        arabic_support="Native",
-        dialect_support=["Darija", "MSA"],
-        special_features=["27B params", "Moroccan dialect", "SOTA Darija"]
-    ),
     # ========== MULTILINGUAL LLMs WITH ARABIC SUPPORT ==========
     "Qwen/Qwen2.5-7B": TokenizerInfo(
@@ -488,18 +352,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA"],
         special_features=["152K vocab", "128K context", "30+ languages"]
     ),
-    "Qwen/Qwen2.5-14B-Instruct": TokenizerInfo(
-        name="Qwen 2.5 14B Instruct",
-        model_id="Qwen/Qwen2.5-14B-Instruct",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=151936,
-        description="Larger Qwen with enhanced Arabic capabilities",
-        organization="Alibaba Qwen",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["14B params", "Strong Arabic", "Instruct tuned"]
-    ),
     "google/gemma-2-9b": TokenizerInfo(
         name="Gemma 2 9B",
         model_id="google/gemma-2-9b",
@@ -512,42 +364,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA"],
         special_features=["256K vocab", "Efficient architecture"]
     ),
-    "google/gemma-2-9b-it": TokenizerInfo(
-        name="Gemma 2 9B Instruct",
-        model_id="google/gemma-2-9b-it",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=256000,
-        description="Instruction-tuned Gemma with Arabic support",
-        organization="Google",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["Instruct tuned", "256K vocab"]
-    ),
-    "CohereForAI/aya-expanse-8b": TokenizerInfo(
-        name="Aya Expanse 8B",
-        model_id="CohereForAI/aya-expanse-8b",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=256000,
-        description="Cohere's multilingual model with strong Arabic support",
-        organization="Cohere",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["23 languages", "Arabic optimized"]
-    ),
-    "CohereForAI/aya-expanse-32b": TokenizerInfo(
-        name="Aya Expanse 32B",
-        model_id="CohereForAI/aya-expanse-32b",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=256000,
-        description="Large Aya model with enhanced multilingual capabilities",
-        organization="Cohere",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["32B params", "23 languages"]
-    ),
     "mistralai/Mistral-7B-v0.3": TokenizerInfo(
         name="Mistral 7B v0.3",
         model_id="mistralai/Mistral-7B-v0.3",
@@ -572,30 +388,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA"],
         special_features=["Tekken tokenizer", "131K vocab", "Multilingual optimized"]
     ),
-    "microsoft/Phi-3.5-mini-instruct": TokenizerInfo(
-        name="Phi-3.5 Mini Instruct",
-        model_id="microsoft/Phi-3.5-mini-instruct",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=32064,
-        description="Microsoft's compact multilingual model",
-        organization="Microsoft",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["Compact", "3.8B params"]
-    ),
-    "google/mt5-base": TokenizerInfo(
-        name="mT5 Base",
-        model_id="google/mt5-base",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=250112,
-        description="Multilingual T5 covering 101 languages",
-        organization="Google",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["250K vocab", "101 languages", "Seq2Seq"]
-    ),
     "xlm-roberta-base": TokenizerInfo(
         name="XLM-RoBERTa Base",
         model_id="xlm-roberta-base",
@@ -620,8 +412,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA"],
         special_features=["Baseline model", "104 languages"]
     ),
-    # ========== FALCON FAMILY ==========
     "tiiuae/falcon-7b": TokenizerInfo(
         name="Falcon 7B",
         model_id="tiiuae/falcon-7b",
@@ -634,96 +424,109 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
         dialect_support=["MSA"],
         special_features=["65K vocab", "RefinedWeb trained"]
     ),
-    "tiiuae/falcon-7b-instruct": TokenizerInfo(
-        name="Falcon 7B Instruct",
-        model_id="tiiuae/falcon-7b-instruct",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=65024,
-        description="Instruction-tuned Falcon 7B",
-        organization="Technology Innovation Institute",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["Instruct tuned", "Chat ready"]
-    ),
 }
-# Try to load gated/authenticated models
-GATED_MODELS = [
-    ("meta-llama/Meta-Llama-3-8B", TokenizerInfo(
-        name="Llama 3 8B",
-        model_id="meta-llama/Meta-Llama-3-8B",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=128256,
-        description="Meta's latest LLM with improved multilingual",
-        organization="Meta AI",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["128K vocab", "Improved tokenizer"]
-    )),
-    ("meta-llama/Llama-3.1-8B-Instruct", TokenizerInfo(
-        name="Llama 3.1 8B Instruct",
-        model_id="meta-llama/Llama-3.1-8B-Instruct",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=128256,
-        description="Latest Llama with instruction tuning",
-        organization="Meta AI",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["128K context", "Tool use"]
-    )),
-    ("meta-llama/Llama-3.2-1B-Instruct", TokenizerInfo(
-        name="Llama 3.2 1B Instruct",
-        model_id="meta-llama/Llama-3.2-1B-Instruct",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=128256,
-        description="Compact Llama for edge deployment",
-        organization="Meta AI",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["Compact 1B", "Edge ready"]
-    )),
-    ("meta-llama/Llama-3.3-70B-Instruct", TokenizerInfo(
-        name="Llama 3.3 70B Instruct",
-        model_id="meta-llama/Llama-3.3-70B-Instruct",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=128256,
-        description="Large Llama with best Arabic among Llama family",
-        organization="Meta AI",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["70B params", "Best Llama Arabic"]
-    )),
-    ("meta-llama/Llama-2-7b-hf", TokenizerInfo(
-        name="Llama 2 7B",
-        model_id="meta-llama/Llama-2-7b-hf",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=32000,
-        description="Meta's Llama 2 base model",
-        organization="Meta AI",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["32K vocab", "Foundation model"]
-    )),
-    # Additional gated Arabic models
-    ("CohereLabs/c4ai-command-a-03-2025", TokenizerInfo(
-        name="Cohere Command A 111B",
-        model_id="CohereLabs/c4ai-command-a-03-2025",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=256000,
-        description="Cohere's flagship 111B model with Arabic support",
-        organization="Cohere",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["111B params", "256K context", "23 languages"]
-    )),
-]
 # ============================================================================
 # TOKENIZER LOADER AND CACHE
@@ -751,15 +554,6 @@ class TokenizerManager:
             except Exception as e:
                 print(f"  ✗ {info.name}: {str(e)[:50]}")
-        # Try gated models
-        for model_id, info in GATED_MODELS:
-            try:
-                _ = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-                self._available[model_id] = info
-                print(f"  ✓ {info.name} (gated)")
-            except Exception as e:
-                print(f"  ✗ {info.name} (gated): {str(e)[:50]}")
         print(f"\nTotal available tokenizers: {len(self._available)}")
     def get_tokenizer(self, model_id: str):
@@ -814,19 +608,8 @@ def has_diacritics(text: str) -> bool:
     diacritics = set('ًٌٍَُِّْٰ')
     return any(c in diacritics for c in text)
-def normalize_arabic(text: str) -> str:
-    """Basic Arabic normalization"""
-    # Normalize alef variants
-    text = re.sub('[إأآا]', 'ا', text)
-    # Normalize yeh
-    text = re.sub('ى', 'ي', text)
-    # Normalize teh marbuta
-    text = re.sub('ة', 'ه', text)
-    return text
 def get_arabic_words(text: str) -> List[str]:
     """Extract Arabic words from text"""
-    # Split on whitespace and filter for words containing Arabic
     words = text.split()
     return [w for w in words if any(is_arabic_char(c) for c in w)]
@@ -920,6 +703,439 @@ def analyze_tokenization(
         decoded_text=decoded
     )
 # ============================================================================
 # UI GENERATION FUNCTIONS
 # ============================================================================
@@ -927,9 +1143,8 @@ def analyze_tokenization(
 def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
     """Generate beautiful HTML visualization of tokens"""
-    # Color palette for tokens (alternating for clarity)
     colors = [
-        ('#1a1a2e', '#eaeaea'),  # Dark blue bg, light text
         ('#16213e', '#f0f0f0'),
         ('#0f3460', '#ffffff'),
         ('#533483', '#f5f5f5'),
@@ -942,10 +1157,7 @@ def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str
     html_parts = []
     for i, (token, tid) in enumerate(zip(tokens, token_ids)):
         bg, fg = colors[i % len(colors)]
-        # Escape HTML entities
         display_token = token.replace('<', '&lt;').replace('>', '&gt;')
-        # Determine if token is Arabic
         is_arabic = any(is_arabic_char(c) for c in token)
         direction = 'rtl' if is_arabic else 'ltr'
@@ -969,7 +1181,6 @@ def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str
 def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
     """Generate metrics visualization card"""
-    # Determine quality indicators
     fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
     strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
     compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
@@ -999,34 +1210,33 @@ def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) ->
         <div class="metric-card {strr_quality}">
             <div class="metric-icon">✨</div>
             <div class="metric-value">{metrics.single_token_retention_rate:.1%}</div>
-            <div class="metric-label">Single Token Rate (STRR)</div>
             <div class="metric-hint">Higher is better</div>
         </div>
         <div class="metric-card">
-            <div class="metric-icon">📝</div>
             <div class="metric-value">{metrics.char_per_token:.2f}</div>
             <div class="metric-label">Characters/Token</div>
         </div>
-        <div class="metric-card">
-            <div class="metric-icon">⚡</div>
-            <div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div>
-            <div class="metric-label">Processing Time</div>
         </div>
-        <div class="metric-card arabic">
-            <div class="metric-icon">🔤</div>
             <div class="metric-value">{metrics.arabic_fertility:.3f}</div>
             <div class="metric-label">Arabic Fertility</div>
-            <div class="metric-hint">Arabic-specific efficiency</div>
         </div>
         <div class="metric-card">
-            <div class="metric-icon">{"✅" if metrics.oov_percentage == 0 else "⚠️"}</div>
-            <div class="metric-value">{metrics.oov_percentage:.1f}%</div>
-            <div class="metric-label">OOV Rate</div>
-            <div class="metric-hint">Lower is better (0% ideal)</div>
         </div>
     </div>
     '''
@@ -1034,44 +1244,40 @@ def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) ->
 def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
     """Generate tokenizer information card"""
-    dialect_badges = ' '.join([
-        f'<span class="dialect-badge">{d}</span>'
-        for d in info.dialect_support
-    ])
-    feature_badges = ' '.join([
-        f'<span class="feature-badge">{f}</span>'
-        for f in info.special_features
-    ])
-    support_class = info.arabic_support.lower().replace(' ', '-')
     return f'''
-    <div class="tokenizer-info">
-        <div class="tokenizer-header">
             <h3>{info.name}</h3>
             <span class="org-badge">{info.organization}</span>
         </div>
-        <p class="tokenizer-desc">{info.description}</p>
-        <div class="tokenizer-meta">
-            <div class="meta-row">
-                <span class="meta-label">Type:</span>
-                <span class="meta-value">{info.type.value}</span>
             </div>
-            <div class="meta-row">
-                <span class="meta-label">Algorithm:</span>
-                <span class="meta-value">{info.algorithm.value}</span>
             </div>
-            <div class="meta-row">
-                <span class="meta-label">Vocab Size:</span>
-                <span class="meta-value">{info.vocab_size:,}</span>
             </div>
-            <div class="meta-row">
-                <span class="meta-label">Arabic Support:</span>
-                <span class="support-badge {support_class}">{info.arabic_support}</span>
             </div>
         </div>
-        <div class="tokenizer-badges">
             <div class="badge-group">
                 <span class="badge-label">Dialects:</span>
                 {dialect_badges}
@@ -1084,39 +1290,43 @@ def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
     </div>
     '''
-# ============================================================================
-# MAIN ANALYSIS FUNCTION
-# ============================================================================
 def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
-    """Analyze text with a single tokenizer"""
-    if not text.strip():
         return (
-            "<p class='warning'>Please enter some text to analyze.</p>",
-            "",
-            "",
-            ""
         )
     model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
-    info = tokenizer_manager.get_available_tokenizers()[model_id]
     try:
-        metrics = analyze_tokenization(text, model_id, info)
-        # Generate all outputs
-        info_html = generate_tokenizer_info_card(info)
-        metrics_html = generate_metrics_card(metrics, info)
         tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
-        # Decoded text output
         decoded_html = f'''
         <div class="decoded-section">
             <h4>Decoded Output</h4>
             <div class="decoded-text" dir="auto">{metrics.decoded_text}</div>
             <div class="decoded-meta">
-                <span>Diacritics preserved: {"✅ Yes" if metrics.diacritic_preservation else "❌ No"}</span>
             </div>
         </div>
         '''
@@ -1124,139 +1334,146 @@ def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str
         return info_html, metrics_html, tokens_html, decoded_html
     except Exception as e:
-        error_html = f'''
-        <div class="error-card">
-            <h4>Error analyzing with {info.name}</h4>
-            <p>{str(e)}</p>
-        </div>
-        '''
-        return error_html, "", "", ""
 def compare_tokenizers(tokenizer_choices: List[str], text: str) -> str:
-    """Compare multiple tokenizers side by side"""
-    if not text.strip():
-        return "<p class='warning'>Please enter some text to analyze.</p>"
     if not tokenizer_choices or len(tokenizer_choices) < 2:
-        return "<p class='warning'>Please select at least 2 tokenizers to compare.</p>"
     results = []
     for choice in tokenizer_choices:
         model_id = tokenizer_manager.get_model_id_from_choice(choice)
-        info = tokenizer_manager.get_available_tokenizers()[model_id]
-        try:
-            metrics = analyze_tokenization(text, model_id, info)
-            results.append((info, metrics))
-        except Exception as e:
-            continue
-    if not results:
-        return "<p class='error'>Failed to analyze with any selected tokenizers.</p>"
-    # Sort by fertility (best first)
-    results.sort(key=lambda x: x[1].fertility)
     # Generate comparison table
-    table_rows = []
-    for i, (info, metrics) in enumerate(results):
-        rank_class = "rank-1" if i == 0 else "rank-2" if i == 1 else "rank-3" if i == 2 else ""
-        table_rows.append(f'''
-        <tr class="{rank_class}">
-            <td class="rank-cell">{i + 1}</td>
-            <td class="name-cell">
-                <strong>{info.name}</strong>
-                <span class="org-small">{info.organization}</span>
-            </td>
-            <td class="metric-cell">{metrics.total_tokens}</td>
-            <td class="metric-cell highlight">{metrics.fertility:.3f}</td>
-            <td class="metric-cell">{metrics.compression_ratio:.2f}</td>
-            <td class="metric-cell">{metrics.single_token_retention_rate:.1%}</td>
-            <td class="metric-cell">{metrics.arabic_fertility:.3f}</td>
-            <td class="metric-cell">{metrics.oov_percentage:.1f}%</td>
-            <td class="metric-cell">{metrics.tokenization_time_ms:.2f}ms</td>
-        </tr>
-        ''')
-    return f'''
     <div class="comparison-container">
-        <h3>Tokenizer Comparison Results</h3>
-        <p class="comparison-subtitle">Ranked by fertility (lower is better)</p>
         <table class="comparison-table">
             <thead>
                 <tr>
-                    <th>#</th>
                     <th>Tokenizer</th>
                     <th>Tokens</th>
                     <th>Fertility ↓</th>
-                    <th>Compression</th>
-                    <th>STRR</th>
-                    <th>Arabic Fertility</th>
                     <th>OOV %</th>
-                    <th>Time</th>
                 </tr>
             </thead>
             <tbody>
-                {''.join(table_rows)}
             </tbody>
         </table>
-        <div class="comparison-legend">
-            <span class="legend-item"><span class="legend-color rank-1"></span> Best</span>
-            <span class="legend-item"><span class="legend-color rank-2"></span> Runner-up</span>
-            <span class="legend-item"><span class="legend-color rank-3"></span> Third</span>
-        </div>
     </div>
     '''
 # ============================================================================
-# CSS STYLES
 # ============================================================================
 CUSTOM_CSS = """
-/* ===== GLOBAL STYLES ===== */
 :root {
-    --primary: #0d47a1;
-    --primary-light: #1976d2;
-    --primary-dark: #002171;
-    --accent: #ff6f00;
-    --accent-light: #ffa040;
-    --success: #2e7d32;
     --warning: #f57c00;
     --error: #c62828;
-    --bg-dark: #0a0a0f;
-    --bg-card: #12121a;
-    --bg-elevated: #1a1a24;
-    --text-primary: #f5f5f5;
-    --text-secondary: #b0b0b0;
-    --border: #2a2a3a;
-    --gradient-1: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    --gradient-2: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
-    --gradient-arabic: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
-}
-.gradio-container {
-    background: var(--bg-dark) !important;
-    font-family: 'IBM Plex Sans Arabic', 'Segoe UI', system-ui, sans-serif !important;
 }
-/* ===== HEADER STYLES ===== */
 .header-section {
     text-align: center;
-    padding: 2rem;
-    background: var(--gradient-1);
     border-radius: 16px;
-    margin-bottom: 2rem;
 }
 .header-section h1 {
     font-size: 2.5rem;
-    font-weight: 700;
     color: white;
     margin-bottom: 0.5rem;
-    text-shadow: 0 2px 10px rgba(0,0,0,0.3);
 }
 .header-section p {
@@ -1264,85 +1481,138 @@ CUSTOM_CSS = """
     font-size: 1.1rem;
 }
-/* ===== TOKEN VISUALIZATION ===== */
-.token-container {
-    display: flex;
-    flex-wrap: wrap;
-    gap: 8px;
-    padding: 1.5rem;
     background: var(--bg-card);
     border-radius: 12px;
     border: 1px solid var(--border);
-    direction: rtl;
 }
-.token {
-    display: inline-flex;
-    flex-direction: column;
     align-items: center;
-    padding: 8px 12px;
-    border-radius: 8px;
-    font-family: 'IBM Plex Mono', monospace;
-    font-size: 0.95rem;
-    transition: transform 0.2s, box-shadow 0.2s;
-    cursor: default;
 }
-.token:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 4px 12px rgba(0,0,0,0.3);
 }
-.token-id {
-    font-size: 0.7rem;
-    opacity: 0.7;
-    margin-top: 4px;
 }
 /* ===== METRICS GRID ===== */
 .metrics-grid {
     display: grid;
-    grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
     gap: 1rem;
-    padding: 1rem;
 }
 .metric-card {
     background: var(--bg-card);
-    border: 1px solid var(--border);
     border-radius: 12px;
-    padding: 1.25rem;
     text-align: center;
-    transition: transform 0.2s, border-color 0.2s;
 }
 .metric-card:hover {
-    transform: translateY(-3px);
-    border-color: var(--primary-light);
 }
 .metric-card.excellent {
     border-color: var(--success);
-    background: linear-gradient(to bottom, rgba(46, 125, 50, 0.1), transparent);
 }
 .metric-card.good {
-    border-color: var(--primary-light);
-    background: linear-gradient(to bottom, rgba(25, 118, 210, 0.1), transparent);
 }
 .metric-card.poor {
-    border-color: var(--warning);
-    background: linear-gradient(to bottom, rgba(245, 124, 0, 0.1), transparent);
 }
 .metric-card.primary {
-    background: var(--gradient-1);
-}
-.metric-card.arabic {
-    background: linear-gradient(to bottom, rgba(17, 153, 142, 0.2), transparent);
-    border-color: #11998e;
 }
 .metric-icon {
@@ -1351,16 +1621,15 @@ CUSTOM_CSS = """
 }
 .metric-value {
-    font-size: 1.75rem;
     font-weight: 700;
     color: var(--text-primary);
-    margin-bottom: 0.25rem;
 }
 .metric-label {
-    font-size: 0.85rem;
     color: var(--text-secondary);
-    margin-bottom: 0.25rem;
 }
 .metric-hint {
@@ -1369,244 +1638,117 @@ CUSTOM_CSS = """
     opacity: 0.7;
 }
-/* ===== TOKENIZER INFO ===== */
-.tokenizer-info {
-    background: var(--bg-card);
-    border: 1px solid var(--border);
     border-radius: 12px;
-    padding: 1.5rem;
 }
-.tokenizer-header {
-    display: flex;
     align-items: center;
-    gap: 1rem;
-    margin-bottom: 1rem;
 }
-.tokenizer-header h3 {
-    margin: 0;
-    color: var(--text-primary);
-    font-size: 1.5rem;
 }
-.org-badge {
-    background: var(--gradient-1);
-    padding: 4px 12px;
-    border-radius: 20px;
-    font-size: 0.8rem;
-    color: white;
 }
-.tokenizer-desc {
-    color: var(--text-secondary);
-    margin-bottom: 1rem;
-    line-height: 1.6;
 }
-.tokenizer-meta {
-    display: grid;
-    grid-template-columns: repeat(2, 1fr);
-    gap: 0.75rem;
     margin-bottom: 1rem;
 }
-.meta-row {
-    display: flex;
-    gap: 0.5rem;
-}
-.meta-label {
-    color: var(--text-secondary);
-    font-size: 0.85rem;
-}
-.meta-value {
     color: var(--text-primary);
-    font-weight: 500;
-}
-.support-badge {
-    padding: 2px 8px;
-    border-radius: 4px;
-    font-size: 0.8rem;
-}
-.support-badge.native {
-    background: var(--success);
-    color: white;
-}
-.support-badge.adapted {
-    background: var(--primary-light);
-    color: white;
-}
-.support-badge.supported {
-    background: var(--warning);
-    color: white;
 }
-.support-badge.limited {
-    background: var(--error);
-    color: white;
-}
-.tokenizer-badges {
-    display: flex;
-    flex-direction: column;
-    gap: 0.75rem;
-}
-.badge-group {
-    display: flex;
-    flex-wrap: wrap;
-    align-items: center;
-    gap: 0.5rem;
-}
-.badge-label {
-    color: var(--text-secondary);
     font-size: 0.85rem;
-}
-.dialect-badge, .feature-badge {
-    background: var(--bg-elevated);
-    border: 1px solid var(--border);
-    padding: 4px 10px;
-    border-radius: 6px;
-    font-size: 0.75rem;
-    color: var(--text-primary);
 }
 /* ===== COMPARISON TABLE ===== */
 .comparison-container {
-    background: var(--bg-card);
-    border-radius: 12px;
-    padding: 1.5rem;
-    border: 1px solid var(--border);
-}
-.comparison-container h3 {
-    color: var(--text-primary);
-    margin-bottom: 0.25rem;
-}
-.comparison-subtitle {
-    color: var(--text-secondary);
-    font-size: 0.9rem;
-    margin-bottom: 1.5rem;
 }
 .comparison-table {
     width: 100%;
     border-collapse: collapse;
-    font-size: 0.9rem;
 }
 .comparison-table th {
-    background: var(--bg-elevated);
-    color: var(--text-secondary);
-    padding: 12px 8px;
     text-align: left;
-    font-weight: 500;
-    border-bottom: 2px solid var(--border);
 }
 .comparison-table td {
-    padding: 12px 8px;
     border-bottom: 1px solid var(--border);
     color: var(--text-primary);
 }
-.comparison-table tr.rank-1 {
-    background: linear-gradient(90deg, rgba(46, 125, 50, 0.2), transparent);
 }
-.comparison-table tr.rank-2 {
-    background: linear-gradient(90deg, rgba(25, 118, 210, 0.15), transparent);
 }
-.comparison-table tr.rank-3 {
-    background: linear-gradient(90deg, rgba(245, 124, 0, 0.1), transparent);
 }
-.rank-cell {
-    font-weight: 700;
-    text-align: center;
-}
-.name-cell strong {
-    display: block;
-}
-.org-small {
-    font-size: 0.75rem;
-    color: var(--text-secondary);
-}
-.metric-cell {
-    text-align: center;
 }
-.metric-cell.highlight {
-    font-weight: 700;
-    color: var(--accent-light);
-}
-.comparison-legend {
-    display: flex;
-    gap: 1.5rem;
-    margin-top: 1rem;
-    padding-top: 1rem;
-    border-top: 1px solid var(--border);
-}
-.legend-item {
-    display: flex;
-    align-items: center;
-    gap: 0.5rem;
-    font-size: 0.85rem;
-    color: var(--text-secondary);
-}
-.legend-color {
-    width: 16px;
-    height: 16px;
-    border-radius: 4px;
-}
-.legend-color.rank-1 { background: var(--success); }
-.legend-color.rank-2 { background: var(--primary-light); }
-.legend-color.rank-3 { background: var(--warning); }
-/* ===== DECODED SECTION ===== */
-.decoded-section {
-    background: var(--bg-card);
-    border: 1px solid var(--border);
-    border-radius: 12px;
-    padding: 1.5rem;
-}
-.decoded-section h4 {
-    color: var(--text-primary);
-    margin-bottom: 1rem;
 }
-.decoded-text {
-    background: var(--bg-elevated);
-    padding: 1rem;
-    border-radius: 8px;
-    font-family: 'IBM Plex Sans Arabic', serif;
-    font-size: 1.1rem;
-    line-height: 1.8;
-    color: var(--text-primary);
 }
-.decoded-meta {
-    margin-top: 1rem;
-    font-size: 0.85rem;
-    color: var(--text-secondary);
 }
 /* ===== UTILITY CLASSES ===== */
@@ -1661,14 +1803,14 @@ def create_interface():
     available_tokenizers = tokenizer_manager.get_tokenizer_choices()
-    # Group tokenizers by type for better organization
-    arabic_specific = [t for t in available_tokenizers if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT'])]
-    arabic_llms = [t for t in available_tokenizers if any(x in t for x in ['Jais', 'AceGPT'])]
     multilingual = [t for t in available_tokenizers if t not in arabic_specific and t not in arabic_llms]
     with gr.Blocks(css=CUSTOM_CSS, title="Arabic Tokenizer Arena Pro", theme=gr.themes.Base(
-        primary_hue="blue",
-        secondary_hue="purple",
         neutral_hue="slate",
         font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
     )) as demo:
@@ -1715,7 +1857,6 @@ def create_interface():
                 tokens_output = gr.HTML(label="Token Visualization")
                 decoded_output = gr.HTML(label="Decoded Output")
-                # Event handlers
                 sample_dropdown.change(
                     lambda x: SAMPLE_TEXTS.get(x, ""),
                     inputs=[sample_dropdown],
@@ -1768,7 +1909,70 @@ def create_interface():
                     outputs=[comparison_output]
                 )
-            # ===== TAB 3: Metrics Reference =====
             with gr.TabItem("📖 Metrics Guide", id="guide"):
                 gr.Markdown("""
                 ## Tokenization Evaluation Metrics Guide
@@ -1803,18 +2007,9 @@ def create_interface():
                 - *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.)
                 - *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025)
                 - *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024)
-                ### Tokenizer Algorithm Types
-                - **BPE (Byte-Pair Encoding)**: Iteratively merges frequent character pairs
-                - **Byte-Level BPE**: BPE applied to UTF-8 bytes instead of characters
-                - **WordPiece**: Google's variant, used in BERT models
-                - **SentencePiece**: Language-independent, uses unigram model
-                - **Unigram**: Probabilistic subword model
-                - **Tiktoken**: OpenAI's optimized BPE implementation
                 """)
-            # ===== TAB 4: About =====
             with gr.TabItem("ℹ️ About", id="about"):
                 gr.Markdown(f"""
                 ## Arabic Tokenizer Arena Pro
@@ -1824,13 +2019,13 @@ def create_interface():
                 ### Available Tokenizers: {len(available_tokenizers)}
                 **Arabic-Specific Models:**
-                {chr(10).join(['- ' + t for t in arabic_specific])}
                 **Arabic LLMs:**
-                {chr(10).join(['- ' + t for t in arabic_llms])}
                 **Multilingual LLMs:**
-                {chr(10).join(['- ' + t for t in multilingual])}
                 ### Features
@@ -1838,6 +2033,7 @@ def create_interface():
                 ✅ Arabic-specific analysis (dialect support, diacritic preservation)
                 ✅ Side-by-side tokenizer comparison
                 ✅ Beautiful token visualization
                 ✅ Support for MSA, dialectal Arabic, and Classical Arabic
                 ✅ Research-backed evaluation methodology
@@ -1861,4 +2057,4 @@ def create_interface():
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch(share=True)

 A comprehensive research and production-grade tool for evaluating Arabic tokenizers
 across multiple dimensions: efficiency, coverage, morphological awareness, and more.
+Now with LEADERBOARD - imports real Arabic datasets from HuggingFace!
 Supports:
 - Arabic-specific tokenizers (Aranizer, AraBERT, CAMeLBERT, MARBERT, etc.)
 - Major LLM tokenizers (Jais, AceGPT, Falcon-Arabic, ALLaM, Qwen, Llama, Mistral, GPT)
 - Comprehensive evaluation metrics based on latest research
+- Real dataset benchmarking from HuggingFace
 """
 import gradio as gr
 from typing import Dict, List, Tuple, Optional, Any
 from dataclasses import dataclass, field
 from enum import Enum
+from collections import defaultdict
+import statistics
 import os
 # Hugging Face authentication
 from transformers import AutoTokenizer, logging
 logging.set_verbosity_error()
+# Import datasets library for leaderboard
+from datasets import load_dataset
 # ============================================================================
 # DATA CLASSES AND ENUMS
 # ============================================================================
         dialect_support=["MSA"],
         special_features=["100K vocabulary", "MSA focused"]
     ),
     # ========== ARABIC-SPECIFIC TOKENIZERS ==========
     "riotu-lab/Aranizer-PBE-86k": TokenizerInfo(
         dialect_support=["MSA"],
         special_features=["Low fertility", "SentencePiece", "86K vocab"]
     ),
     # ========== ARABIC-SPECIFIC LLMs ==========
     "ALLaM-AI/ALLaM-7B-Instruct-preview": TokenizerInfo(
         dialect_support=["MSA"],
         special_features=["LLaMA-based", "Cultural alignment", "RLHF", "Chat"]
     ),
     "silma-ai/SILMA-9B-Instruct-v1.0": TokenizerInfo(
         name="SILMA 9B Instruct",
         model_id="silma-ai/SILMA-9B-Instruct-v1.0",
         dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
         special_features=["Gemma-based", "SOTA 9B class", "Efficient"]
     ),
     "QCRI/Fanar-1-9B-Instruct": TokenizerInfo(
         name="Fanar 9B Instruct",
         model_id="QCRI/Fanar-1-9B-Instruct",
         dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
         special_features=["Islamic RAG", "Cultural alignment", "Gemma-based"]
     ),
     "Navid-AI/Yehia-7B-preview": TokenizerInfo(
         name="Yehia 7B Preview",
         model_id="Navid-AI/Yehia-7B-preview",
         dialect_support=["Darija", "MSA"],
         special_features=["Moroccan dialect", "Transliteration", "Cultural"]
     ),
     # ========== MULTILINGUAL LLMs WITH ARABIC SUPPORT ==========
     "Qwen/Qwen2.5-7B": TokenizerInfo(
         dialect_support=["MSA"],
         special_features=["152K vocab", "128K context", "30+ languages"]
     ),
     "google/gemma-2-9b": TokenizerInfo(
         name="Gemma 2 9B",
         model_id="google/gemma-2-9b",
         dialect_support=["MSA"],
         special_features=["256K vocab", "Efficient architecture"]
     ),
     "mistralai/Mistral-7B-v0.3": TokenizerInfo(
         name="Mistral 7B v0.3",
         model_id="mistralai/Mistral-7B-v0.3",
         dialect_support=["MSA"],
         special_features=["Tekken tokenizer", "131K vocab", "Multilingual optimized"]
     ),
     "xlm-roberta-base": TokenizerInfo(
         name="XLM-RoBERTa Base",
         model_id="xlm-roberta-base",
         dialect_support=["MSA"],
         special_features=["Baseline model", "104 languages"]
     ),
     "tiiuae/falcon-7b": TokenizerInfo(
         name="Falcon 7B",
         model_id="tiiuae/falcon-7b",
         dialect_support=["MSA"],
         special_features=["65K vocab", "RefinedWeb trained"]
     ),
 }
+# ============================================================================
+# LEADERBOARD DATASETS CONFIGURATION - Real HuggingFace Datasets
+# ============================================================================
+LEADERBOARD_DATASETS = {
+    # MSA Benchmarks
+    "arabic_mmlu": {
+        "hf_id": "MBZUAI/ArabicMMLU",
+        "name": "ArabicMMLU",
+        "category": "MSA Benchmark",
+        "text_column": "Question",
+        "split": "test",
+        "subset": None,
+        "samples": 500,
+        "description": "Multi-task benchmark from Arab school exams (14,575 MCQs)"
+    },
+    # Dialectal Arabic
+    "arsentd_lev": {
+        "hf_id": "ramybaly/arsentd_lev",
+        "name": "ArSenTD-LEV",
+        "category": "Levantine Dialect",
+        "text_column": "Tweet",
+        "split": "train",
+        "subset": None,
+        "samples": 500,
+        "description": "Levantine Arabic tweets (Jordan, Lebanon, Syria, Palestine)"
+    },
+    # Classical Arabic
+    "athar": {
+        "hf_id": "mohamed-khalil/ATHAR",
+        "name": "ATHAR Classical",
+        "category": "Classical Arabic",
+        "text_column": "arabic",
+        "split": "train",
+        "subset": None,
+        "samples": 500,
+        "description": "66K classical Arabic sentences with translations"
+    },
+    # Question Answering
+    "arcd": {
+        "hf_id": "arcd",
+        "name": "ARCD",
+        "category": "QA Dataset",
+        "text_column": "context",
+        "split": "train",
+        "subset": None,
+        "samples": 300,
+        "description": "Arabic Reading Comprehension Dataset (1,395 questions)"
+    },
+    # Poetry
+    "ashaar": {
+        "hf_id": "arbml/Ashaar_dataset",
+        "name": "Ashaar Poetry",
+        "category": "Poetry",
+        "text_column": "poem_text",
+        "split": "train",
+        "subset": None,
+        "samples": 500,
+        "description": "2M+ Arabic poetry verses with meter and theme labels"
+    },
+    # Religious - Hadith
+    "hadith": {
+        "hf_id": "gurgutan/sunnah_ar_en_dataset",
+        "name": "Hadith Collection",
+        "category": "Religious",
+        "text_column": "hadith_text_ar",
+        "split": "train",
+        "subset": None,
+        "samples": 400,
+        "description": "50,762 hadiths from 14 authentic books"
+    },
+    # Social Media
+    "arabic_sentiment": {
+        "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
+        "name": "Arabic Sentiment",
+        "category": "Social Media",
+        "text_column": "text",
+        "split": "train",
+        "subset": None,
+        "samples": 500,
+        "description": "Arabic Twitter sentiment corpus"
+    },
+    # News
+    "sanad": {
+        "hf_id": "arbml/SANAD",
+        "name": "SANAD News",
+        "category": "News",
+        "text_column": "text",
+        "split": "train",
+        "subset": "alarabiya",
+        "samples": 400,
+        "description": "Arabic news articles from Al Arabiya"
+    },
+}
 # ============================================================================
 # TOKENIZER LOADER AND CACHE
             except Exception as e:
                 print(f"  ✗ {info.name}: {str(e)[:50]}")
         print(f"\nTotal available tokenizers: {len(self._available)}")
     def get_tokenizer(self, model_id: str):
     diacritics = set('ًٌٍَُِّْٰ')
     return any(c in diacritics for c in text)
 def get_arabic_words(text: str) -> List[str]:
     """Extract Arabic words from text"""
     words = text.split()
     return [w for w in words if any(is_arabic_char(c) for c in w)]
         decoded_text=decoded
     )
+# ============================================================================
+# LEADERBOARD FUNCTIONS - Import Real Datasets from HuggingFace
+# ============================================================================
+class HFDatasetLoader:
+    """Load Arabic datasets from HuggingFace"""
+    def __init__(self):
+        self.cache = {}
+    def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]:
+        """Load texts from a HuggingFace dataset"""
+        if dataset_key in self.cache:
+            return self.cache[dataset_key], f"✅ Loaded {len(self.cache[dataset_key])} samples (cached)"
+        config = LEADERBOARD_DATASETS.get(dataset_key)
+        if not config:
+            return [], f"❌ Unknown dataset: {dataset_key}"
+        try:
+            # Load dataset from HuggingFace
+            if config.get("subset"):
+                ds = load_dataset(
+                    config["hf_id"],
+                    config["subset"],
+                    split=config["split"],
+                    trust_remote_code=True
+                )
+            else:
+                ds = load_dataset(
+                    config["hf_id"],
+                    split=config["split"],
+                    trust_remote_code=True
+                )
+            texts = []
+            text_col = config["text_column"]
+            # Try to find text column
+            if text_col not in ds.column_names:
+                for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]:
+                    if col in ds.column_names:
+                        text_col = col
+                        break
+            # Extract texts
+            max_samples = config.get("samples", 500)
+            for i, item in enumerate(ds):
+                if i >= max_samples:
+                    break
+                text = item.get(text_col, "")
+                if text and isinstance(text, str) and len(text.strip()) > 10:
+                    texts.append(text.strip())
+            self.cache[dataset_key] = texts
+            return texts, f"✅ Loaded {len(texts)} samples from HuggingFace"
+        except Exception as e:
+            return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}"
+def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]:
+    """Evaluate a tokenizer on a list of texts"""
+    fertilities = []
+    compressions = []
+    unk_counts = 0
+    total_tokens = 0
+    for text in texts:
+        try:
+            tokens = tokenizer.encode(text, add_special_tokens=False)
+            decoded = tokenizer.convert_ids_to_tokens(tokens)
+            num_tokens = len(tokens)
+            num_words = len(text.split()) or 1
+            num_bytes = len(text.encode('utf-8'))
+            fertility = num_tokens / num_words
+            compression = num_bytes / num_tokens if num_tokens > 0 else 0
+            # Count UNKs
+            unk_token = getattr(tokenizer, 'unk_token', '[UNK]')
+            unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower()))
+            fertilities.append(fertility)
+            compressions.append(compression)
+            unk_counts += unks
+            total_tokens += num_tokens
+        except Exception:
+            continue
+    if not fertilities:
+        return None
+    return {
+        "avg_fertility": statistics.mean(fertilities),
+        "std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0,
+        "avg_compression": statistics.mean(compressions),
+        "unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0,
+        "samples": len(fertilities)
+    }
+def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float:
+    """Calculate overall score (0-100, higher is better)"""
+    # Lower fertility is better (ideal ~1.0 for Arabic)
+    fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0
+    # Higher compression is better
+    compression_score = min(1, compression / 6)
+    # Lower UNK is better
+    unk_score = 1 - min(1, unk_ratio * 20)
+    # Weighted combination
+    score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100
+    return round(score, 1)
+def run_leaderboard_evaluation(
+    selected_datasets: List[str],
+    selected_tokenizers: List[str],
+    progress=gr.Progress()
+) -> Tuple[str, str, str]:
+    """
+    Run the full leaderboard evaluation with real HF datasets
+    Returns: (leaderboard_html, per_dataset_html, status_message)
+    """
+    if not selected_datasets:
+        return "", "", "⚠️ Please select at least one dataset"
+    if not selected_tokenizers:
+        return "", "", "⚠️ Please select at least one tokenizer"
+    loader = HFDatasetLoader()
+    results = defaultdict(dict)
+    # Status tracking
+    status_lines = []
+    # Load datasets from HuggingFace
+    status_lines.append("📚 **Loading Datasets from HuggingFace:**\n")
+    loaded_datasets = {}
+    for i, ds_key in enumerate(selected_datasets):
+        progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...")
+        texts, msg = loader.load_dataset_texts(ds_key)
+        ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
+        status_lines.append(f"  • {ds_name}: {msg}")
+        if texts:
+            loaded_datasets[ds_key] = texts
+    if not loaded_datasets:
+        return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully"
+    # Evaluate tokenizers
+    status_lines.append("\n🔄 **Evaluating Tokenizers:**\n")
+    tokenizer_cache = {}
+    total_steps = len(selected_tokenizers) * len(loaded_datasets)
+    current_step = 0
+    for tok_choice in selected_tokenizers:
+        # Get model ID from choice
+        tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
+        tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
+        tok_name = tok_info.name if tok_info else tok_choice
+        # Load tokenizer
+        try:
+            if tok_id not in tokenizer_cache:
+                tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained(
+                    tok_id, trust_remote_code=True
+                )
+            tokenizer = tokenizer_cache[tok_id]
+            status_lines.append(f"  • {tok_name}: ✅ Loaded")
+        except Exception as e:
+            status_lines.append(f"  • {tok_name}: ❌ Failed ({str(e)[:30]})")
+            continue
+        # Evaluate on each dataset
+        for ds_key, texts in loaded_datasets.items():
+            current_step += 1
+            progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...")
+            metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
+            if metrics:
+                results[tok_choice][ds_key] = metrics
+    # Generate leaderboard
+    progress(0.95, "Generating leaderboard...")
+    leaderboard_data = []
+    per_dataset_data = []
+    for tok_choice, ds_results in results.items():
+        if not ds_results:
+            continue
+        tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
+        tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
+        # Aggregate across datasets
+        all_fertility = [m["avg_fertility"] for m in ds_results.values()]
+        all_compression = [m["avg_compression"] for m in ds_results.values()]
+        all_unk = [m["unk_ratio"] for m in ds_results.values()]
+        avg_fertility = statistics.mean(all_fertility)
+        avg_compression = statistics.mean(all_compression)
+        avg_unk = statistics.mean(all_unk)
+        score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)
+        leaderboard_data.append({
+            "name": tok_info.name if tok_info else tok_choice,
+            "type": tok_info.type.value if tok_info else "Unknown",
+            "org": tok_info.organization if tok_info else "Unknown",
+            "score": score,
+            "fertility": avg_fertility,
+            "compression": avg_compression,
+            "unk_ratio": avg_unk,
+            "num_datasets": len(ds_results)
+        })
+        # Per-dataset row
+        per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice}
+        for ds_key in selected_datasets:
+            ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
+            if ds_key in ds_results:
+                per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2)
+            else:
+                per_ds_row[ds_name] = "-"
+        per_dataset_data.append(per_ds_row)
+    # Sort by score
+    leaderboard_data.sort(key=lambda x: x["score"], reverse=True)
+    # Create HTML tables
+    leaderboard_html = generate_leaderboard_html(leaderboard_data)
+    per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
+    status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
+    return leaderboard_html, per_dataset_html, "\n".join(status_lines)
+def generate_leaderboard_html(data: List[Dict]) -> str:
+    """Generate HTML for main leaderboard"""
+    if not data:
+        return "<p>No results to display</p>"
+    html = """
+    <style>
+        .leaderboard-table {
+            width: 100%;
+            border-collapse: collapse;
+            font-family: system-ui, -apple-system, sans-serif;
+            margin: 20px 0;
+        }
+        .leaderboard-table th {
+            background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4e 100%);
+            color: white;
+            padding: 12px 8px;
+            text-align: left;
+            font-weight: 600;
+        }
+        .leaderboard-table td {
+            padding: 10px 8px;
+            border-bottom: 1px solid #e0e0e0;
+        }
+        .leaderboard-table tr:nth-child(even) {
+            background-color: #f8f9fa;
+        }
+        .leaderboard-table tr:hover {
+            background-color: #e8f5e9;
+        }
+        .rank-1 { background: linear-gradient(90deg, #ffd700 0%, #fff8dc 100%) !important; }
+        .rank-2 { background: linear-gradient(90deg, #c0c0c0 0%, #f5f5f5 100%) !important; }
+        .rank-3 { background: linear-gradient(90deg, #cd7f32 0%, #ffe4c4 100%) !important; }
+        .score-badge {
+            background: #2d8f4e;
+            color: white;
+            padding: 4px 8px;
+            border-radius: 12px;
+            font-weight: bold;
+        }
+        .type-badge {
+            background: #e3f2fd;
+            color: #1565c0;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-size: 0.85em;
+        }
+        .metric-good { color: #2e7d32; font-weight: 600; }
+        .metric-bad { color: #c62828; }
+    </style>
+    <table class="leaderboard-table">
+        <thead>
+            <tr>
+                <th>Rank</th>
+                <th>Tokenizer</th>
+                <th>Type</th>
+                <th>Organization</th>
+                <th>Score ↑</th>
+                <th>Fertility ↓</th>
+                <th>Compression ↑</th>
+                <th>UNK Rate ↓</th>
+                <th>Datasets</th>
+            </tr>
+        </thead>
+        <tbody>
+    """
+    for i, entry in enumerate(data):
+        rank = i + 1
+        rank_class = f"rank-{rank}" if rank <= 3 else ""
+        # Color coding for metrics
+        fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
+        comp_class = "metric-good" if entry["compression"] > 3.5 else ""
+        unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
+        html += f"""
+            <tr class="{rank_class}">
+                <td><strong>#{rank}</strong></td>
+                <td><strong>{entry["name"]}</strong></td>
+                <td><span class="type-badge">{entry["type"]}</span></td>
+                <td>{entry["org"]}</td>
+                <td><span class="score-badge">{entry["score"]}</span></td>
+                <td class="{fert_class}">{entry["fertility"]:.3f}</td>
+                <td class="{comp_class}">{entry["compression"]:.2f}</td>
+                <td class="{unk_class}">{entry["unk_ratio"]:.2%}</td>
+                <td>{entry["num_datasets"]}</td>
+            </tr>
+        """
+    html += """
+        </tbody>
+    </table>
+    <div style="margin-top: 15px; padding: 10px; background: #f5f5f5; border-radius: 8px; font-size: 0.9em;">
+        <strong>📊 Metric Guide:</strong><br>
+        • <strong>Score:</strong> Overall ranking (0-100, higher = better)<br>
+        • <strong>Fertility:</strong> Tokens per word (lower = better, 1.0 ideal for Arabic)<br>
+        • <strong>Compression:</strong> Bytes per token (higher = more efficient)<br>
+        • <strong>UNK Rate:</strong> Unknown token percentage (lower = better)
+    </div>
+    """
+    return html
+def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
+    """Generate HTML for per-dataset fertility table"""
+    if not data:
+        return "<p>No per-dataset results</p>"
+    ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
+    html = """
+    <style>
+        .dataset-table {
+            width: 100%;
+            border-collapse: collapse;
+            font-family: system-ui, -apple-system, sans-serif;
+            margin: 20px 0;
+            font-size: 0.9em;
+        }
+        .dataset-table th {
+            background: #37474f;
+            color: white;
+            padding: 10px 6px;
+            text-align: center;
+        }
+        .dataset-table th:first-child {
+            text-align: left;
+        }
+        .dataset-table td {
+            padding: 8px 6px;
+            text-align: center;
+            border-bottom: 1px solid #e0e0e0;
+        }
+        .dataset-table td:first-child {
+            text-align: left;
+            font-weight: 500;
+        }
+        .dataset-table tr:nth-child(even) {
+            background-color: #fafafa;
+        }
+        .fert-excellent { background: #c8e6c9; color: #1b5e20; font-weight: 600; }
+        .fert-good { background: #fff9c4; color: #f57f17; }
+        .fert-poor { background: #ffcdd2; color: #b71c1c; }
+    </style>
+    <h4>📈 Fertility per Dataset (tokens/word - lower is better)</h4>
+    <table class="dataset-table">
+        <thead>
+            <tr>
+                <th>Tokenizer</th>
+    """
+    for ds_name in ds_names:
+        html += f"<th>{ds_name}</th>"
+    html += """
+            </tr>
+        </thead>
+        <tbody>
+    """
+    for row in data:
+        html += f"<tr><td>{row['Tokenizer']}</td>"
+        for ds_name in ds_names:
+            val = row.get(ds_name, "-")
+            if val != "-":
+                if val < 1.8:
+                    cls = "fert-excellent"
+                elif val < 2.5:
+                    cls = "fert-good"
+                else:
+                    cls = "fert-poor"
+                html += f'<td class="{cls}">{val}</td>'
+            else:
+                html += '<td>-</td>'
+        html += "</tr>"
+    html += """
+        </tbody>
+    </table>
+    """
+    return html
 # ============================================================================
 # UI GENERATION FUNCTIONS
 # ============================================================================
 def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
     """Generate beautiful HTML visualization of tokens"""
     colors = [
+        ('#1a1a2e', '#eaeaea'),
         ('#16213e', '#f0f0f0'),
         ('#0f3460', '#ffffff'),
         ('#533483', '#f5f5f5'),
     html_parts = []
     for i, (token, tid) in enumerate(zip(tokens, token_ids)):
         bg, fg = colors[i % len(colors)]
         display_token = token.replace('<', '&lt;').replace('>', '&gt;')
         is_arabic = any(is_arabic_char(c) for c in token)
         direction = 'rtl' if is_arabic else 'ltr'
 def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
     """Generate metrics visualization card"""
     fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
     strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
     compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
         <div class="metric-card {strr_quality}">
             <div class="metric-icon">✨</div>
             <div class="metric-value">{metrics.single_token_retention_rate:.1%}</div>
+            <div class="metric-label">STRR (Single Token Retention)</div>
             <div class="metric-hint">Higher is better</div>
         </div>
         <div class="metric-card">
+            <div class="metric-icon">🔤</div>
             <div class="metric-value">{metrics.char_per_token:.2f}</div>
             <div class="metric-label">Characters/Token</div>
         </div>
+        <div class="metric-card {'excellent' if metrics.oov_percentage == 0 else 'poor' if metrics.oov_percentage > 5 else 'good'}">
+            <div class="metric-icon">❓</div>
+            <div class="metric-value">{metrics.oov_percentage:.1f}%</div>
+            <div class="metric-label">OOV Rate</div>
+            <div class="metric-hint">Lower is better (0% ideal)</div>
         </div>
+        <div class="metric-card">
+            <div class="metric-icon">🌍</div>
             <div class="metric-value">{metrics.arabic_fertility:.3f}</div>
             <div class="metric-label">Arabic Fertility</div>
         </div>
         <div class="metric-card">
+            <div class="metric-icon">⚡</div>
+            <div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div>
+            <div class="metric-label">Processing Time</div>
         </div>
     </div>
     '''
 def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
     """Generate tokenizer information card"""
+    dialect_badges = ''.join([f'<span class="badge dialect">{d}</span>' for d in info.dialect_support])
+    feature_badges = ''.join([f'<span class="badge feature">{f}</span>' for f in info.special_features])
+    support_class = "native" if info.arabic_support == "Native" else "supported" if info.arabic_support == "Supported" else "limited"
     return f'''
+    <div class="info-card">
+        <div class="info-header">
             <h3>{info.name}</h3>
             <span class="org-badge">{info.organization}</span>
         </div>
+        <p class="description">{info.description}</p>
+        <div class="info-grid">
+            <div class="info-item">
+                <span class="info-label">Type:</span>
+                <span class="info-value">{info.type.value}</span>
             </div>
+            <div class="info-item">
+                <span class="info-label">Algorithm:</span>
+                <span class="info-value">{info.algorithm.value}</span>
             </div>
+            <div class="info-item">
+                <span class="info-label">Vocab Size:</span>
+                <span class="info-value">{info.vocab_size:,}</span>
             </div>
+            <div class="info-item">
+                <span class="info-label">Arabic Support:</span>
+                <span class="info-value support-{support_class}">{info.arabic_support}</span>
             </div>
         </div>
+        <div class="badge-container">
             <div class="badge-group">
                 <span class="badge-label">Dialects:</span>
                 {dialect_badges}
     </div>
     '''
 def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
+    """Analyze a single tokenizer"""
+    if not text or not text.strip():
         return (
+            '<div class="warning">⚠️ Please enter some text to analyze</div>',
+            '', '', ''
+        )
+    if not tokenizer_choice:
+        return (
+            '<div class="warning">⚠️ Please select a tokenizer</div>',
+            '', '', ''
         )
     model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
+    tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
+    if not tokenizer_info:
+        return (
+            '<div class="error-card"><h4>Error</h4><p>Tokenizer not found</p></div>',
+            '', '', ''
+        )
     try:
+        metrics = analyze_tokenization(text, model_id, tokenizer_info)
+        info_html = generate_tokenizer_info_card(tokenizer_info)
+        metrics_html = generate_metrics_card(metrics, tokenizer_info)
         tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
         decoded_html = f'''
         <div class="decoded-section">
             <h4>Decoded Output</h4>
             <div class="decoded-text" dir="auto">{metrics.decoded_text}</div>
             <div class="decoded-meta">
+                Diacritics preserved: {'✅ Yes' if metrics.diacritic_preservation else '❌ No'}
             </div>
         </div>
         '''
         return info_html, metrics_html, tokens_html, decoded_html
     except Exception as e:
+        return (
+            f'<div class="error-card"><h4>Error</h4><p>{str(e)}</p></div>',
+            '', '', ''
+        )
 def compare_tokenizers(tokenizer_choices: List[str], text: str) -> str:
+    """Compare multiple tokenizers"""
+    if not text or not text.strip():
+        return '<div class="warning">⚠️ Please enter some text to analyze</div>'
     if not tokenizer_choices or len(tokenizer_choices) < 2:
+        return '<div class="warning">⚠️ Please select at least 2 tokenizers to compare</div>'
     results = []
     for choice in tokenizer_choices:
         model_id = tokenizer_manager.get_model_id_from_choice(choice)
+        tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
+        if tokenizer_info:
+            try:
+                metrics = analyze_tokenization(text, model_id, tokenizer_info)
+                results.append({
+                    'name': tokenizer_info.name,
+                    'org': tokenizer_info.organization,
+                    'type': tokenizer_info.type.value,
+                    'metrics': metrics
+                })
+            except Exception as e:
+                results.append({
+                    'name': tokenizer_info.name,
+                    'org': tokenizer_info.organization,
+                    'type': tokenizer_info.type.value,
+                    'error': str(e)
+                })
+    # Sort by fertility (lower is better)
+    results.sort(key=lambda x: x.get('metrics', TokenizationMetrics(
+        total_tokens=0, total_words=0, total_characters=0, total_bytes=0,
+        fertility=999, compression_ratio=0, char_per_token=0,
+        oov_count=0, oov_percentage=0, single_token_words=0,
+        single_token_retention_rate=0, avg_subwords_per_word=0,
+        max_subwords_per_word=0, continued_words_ratio=0,
+        arabic_char_count=0, arabic_token_count=0, arabic_fertility=0,
+        diacritic_preservation=False, tokenization_time_ms=0
+    )).fertility)
     # Generate comparison table
+    html = '''
     <div class="comparison-container">
         <table class="comparison-table">
             <thead>
                 <tr>
+                    <th>Rank</th>
                     <th>Tokenizer</th>
+                    <th>Type</th>
                     <th>Tokens</th>
                     <th>Fertility ↓</th>
+                    <th>Compression ↑</th>
+                    <th>STRR ↑</th>
                     <th>OOV %</th>
                 </tr>
             </thead>
             <tbody>
+    '''
+    for i, result in enumerate(results):
+        rank = i + 1
+        rank_class = 'rank-1' if rank == 1 else 'rank-2' if rank == 2 else 'rank-3' if rank == 3 else ''
+        if 'error' in result:
+            html += f'''
+                <tr class="{rank_class}">
+                    <td>#{rank}</td>
+                    <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
+                    <td>{result['type']}</td>
+                    <td colspan="5" class="error">Error: {result['error']}</td>
+                </tr>
+            '''
+        else:
+            m = result['metrics']
+            fertility_class = 'excellent' if m.fertility < 1.5 else 'good' if m.fertility < 2.5 else 'poor'
+            html += f'''
+                <tr class="{rank_class}">
+                    <td><strong>#{rank}</strong></td>
+                    <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
+                    <td>{result['type']}</td>
+                    <td>{m.total_tokens}</td>
+                    <td class="{fertility_class}">{m.fertility:.3f}</td>
+                    <td>{m.compression_ratio:.2f}</td>
+                    <td>{m.single_token_retention_rate:.1%}</td>
+                    <td>{m.oov_percentage:.1f}%</td>
+                </tr>
+            '''
+    html += '''
             </tbody>
         </table>
     </div>
     '''
+    return html
 # ============================================================================
+# CUSTOM CSS
 # ============================================================================
 CUSTOM_CSS = """
+/* ===== ROOT VARIABLES ===== */
 :root {
+    --primary: #1a5f2a;
+    --primary-light: #2d8f4e;
+    --secondary: #4a90d9;
+    --accent: #f59e0b;
+    --success: #10b981;
     --warning: #f57c00;
     --error: #c62828;
+    --bg-primary: #0f1419;
+    --bg-secondary: #1c2128;
+    --bg-card: #22272e;
+    --text-primary: #e6edf3;
+    --text-secondary: #8b949e;
+    --border: #30363d;
 }
+/* ===== HEADER ===== */
 .header-section {
     text-align: center;
+    padding: 2rem 1rem;
+    background: linear-gradient(135deg, var(--primary) 0%, var(--primary-light) 100%);
     border-radius: 16px;
+    margin-bottom: 1.5rem;
 }
 .header-section h1 {
     font-size: 2.5rem;
     color: white;
     margin-bottom: 0.5rem;
 }
 .header-section p {
     font-size: 1.1rem;
 }
+/* ===== INFO CARD ===== */
+.info-card {
     background: var(--bg-card);
     border-radius: 12px;
+    padding: 1.5rem;
     border: 1px solid var(--border);
 }
+.info-header {
+    display: flex;
+    justify-content: space-between;
     align-items: center;
+    margin-bottom: 1rem;
 }
+.info-header h3 {
+    color: var(--text-primary);
+    margin: 0;
 }
+.org-badge {
+    background: var(--primary);
+    color: white;
+    padding: 0.25rem 0.75rem;
+    border-radius: 20px;
+    font-size: 0.85rem;
+}
+.description {
+    color: var(--text-secondary);
+    line-height: 1.6;
+}
+.info-grid {
+    display: grid;
+    grid-template-columns: repeat(2, 1fr);
+    gap: 1rem;
+    margin: 1rem 0;
+}
+.info-item {
+    display: flex;
+    flex-direction: column;
+}
+.info-label {
+    color: var(--text-secondary);
+    font-size: 0.85rem;
+}
+.info-value {
+    color: var(--text-primary);
+    font-weight: 600;
+}
+.support-native { color: var(--success); }
+.support-supported { color: var(--secondary); }
+.support-limited { color: var(--warning); }
+/* ===== BADGES ===== */
+.badge-container {
+    margin-top: 1rem;
+}
+.badge-group {
+    margin-bottom: 0.5rem;
+}
+.badge-label {
+    color: var(--text-secondary);
+    font-size: 0.85rem;
+    margin-right: 0.5rem;
+}
+.badge {
+    display: inline-block;
+    padding: 0.2rem 0.5rem;
+    border-radius: 4px;
+    font-size: 0.75rem;
+    margin-right: 0.25rem;
+    margin-bottom: 0.25rem;
+}
+.badge.dialect {
+    background: rgba(74, 144, 217, 0.2);
+    color: var(--secondary);
+}
+.badge.feature {
+    background: rgba(245, 158, 11, 0.2);
+    color: var(--accent);
 }
 /* ===== METRICS GRID ===== */
 .metrics-grid {
     display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
     gap: 1rem;
+    margin: 1rem 0;
 }
 .metric-card {
     background: var(--bg-card);
     border-radius: 12px;
+    padding: 1rem;
     text-align: center;
+    border: 1px solid var(--border);
+    transition: transform 0.2s;
 }
 .metric-card:hover {
+    transform: translateY(-2px);
 }
 .metric-card.excellent {
     border-color: var(--success);
+    background: linear-gradient(to bottom, rgba(16, 185, 129, 0.1), transparent);
 }
 .metric-card.good {
+    border-color: var(--secondary);
+    background: linear-gradient(to bottom, rgba(74, 144, 217, 0.1), transparent);
 }
 .metric-card.poor {
+    border-color: var(--error);
+    background: linear-gradient(to bottom, rgba(198, 40, 40, 0.1), transparent);
 }
 .metric-card.primary {
+    border-color: var(--primary);
+    background: linear-gradient(to bottom, rgba(26, 95, 42, 0.1), transparent);
 }
 .metric-icon {
 }
 .metric-value {
+    font-size: 1.5rem;
     font-weight: 700;
     color: var(--text-primary);
 }
 .metric-label {
+    font-size: 0.8rem;
     color: var(--text-secondary);
+    margin-top: 0.25rem;
 }
 .metric-hint {
     opacity: 0.7;
 }
+/* ===== TOKEN VISUALIZATION ===== */
+.token-container {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 0.5rem;
+    padding: 1rem;
+    background: var(--bg-secondary);
     border-radius: 12px;
+    direction: rtl;
 }
+.token {
+    display: inline-flex;
+    flex-direction: column;
     align-items: center;
+    padding: 0.5rem 0.75rem;
+    border-radius: 8px;
+    font-family: 'IBM Plex Sans Arabic', monospace;
+    font-size: 1rem;
+    transition: transform 0.2s;
+    cursor: default;
 }
+.token:hover {
+    transform: scale(1.05);
 }
+.token-id {
+    font-size: 0.65rem;
+    opacity: 0.7;
+    margin-top: 0.25rem;
 }
+/* ===== DECODED SECTION ===== */
+.decoded-section {
+    background: var(--bg-card);
+    border-radius: 12px;
+    padding: 1.5rem;
+    border: 1px solid var(--border);
 }
+.decoded-section h4 {
+    color: var(--text-primary);
     margin-bottom: 1rem;
 }
+.decoded-text {
+    font-family: 'IBM Plex Sans Arabic', serif;
+    font-size: 1.1rem;
+    line-height: 1.8;
     color: var(--text-primary);
 }
+.decoded-meta {
+    margin-top: 1rem;
     font-size: 0.85rem;
+    color: var(--text-secondary);
 }
 /* ===== COMPARISON TABLE ===== */
 .comparison-container {
+    overflow-x: auto;
 }
 .comparison-table {
     width: 100%;
     border-collapse: collapse;
+    margin: 1rem 0;
 }
 .comparison-table th {
+    background: var(--primary);
+    color: white;
+    padding: 0.75rem;
     text-align: left;
+    font-weight: 600;
 }
 .comparison-table td {
+    padding: 0.75rem;
     border-bottom: 1px solid var(--border);
     color: var(--text-primary);
 }
+.comparison-table tr:hover {
+    background: rgba(74, 144, 217, 0.1);
 }
+.comparison-table .rank-1 {
+    background: linear-gradient(90deg, rgba(255, 215, 0, 0.2), transparent);
 }
+.comparison-table .rank-2 {
+    background: linear-gradient(90deg, rgba(192, 192, 192, 0.2), transparent);
 }
+.comparison-table .rank-3 {
+    background: linear-gradient(90deg, rgba(205, 127, 50, 0.2), transparent);
 }
+.comparison-table .excellent {
+    color: var(--success);
+    font-weight: 600;
 }
+.comparison-table .good {
+    color: var(--secondary);
 }
+.comparison-table .poor {
+    color: var(--error);
 }
 /* ===== UTILITY CLASSES ===== */
     available_tokenizers = tokenizer_manager.get_tokenizer_choices()
+    # Group tokenizers by type
+    arabic_specific = [t for t in available_tokenizers if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT', 'Aranizer'])]
+    arabic_llms = [t for t in available_tokenizers if any(x in t for x in ['Jais', 'AceGPT', 'ALLaM', 'SILMA', 'Fanar', 'Yehia', 'Atlas'])]
     multilingual = [t for t in available_tokenizers if t not in arabic_specific and t not in arabic_llms]
     with gr.Blocks(css=CUSTOM_CSS, title="Arabic Tokenizer Arena Pro", theme=gr.themes.Base(
+        primary_hue="green",
+        secondary_hue="blue",
         neutral_hue="slate",
         font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
     )) as demo:
                 tokens_output = gr.HTML(label="Token Visualization")
                 decoded_output = gr.HTML(label="Decoded Output")
                 sample_dropdown.change(
                     lambda x: SAMPLE_TEXTS.get(x, ""),
                     inputs=[sample_dropdown],
                     outputs=[comparison_output]
                 )
+            # ===== TAB 3: LEADERBOARD - Real HF Datasets =====
+            with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
+                gr.Markdown("""
+                ## 🏆 Arabic Tokenizer Leaderboard
+                Evaluate and rank tokenizers using **real Arabic datasets from HuggingFace**.
+                Select datasets and tokenizers below, then click "Run Evaluation" to generate the leaderboard.
+                ⚠️ **Note:** First run will download datasets from HuggingFace (may take a few minutes).
+                """)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("### 📚 Select Datasets")
+                        dataset_choices = gr.CheckboxGroup(
+                            choices=[(f"{v['name']} ({v['category']})", k) for k, v in LEADERBOARD_DATASETS.items()],
+                            value=["arabic_mmlu", "arsentd_lev", "athar", "arcd"],
+                            label="HuggingFace Datasets",
+                            info="Datasets will be downloaded from HuggingFace"
+                        )
+                    with gr.Column(scale=1):
+                        gr.Markdown("### 🔧 Select Tokenizers")
+                        leaderboard_tokenizer_choices = gr.CheckboxGroup(
+                            choices=available_tokenizers,
+                            value=available_tokenizers[:8] if len(available_tokenizers) >= 8 else available_tokenizers,
+                            label="Tokenizers to Evaluate"
+                        )
+                run_leaderboard_btn = gr.Button("🚀 Run Evaluation", variant="primary", size="lg")
+                status_output = gr.Markdown("Click 'Run Evaluation' to start...")
+                gr.Markdown("---")
+                gr.Markdown("### 📊 Leaderboard Results")
+                leaderboard_output = gr.HTML()
+                gr.Markdown("### 📈 Per-Dataset Breakdown")
+                per_dataset_output = gr.HTML()
+                run_leaderboard_btn.click(
+                    fn=run_leaderboard_evaluation,
+                    inputs=[dataset_choices, leaderboard_tokenizer_choices],
+                    outputs=[leaderboard_output, per_dataset_output, status_output]
+                )
+                gr.Markdown("""
+                ---
+                ### 📖 Dataset Sources (from HuggingFace)
+                | Dataset | HuggingFace ID | Category | Description |
+                |---------|----------------|----------|-------------|
+                | ArabicMMLU | `MBZUAI/ArabicMMLU` | Benchmark | Multi-task exam questions (14,575 MCQs) |
+                | ArSenTD-LEV | `ramybaly/arsentd_lev` | Dialectal | Levantine tweets |
+                | ATHAR | `mohamed-khalil/ATHAR` | Classical | 66K classical Arabic sentences |
+                | ARCD | `arcd` | QA | Arabic Reading Comprehension |
+                | Ashaar | `arbml/Ashaar_dataset` | Poetry | 2M+ Arabic poetry verses |
+                | Hadith | `gurgutan/sunnah_ar_en_dataset` | Religious | 50,762 hadiths |
+                | Arabic Sentiment | `arbml/Arabic_Sentiment_Twitter_Corpus` | Social Media | Twitter sentiment |
+                | SANAD | `arbml/SANAD` | News | Arabic news articles |
+                """)
+            # ===== TAB 4: Metrics Reference =====
             with gr.TabItem("📖 Metrics Guide", id="guide"):
                 gr.Markdown("""
                 ## Tokenization Evaluation Metrics Guide
                 - *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.)
                 - *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025)
                 - *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024)
                 """)
+            # ===== TAB 5: About =====
             with gr.TabItem("ℹ️ About", id="about"):
                 gr.Markdown(f"""
                 ## Arabic Tokenizer Arena Pro
                 ### Available Tokenizers: {len(available_tokenizers)}
                 **Arabic-Specific Models:**
+                {chr(10).join(['- ' + t for t in arabic_specific[:10]])}
                 **Arabic LLMs:**
+                {chr(10).join(['- ' + t for t in arabic_llms[:10]])}
                 **Multilingual LLMs:**
+                {chr(10).join(['- ' + t for t in multilingual[:10]])}
                 ### Features
                 ✅ Arabic-specific analysis (dialect support, diacritic preservation)
                 ✅ Side-by-side tokenizer comparison
                 ✅ Beautiful token visualization
+                ✅ **NEW: Leaderboard with real HuggingFace datasets**
                 ✅ Support for MSA, dialectal Arabic, and Classical Arabic
                 ✅ Research-backed evaluation methodology
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch()