Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

HeshamHaroon Claude commited on 14 days ago

Commit

f32d4c7

1 Parent(s): f2a2081

Refactor: modularize codebase into separate modules

- Split monolithic app.py into logical modules:
- config.py: tokenizer registry, datasets, sample texts
- tokenizer_manager.py: tokenizer loading and caching
- analysis.py: tokenization analysis functions
- leaderboard.py: HF dataset evaluation
- utils.py: Arabic text utilities
- styles.py: CSS styles
- ui_components.py: HTML generation
- Add .gitignore for Python/Gradio
- Add __init__.py for package structure

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (12) hide show

.gitignore +38 -0
README.md +99 -10
__init__.py +8 -0
analysis.py +244 -0
app.py +52 -1853
config.py +551 -0
leaderboard.py +449 -0
requirements.txt +7 -1
styles.py +526 -0
tokenizer_manager.py +86 -0
ui_components.py +280 -0
utils.py +56 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,38 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+.venv/
+*.egg-info/
+dist/
+build/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# Environment
+.env
+.env.local
+# Logs
+*.log
+logs/
+# Cache
+.cache/
+*.cache
+.gradio/
+# OS
+.DS_Store
+Thumbs.db
+# HuggingFace
+.huggingface/

README.md CHANGED Viewed

@@ -1,12 +1,101 @@
----
-title: Token
-emoji: 🐠
-colorFrom: purple
-colorTo: purple
-sdk: gradio
-sdk_version: 6.0.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🏟️ Arabic Tokenizer Arena Pro
+Advanced research & production platform for Arabic tokenization analysis.
+## Features
+- 📊 **Comprehensive Metrics**: Fertility, compression, STRR, OOV rate, and more
+- 🌍 **Arabic-Specific Analysis**: Dialect support, diacritic preservation
+- ⚖️ **Side-by-Side Comparison**: Compare multiple tokenizers instantly
+- 🎨 **Beautiful Visualization**: Token-by-token display with IDs
+- 🏆 **Leaderboard**: Evaluate on real HuggingFace Arabic datasets
+- 📖 **Multi-Variant Support**: MSA, dialectal, and Classical Arabic
+## Project Structure
+```
+arabic_tokenizer_arena/
+├── app.py                 # Main Gradio application
+├── config.py              # Tokenizer registry & dataset configs
+├── tokenizer_manager.py   # Tokenizer loading & caching
+├── analysis.py            # Tokenization analysis functions
+├── leaderboard.py         # Leaderboard with HF datasets
+├── ui_components.py       # HTML generation
+├── styles.py              # CSS styles
+├── utils.py               # Arabic text utilities
+├── requirements.txt       # Dependencies
+└── README.md              # This file
+```
+## Installation
+```bash
+pip install -r requirements.txt
+```
+## Usage
+### Local Development
+```bash
+python app.py
+```
+### HuggingFace Spaces
+1. Upload all `.py` files to your Space
+2. Add `HF_TOKEN` secret if using gated models
+3. The app will start automatically
+## Available Tokenizers
+### Arabic BERT Models
+- AraBERT v2 (AUB MIND Lab)
+- CAMeLBERT Mix/MSA/DA/CA (CAMeL Lab)
+- MARBERT & ARBERT (UBC NLP)
+### Arabic LLMs
+- Jais 13B/30B (Inception/MBZUAI)
+- SILMA 9B (SILMA AI)
+- Fanar 9B (QCRI)
+- Yehia 7B (Navid AI)
+- Atlas-Chat (MBZUAI Paris)
+### Arabic Tokenizers
+- Aranizer PBE/SP 32K/86K (RIOTU Lab)
+### Multilingual Models
+- Qwen 2.5 (Alibaba)
+- Gemma 2 (Google)
+- Mistral (Mistral AI)
+- XLM-RoBERTa (Meta)
+## Leaderboard Datasets
+| Dataset | Source | Category |
+|---------|--------|----------|
+| ArabicMMLU | MBZUAI | MSA Benchmark |
+| ArSenTD-LEV | ramybaly | Levantine Dialect |
+| ATHAR | mohamed-khalil | Classical Arabic |
+| ARCD | arcd | QA Dataset |
+| Ashaar | arbml | Poetry |
+| Hadith | gurgutan | Religious |
+| Arabic Sentiment | arbml | Social Media |
+| SANAD | arbml | News |
+## Metrics
+- **Fertility**: Tokens per word (lower = better, 1.0 ideal)
+- **Compression**: Bytes per token (higher = better)
+- **STRR**: Single Token Retention Rate (higher = better)
+- **OOV Rate**: Out-of-vocabulary percentage (lower = better)
+## License
+MIT License
+## Contributing
+Contributions welcome! Please open an issue or PR.
 ---
+Built with ❤️ for the Arabic NLP community

__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Arabic Tokenizer Arena Pro
+==========================
+A comprehensive platform for evaluating Arabic tokenizers
+"""
+__version__ = "2.0.0"
+__author__ = "Arabic NLP Community"

analysis.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+Tokenization Analysis
+=====================
+Core analysis functions for evaluating tokenizers
+"""
+import time
+from typing import Tuple
+from config import TokenizerInfo, TokenizationMetrics
+from utils import count_arabic_chars, get_arabic_words, has_diacritics, is_arabic_char
+from tokenizer_manager import tokenizer_manager
+def analyze_tokenization(
+    text: str,
+    model_id: str,
+    tokenizer_info: TokenizerInfo
+) -> TokenizationMetrics:
+    """Perform comprehensive tokenization analysis"""
+    tokenizer = tokenizer_manager.get_tokenizer(model_id)
+    # Time the tokenization
+    start_time = time.perf_counter()
+    tokens = tokenizer.tokenize(text)
+    token_ids = tokenizer.encode(text, add_special_tokens=False)
+    tokenization_time = (time.perf_counter() - start_time) * 1000
+    decoded = tokenizer.decode(token_ids, skip_special_tokens=True)
+    # Basic counts
+    words = text.split()
+    total_words = len(words)
+    total_tokens = len(tokens)
+    total_characters = len(text)
+    total_bytes = len(text.encode('utf-8'))
+    # Efficiency metrics
+    fertility = total_tokens / max(total_words, 1)
+    compression_ratio = total_bytes / max(total_tokens, 1)
+    char_per_token = total_characters / max(total_tokens, 1)
+    # OOV analysis
+    unk_token = tokenizer.unk_token if hasattr(tokenizer, 'unk_token') else '[UNK]'
+    oov_count = sum(1 for t in tokens if t == unk_token or '[UNK]' in str(t))
+    oov_percentage = (oov_count / max(total_tokens, 1)) * 100
+    # Single Token Retention Rate (STRR)
+    single_token_words = 0
+    subwords_per_word = []
+    for word in words:
+        word_tokens = tokenizer.tokenize(word)
+        subwords_per_word.append(len(word_tokens))
+        if len(word_tokens) == 1:
+            single_token_words += 1
+    strr = single_token_words / max(total_words, 1)
+    avg_subwords = sum(subwords_per_word) / max(len(subwords_per_word), 1)
+    max_subwords = max(subwords_per_word) if subwords_per_word else 0
+    continued_ratio = (total_words - single_token_words) / max(total_words, 1)
+    # Arabic-specific metrics
+    arabic_char_count = count_arabic_chars(text)
+    arabic_words = get_arabic_words(text)
+    arabic_tokens_count = 0
+    for token in tokens:
+        if any(is_arabic_char(c) for c in str(token)):
+            arabic_tokens_count += 1
+    arabic_fertility = arabic_tokens_count / max(len(arabic_words), 1) if arabic_words else 0
+    diacritic_preserved = has_diacritics(text) == has_diacritics(decoded)
+    return TokenizationMetrics(
+        total_tokens=total_tokens,
+        total_words=total_words,
+        total_characters=total_characters,
+        total_bytes=total_bytes,
+        fertility=fertility,
+        compression_ratio=compression_ratio,
+        char_per_token=char_per_token,
+        oov_count=oov_count,
+        oov_percentage=oov_percentage,
+        single_token_words=single_token_words,
+        single_token_retention_rate=strr,
+        avg_subwords_per_word=avg_subwords,
+        max_subwords_per_word=max_subwords,
+        continued_words_ratio=continued_ratio,
+        arabic_char_count=arabic_char_count,
+        arabic_token_count=arabic_tokens_count,
+        arabic_fertility=arabic_fertility,
+        diacritic_preservation=diacritic_preserved,
+        tokenization_time_ms=tokenization_time,
+        tokens=tokens,
+        token_ids=token_ids,
+        decoded_text=decoded
+    )
+def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
+    """Analyze a single tokenizer - returns HTML outputs"""
+    from ui_components import (
+        generate_tokenizer_info_card,
+        generate_metrics_card,
+        generate_token_visualization,
+        generate_decoded_section
+    )
+    if not text or not text.strip():
+        return (
+            '<div class="warning">⚠️ Please enter some text to analyze</div>',
+            '', '', ''
+        )
+    if not tokenizer_choice:
+        return (
+            '<div class="warning">⚠️ Please select a tokenizer</div>',
+            '', '', ''
+        )
+    model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
+    tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
+    if not tokenizer_info:
+        return (
+            '<div class="error-card"><h4>Error</h4><p>Tokenizer not found</p></div>',
+            '', '', ''
+        )
+    try:
+        metrics = analyze_tokenization(text, model_id, tokenizer_info)
+        info_html = generate_tokenizer_info_card(tokenizer_info)
+        metrics_html = generate_metrics_card(metrics, tokenizer_info)
+        tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
+        decoded_html = generate_decoded_section(metrics)
+        return info_html, metrics_html, tokens_html, decoded_html
+    except Exception as e:
+        return (
+            f'<div class="error-card"><h4>Error</h4><p>{str(e)}</p></div>',
+            '', '', ''
+        )
+def compare_tokenizers(tokenizer_choices: list, text: str) -> str:
+    """Compare multiple tokenizers - returns HTML table"""
+    from config import TokenizationMetrics
+    if not text or not text.strip():
+        return '<div class="warning">⚠️ Please enter some text to analyze</div>'
+    if not tokenizer_choices or len(tokenizer_choices) < 2:
+        return '<div class="warning">⚠️ Please select at least 2 tokenizers to compare</div>'
+    results = []
+    for choice in tokenizer_choices:
+        model_id = tokenizer_manager.get_model_id_from_choice(choice)
+        tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
+        if tokenizer_info:
+            try:
+                metrics = analyze_tokenization(text, model_id, tokenizer_info)
+                results.append({
+                    'name': tokenizer_info.name,
+                    'org': tokenizer_info.organization,
+                    'type': tokenizer_info.type.value,
+                    'metrics': metrics
+                })
+            except Exception as e:
+                results.append({
+                    'name': tokenizer_info.name,
+                    'org': tokenizer_info.organization,
+                    'type': tokenizer_info.type.value,
+                    'error': str(e)
+                })
+    # Sort by fertility (lower is better)
+    def get_fertility(x):
+        if 'error' in x:
+            return 999
+        return x['metrics'].fertility
+    results.sort(key=get_fertility)
+    # Generate comparison table
+    html = '''
+    <div class="comparison-container">
+        <table class="comparison-table">
+            <thead>
+                <tr>
+                    <th>Rank</th>
+                    <th>Tokenizer</th>
+                    <th>Type</th>
+                    <th>Tokens</th>
+                    <th>Fertility ↓</th>
+                    <th>Compression ↑</th>
+                    <th>STRR ↑</th>
+                    <th>OOV %</th>
+                </tr>
+            </thead>
+            <tbody>
+    '''
+    for i, result in enumerate(results):
+        rank = i + 1
+        rank_class = 'rank-1' if rank == 1 else 'rank-2' if rank == 2 else 'rank-3' if rank == 3 else ''
+        if 'error' in result:
+            html += f'''
+                <tr class="{rank_class}">
+                    <td>#{rank}</td>
+                    <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
+                    <td>{result['type']}</td>
+                    <td colspan="5" class="error">Error: {result['error']}</td>
+                </tr>
+            '''
+        else:
+            m = result['metrics']
+            fertility_class = 'excellent' if m.fertility < 1.5 else 'good' if m.fertility < 2.5 else 'poor'
+            html += f'''
+                <tr class="{rank_class}">
+                    <td><strong>#{rank}</strong></td>
+                    <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
+                    <td>{result['type']}</td>
+                    <td>{m.total_tokens}</td>
+                    <td class="{fertility_class}">{m.fertility:.3f}</td>
+                    <td>{m.compression_ratio:.2f}</td>
+                    <td>{m.single_token_retention_rate:.1%}</td>
+                    <td>{m.oov_percentage:.1f}%</td>
+                </tr>
+            '''
+    html += '''
+            </tbody>
+        </table>
+    </div>
+    '''
+    return html

app.py CHANGED Viewed

@@ -1,1819 +1,38 @@
 """
-Arabic Tokenizer Arena Pro - Advanced Arabic Tokenization Analysis Platform
-============================================================================
-A comprehensive research and production-grade tool for evaluating Arabic tokenizers
-across multiple dimensions: efficiency, coverage, morphological awareness, and more.
-Now with LEADERBOARD - imports real Arabic datasets from HuggingFace!
-Supports:
-- Arabic-specific tokenizers (Aranizer, AraBERT, CAMeLBERT, MARBERT, etc.)
-- Major LLM tokenizers (Jais, AceGPT, Falcon-Arabic, ALLaM, Qwen, Llama, Mistral, GPT)
-- Comprehensive evaluation metrics based on latest research
-- Real dataset benchmarking from HuggingFace
 """
 import gradio as gr
-import json
-import re
-import time
-import unicodedata
-from typing import Dict, List, Tuple, Optional, Any
-from dataclasses import dataclass, field
-from enum import Enum
-from collections import defaultdict
-import statistics
-import os
-# Hugging Face authentication
-HF_TOKEN = os.getenv('HF_TOKEN')
-if HF_TOKEN:
-    HF_TOKEN = HF_TOKEN.strip()
-    from huggingface_hub import login
-    login(token=HF_TOKEN)
-from transformers import AutoTokenizer, logging
-logging.set_verbosity_error()
-# Import datasets library for leaderboard
-from datasets import load_dataset
-# ============================================================================
-# DATA CLASSES AND ENUMS
-# ============================================================================
-class TokenizerType(Enum):
-    ARABIC_SPECIFIC = "Arabic-Specific"
-    MULTILINGUAL_LLM = "Multilingual LLM"
-    ARABIC_LLM = "Arabic LLM"
-    ENCODER_ONLY = "Encoder-Only (BERT)"
-    DECODER_ONLY = "Decoder-Only (GPT)"
-class TokenizerAlgorithm(Enum):
-    BPE = "Byte-Pair Encoding (BPE)"
-    BBPE = "Byte-Level BPE"
-    WORDPIECE = "WordPiece"
-    SENTENCEPIECE = "SentencePiece"
-    UNIGRAM = "Unigram"
-    TIKTOKEN = "Tiktoken"
-@dataclass
-class TokenizerInfo:
-    """Metadata about a tokenizer"""
-    name: str
-    model_id: str
-    type: TokenizerType
-    algorithm: TokenizerAlgorithm
-    vocab_size: int
-    description: str
-    organization: str
-    arabic_support: str  # Native, Adapted, Limited
-    dialect_support: List[str] = field(default_factory=list)
-    special_features: List[str] = field(default_factory=list)
-@dataclass
-class TokenizationMetrics:
-    """Comprehensive tokenization evaluation metrics"""
-    # Basic counts
-    total_tokens: int
-    total_words: int
-    total_characters: int
-    total_bytes: int
-    # Efficiency metrics
-    fertility: float  # tokens per word (lower is better, 1.0 is ideal)
-    compression_ratio: float  # bytes per token (higher is better)
-    char_per_token: float  # characters per token
-    # Coverage metrics
-    oov_count: int  # out-of-vocabulary tokens (UNK)
-    oov_percentage: float
-    single_token_words: int  # words tokenized as single token
-    single_token_retention_rate: float  # STRR metric
-    # Morphological metrics
-    avg_subwords_per_word: float
-    max_subwords_per_word: int
-    continued_words_ratio: float  # words split into multiple tokens
-    # Arabic-specific metrics
-    arabic_char_count: int
-    arabic_token_count: int
-    arabic_fertility: float
-    diacritic_preservation: bool
-    # Performance metrics
-    tokenization_time_ms: float
-    # Token details
-    tokens: List[str] = field(default_factory=list)
-    token_ids: List[int] = field(default_factory=list)
-    decoded_text: str = ""
-# ============================================================================
-# TOKENIZER REGISTRY - Comprehensive list of Arabic tokenizers
-# ============================================================================
-TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
-    # ========== ARABIC-SPECIFIC BERT MODELS ==========
-    "aubmindlab/bert-base-arabertv2": TokenizerInfo(
-        name="AraBERT v2",
-        model_id="aubmindlab/bert-base-arabertv2",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=64000,
-        description="Arabic BERT with Farasa segmentation, optimized for MSA",
-        organization="AUB MIND Lab",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["Farasa preprocessing", "Morphological segmentation"]
-    ),
-    "aubmindlab/bert-large-arabertv2": TokenizerInfo(
-        name="AraBERT v2 Large",
-        model_id="aubmindlab/bert-large-arabertv2",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=64000,
-        description="Large Arabic BERT with enhanced capacity",
-        organization="AUB MIND Lab",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["Large model", "Farasa preprocessing"]
-    ),
-    "CAMeL-Lab/bert-base-arabic-camelbert-mix": TokenizerInfo(
-        name="CAMeLBERT Mix",
-        model_id="CAMeL-Lab/bert-base-arabic-camelbert-mix",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=30000,
-        description="Pre-trained on MSA, DA, and Classical Arabic mix",
-        organization="CAMeL Lab NYU Abu Dhabi",
-        arabic_support="Native",
-        dialect_support=["MSA", "DA", "CA"],
-        special_features=["Multi-variant Arabic", "Classical Arabic support"]
-    ),
-    "CAMeL-Lab/bert-base-arabic-camelbert-msa": TokenizerInfo(
-        name="CAMeLBERT MSA",
-        model_id="CAMeL-Lab/bert-base-arabic-camelbert-msa",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=30000,
-        description="Specialized for Modern Standard Arabic",
-        organization="CAMeL Lab NYU Abu Dhabi",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["MSA optimized"]
-    ),
-    "CAMeL-Lab/bert-base-arabic-camelbert-da": TokenizerInfo(
-        name="CAMeLBERT DA",
-        model_id="CAMeL-Lab/bert-base-arabic-camelbert-da",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=30000,
-        description="Specialized for Dialectal Arabic",
-        organization="CAMeL Lab NYU Abu Dhabi",
-        arabic_support="Native",
-        dialect_support=["Egyptian", "Gulf", "Levantine", "Maghrebi"],
-        special_features=["Dialect optimized"]
-    ),
-    "CAMeL-Lab/bert-base-arabic-camelbert-ca": TokenizerInfo(
-        name="CAMeLBERT CA",
-        model_id="CAMeL-Lab/bert-base-arabic-camelbert-ca",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=30000,
-        description="Specialized for Classical Arabic",
-        organization="CAMeL Lab NYU Abu Dhabi",
-        arabic_support="Native",
-        dialect_support=["Classical"],
-        special_features=["Classical Arabic", "Religious texts"]
-    ),
-    "UBC-NLP/MARBERT": TokenizerInfo(
-        name="MARBERT",
-        model_id="UBC-NLP/MARBERT",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=100000,
-        description="Multi-dialectal Arabic BERT trained on Twitter data",
-        organization="UBC NLP",
-        arabic_support="Native",
-        dialect_support=["MSA", "Egyptian", "Gulf", "Levantine", "Maghrebi"],
-        special_features=["Twitter data", "100K vocabulary", "Multi-dialect"]
-    ),
-    "UBC-NLP/ARBERT": TokenizerInfo(
-        name="ARBERT",
-        model_id="UBC-NLP/ARBERT",
-        type=TokenizerType.ENCODER_ONLY,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=100000,
-        description="Arabic BERT focused on MSA with large vocabulary",
-        organization="UBC NLP",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["100K vocabulary", "MSA focused"]
-    ),
-    # ========== ARABIC-SPECIFIC TOKENIZERS ==========
-    "riotu-lab/Aranizer-PBE-86k": TokenizerInfo(
-        name="Aranizer PBE 86K",
-        model_id="riotu-lab/Aranizer-PBE-86k",
-        type=TokenizerType.ARABIC_SPECIFIC,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=86000,
-        description="Pair Byte Encoding tokenizer optimized for Arabic LLMs",
-        organization="RIOTU Lab",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["Low fertility", "LLM optimized", "86K vocab"]
-    ),
-    "riotu-lab/Aranizer-SP-86k": TokenizerInfo(
-        name="Aranizer SP 86K",
-        model_id="riotu-lab/Aranizer-SP-86k",
-        type=TokenizerType.ARABIC_SPECIFIC,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=86000,
-        description="SentencePiece tokenizer optimized for Arabic",
-        organization="RIOTU Lab",
-        arabic_support="Native",
-        dialect_support=["MSA"],
-        special_features=["Low fertility", "SentencePiece", "86K vocab"]
-    ),
-    # ========== ARABIC-SPECIFIC LLMs ==========
-    "ALLaM-AI/ALLaM-7B-Instruct-preview": TokenizerInfo(
-        name="ALLaM 7B Instruct",
-        model_id="ALLaM-AI/ALLaM-7B-Instruct-preview",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=128000,
-        description="Saudi Arabia's flagship Arabic LLM by SDAIA, SOTA on Arabic MMLU",
-        organization="SDAIA (Saudi Arabia)",
-        arabic_support="Native",
-        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
-        special_features=["SOTA Arabic", "Islamic values aligned", "Vision 2030"]
-    ),
-    "inception-mbzuai/jais-13b": TokenizerInfo(
-        name="Jais 13B",
-        model_id="inception-mbzuai/jais-13b",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=84992,
-        description="World's most advanced Arabic LLM, trained from scratch",
-        organization="Inception/MBZUAI",
-        arabic_support="Native",
-        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
-        special_features=["Arabic-first", "Lowest fertility", "UAE-native"]
-    ),
-    "inceptionai/jais-family-30b-8k-chat": TokenizerInfo(
-        name="Jais 30B Chat",
-        model_id="inceptionai/jais-family-30b-8k-chat",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=84992,
-        description="Enhanced 30B version with chat capabilities",
-        organization="Inception AI",
-        arabic_support="Native",
-        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
-        special_features=["30B parameters", "Chat optimized", "8K context"]
-    ),
-    "FreedomIntelligence/AceGPT-13B-chat": TokenizerInfo(
-        name="AceGPT 13B Chat",
-        model_id="FreedomIntelligence/AceGPT-13B-chat",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=32000,
-        description="Arabic-enhanced LLaMA with cultural alignment and chat",
-        organization="Freedom Intelligence",
-        arabic_support="Adapted",
-        dialect_support=["MSA"],
-        special_features=["LLaMA-based", "Cultural alignment", "RLHF", "Chat"]
-    ),
-    "silma-ai/SILMA-9B-Instruct-v1.0": TokenizerInfo(
-        name="SILMA 9B Instruct",
-        model_id="silma-ai/SILMA-9B-Instruct-v1.0",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=256000,
-        description="Top-ranked Arabic LLM based on Gemma, outperforms larger models",
-        organization="SILMA AI",
-        arabic_support="Native",
-        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
-        special_features=["Gemma-based", "SOTA 9B class", "Efficient"]
-    ),
-    "QCRI/Fanar-1-9B-Instruct": TokenizerInfo(
-        name="Fanar 9B Instruct",
-        model_id="QCRI/Fanar-1-9B-Instruct",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=256000,
-        description="Qatar's Arabic LLM aligned with Islamic values and Arab culture",
-        organization="QCRI (Qatar)",
-        arabic_support="Native",
-        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
-        special_features=["Islamic RAG", "Cultural alignment", "Gemma-based"]
-    ),
-    "Navid-AI/Yehia-7B-preview": TokenizerInfo(
-        name="Yehia 7B Preview",
-        model_id="Navid-AI/Yehia-7B-preview",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=128256,
-        description="Best Arabic model on AraGen-Leaderboard (0.5B-25B), GRPO trained",
-        organization="Navid AI",
-        arabic_support="Native",
-        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
-        special_features=["GRPO trained", "3C3H aligned", "SOTA AraGen"]
-    ),
-    # ========== DIALECT-SPECIFIC MODELS ==========
-    "MBZUAI-Paris/Atlas-Chat-9B": TokenizerInfo(
-        name="Atlas-Chat 9B (Darija)",
-        model_id="MBZUAI-Paris/Atlas-Chat-9B",
-        type=TokenizerType.ARABIC_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=256000,
-        description="First LLM for Moroccan Arabic (Darija), Gemma-based",
-        organization="MBZUAI Paris",
-        arabic_support="Native",
-        dialect_support=["Darija", "MSA"],
-        special_features=["Moroccan dialect", "Transliteration", "Cultural"]
-    ),
-    # ========== MULTILINGUAL LLMs WITH ARABIC SUPPORT ==========
-    "Qwen/Qwen2.5-7B": TokenizerInfo(
-        name="Qwen 2.5 7B",
-        model_id="Qwen/Qwen2.5-7B",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=151936,
-        description="Alibaba's multilingual LLM with 30+ language support",
-        organization="Alibaba Qwen",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["152K vocab", "128K context", "30+ languages"]
-    ),
-    "google/gemma-2-9b": TokenizerInfo(
-        name="Gemma 2 9B",
-        model_id="google/gemma-2-9b",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=256000,
-        description="Google's efficient multilingual model",
-        organization="Google",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["256K vocab", "Efficient architecture"]
-    ),
-    "mistralai/Mistral-7B-v0.3": TokenizerInfo(
-        name="Mistral 7B v0.3",
-        model_id="mistralai/Mistral-7B-v0.3",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=32768,
-        description="Efficient multilingual model with sliding window attention",
-        organization="Mistral AI",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["Sliding window", "Efficient"]
-    ),
-    "mistralai/Mistral-Nemo-Base-2407": TokenizerInfo(
-        name="Mistral Nemo",
-        model_id="mistralai/Mistral-Nemo-Base-2407",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.TIKTOKEN,
-        vocab_size=131072,
-        description="Uses Tekken tokenizer, optimized for multilingual",
-        organization="Mistral AI + NVIDIA",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["Tekken tokenizer", "131K vocab", "Multilingual optimized"]
-    ),
-    "xlm-roberta-base": TokenizerInfo(
-        name="XLM-RoBERTa Base",
-        model_id="xlm-roberta-base",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
-        vocab_size=250002,
-        description="Cross-lingual model covering 100 languages",
-        organization="Facebook AI",
-        arabic_support="Supported",
-        dialect_support=["MSA"],
-        special_features=["250K vocab", "100 languages"]
-    ),
-    "bert-base-multilingual-cased": TokenizerInfo(
-        name="mBERT",
-        model_id="bert-base-multilingual-cased",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.WORDPIECE,
-        vocab_size=119547,
-        description="Original multilingual BERT, baseline for comparison",
-        organization="Google",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["Baseline model", "104 languages"]
-    ),
-    "tiiuae/falcon-7b": TokenizerInfo(
-        name="Falcon 7B",
-        model_id="tiiuae/falcon-7b",
-        type=TokenizerType.MULTILINGUAL_LLM,
-        algorithm=TokenizerAlgorithm.BPE,
-        vocab_size=65024,
-        description="TII's powerful open-source LLM",
-        organization="Technology Innovation Institute",
-        arabic_support="Limited",
-        dialect_support=["MSA"],
-        special_features=["65K vocab", "RefinedWeb trained"]
-    ),
-}
-# ============================================================================
-# LEADERBOARD DATASETS CONFIGURATION - Real HuggingFace Datasets
-# ============================================================================
-LEADERBOARD_DATASETS = {
-    # MSA Benchmarks
-    "arabic_mmlu": {
-        "hf_id": "MBZUAI/ArabicMMLU",
-        "name": "ArabicMMLU",
-        "category": "MSA Benchmark",
-        "text_column": "Question",
-        "split": "test",
-        "subset": None,
-        "samples": 500,
-        "description": "Multi-task benchmark from Arab school exams (14,575 MCQs)"
-    },
-    # Dialectal Arabic
-    "arsentd_lev": {
-        "hf_id": "ramybaly/arsentd_lev",
-        "name": "ArSenTD-LEV",
-        "category": "Levantine Dialect",
-        "text_column": "Tweet",
-        "split": "train",
-        "subset": None,
-        "samples": 500,
-        "description": "Levantine Arabic tweets (Jordan, Lebanon, Syria, Palestine)"
-    },
-    # Classical Arabic
-    "athar": {
-        "hf_id": "mohamed-khalil/ATHAR",
-        "name": "ATHAR Classical",
-        "category": "Classical Arabic",
-        "text_column": "arabic",
-        "split": "train",
-        "subset": None,
-        "samples": 500,
-        "description": "66K classical Arabic sentences with translations"
-    },
-    # Question Answering
-    "arcd": {
-        "hf_id": "arcd",
-        "name": "ARCD",
-        "category": "QA Dataset",
-        "text_column": "context",
-        "split": "train",
-        "subset": None,
-        "samples": 300,
-        "description": "Arabic Reading Comprehension Dataset (1,395 questions)"
-    },
-    # Poetry
-    "ashaar": {
-        "hf_id": "arbml/Ashaar_dataset",
-        "name": "Ashaar Poetry",
-        "category": "Poetry",
-        "text_column": "poem_text",
-        "split": "train",
-        "subset": None,
-        "samples": 500,
-        "description": "2M+ Arabic poetry verses with meter and theme labels"
-    },
-    # Religious - Hadith
-    "hadith": {
-        "hf_id": "gurgutan/sunnah_ar_en_dataset",
-        "name": "Hadith Collection",
-        "category": "Religious",
-        "text_column": "hadith_text_ar",
-        "split": "train",
-        "subset": None,
-        "samples": 400,
-        "description": "50,762 hadiths from 14 authentic books"
-    },
-    # Social Media
-    "arabic_sentiment": {
-        "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
-        "name": "Arabic Sentiment",
-        "category": "Social Media",
-        "text_column": "text",
-        "split": "train",
-        "subset": None,
-        "samples": 500,
-        "description": "Arabic Twitter sentiment corpus"
-    },
-    # News
-    "sanad": {
-        "hf_id": "arbml/SANAD",
-        "name": "SANAD News",
-        "category": "News",
-        "text_column": "text",
-        "split": "train",
-        "subset": "alarabiya",
-        "samples": 400,
-        "description": "Arabic news articles from Al Arabiya"
-    },
-}
-# ============================================================================
-# TOKENIZER LOADER AND CACHE
-# ============================================================================
-class TokenizerManager:
-    """Manages tokenizer loading and caching"""
-    def __init__(self):
-        self._cache: Dict[str, Any] = {}
-        self._available: Dict[str, TokenizerInfo] = {}
-        self._initialize_available_tokenizers()
-    def _initialize_available_tokenizers(self):
-        """Check which tokenizers are available and can be loaded"""
-        print("Initializing tokenizer registry...")
-        # Add all base tokenizers
-        for model_id, info in TOKENIZER_REGISTRY.items():
-            try:
-                # Quick check if tokenizer can be loaded
-                _ = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-                self._available[model_id] = info
-                print(f"  ✓ {info.name}")
-            except Exception as e:
-                print(f"  ✗ {info.name}: {str(e)[:50]}")
-        print(f"\nTotal available tokenizers: {len(self._available)}")
-    def get_tokenizer(self, model_id: str):
-        """Get tokenizer from cache or load it"""
-        if model_id not in self._cache:
-            self._cache[model_id] = AutoTokenizer.from_pretrained(
-                model_id,
-                trust_remote_code=True
-            )
-        return self._cache[model_id]
-    def get_available_tokenizers(self) -> Dict[str, TokenizerInfo]:
-        return self._available
-    def get_tokenizer_choices(self) -> List[str]:
-        """Get list of tokenizer display names for dropdown"""
-        return [f"{info.name} ({info.organization})" for info in self._available.values()]
-    def get_model_id_from_choice(self, choice: str) -> str:
-        """Convert display choice back to model ID"""
-        for model_id, info in self._available.items():
-            if f"{info.name} ({info.organization})" == choice:
-                return model_id
-        return list(self._available.keys())[0]
-# Global tokenizer manager
-tokenizer_manager = TokenizerManager()
-# ============================================================================
-# ARABIC TEXT UTILITIES
-# ============================================================================
-def is_arabic_char(char: str) -> bool:
-    """Check if character is Arabic"""
-    if len(char) != 1:
-        return False
-    code = ord(char)
-    return (
-        (0x0600 <= code <= 0x06FF) or  # Arabic
-        (0x0750 <= code <= 0x077F) or  # Arabic Supplement
-        (0x08A0 <= code <= 0x08FF) or  # Arabic Extended-A
-        (0xFB50 <= code <= 0xFDFF) or  # Arabic Presentation Forms-A
-        (0xFE70 <= code <= 0xFEFF)     # Arabic Presentation Forms-B
-    )
-def count_arabic_chars(text: str) -> int:
-    """Count Arabic characters in text"""
-    return sum(1 for c in text if is_arabic_char(c))
-def has_diacritics(text: str) -> bool:
-    """Check if text contains Arabic diacritics (tashkeel)"""
-    diacritics = set('ًٌٍَُِّْٰ')
-    return any(c in diacritics for c in text)
-def get_arabic_words(text: str) -> List[str]:
-    """Extract Arabic words from text"""
-    words = text.split()
-    return [w for w in words if any(is_arabic_char(c) for c in w)]
-# ============================================================================
-# TOKENIZATION ANALYSIS ENGINE
-# ============================================================================
-def analyze_tokenization(
-    text: str,
-    model_id: str,
-    tokenizer_info: TokenizerInfo
-) -> TokenizationMetrics:
-    """Perform comprehensive tokenization analysis"""
-    tokenizer = tokenizer_manager.get_tokenizer(model_id)
-    # Time the tokenization
-    start_time = time.perf_counter()
-    tokens = tokenizer.tokenize(text)
-    token_ids = tokenizer.encode(text, add_special_tokens=False)
-    tokenization_time = (time.perf_counter() - start_time) * 1000
-    decoded = tokenizer.decode(token_ids, skip_special_tokens=True)
-    # Basic counts
-    words = text.split()
-    total_words = len(words)
-    total_tokens = len(tokens)
-    total_characters = len(text)
-    total_bytes = len(text.encode('utf-8'))
-    # Efficiency metrics
-    fertility = total_tokens / max(total_words, 1)
-    compression_ratio = total_bytes / max(total_tokens, 1)
-    char_per_token = total_characters / max(total_tokens, 1)
-    # OOV analysis
-    unk_token = tokenizer.unk_token if hasattr(tokenizer, 'unk_token') else '[UNK]'
-    oov_count = sum(1 for t in tokens if t == unk_token or '[UNK]' in str(t))
-    oov_percentage = (oov_count / max(total_tokens, 1)) * 100
-    # Single Token Retention Rate (STRR)
-    single_token_words = 0
-    subwords_per_word = []
-    for word in words:
-        word_tokens = tokenizer.tokenize(word)
-        subwords_per_word.append(len(word_tokens))
-        if len(word_tokens) == 1:
-            single_token_words += 1
-    strr = single_token_words / max(total_words, 1)
-    avg_subwords = sum(subwords_per_word) / max(len(subwords_per_word), 1)
-    max_subwords = max(subwords_per_word) if subwords_per_word else 0
-    continued_ratio = (total_words - single_token_words) / max(total_words, 1)
-    # Arabic-specific metrics
-    arabic_char_count = count_arabic_chars(text)
-    arabic_words = get_arabic_words(text)
-    arabic_tokens_count = 0
-    for token in tokens:
-        if any(is_arabic_char(c) for c in str(token)):
-            arabic_tokens_count += 1
-    arabic_fertility = arabic_tokens_count / max(len(arabic_words), 1) if arabic_words else 0
-    diacritic_preserved = has_diacritics(text) == has_diacritics(decoded)
-    return TokenizationMetrics(
-        total_tokens=total_tokens,
-        total_words=total_words,
-        total_characters=total_characters,
-        total_bytes=total_bytes,
-        fertility=fertility,
-        compression_ratio=compression_ratio,
-        char_per_token=char_per_token,
-        oov_count=oov_count,
-        oov_percentage=oov_percentage,
-        single_token_words=single_token_words,
-        single_token_retention_rate=strr,
-        avg_subwords_per_word=avg_subwords,
-        max_subwords_per_word=max_subwords,
-        continued_words_ratio=continued_ratio,
-        arabic_char_count=arabic_char_count,
-        arabic_token_count=arabic_tokens_count,
-        arabic_fertility=arabic_fertility,
-        diacritic_preservation=diacritic_preserved,
-        tokenization_time_ms=tokenization_time,
-        tokens=tokens,
-        token_ids=token_ids,
-        decoded_text=decoded
-    )
-# ============================================================================
-# LEADERBOARD FUNCTIONS - Import Real Datasets from HuggingFace
-# ============================================================================
-class HFDatasetLoader:
-    """Load Arabic datasets from HuggingFace"""
-    def __init__(self):
-        self.cache = {}
-    def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]:
-        """Load texts from a HuggingFace dataset"""
-        if dataset_key in self.cache:
-            return self.cache[dataset_key], f"✅ Loaded {len(self.cache[dataset_key])} samples (cached)"
-        config = LEADERBOARD_DATASETS.get(dataset_key)
-        if not config:
-            return [], f"❌ Unknown dataset: {dataset_key}"
-        try:
-            # Load dataset from HuggingFace
-            if config.get("subset"):
-                ds = load_dataset(
-                    config["hf_id"],
-                    config["subset"],
-                    split=config["split"],
-                    trust_remote_code=True
-                )
-            else:
-                ds = load_dataset(
-                    config["hf_id"],
-                    split=config["split"],
-                    trust_remote_code=True
-                )
-            texts = []
-            text_col = config["text_column"]
-            # Try to find text column
-            if text_col not in ds.column_names:
-                for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]:
-                    if col in ds.column_names:
-                        text_col = col
-                        break
-            # Extract texts
-            max_samples = config.get("samples", 500)
-            for i, item in enumerate(ds):
-                if i >= max_samples:
-                    break
-                text = item.get(text_col, "")
-                if text and isinstance(text, str) and len(text.strip()) > 10:
-                    texts.append(text.strip())
-            self.cache[dataset_key] = texts
-            return texts, f"✅ Loaded {len(texts)} samples from HuggingFace"
-        except Exception as e:
-            return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}"
-def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]:
-    """Evaluate a tokenizer on a list of texts"""
-    fertilities = []
-    compressions = []
-    unk_counts = 0
-    total_tokens = 0
-    for text in texts:
-        try:
-            tokens = tokenizer.encode(text, add_special_tokens=False)
-            decoded = tokenizer.convert_ids_to_tokens(tokens)
-            num_tokens = len(tokens)
-            num_words = len(text.split()) or 1
-            num_bytes = len(text.encode('utf-8'))
-            fertility = num_tokens / num_words
-            compression = num_bytes / num_tokens if num_tokens > 0 else 0
-            # Count UNKs
-            unk_token = getattr(tokenizer, 'unk_token', '[UNK]')
-            unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower()))
-            fertilities.append(fertility)
-            compressions.append(compression)
-            unk_counts += unks
-            total_tokens += num_tokens
-        except Exception:
-            continue
-    if not fertilities:
-        return None
-    return {
-        "avg_fertility": statistics.mean(fertilities),
-        "std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0,
-        "avg_compression": statistics.mean(compressions),
-        "unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0,
-        "samples": len(fertilities)
-    }
-def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float:
-    """Calculate overall score (0-100, higher is better)"""
-    # Lower fertility is better (ideal ~1.0 for Arabic)
-    fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0
-    # Higher compression is better
-    compression_score = min(1, compression / 6)
-    # Lower UNK is better
-    unk_score = 1 - min(1, unk_ratio * 20)
-    # Weighted combination
-    score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100
-    return round(score, 1)
-def run_leaderboard_evaluation(
-    selected_datasets: List[str],
-    selected_tokenizers: List[str],
-    progress=gr.Progress()
-) -> Tuple[str, str, str]:
-    """
-    Run the full leaderboard evaluation with real HF datasets
-    Returns: (leaderboard_html, per_dataset_html, status_message)
-    """
-    if not selected_datasets:
-        return "", "", "⚠️ Please select at least one dataset"
-    if not selected_tokenizers:
-        return "", "", "⚠️ Please select at least one tokenizer"
-    loader = HFDatasetLoader()
-    results = defaultdict(dict)
-    # Status tracking
-    status_lines = []
-    # Load datasets from HuggingFace
-    status_lines.append("📚 **Loading Datasets from HuggingFace:**\n")
-    loaded_datasets = {}
-    for i, ds_key in enumerate(selected_datasets):
-        progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...")
-        texts, msg = loader.load_dataset_texts(ds_key)
-        ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
-        status_lines.append(f"  • {ds_name}: {msg}")
-        if texts:
-            loaded_datasets[ds_key] = texts
-    if not loaded_datasets:
-        return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully"
-    # Evaluate tokenizers
-    status_lines.append("\n🔄 **Evaluating Tokenizers:**\n")
-    tokenizer_cache = {}
-    total_steps = len(selected_tokenizers) * len(loaded_datasets)
-    current_step = 0
-    for tok_choice in selected_tokenizers:
-        # Get model ID from choice
-        tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
-        tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
-        tok_name = tok_info.name if tok_info else tok_choice
-        # Load tokenizer
-        try:
-            if tok_id not in tokenizer_cache:
-                tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained(
-                    tok_id, trust_remote_code=True
-                )
-            tokenizer = tokenizer_cache[tok_id]
-            status_lines.append(f"  • {tok_name}: ✅ Loaded")
-        except Exception as e:
-            status_lines.append(f"  • {tok_name}: ❌ Failed ({str(e)[:30]})")
-            continue
-        # Evaluate on each dataset
-        for ds_key, texts in loaded_datasets.items():
-            current_step += 1
-            progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...")
-            metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
-            if metrics:
-                results[tok_choice][ds_key] = metrics
-    # Generate leaderboard
-    progress(0.95, "Generating leaderboard...")
-    leaderboard_data = []
-    per_dataset_data = []
-    for tok_choice, ds_results in results.items():
-        if not ds_results:
-            continue
-        tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
-        tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
-        # Aggregate across datasets
-        all_fertility = [m["avg_fertility"] for m in ds_results.values()]
-        all_compression = [m["avg_compression"] for m in ds_results.values()]
-        all_unk = [m["unk_ratio"] for m in ds_results.values()]
-        avg_fertility = statistics.mean(all_fertility)
-        avg_compression = statistics.mean(all_compression)
-        avg_unk = statistics.mean(all_unk)
-        score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)
-        leaderboard_data.append({
-            "name": tok_info.name if tok_info else tok_choice,
-            "type": tok_info.type.value if tok_info else "Unknown",
-            "org": tok_info.organization if tok_info else "Unknown",
-            "score": score,
-            "fertility": avg_fertility,
-            "compression": avg_compression,
-            "unk_ratio": avg_unk,
-            "num_datasets": len(ds_results)
-        })
-        # Per-dataset row
-        per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice}
-        for ds_key in selected_datasets:
-            ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
-            if ds_key in ds_results:
-                per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2)
-            else:
-                per_ds_row[ds_name] = "-"
-        per_dataset_data.append(per_ds_row)
-    # Sort by score
-    leaderboard_data.sort(key=lambda x: x["score"], reverse=True)
-    # Create HTML tables
-    leaderboard_html = generate_leaderboard_html(leaderboard_data)
-    per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
-    status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
-    return leaderboard_html, per_dataset_html, "\n".join(status_lines)
-def generate_leaderboard_html(data: List[Dict]) -> str:
-    """Generate HTML for main leaderboard"""
-    if not data:
-        return "<p>No results to display</p>"
-    html = """
-    <style>
-        .leaderboard-table {
-            width: 100%;
-            border-collapse: collapse;
-            font-family: system-ui, -apple-system, sans-serif;
-            margin: 20px 0;
-        }
-        .leaderboard-table th {
-            background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4e 100%);
-            color: white;
-            padding: 12px 8px;
-            text-align: left;
-            font-weight: 600;
-        }
-        .leaderboard-table td {
-            padding: 10px 8px;
-            border-bottom: 1px solid #e0e0e0;
-        }
-        .leaderboard-table tr:nth-child(even) {
-            background-color: #f8f9fa;
-        }
-        .leaderboard-table tr:hover {
-            background-color: #e8f5e9;
-        }
-        .rank-1 { background: linear-gradient(90deg, #ffd700 0%, #fff8dc 100%) !important; }
-        .rank-2 { background: linear-gradient(90deg, #c0c0c0 0%, #f5f5f5 100%) !important; }
-        .rank-3 { background: linear-gradient(90deg, #cd7f32 0%, #ffe4c4 100%) !important; }
-        .score-badge {
-            background: #2d8f4e;
-            color: white;
-            padding: 4px 8px;
-            border-radius: 12px;
-            font-weight: bold;
-        }
-        .type-badge {
-            background: #e3f2fd;
-            color: #1565c0;
-            padding: 2px 6px;
-            border-radius: 4px;
-            font-size: 0.85em;
-        }
-        .metric-good { color: #2e7d32; font-weight: 600; }
-        .metric-bad { color: #c62828; }
-    </style>
-    <table class="leaderboard-table">
-        <thead>
-            <tr>
-                <th>Rank</th>
-                <th>Tokenizer</th>
-                <th>Type</th>
-                <th>Organization</th>
-                <th>Score ↑</th>
-                <th>Fertility ↓</th>
-                <th>Compression ↑</th>
-                <th>UNK Rate ↓</th>
-                <th>Datasets</th>
-            </tr>
-        </thead>
-        <tbody>
-    """
-    for i, entry in enumerate(data):
-        rank = i + 1
-        rank_class = f"rank-{rank}" if rank <= 3 else ""
-        # Color coding for metrics
-        fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
-        comp_class = "metric-good" if entry["compression"] > 3.5 else ""
-        unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
-        html += f"""
-            <tr class="{rank_class}">
-                <td><strong>#{rank}</strong></td>
-                <td><strong>{entry["name"]}</strong></td>
-                <td><span class="type-badge">{entry["type"]}</span></td>
-                <td>{entry["org"]}</td>
-                <td><span class="score-badge">{entry["score"]}</span></td>
-                <td class="{fert_class}">{entry["fertility"]:.3f}</td>
-                <td class="{comp_class}">{entry["compression"]:.2f}</td>
-                <td class="{unk_class}">{entry["unk_ratio"]:.2%}</td>
-                <td>{entry["num_datasets"]}</td>
-            </tr>
-        """
-    html += """
-        </tbody>
-    </table>
-    <div style="margin-top: 15px; padding: 10px; background: #f5f5f5; border-radius: 8px; font-size: 0.9em;">
-        <strong>📊 Metric Guide:</strong><br>
-        • <strong>Score:</strong> Overall ranking (0-100, higher = better)<br>
-        • <strong>Fertility:</strong> Tokens per word (lower = better, 1.0 ideal for Arabic)<br>
-        • <strong>Compression:</strong> Bytes per token (higher = more efficient)<br>
-        • <strong>UNK Rate:</strong> Unknown token percentage (lower = better)
-    </div>
-    """
-    return html
-def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
-    """Generate HTML for per-dataset fertility table"""
-    if not data:
-        return "<p>No per-dataset results</p>"
-    ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
-    html = """
-    <style>
-        .dataset-table {
-            width: 100%;
-            border-collapse: collapse;
-            font-family: system-ui, -apple-system, sans-serif;
-            margin: 20px 0;
-            font-size: 0.9em;
-        }
-        .dataset-table th {
-            background: #37474f;
-            color: white;
-            padding: 10px 6px;
-            text-align: center;
-        }
-        .dataset-table th:first-child {
-            text-align: left;
-        }
-        .dataset-table td {
-            padding: 8px 6px;
-            text-align: center;
-            border-bottom: 1px solid #e0e0e0;
-        }
-        .dataset-table td:first-child {
-            text-align: left;
-            font-weight: 500;
-        }
-        .dataset-table tr:nth-child(even) {
-            background-color: #fafafa;
-        }
-        .fert-excellent { background: #c8e6c9; color: #1b5e20; font-weight: 600; }
-        .fert-good { background: #fff9c4; color: #f57f17; }
-        .fert-poor { background: #ffcdd2; color: #b71c1c; }
-    </style>
-    <h4>📈 Fertility per Dataset (tokens/word - lower is better)</h4>
-    <table class="dataset-table">
-        <thead>
-            <tr>
-                <th>Tokenizer</th>
-    """
-    for ds_name in ds_names:
-        html += f"<th>{ds_name}</th>"
-    html += """
-            </tr>
-        </thead>
-        <tbody>
-    """
-    for row in data:
-        html += f"<tr><td>{row['Tokenizer']}</td>"
-        for ds_name in ds_names:
-            val = row.get(ds_name, "-")
-            if val != "-":
-                if val < 1.8:
-                    cls = "fert-excellent"
-                elif val < 2.5:
-                    cls = "fert-good"
-                else:
-                    cls = "fert-poor"
-                html += f'<td class="{cls}">{val}</td>'
-            else:
-                html += '<td>-</td>'
-        html += "</tr>"
-    html += """
-        </tbody>
-    </table>
-    """
-    return html
-# ============================================================================
-# UI GENERATION FUNCTIONS
-# ============================================================================
-def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
-    """Generate beautiful HTML visualization of tokens"""
-    colors = [
-        ('#1a1a2e', '#eaeaea'),
-        ('#16213e', '#f0f0f0'),
-        ('#0f3460', '#ffffff'),
-        ('#533483', '#f5f5f5'),
-        ('#e94560', '#ffffff'),
-        ('#0f4c75', '#f0f0f0'),
-        ('#3282b8', '#ffffff'),
-        ('#bbe1fa', '#1a1a2e'),
-    ]
-    html_parts = []
-    for i, (token, tid) in enumerate(zip(tokens, token_ids)):
-        bg, fg = colors[i % len(colors)]
-        display_token = token.replace('<', '&lt;').replace('>', '&gt;')
-        is_arabic = any(is_arabic_char(c) for c in token)
-        direction = 'rtl' if is_arabic else 'ltr'
-        html_parts.append(f'''
-            <span class="token" style="
-                background: {bg};
-                color: {fg};
-                direction: {direction};
-            " title="ID: {tid}">
-                {display_token}
-                <span class="token-id">{tid}</span>
-            </span>
-        ''')
-    return f'''
-    <div class="token-container">
-        {''.join(html_parts)}
-    </div>
-    '''
-def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
-    """Generate metrics visualization card"""
-    fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
-    strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
-    compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
-    return f'''
-    <div class="metrics-grid">
-        <div class="metric-card primary">
-            <div class="metric-icon">📊</div>
-            <div class="metric-value">{metrics.total_tokens}</div>
-            <div class="metric-label">Total Tokens</div>
-        </div>
-        <div class="metric-card {fertility_quality}">
-            <div class="metric-icon">🎯</div>
-            <div class="metric-value">{metrics.fertility:.3f}</div>
-            <div class="metric-label">Fertility (tokens/word)</div>
-            <div class="metric-hint">Lower is better (1.0 ideal)</div>
-        </div>
-        <div class="metric-card {compression_quality}">
-            <div class="metric-icon">📦</div>
-            <div class="metric-value">{metrics.compression_ratio:.2f}</div>
-            <div class="metric-label">Compression (bytes/token)</div>
-            <div class="metric-hint">Higher is better</div>
-        </div>
-        <div class="metric-card {strr_quality}">
-            <div class="metric-icon">✨</div>
-            <div class="metric-value">{metrics.single_token_retention_rate:.1%}</div>
-            <div class="metric-label">STRR (Single Token Retention)</div>
-            <div class="metric-hint">Higher is better</div>
-        </div>
-        <div class="metric-card">
-            <div class="metric-icon">🔤</div>
-            <div class="metric-value">{metrics.char_per_token:.2f}</div>
-            <div class="metric-label">Characters/Token</div>
-        </div>
-        <div class="metric-card {'excellent' if metrics.oov_percentage == 0 else 'poor' if metrics.oov_percentage > 5 else 'good'}">
-            <div class="metric-icon">❓</div>
-            <div class="metric-value">{metrics.oov_percentage:.1f}%</div>
-            <div class="metric-label">OOV Rate</div>
-            <div class="metric-hint">Lower is better (0% ideal)</div>
-        </div>
-        <div class="metric-card">
-            <div class="metric-icon">🌍</div>
-            <div class="metric-value">{metrics.arabic_fertility:.3f}</div>
-            <div class="metric-label">Arabic Fertility</div>
-        </div>
-        <div class="metric-card">
-            <div class="metric-icon">⚡</div>
-            <div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div>
-            <div class="metric-label">Processing Time</div>
-        </div>
-    </div>
-    '''
-def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
-    """Generate tokenizer information card"""
-    dialect_badges = ''.join([f'<span class="badge dialect">{d}</span>' for d in info.dialect_support])
-    feature_badges = ''.join([f'<span class="badge feature">{f}</span>' for f in info.special_features])
-    support_class = "native" if info.arabic_support == "Native" else "supported" if info.arabic_support == "Supported" else "limited"
-    return f'''
-    <div class="info-card">
-        <div class="info-header">
-            <h3>{info.name}</h3>
-            <span class="org-badge">{info.organization}</span>
-        </div>
-        <p class="description">{info.description}</p>
-        <div class="info-grid">
-            <div class="info-item">
-                <span class="info-label">Type:</span>
-                <span class="info-value">{info.type.value}</span>
-            </div>
-            <div class="info-item">
-                <span class="info-label">Algorithm:</span>
-                <span class="info-value">{info.algorithm.value}</span>
-            </div>
-            <div class="info-item">
-                <span class="info-label">Vocab Size:</span>
-                <span class="info-value">{info.vocab_size:,}</span>
-            </div>
-            <div class="info-item">
-                <span class="info-label">Arabic Support:</span>
-                <span class="info-value support-{support_class}">{info.arabic_support}</span>
-            </div>
-        </div>
-        <div class="badge-container">
-            <div class="badge-group">
-                <span class="badge-label">Dialects:</span>
-                {dialect_badges}
-            </div>
-            <div class="badge-group">
-                <span class="badge-label">Features:</span>
-                {feature_badges}
-            </div>
-        </div>
-    </div>
-    '''
-def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
-    """Analyze a single tokenizer"""
-    if not text or not text.strip():
-        return (
-            '<div class="warning">⚠️ Please enter some text to analyze</div>',
-            '', '', ''
-        )
-    if not tokenizer_choice:
-        return (
-            '<div class="warning">⚠️ Please select a tokenizer</div>',
-            '', '', ''
-        )
-    model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
-    tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
-    if not tokenizer_info:
-        return (
-            '<div class="error-card"><h4>Error</h4><p>Tokenizer not found</p></div>',
-            '', '', ''
-        )
-    try:
-        metrics = analyze_tokenization(text, model_id, tokenizer_info)
-        info_html = generate_tokenizer_info_card(tokenizer_info)
-        metrics_html = generate_metrics_card(metrics, tokenizer_info)
-        tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
-        decoded_html = f'''
-        <div class="decoded-section">
-            <h4>Decoded Output</h4>
-            <div class="decoded-text" dir="auto">{metrics.decoded_text}</div>
-            <div class="decoded-meta">
-                Diacritics preserved: {'✅ Yes' if metrics.diacritic_preservation else '❌ No'}
-            </div>
-        </div>
-        '''
-        return info_html, metrics_html, tokens_html, decoded_html
-    except Exception as e:
-        return (
-            f'<div class="error-card"><h4>Error</h4><p>{str(e)}</p></div>',
-            '', '', ''
-        )
-def compare_tokenizers(tokenizer_choices: List[str], text: str) -> str:
-    """Compare multiple tokenizers"""
-    if not text or not text.strip():
-        return '<div class="warning">⚠️ Please enter some text to analyze</div>'
-    if not tokenizer_choices or len(tokenizer_choices) < 2:
-        return '<div class="warning">⚠️ Please select at least 2 tokenizers to compare</div>'
-    results = []
-    for choice in tokenizer_choices:
-        model_id = tokenizer_manager.get_model_id_from_choice(choice)
-        tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
-        if tokenizer_info:
-            try:
-                metrics = analyze_tokenization(text, model_id, tokenizer_info)
-                results.append({
-                    'name': tokenizer_info.name,
-                    'org': tokenizer_info.organization,
-                    'type': tokenizer_info.type.value,
-                    'metrics': metrics
-                })
-            except Exception as e:
-                results.append({
-                    'name': tokenizer_info.name,
-                    'org': tokenizer_info.organization,
-                    'type': tokenizer_info.type.value,
-                    'error': str(e)
-                })
-    # Sort by fertility (lower is better)
-    results.sort(key=lambda x: x.get('metrics', TokenizationMetrics(
-        total_tokens=0, total_words=0, total_characters=0, total_bytes=0,
-        fertility=999, compression_ratio=0, char_per_token=0,
-        oov_count=0, oov_percentage=0, single_token_words=0,
-        single_token_retention_rate=0, avg_subwords_per_word=0,
-        max_subwords_per_word=0, continued_words_ratio=0,
-        arabic_char_count=0, arabic_token_count=0, arabic_fertility=0,
-        diacritic_preservation=False, tokenization_time_ms=0
-    )).fertility)
-    # Generate comparison table
-    html = '''
-    <div class="comparison-container">
-        <table class="comparison-table">
-            <thead>
-                <tr>
-                    <th>Rank</th>
-                    <th>Tokenizer</th>
-                    <th>Type</th>
-                    <th>Tokens</th>
-                    <th>Fertility ↓</th>
-                    <th>Compression ↑</th>
-                    <th>STRR ↑</th>
-                    <th>OOV %</th>
-                </tr>
-            </thead>
-            <tbody>
-    '''
-    for i, result in enumerate(results):
-        rank = i + 1
-        rank_class = 'rank-1' if rank == 1 else 'rank-2' if rank == 2 else 'rank-3' if rank == 3 else ''
-        if 'error' in result:
-            html += f'''
-                <tr class="{rank_class}">
-                    <td>#{rank}</td>
-                    <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
-                    <td>{result['type']}</td>
-                    <td colspan="5" class="error">Error: {result['error']}</td>
-                </tr>
-            '''
-        else:
-            m = result['metrics']
-            fertility_class = 'excellent' if m.fertility < 1.5 else 'good' if m.fertility < 2.5 else 'poor'
-            html += f'''
-                <tr class="{rank_class}">
-                    <td><strong>#{rank}</strong></td>
-                    <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
-                    <td>{result['type']}</td>
-                    <td>{m.total_tokens}</td>
-                    <td class="{fertility_class}">{m.fertility:.3f}</td>
-                    <td>{m.compression_ratio:.2f}</td>
-                    <td>{m.single_token_retention_rate:.1%}</td>
-                    <td>{m.oov_percentage:.1f}%</td>
-                </tr>
-            '''
-    html += '''
-            </tbody>
-        </table>
-    </div>
-    '''
-    return html
-# ============================================================================
-# CUSTOM CSS
-# ============================================================================
-CUSTOM_CSS = """
-/* ===== ROOT VARIABLES ===== */
-:root {
-    --primary: #1a5f2a;
-    --primary-light: #2d8f4e;
-    --secondary: #4a90d9;
-    --accent: #f59e0b;
-    --success: #10b981;
-    --warning: #f57c00;
-    --error: #c62828;
-    --bg-primary: #0f1419;
-    --bg-secondary: #1c2128;
-    --bg-card: #22272e;
-    --text-primary: #e6edf3;
-    --text-secondary: #8b949e;
-    --border: #30363d;
-}
-/* ===== HEADER ===== */
-.header-section {
-    text-align: center;
-    padding: 2rem 1rem;
-    background: linear-gradient(135deg, var(--primary) 0%, var(--primary-light) 100%);
-    border-radius: 16px;
-    margin-bottom: 1.5rem;
-}
-.header-section h1 {
-    font-size: 2.5rem;
-    color: white;
-    margin-bottom: 0.5rem;
-}
-.header-section p {
-    color: rgba(255,255,255,0.9);
-    font-size: 1.1rem;
-}
-/* ===== INFO CARD ===== */
-.info-card {
-    background: var(--bg-card);
-    border-radius: 12px;
-    padding: 1.5rem;
-    border: 1px solid var(--border);
-}
-.info-header {
-    display: flex;
-    justify-content: space-between;
-    align-items: center;
-    margin-bottom: 1rem;
-}
-.info-header h3 {
-    color: var(--text-primary);
-    margin: 0;
-}
-.org-badge {
-    background: var(--primary);
-    color: white;
-    padding: 0.25rem 0.75rem;
-    border-radius: 20px;
-    font-size: 0.85rem;
-}
-.description {
-    color: var(--text-secondary);
-    line-height: 1.6;
-}
-.info-grid {
-    display: grid;
-    grid-template-columns: repeat(2, 1fr);
-    gap: 1rem;
-    margin: 1rem 0;
-}
-.info-item {
-    display: flex;
-    flex-direction: column;
-}
-.info-label {
-    color: var(--text-secondary);
-    font-size: 0.85rem;
-}
-.info-value {
-    color: var(--text-primary);
-    font-weight: 600;
-}
-.support-native { color: var(--success); }
-.support-supported { color: var(--secondary); }
-.support-limited { color: var(--warning); }
-/* ===== BADGES ===== */
-.badge-container {
-    margin-top: 1rem;
-}
-.badge-group {
-    margin-bottom: 0.5rem;
-}
-.badge-label {
-    color: var(--text-secondary);
-    font-size: 0.85rem;
-    margin-right: 0.5rem;
-}
-.badge {
-    display: inline-block;
-    padding: 0.2rem 0.5rem;
-    border-radius: 4px;
-    font-size: 0.75rem;
-    margin-right: 0.25rem;
-    margin-bottom: 0.25rem;
-}
-.badge.dialect {
-    background: rgba(74, 144, 217, 0.2);
-    color: var(--secondary);
-}
-.badge.feature {
-    background: rgba(245, 158, 11, 0.2);
-    color: var(--accent);
-}
-/* ===== METRICS GRID ===== */
-.metrics-grid {
-    display: grid;
-    grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
-    gap: 1rem;
-    margin: 1rem 0;
-}
-.metric-card {
-    background: var(--bg-card);
-    border-radius: 12px;
-    padding: 1rem;
-    text-align: center;
-    border: 1px solid var(--border);
-    transition: transform 0.2s;
-}
-.metric-card:hover {
-    transform: translateY(-2px);
-}
-.metric-card.excellent {
-    border-color: var(--success);
-    background: linear-gradient(to bottom, rgba(16, 185, 129, 0.1), transparent);
-}
-.metric-card.good {
-    border-color: var(--secondary);
-    background: linear-gradient(to bottom, rgba(74, 144, 217, 0.1), transparent);
-}
-.metric-card.poor {
-    border-color: var(--error);
-    background: linear-gradient(to bottom, rgba(198, 40, 40, 0.1), transparent);
-}
-.metric-card.primary {
-    border-color: var(--primary);
-    background: linear-gradient(to bottom, rgba(26, 95, 42, 0.1), transparent);
-}
-.metric-icon {
-    font-size: 1.5rem;
-    margin-bottom: 0.5rem;
-}
-.metric-value {
-    font-size: 1.5rem;
-    font-weight: 700;
-    color: var(--text-primary);
-}
-.metric-label {
-    font-size: 0.8rem;
-    color: var(--text-secondary);
-    margin-top: 0.25rem;
-}
-.metric-hint {
-    font-size: 0.7rem;
-    color: var(--text-secondary);
-    opacity: 0.7;
-}
-/* ===== TOKEN VISUALIZATION ===== */
-.token-container {
-    display: flex;
-    flex-wrap: wrap;
-    gap: 0.5rem;
-    padding: 1rem;
-    background: var(--bg-secondary);
-    border-radius: 12px;
-    direction: rtl;
-}
-.token {
-    display: inline-flex;
-    flex-direction: column;
-    align-items: center;
-    padding: 0.5rem 0.75rem;
-    border-radius: 8px;
-    font-family: 'IBM Plex Sans Arabic', monospace;
-    font-size: 1rem;
-    transition: transform 0.2s;
-    cursor: default;
-}
-.token:hover {
-    transform: scale(1.05);
-}
-.token-id {
-    font-size: 0.65rem;
-    opacity: 0.7;
-    margin-top: 0.25rem;
-}
-/* ===== DECODED SECTION ===== */
-.decoded-section {
-    background: var(--bg-card);
-    border-radius: 12px;
-    padding: 1.5rem;
-    border: 1px solid var(--border);
-}
-.decoded-section h4 {
-    color: var(--text-primary);
-    margin-bottom: 1rem;
-}
-.decoded-text {
-    font-family: 'IBM Plex Sans Arabic', serif;
-    font-size: 1.1rem;
-    line-height: 1.8;
-    color: var(--text-primary);
-}
-.decoded-meta {
-    margin-top: 1rem;
-    font-size: 0.85rem;
-    color: var(--text-secondary);
-}
-/* ===== COMPARISON TABLE ===== */
-.comparison-container {
-    overflow-x: auto;
-}
-.comparison-table {
-    width: 100%;
-    border-collapse: collapse;
-    margin: 1rem 0;
-}
-.comparison-table th {
-    background: var(--primary);
-    color: white;
-    padding: 0.75rem;
-    text-align: left;
-    font-weight: 600;
-}
-.comparison-table td {
-    padding: 0.75rem;
-    border-bottom: 1px solid var(--border);
-    color: var(--text-primary);
-}
-.comparison-table tr:hover {
-    background: rgba(74, 144, 217, 0.1);
-}
-.comparison-table .rank-1 {
-    background: linear-gradient(90deg, rgba(255, 215, 0, 0.2), transparent);
-}
-.comparison-table .rank-2 {
-    background: linear-gradient(90deg, rgba(192, 192, 192, 0.2), transparent);
-}
-.comparison-table .rank-3 {
-    background: linear-gradient(90deg, rgba(205, 127, 50, 0.2), transparent);
-}
-.comparison-table .excellent {
-    color: var(--success);
-    font-weight: 600;
-}
-.comparison-table .good {
-    color: var(--secondary);
-}
-.comparison-table .poor {
-    color: var(--error);
-}
-/* ===== UTILITY CLASSES ===== */
-.warning {
-    background: linear-gradient(to right, rgba(245, 124, 0, 0.1), transparent);
-    border-left: 4px solid var(--warning);
-    padding: 1rem;
-    border-radius: 0 8px 8px 0;
-    color: var(--text-primary);
-}
-.error-card {
-    background: linear-gradient(to right, rgba(198, 40, 40, 0.1), transparent);
-    border-left: 4px solid var(--error);
-    padding: 1rem;
-    border-radius: 0 8px 8px 0;
-}
-.error-card h4 {
-    color: var(--error);
-    margin-bottom: 0.5rem;
-}
-.error-card p {
-    color: var(--text-secondary);
-}
-"""
-# ============================================================================
-# SAMPLE TEXTS FOR TESTING
-# ============================================================================
-SAMPLE_TEXTS = {
-    "MSA News": "أعلنت وزارة التربية والتعليم عن بدء العام الدراسي الجديد في الأول من سبتمبر، حيث ستعود المدارس لاستقبال الطلاب بعد العطلة الصيفية الطويلة.",
-    "MSA Formal": "إن تطوير تقنيات الذكاء الاصطناعي يمثل نقلة نوعية في مجال معالجة اللغات الطبيعية، وخاصة فيما يتعلق باللغة العربية ذات الخصائص المورفولوجية الغنية.",
-    "Egyptian Dialect": "ازيك يا صاحبي؟ إيه أخبارك؟ عامل إيه النهارده؟ قولي هنروح فين بكره؟",
-    "Gulf Dialect": "شلونك؟ شخبارك؟ وش تسوي الحين؟ ودك تروح وياي للسوق؟",
-    "Levantine Dialect": "كيفك؟ شو أخبارك؟ شو عم تعمل هلق؟ بدك تيجي معي على السوق؟",
-    "Classical Arabic (Quran)": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ ۝ الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
-    "Poetry": "وما من كاتبٍ إلا سيفنى ويُبقي الدهرُ ما كتبت يداهُ",
-    "Technical": "يستخدم نموذج المحولات آلية الانتباه الذاتي لمعالجة تسلسلات النصوص بشكل متوازي.",
-    "Mixed Arabic-English": "The Arabic language العربية is a Semitic language with over 400 million speakers worldwide.",
-    "With Diacritics": "إِنَّ اللَّهَ وَمَلَائِكَتَهُ يُصَلُّونَ عَلَى النَّبِيِّ",
-}
-# ============================================================================
-# GRADIO INTERFACE
-# ============================================================================
 def create_interface():
     """Create the Gradio interface"""
     available_tokenizers = tokenizer_manager.get_tokenizer_choices()
-    # Group tokenizers by type
-    arabic_specific = [t for t in available_tokenizers if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT', 'Aranizer'])]
-    arabic_llms = [t for t in available_tokenizers if any(x in t for x in ['Jais', 'AceGPT', 'ALLaM', 'SILMA', 'Fanar', 'Yehia', 'Atlas'])]
-    multilingual = [t for t in available_tokenizers if t not in arabic_specific and t not in arabic_llms]
-    with gr.Blocks(css=CUSTOM_CSS, title="Arabic Tokenizer Arena Pro", theme=gr.themes.Base(
-        primary_hue="green",
-        secondary_hue="blue",
-        neutral_hue="slate",
-        font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
-    )) as demo:
         # Header
         gr.HTML("""
@@ -1909,7 +128,7 @@ def create_interface():
                     outputs=[comparison_output]
                 )
-            # ===== TAB 3: LEADERBOARD - Real HF Datasets =====
             with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
                 gr.Markdown("""
                 ## 🏆 Arabic Tokenizer Leaderboard
@@ -1960,16 +179,16 @@ def create_interface():
                 ---
                 ### 📖 Dataset Sources (from HuggingFace)
-                | Dataset | HuggingFace ID | Category | Description |
-                |---------|----------------|----------|-------------|
-                | ArabicMMLU | `MBZUAI/ArabicMMLU` | Benchmark | Multi-task exam questions (14,575 MCQs) |
-                | ArSenTD-LEV | `ramybaly/arsentd_lev` | Dialectal | Levantine tweets |
-                | ATHAR | `mohamed-khalil/ATHAR` | Classical | 66K classical Arabic sentences |
-                | ARCD | `arcd` | QA | Arabic Reading Comprehension |
-                | Ashaar | `arbml/Ashaar_dataset` | Poetry | 2M+ Arabic poetry verses |
-                | Hadith | `gurgutan/sunnah_ar_en_dataset` | Religious | 50,762 hadiths |
-                | Arabic Sentiment | `arbml/Arabic_Sentiment_Twitter_Corpus` | Social Media | Twitter sentiment |
-                | SANAD | `arbml/SANAD` | News | Arabic news articles |
                 """)
             # ===== TAB 4: Metrics Reference =====
@@ -2000,6 +219,17 @@ def create_interface():
                 | **Arabic Fertility** | Tokens per Arabic word | Arabic-specific efficiency measure |
                 | **Diacritic Preservation** | Whether tashkeel is preserved | Important for religious & educational texts |
                 ### Research Background
                 These metrics are based on recent research including:
@@ -2011,50 +241,19 @@ def create_interface():
             # ===== TAB 5: About =====
             with gr.TabItem("ℹ️ About", id="about"):
-                gr.Markdown(f"""
-                ## Arabic Tokenizer Arena Pro
-                A comprehensive platform for evaluating Arabic tokenizers across multiple dimensions.
-                ### Available Tokenizers: {len(available_tokenizers)}
-                **Arabic-Specific Models:**
-                {chr(10).join(['- ' + t for t in arabic_specific[:10]])}
-                **Arabic LLMs:**
-                {chr(10).join(['- ' + t for t in arabic_llms[:10]])}
-                **Multilingual LLMs:**
-                {chr(10).join(['- ' + t for t in multilingual[:10]])}
-                ### Features
-                ✅ Comprehensive efficiency metrics (fertility, compression, STRR)
-                ✅ Arabic-specific analysis (dialect support, diacritic preservation)
-                ✅ Side-by-side tokenizer comparison
-                ✅ Beautiful token visualization
-                ✅ **NEW: Leaderboard with real HuggingFace datasets**
-                ✅ Support for MSA, dialectal Arabic, and Classical Arabic
-                ✅ Research-backed evaluation methodology
-                ### Use Cases
-                - **Research**: Compare tokenizers for Arabic NLP experiments
-                - **Production**: Select optimal tokenizer for deployment
-                - **Education**: Understand how different algorithms handle Arabic
-                - **Optimization**: Identify cost-efficient tokenizers for API usage
-                ---
-                Built with ❤️ for the Arabic NLP community
-                """)
         return demo
 # ============================================================================
 # MAIN
 # ============================================================================
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch()

 """
+Arabic Tokenizer Arena Pro - Main Application
+==============================================
+Advanced research & production platform for Arabic tokenization analysis
+Run with: python app.py
 """
 import gradio as gr
+# Import modules
+from config import SAMPLE_TEXTS, LEADERBOARD_DATASETS
+from styles import CUSTOM_CSS
+from tokenizer_manager import tokenizer_manager
+from analysis import analyze_single_tokenizer, compare_tokenizers
+from leaderboard import run_leaderboard_evaluation
+from ui_components import generate_about_html
 def create_interface():
     """Create the Gradio interface"""
     available_tokenizers = tokenizer_manager.get_tokenizer_choices()
+    tokenizers_by_type = tokenizer_manager.get_tokenizers_by_type()
+    with gr.Blocks(
+        css=CUSTOM_CSS,
+        title="Arabic Tokenizer Arena Pro",
+        theme=gr.themes.Base(
+            primary_hue="green",
+            secondary_hue="blue",
+            neutral_hue="slate",
+            font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
+        )
+    ) as demo:
         # Header
         gr.HTML("""
                     outputs=[comparison_output]
                 )
+            # ===== TAB 3: LEADERBOARD =====
             with gr.TabItem("🏆 Leaderboard", id="leaderboard"):
                 gr.Markdown("""
                 ## 🏆 Arabic Tokenizer Leaderboard
                 ---
                 ### 📖 Dataset Sources (from HuggingFace)
+                | Dataset | HuggingFace ID | Category | Samples |
+                |---------|----------------|----------|---------|
+                | ArabicMMLU | `MBZUAI/ArabicMMLU` | MSA Benchmark | 500 |
+                | ArSenTD-LEV | `ramybaly/arsentd_lev` | Levantine Dialect | 500 |
+                | ATHAR | `mohamed-khalil/ATHAR` | Classical Arabic | 500 |
+                | ARCD | `arcd` | QA Dataset | 300 |
+                | Ashaar | `arbml/Ashaar_dataset` | Poetry | 500 |
+                | Hadith | `gurgutan/sunnah_ar_en_dataset` | Religious | 400 |
+                | Arabic Sentiment | `arbml/Arabic_Sentiment_Twitter_Corpus` | Social Media | 500 |
+                | SANAD | `arbml/SANAD` | News | 400 |
                 """)
             # ===== TAB 4: Metrics Reference =====
                 | **Arabic Fertility** | Tokens per Arabic word | Arabic-specific efficiency measure |
                 | **Diacritic Preservation** | Whether tashkeel is preserved | Important for religious & educational texts |
+                ### Scoring Formula (Leaderboard)
+                ```
+                Score = (Fertility Score × 0.45) + (Compression Score × 0.35) + (UNK Score × 0.20) × 100
+                ```
+                Where:
+                - **Fertility Score** = 2.0 / fertility (capped 0-1, inverted - lower fertility = higher score)
+                - **Compression Score** = compression / 6 (capped 0-1)
+                - **UNK Score** = 1 - (unk_ratio × 20) (capped 0-1, inverted)
                 ### Research Background
                 These metrics are based on recent research including:
             # ===== TAB 5: About =====
             with gr.TabItem("ℹ️ About", id="about"):
+                about_html = generate_about_html(
+                    tokenizers_by_type,
+                    len(available_tokenizers)
+                )
+                gr.HTML(about_html)
         return demo
 # ============================================================================
 # MAIN
 # ============================================================================
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,551 @@

+"""
+Configuration for Arabic Tokenizer Arena
+=========================================
+Tokenizer registry, dataset configs, and sample texts
+"""
+from dataclasses import dataclass, field
+from typing import List, Dict
+from enum import Enum
+class TokenizerType(Enum):
+    ARABIC_SPECIFIC = "Arabic-Specific"
+    MULTILINGUAL_LLM = "Multilingual LLM"
+    ARABIC_LLM = "Arabic LLM"
+    ENCODER_ONLY = "Encoder-Only (BERT)"
+    DECODER_ONLY = "Decoder-Only (GPT)"
+class TokenizerAlgorithm(Enum):
+    BPE = "Byte-Pair Encoding (BPE)"
+    BBPE = "Byte-Level BPE"
+    WORDPIECE = "WordPiece"
+    SENTENCEPIECE = "SentencePiece"
+    UNIGRAM = "Unigram"
+    TIKTOKEN = "Tiktoken"
+@dataclass
+class TokenizerInfo:
+    """Metadata about a tokenizer"""
+    name: str
+    model_id: str
+    type: TokenizerType
+    algorithm: TokenizerAlgorithm
+    vocab_size: int
+    description: str
+    organization: str
+    arabic_support: str  # Native, Adapted, Limited
+    dialect_support: List[str] = field(default_factory=list)
+    special_features: List[str] = field(default_factory=list)
+@dataclass
+class TokenizationMetrics:
+    """Comprehensive tokenization evaluation metrics"""
+    total_tokens: int
+    total_words: int
+    total_characters: int
+    total_bytes: int
+    fertility: float
+    compression_ratio: float
+    char_per_token: float
+    oov_count: int
+    oov_percentage: float
+    single_token_words: int
+    single_token_retention_rate: float
+    avg_subwords_per_word: float
+    max_subwords_per_word: int
+    continued_words_ratio: float
+    arabic_char_count: int
+    arabic_token_count: int
+    arabic_fertility: float
+    diacritic_preservation: bool
+    tokenization_time_ms: float
+    tokens: List[str] = field(default_factory=list)
+    token_ids: List[int] = field(default_factory=list)
+    decoded_text: str = ""
+# ============================================================================
+# TOKENIZER REGISTRY
+# ============================================================================
+TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
+    # ========== ARABIC-SPECIFIC BERT MODELS ==========
+    "aubmindlab/bert-base-arabertv2": TokenizerInfo(
+        name="AraBERT v2",
+        model_id="aubmindlab/bert-base-arabertv2",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=64000,
+        description="Arabic BERT with Farasa segmentation, optimized for MSA",
+        organization="AUB MIND Lab",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["Farasa preprocessing", "Morphological segmentation"]
+    ),
+    "aubmindlab/bert-large-arabertv2": TokenizerInfo(
+        name="AraBERT v2 Large",
+        model_id="aubmindlab/bert-large-arabertv2",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=64000,
+        description="Large Arabic BERT with enhanced capacity",
+        organization="AUB MIND Lab",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["Large model", "Farasa preprocessing"]
+    ),
+    "CAMeL-Lab/bert-base-arabic-camelbert-mix": TokenizerInfo(
+        name="CAMeLBERT Mix",
+        model_id="CAMeL-Lab/bert-base-arabic-camelbert-mix",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=30000,
+        description="Pre-trained on MSA, DA, and Classical Arabic mix",
+        organization="CAMeL Lab NYU Abu Dhabi",
+        arabic_support="Native",
+        dialect_support=["MSA", "DA", "CA"],
+        special_features=["Multi-variant Arabic", "Classical Arabic support"]
+    ),
+    "CAMeL-Lab/bert-base-arabic-camelbert-msa": TokenizerInfo(
+        name="CAMeLBERT MSA",
+        model_id="CAMeL-Lab/bert-base-arabic-camelbert-msa",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=30000,
+        description="Specialized for Modern Standard Arabic",
+        organization="CAMeL Lab NYU Abu Dhabi",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["MSA optimized"]
+    ),
+    "CAMeL-Lab/bert-base-arabic-camelbert-da": TokenizerInfo(
+        name="CAMeLBERT DA",
+        model_id="CAMeL-Lab/bert-base-arabic-camelbert-da",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=30000,
+        description="Specialized for Dialectal Arabic",
+        organization="CAMeL Lab NYU Abu Dhabi",
+        arabic_support="Native",
+        dialect_support=["Egyptian", "Gulf", "Levantine", "Maghrebi"],
+        special_features=["Dialect optimized"]
+    ),
+    "CAMeL-Lab/bert-base-arabic-camelbert-ca": TokenizerInfo(
+        name="CAMeLBERT CA",
+        model_id="CAMeL-Lab/bert-base-arabic-camelbert-ca",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=30000,
+        description="Specialized for Classical Arabic",
+        organization="CAMeL Lab NYU Abu Dhabi",
+        arabic_support="Native",
+        dialect_support=["Classical"],
+        special_features=["Classical Arabic", "Religious texts"]
+    ),
+    "UBC-NLP/MARBERT": TokenizerInfo(
+        name="MARBERT",
+        model_id="UBC-NLP/MARBERT",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=100000,
+        description="Multi-dialectal Arabic BERT trained on Twitter data",
+        organization="UBC NLP",
+        arabic_support="Native",
+        dialect_support=["MSA", "Egyptian", "Gulf", "Levantine", "Maghrebi"],
+        special_features=["Twitter data", "100K vocabulary", "Multi-dialect"]
+    ),
+    "UBC-NLP/ARBERT": TokenizerInfo(
+        name="ARBERT",
+        model_id="UBC-NLP/ARBERT",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=100000,
+        description="Arabic BERT focused on MSA with large vocabulary",
+        organization="UBC NLP",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["100K vocabulary", "MSA focused"]
+    ),
+    "asafaya/bert-base-arabic": TokenizerInfo(
+        name="Arabic BERT (Safaya)",
+        model_id="asafaya/bert-base-arabic",
+        type=TokenizerType.ENCODER_ONLY,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=32000,
+        description="Arabic BERT trained on MSA and dialectal Arabic",
+        organization="Safaya",
+        arabic_support="Native",
+        dialect_support=["MSA", "DA"],
+        special_features=["TPU trained", "Dialect support"]
+    ),
+    # ========== ARABIC-SPECIFIC TOKENIZERS ==========
+    "riotu-lab/Aranizer-PBE-86k": TokenizerInfo(
+        name="Aranizer PBE 86K",
+        model_id="riotu-lab/Aranizer-PBE-86k",
+        type=TokenizerType.ARABIC_SPECIFIC,
+        algorithm=TokenizerAlgorithm.BPE,
+        vocab_size=86000,
+        description="Pair Byte Encoding tokenizer optimized for Arabic LLMs",
+        organization="RIOTU Lab",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["Low fertility", "LLM optimized", "86K vocab"]
+    ),
+    "riotu-lab/Aranizer-SP-86k": TokenizerInfo(
+        name="Aranizer SP 86K",
+        model_id="riotu-lab/Aranizer-SP-86k",
+        type=TokenizerType.ARABIC_SPECIFIC,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=86000,
+        description="SentencePiece tokenizer optimized for Arabic",
+        organization="RIOTU Lab",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["Low fertility", "SentencePiece", "86K vocab"]
+    ),
+    "riotu-lab/Aranizer-PBE-32k": TokenizerInfo(
+        name="Aranizer PBE 32K",
+        model_id="riotu-lab/Aranizer-PBE-32k",
+        type=TokenizerType.ARABIC_SPECIFIC,
+        algorithm=TokenizerAlgorithm.BPE,
+        vocab_size=32000,
+        description="Compact PBE tokenizer for Arabic",
+        organization="RIOTU Lab",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["Compact", "LLM compatible"]
+    ),
+    "riotu-lab/Aranizer-SP-32k": TokenizerInfo(
+        name="Aranizer SP 32K",
+        model_id="riotu-lab/Aranizer-SP-32k",
+        type=TokenizerType.ARABIC_SPECIFIC,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=32000,
+        description="Compact SentencePiece tokenizer for Arabic",
+        organization="RIOTU Lab",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["Compact", "Efficient"]
+    ),
+    # ========== ARABIC LLMs ==========
+    "inception-mbzuai/jais-13b": TokenizerInfo(
+        name="Jais 13B",
+        model_id="inception-mbzuai/jais-13b",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=84992,
+        description="World's most advanced Arabic LLM, trained from scratch",
+        organization="Inception/MBZUAI",
+        arabic_support="Native",
+        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
+        special_features=["Arabic-first", "Lowest fertility", "UAE-native"]
+    ),
+    "inceptionai/jais-family-30b-8k-chat": TokenizerInfo(
+        name="Jais 30B Chat",
+        model_id="inceptionai/jais-family-30b-8k-chat",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=84992,
+        description="Enhanced 30B version with chat capabilities",
+        organization="Inception AI",
+        arabic_support="Native",
+        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
+        special_features=["30B parameters", "Chat optimized", "8K context"]
+    ),
+    "FreedomIntelligence/AceGPT-13B-chat": TokenizerInfo(
+        name="AceGPT 13B Chat",
+        model_id="FreedomIntelligence/AceGPT-13B-chat",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=32000,
+        description="Arabic-enhanced LLaMA with cultural alignment and chat",
+        organization="Freedom Intelligence",
+        arabic_support="Adapted",
+        dialect_support=["MSA"],
+        special_features=["LLaMA-based", "Cultural alignment", "RLHF", "Chat"]
+    ),
+    "silma-ai/SILMA-9B-Instruct-v1.0": TokenizerInfo(
+        name="SILMA 9B Instruct",
+        model_id="silma-ai/SILMA-9B-Instruct-v1.0",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=256000,
+        description="Top-ranked Arabic LLM based on Gemma, outperforms larger models",
+        organization="SILMA AI",
+        arabic_support="Native",
+        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
+        special_features=["Gemma-based", "SOTA 9B class", "Efficient"]
+    ),
+    "silma-ai/SILMA-Kashif-2B-Instruct-v1.0": TokenizerInfo(
+        name="SILMA Kashif 2B (RAG)",
+        model_id="silma-ai/SILMA-Kashif-2B-Instruct-v1.0",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=256000,
+        description="RAG-optimized Arabic model, excellent for context-based QA",
+        organization="SILMA AI",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["RAG optimized", "12K context", "Compact"]
+    ),
+    "QCRI/Fanar-1-9B-Instruct": TokenizerInfo(
+        name="Fanar 9B Instruct",
+        model_id="QCRI/Fanar-1-9B-Instruct",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=256000,
+        description="Qatar's Arabic LLM aligned with Islamic values and Arab culture",
+        organization="QCRI (Qatar)",
+        arabic_support="Native",
+        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
+        special_features=["Islamic RAG", "Cultural alignment", "Gemma-based"]
+    ),
+    "stabilityai/ar-stablelm-2-chat": TokenizerInfo(
+        name="Arabic StableLM 2 Chat",
+        model_id="stabilityai/ar-stablelm-2-chat",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.BPE,
+        vocab_size=100289,
+        description="Stability AI's Arabic instruction-tuned 1.6B model",
+        organization="Stability AI",
+        arabic_support="Native",
+        dialect_support=["MSA"],
+        special_features=["Compact 1.6B", "Chat optimized", "Efficient"]
+    ),
+    "Navid-AI/Yehia-7B-preview": TokenizerInfo(
+        name="Yehia 7B Preview",
+        model_id="Navid-AI/Yehia-7B-preview",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.BPE,
+        vocab_size=128256,
+        description="Best Arabic model on AraGen-Leaderboard (0.5B-25B), GRPO trained",
+        organization="Navid AI",
+        arabic_support="Native",
+        dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
+        special_features=["GRPO trained", "3C3H aligned", "SOTA AraGen"]
+    ),
+    # ========== DIALECT-SPECIFIC MODELS ==========
+    "MBZUAI-Paris/Atlas-Chat-9B": TokenizerInfo(
+        name="Atlas-Chat 9B (Darija)",
+        model_id="MBZUAI-Paris/Atlas-Chat-9B",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=256000,
+        description="First LLM for Moroccan Arabic (Darija), Gemma-based",
+        organization="MBZUAI Paris",
+        arabic_support="Native",
+        dialect_support=["Darija", "MSA"],
+        special_features=["Moroccan dialect", "Transliteration", "Cultural"]
+    ),
+    "MBZUAI-Paris/Atlas-Chat-2B": TokenizerInfo(
+        name="Atlas-Chat 2B (Darija)",
+        model_id="MBZUAI-Paris/Atlas-Chat-2B",
+        type=TokenizerType.ARABIC_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=256000,
+        description="Compact Moroccan Arabic model for edge deployment",
+        organization="MBZUAI Paris",
+        arabic_support="Native",
+        dialect_support=["Darija", "MSA"],
+        special_features=["Compact", "Moroccan dialect", "Edge-ready"]
+    ),
+    # ========== MULTILINGUAL LLMs ==========
+    "Qwen/Qwen2.5-7B": TokenizerInfo(
+        name="Qwen 2.5 7B",
+        model_id="Qwen/Qwen2.5-7B",
+        type=TokenizerType.MULTILINGUAL_LLM,
+        algorithm=TokenizerAlgorithm.BPE,
+        vocab_size=151936,
+        description="Alibaba's multilingual LLM with 30+ language support",
+        organization="Alibaba Qwen",
+        arabic_support="Supported",
+        dialect_support=["MSA"],
+        special_features=["152K vocab", "128K context", "30+ languages"]
+    ),
+    "google/gemma-2-9b": TokenizerInfo(
+        name="Gemma 2 9B",
+        model_id="google/gemma-2-9b",
+        type=TokenizerType.MULTILINGUAL_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=256000,
+        description="Google's efficient multilingual model",
+        organization="Google",
+        arabic_support="Supported",
+        dialect_support=["MSA"],
+        special_features=["256K vocab", "Efficient architecture"]
+    ),
+    "mistralai/Mistral-7B-v0.3": TokenizerInfo(
+        name="Mistral 7B v0.3",
+        model_id="mistralai/Mistral-7B-v0.3",
+        type=TokenizerType.MULTILINGUAL_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=32768,
+        description="Efficient multilingual model with sliding window attention",
+        organization="Mistral AI",
+        arabic_support="Limited",
+        dialect_support=["MSA"],
+        special_features=["Sliding window", "Efficient"]
+    ),
+    "mistralai/Mistral-Nemo-Base-2407": TokenizerInfo(
+        name="Mistral Nemo",
+        model_id="mistralai/Mistral-Nemo-Base-2407",
+        type=TokenizerType.MULTILINGUAL_LLM,
+        algorithm=TokenizerAlgorithm.TIKTOKEN,
+        vocab_size=131072,
+        description="Uses Tekken tokenizer, optimized for multilingual",
+        organization="Mistral AI + NVIDIA",
+        arabic_support="Supported",
+        dialect_support=["MSA"],
+        special_features=["Tekken tokenizer", "131K vocab", "Multilingual optimized"]
+    ),
+    "xlm-roberta-base": TokenizerInfo(
+        name="XLM-RoBERTa Base",
+        model_id="xlm-roberta-base",
+        type=TokenizerType.MULTILINGUAL_LLM,
+        algorithm=TokenizerAlgorithm.SENTENCEPIECE,
+        vocab_size=250002,
+        description="Cross-lingual model covering 100 languages",
+        organization="Facebook AI",
+        arabic_support="Supported",
+        dialect_support=["MSA"],
+        special_features=["250K vocab", "100 languages"]
+    ),
+    "bert-base-multilingual-cased": TokenizerInfo(
+        name="mBERT",
+        model_id="bert-base-multilingual-cased",
+        type=TokenizerType.MULTILINGUAL_LLM,
+        algorithm=TokenizerAlgorithm.WORDPIECE,
+        vocab_size=119547,
+        description="Original multilingual BERT, baseline for comparison",
+        organization="Google",
+        arabic_support="Limited",
+        dialect_support=["MSA"],
+        special_features=["Baseline model", "104 languages"]
+    ),
+    "tiiuae/falcon-7b": TokenizerInfo(
+        name="Falcon 7B",
+        model_id="tiiuae/falcon-7b",
+        type=TokenizerType.MULTILINGUAL_LLM,
+        algorithm=TokenizerAlgorithm.BPE,
+        vocab_size=65024,
+        description="TII's powerful open-source LLM",
+        organization="Technology Innovation Institute",
+        arabic_support="Limited",
+        dialect_support=["MSA"],
+        special_features=["65K vocab", "RefinedWeb trained"]
+    ),
+}
+# ============================================================================
+# LEADERBOARD DATASETS - Real HuggingFace Datasets
+# ============================================================================
+LEADERBOARD_DATASETS = {
+    "arabic_mmlu": {
+        "hf_id": "MBZUAI/ArabicMMLU",
+        "name": "ArabicMMLU",
+        "category": "MSA Benchmark",
+        "text_column": "Question",
+        "split": "test",
+        "subset": None,
+        "samples": 500,
+        "description": "Multi-task benchmark from Arab school exams (14,575 MCQs)"
+    },
+    "arsentd_lev": {
+        "hf_id": "ramybaly/arsentd_lev",
+        "name": "ArSenTD-LEV",
+        "category": "Levantine Dialect",
+        "text_column": "Tweet",
+        "split": "train",
+        "subset": None,
+        "samples": 500,
+        "description": "Levantine Arabic tweets (Jordan, Lebanon, Syria, Palestine)"
+    },
+    "athar": {
+        "hf_id": "mohamed-khalil/ATHAR",
+        "name": "ATHAR Classical",
+        "category": "Classical Arabic",
+        "text_column": "arabic",
+        "split": "train",
+        "subset": None,
+        "samples": 500,
+        "description": "66K classical Arabic sentences with translations"
+    },
+    "arcd": {
+        "hf_id": "arcd",
+        "name": "ARCD",
+        "category": "QA Dataset",
+        "text_column": "context",
+        "split": "train",
+        "subset": None,
+        "samples": 300,
+        "description": "Arabic Reading Comprehension Dataset (1,395 questions)"
+    },
+    "ashaar": {
+        "hf_id": "arbml/Ashaar_dataset",
+        "name": "Ashaar Poetry",
+        "category": "Poetry",
+        "text_column": "poem_text",
+        "split": "train",
+        "subset": None,
+        "samples": 500,
+        "description": "2M+ Arabic poetry verses with meter and theme labels"
+    },
+    "hadith": {
+        "hf_id": "gurgutan/sunnah_ar_en_dataset",
+        "name": "Hadith Collection",
+        "category": "Religious",
+        "text_column": "hadith_text_ar",
+        "split": "train",
+        "subset": None,
+        "samples": 400,
+        "description": "50,762 hadiths from 14 authentic books"
+    },
+    "arabic_sentiment": {
+        "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
+        "name": "Arabic Sentiment",
+        "category": "Social Media",
+        "text_column": "text",
+        "split": "train",
+        "subset": None,
+        "samples": 500,
+        "description": "Arabic Twitter sentiment corpus"
+    },
+    "sanad": {
+        "hf_id": "arbml/SANAD",
+        "name": "SANAD News",
+        "category": "News",
+        "text_column": "text",
+        "split": "train",
+        "subset": "alarabiya",
+        "samples": 400,
+        "description": "Arabic news articles from Al Arabiya"
+    },
+}
+# ============================================================================
+# SAMPLE TEXTS
+# ============================================================================
+SAMPLE_TEXTS = {
+    "MSA News": "أعلنت وزارة التربية والتعليم عن بدء العام الدراسي الجديد في الأول من سبتمبر، حيث ستعود المدارس لاستقبال الطلاب بعد العطلة الصيفية الطويلة.",
+    "MSA Formal": "إن تطوير تقنيات الذكاء الاصطناعي يمثل نقلة نوعية في مجال معالجة اللغات الطبيعية، وخاصة فيما يتعلق باللغة العربية ذات الخصائص المورفولوجية الغنية.",
+    "Egyptian Dialect": "ازيك يا صاحبي؟ إيه أخبارك؟ عامل إيه النهارده؟ قولي هنروح فين بكره؟",
+    "Gulf Dialect": "شلونك؟ شخبارك؟ وش تسوي الحين؟ ودك تروح وياي للسوق؟",
+    "Levantine Dialect": "كيفك؟ شو أخبارك؟ شو عم تعمل هلق؟ بدك تيجي معي على السوق؟",
+    "Classical Arabic (Quran)": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ ۝ الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
+    "Poetry": "وما من كاتبٍ إلا سيفنى ويُبقي الدهرُ ما كتبت يداهُ",
+    "Technical": "يستخدم نموذج المحولات آلية الانتباه الذاتي لمعالجة تسلسلات النصوص بشكل متوازي.",
+    "Mixed Arabic-English": "The Arabic language العربية is a Semitic language with over 400 million speakers worldwide.",
+    "With Diacritics": "إِنَّ اللَّهَ وَمَلَائِكَتَهُ يُصَلُّونَ عَلَى النَّبِيِّ",
+}

leaderboard.py ADDED Viewed

	@@ -0,0 +1,449 @@

+"""
+Leaderboard Module
+==================
+Evaluate tokenizers on real HuggingFace Arabic datasets
+"""
+import statistics
+from typing import Dict, List, Tuple, Optional
+from collections import defaultdict
+import gradio as gr
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from config import LEADERBOARD_DATASETS
+from tokenizer_manager import tokenizer_manager
+class HFDatasetLoader:
+    """Load Arabic datasets from HuggingFace"""
+    def __init__(self):
+        self.cache = {}
+    def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]:
+        """Load texts from a HuggingFace dataset"""
+        if dataset_key in self.cache:
+            return self.cache[dataset_key], f"✅ Loaded {len(self.cache[dataset_key])} samples (cached)"
+        config = LEADERBOARD_DATASETS.get(dataset_key)
+        if not config:
+            return [], f"❌ Unknown dataset: {dataset_key}"
+        try:
+            # Load dataset from HuggingFace
+            if config.get("subset"):
+                ds = load_dataset(
+                    config["hf_id"],
+                    config["subset"],
+                    split=config["split"],
+                    trust_remote_code=True
+                )
+            else:
+                ds = load_dataset(
+                    config["hf_id"],
+                    split=config["split"],
+                    trust_remote_code=True
+                )
+            texts = []
+            text_col = config["text_column"]
+            # Try to find text column
+            if text_col not in ds.column_names:
+                for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]:
+                    if col in ds.column_names:
+                        text_col = col
+                        break
+            # Extract texts
+            max_samples = config.get("samples", 500)
+            for i, item in enumerate(ds):
+                if i >= max_samples:
+                    break
+                text = item.get(text_col, "")
+                if text and isinstance(text, str) and len(text.strip()) > 10:
+                    texts.append(text.strip())
+            self.cache[dataset_key] = texts
+            return texts, f"✅ Loaded {len(texts)} samples from HuggingFace"
+        except Exception as e:
+            return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}"
+def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]:
+    """Evaluate a tokenizer on a list of texts"""
+    fertilities = []
+    compressions = []
+    unk_counts = 0
+    total_tokens = 0
+    for text in texts:
+        try:
+            tokens = tokenizer.encode(text, add_special_tokens=False)
+            decoded = tokenizer.convert_ids_to_tokens(tokens)
+            num_tokens = len(tokens)
+            num_words = len(text.split()) or 1
+            num_bytes = len(text.encode('utf-8'))
+            fertility = num_tokens / num_words
+            compression = num_bytes / num_tokens if num_tokens > 0 else 0
+            # Count UNKs
+            unk_token = getattr(tokenizer, 'unk_token', '[UNK]')
+            unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower()))
+            fertilities.append(fertility)
+            compressions.append(compression)
+            unk_counts += unks
+            total_tokens += num_tokens
+        except Exception:
+            continue
+    if not fertilities:
+        return None
+    return {
+        "avg_fertility": statistics.mean(fertilities),
+        "std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0,
+        "avg_compression": statistics.mean(compressions),
+        "unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0,
+        "samples": len(fertilities)
+    }
+def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float:
+    """Calculate overall score (0-100, higher is better)"""
+    # Lower fertility is better (ideal ~1.0 for Arabic)
+    fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0
+    # Higher compression is better
+    compression_score = min(1, compression / 6)
+    # Lower UNK is better
+    unk_score = 1 - min(1, unk_ratio * 20)
+    # Weighted combination
+    score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100
+    return round(score, 1)
+def run_leaderboard_evaluation(
+    selected_datasets: List[str],
+    selected_tokenizers: List[str],
+    progress=gr.Progress()
+) -> Tuple[str, str, str]:
+    """
+    Run the full leaderboard evaluation with real HF datasets
+    Returns: (leaderboard_html, per_dataset_html, status_message)
+    """
+    if not selected_datasets:
+        return "", "", "⚠️ Please select at least one dataset"
+    if not selected_tokenizers:
+        return "", "", "⚠️ Please select at least one tokenizer"
+    loader = HFDatasetLoader()
+    results = defaultdict(dict)
+    # Status tracking
+    status_lines = []
+    # Load datasets from HuggingFace
+    status_lines.append("📚 **Loading Datasets from HuggingFace:**\n")
+    loaded_datasets = {}
+    for i, ds_key in enumerate(selected_datasets):
+        progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...")
+        texts, msg = loader.load_dataset_texts(ds_key)
+        ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
+        status_lines.append(f"  • {ds_name}: {msg}")
+        if texts:
+            loaded_datasets[ds_key] = texts
+    if not loaded_datasets:
+        return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully"
+    # Evaluate tokenizers
+    status_lines.append("\n🔄 **Evaluating Tokenizers:**\n")
+    tokenizer_cache = {}
+    total_steps = len(selected_tokenizers) * len(loaded_datasets)
+    current_step = 0
+    for tok_choice in selected_tokenizers:
+        # Get model ID from choice
+        tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
+        tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
+        tok_name = tok_info.name if tok_info else tok_choice
+        # Load tokenizer
+        try:
+            if tok_id not in tokenizer_cache:
+                tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained(
+                    tok_id, trust_remote_code=True
+                )
+            tokenizer = tokenizer_cache[tok_id]
+            status_lines.append(f"  • {tok_name}: ✅ Loaded")
+        except Exception as e:
+            status_lines.append(f"  • {tok_name}: ❌ Failed ({str(e)[:30]})")
+            continue
+        # Evaluate on each dataset
+        for ds_key, texts in loaded_datasets.items():
+            current_step += 1
+            progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...")
+            metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
+            if metrics:
+                results[tok_choice][ds_key] = metrics
+    # Generate leaderboard
+    progress(0.95, "Generating leaderboard...")
+    leaderboard_data = []
+    per_dataset_data = []
+    for tok_choice, ds_results in results.items():
+        if not ds_results:
+            continue
+        tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
+        tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
+        # Aggregate across datasets
+        all_fertility = [m["avg_fertility"] for m in ds_results.values()]
+        all_compression = [m["avg_compression"] for m in ds_results.values()]
+        all_unk = [m["unk_ratio"] for m in ds_results.values()]
+        avg_fertility = statistics.mean(all_fertility)
+        avg_compression = statistics.mean(all_compression)
+        avg_unk = statistics.mean(all_unk)
+        score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)
+        leaderboard_data.append({
+            "name": tok_info.name if tok_info else tok_choice,
+            "type": tok_info.type.value if tok_info else "Unknown",
+            "org": tok_info.organization if tok_info else "Unknown",
+            "score": score,
+            "fertility": avg_fertility,
+            "compression": avg_compression,
+            "unk_ratio": avg_unk,
+            "num_datasets": len(ds_results)
+        })
+        # Per-dataset row
+        per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice}
+        for ds_key in selected_datasets:
+            ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
+            if ds_key in ds_results:
+                per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2)
+            else:
+                per_ds_row[ds_name] = "-"
+        per_dataset_data.append(per_ds_row)
+    # Sort by score
+    leaderboard_data.sort(key=lambda x: x["score"], reverse=True)
+    # Create HTML tables
+    leaderboard_html = generate_leaderboard_html(leaderboard_data)
+    per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
+    status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
+    return leaderboard_html, per_dataset_html, "\n".join(status_lines)
+def generate_leaderboard_html(data: List[Dict]) -> str:
+    """Generate HTML for main leaderboard"""
+    if not data:
+        return "<p>No results to display</p>"
+    html = """
+    <style>
+        .leaderboard-table {
+            width: 100%;
+            border-collapse: collapse;
+            font-family: system-ui, -apple-system, sans-serif;
+            margin: 20px 0;
+        }
+        .leaderboard-table th {
+            background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4e 100%);
+            color: white;
+            padding: 12px 8px;
+            text-align: left;
+            font-weight: 600;
+        }
+        .leaderboard-table td {
+            padding: 10px 8px;
+            border-bottom: 1px solid #e0e0e0;
+        }
+        .leaderboard-table tr:nth-child(even) {
+            background-color: #f8f9fa;
+        }
+        .leaderboard-table tr:hover {
+            background-color: #e8f5e9;
+        }
+        .rank-1 { background: linear-gradient(90deg, #ffd700 0%, #fff8dc 100%) !important; }
+        .rank-2 { background: linear-gradient(90deg, #c0c0c0 0%, #f5f5f5 100%) !important; }
+        .rank-3 { background: linear-gradient(90deg, #cd7f32 0%, #ffe4c4 100%) !important; }
+        .score-badge {
+            background: #2d8f4e;
+            color: white;
+            padding: 4px 8px;
+            border-radius: 12px;
+            font-weight: bold;
+        }
+        .type-badge {
+            background: #e3f2fd;
+            color: #1565c0;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-size: 0.85em;
+        }
+        .metric-good { color: #2e7d32; font-weight: 600; }
+        .metric-bad { color: #c62828; }
+    </style>
+    <table class="leaderboard-table">
+        <thead>
+            <tr>
+                <th>Rank</th>
+                <th>Tokenizer</th>
+                <th>Type</th>
+                <th>Organization</th>
+                <th>Score ↑</th>
+                <th>Fertility ↓</th>
+                <th>Compression ↑</th>
+                <th>UNK Rate ↓</th>
+                <th>Datasets</th>
+            </tr>
+        </thead>
+        <tbody>
+    """
+    for i, entry in enumerate(data):
+        rank = i + 1
+        rank_class = f"rank-{rank}" if rank <= 3 else ""
+        fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
+        comp_class = "metric-good" if entry["compression"] > 3.5 else ""
+        unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
+        html += f"""
+            <tr class="{rank_class}">
+                <td><strong>#{rank}</strong></td>
+                <td><strong>{entry["name"]}</strong></td>
+                <td><span class="type-badge">{entry["type"]}</span></td>
+                <td>{entry["org"]}</td>
+                <td><span class="score-badge">{entry["score"]}</span></td>
+                <td class="{fert_class}">{entry["fertility"]:.3f}</td>
+                <td class="{comp_class}">{entry["compression"]:.2f}</td>
+                <td class="{unk_class}">{entry["unk_ratio"]:.2%}</td>
+                <td>{entry["num_datasets"]}</td>
+            </tr>
+        """
+    html += """
+        </tbody>
+    </table>
+    <div style="margin-top: 15px; padding: 10px; background: #f5f5f5; border-radius: 8px; font-size: 0.9em;">
+        <strong>📊 Metric Guide:</strong><br>
+        • <strong>Score:</strong> Overall ranking (0-100, higher = better)<br>
+        • <strong>Fertility:</strong> Tokens per word (lower = better, 1.0 ideal for Arabic)<br>
+        • <strong>Compression:</strong> Bytes per token (higher = more efficient)<br>
+        • <strong>UNK Rate:</strong> Unknown token percentage (lower = better)
+    </div>
+    """
+    return html
+def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
+    """Generate HTML for per-dataset fertility table"""
+    if not data:
+        return "<p>No per-dataset results</p>"
+    ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
+    html = """
+    <style>
+        .dataset-table {
+            width: 100%;
+            border-collapse: collapse;
+            font-family: system-ui, -apple-system, sans-serif;
+            margin: 20px 0;
+            font-size: 0.9em;
+        }
+        .dataset-table th {
+            background: #37474f;
+            color: white;
+            padding: 10px 6px;
+            text-align: center;
+        }
+        .dataset-table th:first-child {
+            text-align: left;
+        }
+        .dataset-table td {
+            padding: 8px 6px;
+            text-align: center;
+            border-bottom: 1px solid #e0e0e0;
+        }
+        .dataset-table td:first-child {
+            text-align: left;
+            font-weight: 500;
+        }
+        .dataset-table tr:nth-child(even) {
+            background-color: #fafafa;
+        }
+        .fert-excellent { background: #c8e6c9; color: #1b5e20; font-weight: 600; }
+        .fert-good { background: #fff9c4; color: #f57f17; }
+        .fert-poor { background: #ffcdd2; color: #b71c1c; }
+    </style>
+    <h4>📈 Fertility per Dataset (tokens/word - lower is better)</h4>
+    <table class="dataset-table">
+        <thead>
+            <tr>
+                <th>Tokenizer</th>
+    """
+    for ds_name in ds_names:
+        html += f"<th>{ds_name}</th>"
+    html += """
+            </tr>
+        </thead>
+        <tbody>
+    """
+    for row in data:
+        html += f"<tr><td>{row['Tokenizer']}</td>"
+        for ds_name in ds_names:
+            val = row.get(ds_name, "-")
+            if val != "-":
+                if val < 1.8:
+                    cls = "fert-excellent"
+                elif val < 2.5:
+                    cls = "fert-good"
+                else:
+                    cls = "fert-poor"
+                html += f'<td class="{cls}">{val}</td>'
+            else:
+                html += '<td>-</td>'
+        html += "</tr>"
+    html += """
+        </tbody>
+    </table>
+    """
+    return html

requirements.txt CHANGED Viewed

	@@ -1 +1,7 @@
1	- ~~aranizer~~

+gradio>=4.0.0
+transformers>=4.35.0
+huggingface_hub>=0.19.0
+datasets>=2.14.0
+torch
+sentencepiece
+protobuf

styles.py ADDED Viewed

	@@ -0,0 +1,526 @@

+"""
+CSS Styles
+==========
+All custom CSS for the Arabic Tokenizer Arena
+"""
+CUSTOM_CSS = """
+/* ===== ROOT VARIABLES ===== */
+:root {
+    --primary: #1a5f2a;
+    --primary-light: #2d8f4e;
+    --secondary: #4a90d9;
+    --accent: #f59e0b;
+    --success: #10b981;
+    --warning: #f57c00;
+    --error: #c62828;
+    --bg-primary: #0f1419;
+    --bg-secondary: #1c2128;
+    --bg-card: #22272e;
+    --text-primary: #e6edf3;
+    --text-secondary: #8b949e;
+    --border: #30363d;
+}
+/* ===== HEADER ===== */
+.header-section {
+    text-align: center;
+    padding: 2rem 1rem;
+    background: linear-gradient(135deg, var(--primary) 0%, var(--primary-light) 100%);
+    border-radius: 16px;
+    margin-bottom: 1.5rem;
+}
+.header-section h1 {
+    font-size: 2.5rem;
+    color: white;
+    margin-bottom: 0.5rem;
+}
+.header-section p {
+    color: rgba(255,255,255,0.9);
+    font-size: 1.1rem;
+}
+/* ===== INFO CARD ===== */
+.info-card {
+    background: var(--bg-card);
+    border-radius: 12px;
+    padding: 1.5rem;
+    border: 1px solid var(--border);
+}
+.info-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 1rem;
+    flex-wrap: wrap;
+    gap: 0.5rem;
+}
+.info-header h3 {
+    color: var(--text-primary);
+    margin: 0;
+}
+.org-badge {
+    background: var(--primary);
+    color: white;
+    padding: 0.25rem 0.75rem;
+    border-radius: 20px;
+    font-size: 0.85rem;
+}
+.description {
+    color: var(--text-secondary);
+    line-height: 1.6;
+}
+.info-grid {
+    display: grid;
+    grid-template-columns: repeat(2, 1fr);
+    gap: 1rem;
+    margin: 1rem 0;
+}
+.info-item {
+    display: flex;
+    flex-direction: column;
+}
+.info-label {
+    color: var(--text-secondary);
+    font-size: 0.85rem;
+}
+.info-value {
+    color: var(--text-primary);
+    font-weight: 600;
+}
+.support-native { color: var(--success); }
+.support-supported { color: var(--secondary); }
+.support-limited { color: var(--warning); }
+/* ===== BADGES ===== */
+.badge-container {
+    margin-top: 1rem;
+}
+.badge-group {
+    margin-bottom: 0.5rem;
+}
+.badge-label {
+    color: var(--text-secondary);
+    font-size: 0.85rem;
+    margin-right: 0.5rem;
+}
+.badge {
+    display: inline-block;
+    padding: 0.2rem 0.5rem;
+    border-radius: 4px;
+    font-size: 0.75rem;
+    margin-right: 0.25rem;
+    margin-bottom: 0.25rem;
+}
+.badge.dialect {
+    background: rgba(74, 144, 217, 0.2);
+    color: var(--secondary);
+}
+.badge.feature {
+    background: rgba(245, 158, 11, 0.2);
+    color: var(--accent);
+}
+/* ===== METRICS GRID ===== */
+.metrics-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+    gap: 1rem;
+    margin: 1rem 0;
+}
+.metric-card {
+    background: var(--bg-card);
+    border-radius: 12px;
+    padding: 1rem;
+    text-align: center;
+    border: 1px solid var(--border);
+    transition: transform 0.2s;
+}
+.metric-card:hover {
+    transform: translateY(-2px);
+}
+.metric-card.excellent {
+    border-color: var(--success);
+    background: linear-gradient(to bottom, rgba(16, 185, 129, 0.1), transparent);
+}
+.metric-card.good {
+    border-color: var(--secondary);
+    background: linear-gradient(to bottom, rgba(74, 144, 217, 0.1), transparent);
+}
+.metric-card.poor {
+    border-color: var(--error);
+    background: linear-gradient(to bottom, rgba(198, 40, 40, 0.1), transparent);
+}
+.metric-card.primary {
+    border-color: var(--primary);
+    background: linear-gradient(to bottom, rgba(26, 95, 42, 0.1), transparent);
+}
+.metric-icon {
+    font-size: 1.5rem;
+    margin-bottom: 0.5rem;
+}
+.metric-value {
+    font-size: 1.5rem;
+    font-weight: 700;
+    color: var(--text-primary);
+}
+.metric-label {
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+    margin-top: 0.25rem;
+}
+.metric-hint {
+    font-size: 0.7rem;
+    color: var(--text-secondary);
+    opacity: 0.7;
+}
+/* ===== TOKEN VISUALIZATION ===== */
+.token-container {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 0.5rem;
+    padding: 1rem;
+    background: var(--bg-secondary);
+    border-radius: 12px;
+    direction: rtl;
+}
+.token {
+    display: inline-flex;
+    flex-direction: column;
+    align-items: center;
+    padding: 0.5rem 0.75rem;
+    border-radius: 8px;
+    font-family: 'IBM Plex Sans Arabic', monospace;
+    font-size: 1rem;
+    transition: transform 0.2s;
+    cursor: default;
+}
+.token:hover {
+    transform: scale(1.05);
+}
+.token-id {
+    font-size: 0.65rem;
+    opacity: 0.7;
+    margin-top: 0.25rem;
+}
+/* ===== DECODED SECTION ===== */
+.decoded-section {
+    background: var(--bg-card);
+    border-radius: 12px;
+    padding: 1.5rem;
+    border: 1px solid var(--border);
+}
+.decoded-section h4 {
+    color: var(--text-primary);
+    margin-bottom: 1rem;
+}
+.decoded-text {
+    font-family: 'IBM Plex Sans Arabic', serif;
+    font-size: 1.1rem;
+    line-height: 1.8;
+    color: var(--text-primary);
+}
+.decoded-meta {
+    margin-top: 1rem;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+}
+/* ===== COMPARISON TABLE ===== */
+.comparison-container {
+    overflow-x: auto;
+}
+.comparison-table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 1rem 0;
+}
+.comparison-table th {
+    background: var(--primary);
+    color: white;
+    padding: 0.75rem;
+    text-align: left;
+    font-weight: 600;
+}
+.comparison-table td {
+    padding: 0.75rem;
+    border-bottom: 1px solid var(--border);
+    color: var(--text-primary);
+}
+.comparison-table tr:hover {
+    background: rgba(74, 144, 217, 0.1);
+}
+.comparison-table .rank-1 {
+    background: linear-gradient(90deg, rgba(255, 215, 0, 0.2), transparent);
+}
+.comparison-table .rank-2 {
+    background: linear-gradient(90deg, rgba(192, 192, 192, 0.2), transparent);
+}
+.comparison-table .rank-3 {
+    background: linear-gradient(90deg, rgba(205, 127, 50, 0.2), transparent);
+}
+.comparison-table .excellent {
+    color: var(--success);
+    font-weight: 600;
+}
+.comparison-table .good {
+    color: var(--secondary);
+}
+.comparison-table .poor {
+    color: var(--error);
+}
+/* ===== ABOUT PAGE ===== */
+.about-container {
+    padding: 1rem;
+}
+.about-header {
+    text-align: center;
+    margin-bottom: 2rem;
+}
+.about-header h2 {
+    color: var(--text-primary);
+    font-size: 2rem;
+    margin-bottom: 0.5rem;
+}
+.about-subtitle {
+    color: var(--text-secondary);
+    font-size: 1.1rem;
+}
+.about-stats {
+    display: flex;
+    justify-content: center;
+    gap: 2rem;
+    margin: 2rem 0;
+    flex-wrap: wrap;
+}
+.stat-card {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 1.5rem 2rem;
+    text-align: center;
+}
+.stat-value {
+    font-size: 2.5rem;
+    font-weight: 700;
+    color: var(--primary-light);
+}
+.stat-label {
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+    margin-top: 0.25rem;
+}
+.about-tokenizers {
+    margin: 2rem 0;
+}
+.about-tokenizers h3 {
+    color: var(--text-primary);
+    margin-bottom: 1rem;
+}
+.tokenizer-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
+    gap: 1.5rem;
+}
+.about-category {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 1rem 1.5rem;
+}
+.about-category h4 {
+    color: var(--primary-light);
+    margin-bottom: 0.75rem;
+    font-size: 1rem;
+}
+.about-category ul {
+    list-style: none;
+    padding: 0;
+    margin: 0;
+}
+.about-category li {
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+    padding: 0.25rem 0;
+    border-bottom: 1px solid var(--border);
+}
+.about-category li:last-child {
+    border-bottom: none;
+}
+.about-features {
+    margin: 2rem 0;
+}
+.about-features h3 {
+    color: var(--text-primary);
+    margin-bottom: 1rem;
+}
+.feature-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+    gap: 1rem;
+}
+.feature-item {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+    padding: 0.75rem 1rem;
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    color: var(--text-secondary);
+}
+.feature-icon {
+    font-size: 1.25rem;
+}
+.about-usecases {
+    margin: 2rem 0;
+}
+.about-usecases h3 {
+    color: var(--text-primary);
+    margin-bottom: 1rem;
+}
+.usecase-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+    gap: 1rem;
+}
+.usecase-card {
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 12px;
+    padding: 1.25rem;
+}
+.usecase-card h4 {
+    color: var(--primary-light);
+    margin-bottom: 0.5rem;
+}
+.usecase-card p {
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+    margin: 0;
+}
+.about-footer {
+    text-align: center;
+    margin-top: 2rem;
+    padding-top: 1.5rem;
+    border-top: 1px solid var(--border);
+    color: var(--text-secondary);
+}
+/* ===== UTILITY CLASSES ===== */
+.warning {
+    background: linear-gradient(to right, rgba(245, 124, 0, 0.1), transparent);
+    border-left: 4px solid var(--warning);
+    padding: 1rem;
+    border-radius: 0 8px 8px 0;
+    color: var(--text-primary);
+}
+.error-card {
+    background: linear-gradient(to right, rgba(198, 40, 40, 0.1), transparent);
+    border-left: 4px solid var(--error);
+    padding: 1rem;
+    border-radius: 0 8px 8px 0;
+}
+.error-card h4 {
+    color: var(--error);
+    margin-bottom: 0.5rem;
+}
+.error-card p {
+    color: var(--text-secondary);
+}
+/* ===== RESPONSIVE ===== */
+@media (max-width: 768px) {
+    .header-section h1 {
+        font-size: 1.75rem;
+    }
+    .info-grid {
+        grid-template-columns: 1fr;
+    }
+    .metrics-grid {
+        grid-template-columns: repeat(2, 1fr);
+    }
+    .about-stats {
+        flex-direction: column;
+        align-items: center;
+    }
+}
+"""

tokenizer_manager.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Tokenizer Manager
+=================
+Handles tokenizer loading, caching, and availability checking
+"""
+import os
+from typing import Dict, List, Any
+from transformers import AutoTokenizer, logging
+from config import TOKENIZER_REGISTRY, TokenizerInfo
+logging.set_verbosity_error()
+# HuggingFace authentication
+HF_TOKEN = os.getenv('HF_TOKEN')
+if HF_TOKEN:
+    HF_TOKEN = HF_TOKEN.strip()
+    from huggingface_hub import login
+    login(token=HF_TOKEN)
+class TokenizerManager:
+    """Manages tokenizer loading and caching"""
+    def __init__(self):
+        self._cache: Dict[str, Any] = {}
+        self._available: Dict[str, TokenizerInfo] = {}
+        self._initialize_available_tokenizers()
+    def _initialize_available_tokenizers(self):
+        """Check which tokenizers are available and can be loaded"""
+        print("🔄 Initializing tokenizer registry...")
+        for model_id, info in TOKENIZER_REGISTRY.items():
+            try:
+                _ = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+                self._available[model_id] = info
+                print(f"  ✓ {info.name}")
+            except Exception as e:
+                print(f"  ✗ {info.name}: {str(e)[:50]}")
+        print(f"\n✅ Total available tokenizers: {len(self._available)}")
+    def get_tokenizer(self, model_id: str):
+        """Get tokenizer from cache or load it"""
+        if model_id not in self._cache:
+            self._cache[model_id] = AutoTokenizer.from_pretrained(
+                model_id,
+                trust_remote_code=True
+            )
+        return self._cache[model_id]
+    def get_available_tokenizers(self) -> Dict[str, TokenizerInfo]:
+        """Get all available tokenizers"""
+        return self._available
+    def get_tokenizer_choices(self) -> List[str]:
+        """Get list of tokenizer display names for dropdown"""
+        return [f"{info.name} ({info.organization})" for info in self._available.values()]
+    def get_model_id_from_choice(self, choice: str) -> str:
+        """Convert display choice back to model ID"""
+        for model_id, info in self._available.items():
+            if f"{info.name} ({info.organization})" == choice:
+                return model_id
+        return list(self._available.keys())[0] if self._available else ""
+    def get_tokenizers_by_type(self) -> Dict[str, List[str]]:
+        """Group available tokenizers by type"""
+        choices = self.get_tokenizer_choices()
+        arabic_bert = [t for t in choices if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT', 'Safaya'])]
+        arabic_specific = [t for t in choices if any(x in t for x in ['Aranizer'])]
+        arabic_llms = [t for t in choices if any(x in t for x in ['Jais', 'AceGPT', 'SILMA', 'Fanar', 'StableLM', 'Yehia', 'Atlas'])]
+        multilingual = [t for t in choices if t not in arabic_bert and t not in arabic_specific and t not in arabic_llms]
+        return {
+            "Arabic BERT Models": arabic_bert,
+            "Arabic Tokenizers": arabic_specific,
+            "Arabic LLMs": arabic_llms,
+            "Multilingual Models": multilingual
+        }
+# Global tokenizer manager instance
+tokenizer_manager = TokenizerManager()

ui_components.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+UI Components
+=============
+HTML generation functions for the Gradio interface
+"""
+from typing import List
+from config import TokenizerInfo, TokenizationMetrics
+from utils import is_arabic_char
+def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
+    """Generate beautiful HTML visualization of tokens"""
+    colors = [
+        ('#1a1a2e', '#eaeaea'),
+        ('#16213e', '#f0f0f0'),
+        ('#0f3460', '#ffffff'),
+        ('#533483', '#f5f5f5'),
+        ('#e94560', '#ffffff'),
+        ('#0f4c75', '#f0f0f0'),
+        ('#3282b8', '#ffffff'),
+        ('#bbe1fa', '#1a1a2e'),
+    ]
+    html_parts = []
+    for i, (token, tid) in enumerate(zip(tokens, token_ids)):
+        bg, fg = colors[i % len(colors)]
+        display_token = token.replace('<', '&lt;').replace('>', '&gt;')
+        is_arabic = any(is_arabic_char(c) for c in token)
+        direction = 'rtl' if is_arabic else 'ltr'
+        html_parts.append(f'''
+            <span class="token" style="
+                background: {bg};
+                color: {fg};
+                direction: {direction};
+            " title="ID: {tid}">
+                {display_token}
+                <span class="token-id">{tid}</span>
+            </span>
+        ''')
+    return f'''
+    <div class="token-container">
+        {''.join(html_parts)}
+    </div>
+    '''
+def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
+    """Generate metrics visualization card"""
+    fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
+    strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
+    compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
+    return f'''
+    <div class="metrics-grid">
+        <div class="metric-card primary">
+            <div class="metric-icon">📊</div>
+            <div class="metric-value">{metrics.total_tokens}</div>
+            <div class="metric-label">Total Tokens</div>
+        </div>
+        <div class="metric-card {fertility_quality}">
+            <div class="metric-icon">🎯</div>
+            <div class="metric-value">{metrics.fertility:.3f}</div>
+            <div class="metric-label">Fertility (tokens/word)</div>
+            <div class="metric-hint">Lower is better (1.0 ideal)</div>
+        </div>
+        <div class="metric-card {compression_quality}">
+            <div class="metric-icon">📦</div>
+            <div class="metric-value">{metrics.compression_ratio:.2f}</div>
+            <div class="metric-label">Compression (bytes/token)</div>
+            <div class="metric-hint">Higher is better</div>
+        </div>
+        <div class="metric-card {strr_quality}">
+            <div class="metric-icon">✨</div>
+            <div class="metric-value">{metrics.single_token_retention_rate:.1%}</div>
+            <div class="metric-label">STRR (Single Token Retention)</div>
+            <div class="metric-hint">Higher is better</div>
+        </div>
+        <div class="metric-card">
+            <div class="metric-icon">🔤</div>
+            <div class="metric-value">{metrics.char_per_token:.2f}</div>
+            <div class="metric-label">Characters/Token</div>
+        </div>
+        <div class="metric-card {'excellent' if metrics.oov_percentage == 0 else 'poor' if metrics.oov_percentage > 5 else 'good'}">
+            <div class="metric-icon">❓</div>
+            <div class="metric-value">{metrics.oov_percentage:.1f}%</div>
+            <div class="metric-label">OOV Rate</div>
+            <div class="metric-hint">Lower is better (0% ideal)</div>
+        </div>
+        <div class="metric-card">
+            <div class="metric-icon">🌍</div>
+            <div class="metric-value">{metrics.arabic_fertility:.3f}</div>
+            <div class="metric-label">Arabic Fertility</div>
+        </div>
+        <div class="metric-card">
+            <div class="metric-icon">⚡</div>
+            <div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div>
+            <div class="metric-label">Processing Time</div>
+        </div>
+    </div>
+    '''
+def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
+    """Generate tokenizer information card"""
+    dialect_badges = ''.join([f'<span class="badge dialect">{d}</span>' for d in info.dialect_support])
+    feature_badges = ''.join([f'<span class="badge feature">{f}</span>' for f in info.special_features])
+    support_class = "native" if info.arabic_support == "Native" else "supported" if info.arabic_support == "Supported" else "limited"
+    return f'''
+    <div class="info-card">
+        <div class="info-header">
+            <h3>{info.name}</h3>
+            <span class="org-badge">{info.organization}</span>
+        </div>
+        <p class="description">{info.description}</p>
+        <div class="info-grid">
+            <div class="info-item">
+                <span class="info-label">Type:</span>
+                <span class="info-value">{info.type.value}</span>
+            </div>
+            <div class="info-item">
+                <span class="info-label">Algorithm:</span>
+                <span class="info-value">{info.algorithm.value}</span>
+            </div>
+            <div class="info-item">
+                <span class="info-label">Vocab Size:</span>
+                <span class="info-value">{info.vocab_size:,}</span>
+            </div>
+            <div class="info-item">
+                <span class="info-label">Arabic Support:</span>
+                <span class="info-value support-{support_class}">{info.arabic_support}</span>
+            </div>
+        </div>
+        <div class="badge-container">
+            <div class="badge-group">
+                <span class="badge-label">Dialects:</span>
+                {dialect_badges}
+            </div>
+            <div class="badge-group">
+                <span class="badge-label">Features:</span>
+                {feature_badges}
+            </div>
+        </div>
+    </div>
+    '''
+def generate_decoded_section(metrics: TokenizationMetrics) -> str:
+    """Generate decoded output section"""
+    return f'''
+    <div class="decoded-section">
+        <h4>Decoded Output</h4>
+        <div class="decoded-text" dir="auto">{metrics.decoded_text}</div>
+        <div class="decoded-meta">
+            Diacritics preserved: {'✅ Yes' if metrics.diacritic_preservation else '❌ No'}
+        </div>
+    </div>
+    '''
+def generate_about_html(tokenizers_by_type: dict, total_count: int) -> str:
+    """Generate About page HTML"""
+    # Build tokenizer lists
+    sections = []
+    for category, tokenizers in tokenizers_by_type.items():
+        if tokenizers:
+            items = ''.join([f'<li>{t}</li>' for t in tokenizers[:12]])
+            if len(tokenizers) > 12:
+                items += f'<li><em>...and {len(tokenizers) - 12} more</em></li>'
+            sections.append(f'''
+                <div class="about-category">
+                    <h4>{category}</h4>
+                    <ul>{items}</ul>
+                </div>
+            ''')
+    return f'''
+    <div class="about-container">
+        <div class="about-header">
+            <h2>🏟️ Arabic Tokenizer Arena Pro</h2>
+            <p class="about-subtitle">A comprehensive platform for evaluating Arabic tokenizers across multiple dimensions</p>
+        </div>
+        <div class="about-stats">
+            <div class="stat-card">
+                <div class="stat-value">{total_count}</div>
+                <div class="stat-label">Available Tokenizers</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-value">8</div>
+                <div class="stat-label">Evaluation Datasets</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-value">8+</div>
+                <div class="stat-label">Metrics</div>
+            </div>
+        </div>
+        <div class="about-tokenizers">
+            <h3>📚 Available Tokenizers</h3>
+            <div class="tokenizer-grid">
+                {''.join(sections)}
+            </div>
+        </div>
+        <div class="about-features">
+            <h3>✨ Features</h3>
+            <div class="feature-grid">
+                <div class="feature-item">
+                    <span class="feature-icon">📊</span>
+                    <span>Comprehensive efficiency metrics (fertility, compression, STRR)</span>
+                </div>
+                <div class="feature-item">
+                    <span class="feature-icon">🌍</span>
+                    <span>Arabic-specific analysis (dialect support, diacritic preservation)</span>
+                </div>
+                <div class="feature-item">
+                    <span class="feature-icon">⚖️</span>
+                    <span>Side-by-side tokenizer comparison</span>
+                </div>
+                <div class="feature-item">
+                    <span class="feature-icon">🎨</span>
+                    <span>Beautiful token visualization</span>
+                </div>
+                <div class="feature-item">
+                    <span class="feature-icon">🏆</span>
+                    <span>Leaderboard with real HuggingFace datasets</span>
+                </div>
+                <div class="feature-item">
+                    <span class="feature-icon">📖</span>
+                    <span>Support for MSA, dialectal, and Classical Arabic</span>
+                </div>
+            </div>
+        </div>
+        <div class="about-usecases">
+            <h3>🎯 Use Cases</h3>
+            <div class="usecase-grid">
+                <div class="usecase-card">
+                    <h4>🔬 Research</h4>
+                    <p>Compare tokenizers for Arabic NLP experiments</p>
+                </div>
+                <div class="usecase-card">
+                    <h4>🚀 Production</h4>
+                    <p>Select optimal tokenizer for deployment</p>
+                </div>
+                <div class="usecase-card">
+                    <h4>📚 Education</h4>
+                    <p>Understand how different algorithms handle Arabic</p>
+                </div>
+                <div class="usecase-card">
+                    <h4>💰 Optimization</h4>
+                    <p>Identify cost-efficient tokenizers for API usage</p>
+                </div>
+            </div>
+        </div>
+        <div class="about-footer">
+            <p>Built with ❤️ for the Arabic NLP community</p>
+        </div>
+    </div>
+    '''

utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Arabic Text Utilities
+=====================
+Helper functions for Arabic text analysis
+"""
+import re
+from typing import List
+def is_arabic_char(char: str) -> bool:
+    """Check if character is Arabic"""
+    if len(char) != 1:
+        return False
+    code = ord(char)
+    return (
+        (0x0600 <= code <= 0x06FF) or  # Arabic
+        (0x0750 <= code <= 0x077F) or  # Arabic Supplement
+        (0x08A0 <= code <= 0x08FF) or  # Arabic Extended-A
+        (0xFB50 <= code <= 0xFDFF) or  # Arabic Presentation Forms-A
+        (0xFE70 <= code <= 0xFEFF)     # Arabic Presentation Forms-B
+    )
+def count_arabic_chars(text: str) -> int:
+    """Count Arabic characters in text"""
+    return sum(1 for c in text if is_arabic_char(c))
+def has_diacritics(text: str) -> bool:
+    """Check if text contains Arabic diacritics (tashkeel)"""
+    diacritics = set('ًٌٍَُِّْٰ')
+    return any(c in diacritics for c in text)
+def normalize_arabic(text: str) -> str:
+    """Basic Arabic normalization"""
+    # Normalize alef variants
+    text = re.sub('[إأآا]', 'ا', text)
+    # Normalize yeh
+    text = re.sub('ى', 'ي', text)
+    # Normalize teh marbuta
+    text = re.sub('ة', 'ه', text)
+    return text
+def get_arabic_words(text: str) -> List[str]:
+    """Extract Arabic words from text"""
+    words = text.split()
+    return [w for w in words if any(is_arabic_char(c) for c in w)]
+def remove_diacritics(text: str) -> str:
+    """Remove Arabic diacritics from text"""
+    diacritics = 'ًٌٍَُِّْٰ'
+    return ''.join(c for c in text if c not in diacritics)