HeshamHaroon commited on
Commit
59baef6
Β·
verified Β·
1 Parent(s): 06d6710

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +923 -727
app.py CHANGED
@@ -4,10 +4,13 @@ Arabic Tokenizer Arena Pro - Advanced Arabic Tokenization Analysis Platform
4
  A comprehensive research and production-grade tool for evaluating Arabic tokenizers
5
  across multiple dimensions: efficiency, coverage, morphological awareness, and more.
6
 
 
 
7
  Supports:
8
  - Arabic-specific tokenizers (Aranizer, AraBERT, CAMeLBERT, MARBERT, etc.)
9
  - Major LLM tokenizers (Jais, AceGPT, Falcon-Arabic, ALLaM, Qwen, Llama, Mistral, GPT)
10
  - Comprehensive evaluation metrics based on latest research
 
11
  """
12
 
13
  import gradio as gr
@@ -18,6 +21,8 @@ import unicodedata
18
  from typing import Dict, List, Tuple, Optional, Any
19
  from dataclasses import dataclass, field
20
  from enum import Enum
 
 
21
  import os
22
 
23
  # Hugging Face authentication
@@ -30,6 +35,9 @@ if HF_TOKEN:
30
  from transformers import AutoTokenizer, logging
31
  logging.set_verbosity_error()
32
 
 
 
 
33
  # ============================================================================
34
  # DATA CLASSES AND ENUMS
35
  # ============================================================================
@@ -204,30 +212,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
204
  dialect_support=["MSA"],
205
  special_features=["100K vocabulary", "MSA focused"]
206
  ),
207
- "asafaya/bert-base-arabic": TokenizerInfo(
208
- name="Arabic BERT (Safaya)",
209
- model_id="asafaya/bert-base-arabic",
210
- type=TokenizerType.ENCODER_ONLY,
211
- algorithm=TokenizerAlgorithm.WORDPIECE,
212
- vocab_size=32000,
213
- description="Arabic BERT trained on MSA and dialectal Arabic",
214
- organization="Safaya",
215
- arabic_support="Native",
216
- dialect_support=["MSA", "DA"],
217
- special_features=["TPU trained", "Dialect support"]
218
- ),
219
- "UBC-NLP/AraT5-base": TokenizerInfo(
220
- name="AraT5 Base",
221
- model_id="UBC-NLP/AraT5-base",
222
- type=TokenizerType.ENCODER_ONLY,
223
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
224
- vocab_size=110000,
225
- description="Arabic text-to-text transformer for generation tasks",
226
- organization="UBC NLP",
227
- arabic_support="Native",
228
- dialect_support=["MSA", "Tweet"],
229
- special_features=["Text-to-Text", "Generation optimized"]
230
- ),
231
 
232
  # ========== ARABIC-SPECIFIC TOKENIZERS ==========
233
  "riotu-lab/Aranizer-PBE-86k": TokenizerInfo(
@@ -254,30 +238,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
254
  dialect_support=["MSA"],
255
  special_features=["Low fertility", "SentencePiece", "86K vocab"]
256
  ),
257
- "riotu-lab/Aranizer-PBE-32k": TokenizerInfo(
258
- name="Aranizer PBE 32K",
259
- model_id="riotu-lab/Aranizer-PBE-32k",
260
- type=TokenizerType.ARABIC_SPECIFIC,
261
- algorithm=TokenizerAlgorithm.BPE,
262
- vocab_size=32000,
263
- description="Compact PBE tokenizer for Arabic",
264
- organization="RIOTU Lab",
265
- arabic_support="Native",
266
- dialect_support=["MSA"],
267
- special_features=["Compact", "LLM compatible"]
268
- ),
269
- "riotu-lab/Aranizer-SP-32k": TokenizerInfo(
270
- name="Aranizer SP 32K",
271
- model_id="riotu-lab/Aranizer-SP-32k",
272
- type=TokenizerType.ARABIC_SPECIFIC,
273
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
274
- vocab_size=32000,
275
- description="Compact SentencePiece tokenizer for Arabic",
276
- organization="RIOTU Lab",
277
- arabic_support="Native",
278
- dialect_support=["MSA"],
279
- special_features=["Compact", "Efficient"]
280
- ),
281
 
282
  # ========== ARABIC-SPECIFIC LLMs ==========
283
  "ALLaM-AI/ALLaM-7B-Instruct-preview": TokenizerInfo(
@@ -328,18 +288,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
328
  dialect_support=["MSA"],
329
  special_features=["LLaMA-based", "Cultural alignment", "RLHF", "Chat"]
330
  ),
331
- "FreedomIntelligence/AceGPT-7B-chat": TokenizerInfo(
332
- name="AceGPT 7B Chat",
333
- model_id="FreedomIntelligence/AceGPT-7B-chat",
334
- type=TokenizerType.ARABIC_LLM,
335
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
336
- vocab_size=32000,
337
- description="Smaller Arabic-enhanced LLaMA variant with chat",
338
- organization="Freedom Intelligence",
339
- arabic_support="Adapted",
340
- dialect_support=["MSA"],
341
- special_features=["LLaMA-based", "Efficient", "Chat"]
342
- ),
343
  "silma-ai/SILMA-9B-Instruct-v1.0": TokenizerInfo(
344
  name="SILMA 9B Instruct",
345
  model_id="silma-ai/SILMA-9B-Instruct-v1.0",
@@ -352,18 +300,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
352
  dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
353
  special_features=["Gemma-based", "SOTA 9B class", "Efficient"]
354
  ),
355
- "silma-ai/SILMA-Kashif-2B-Instruct-v1.0": TokenizerInfo(
356
- name="SILMA Kashif 2B (RAG)",
357
- model_id="silma-ai/SILMA-Kashif-2B-Instruct-v1.0",
358
- type=TokenizerType.ARABIC_LLM,
359
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
360
- vocab_size=256000,
361
- description="RAG-optimized Arabic model, excellent for context-based QA",
362
- organization="SILMA AI",
363
- arabic_support="Native",
364
- dialect_support=["MSA"],
365
- special_features=["RAG optimized", "12K context", "Compact"]
366
- ),
367
  "QCRI/Fanar-1-9B-Instruct": TokenizerInfo(
368
  name="Fanar 9B Instruct",
369
  model_id="QCRI/Fanar-1-9B-Instruct",
@@ -376,54 +312,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
376
  dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
377
  special_features=["Islamic RAG", "Cultural alignment", "Gemma-based"]
378
  ),
379
- "tiiuae/Falcon-Arabic-7B-Instruct": TokenizerInfo(
380
- name="Falcon Arabic 7B Instruct",
381
- model_id="tiiuae/Falcon-Arabic-7B-Instruct",
382
- type=TokenizerType.ARABIC_LLM,
383
- algorithm=TokenizerAlgorithm.BPE,
384
- vocab_size=97024,
385
- description="SOTA Arabic LLM from TII, outperforms models 4x its size",
386
- organization="Technology Innovation Institute",
387
- arabic_support="Native",
388
- dialect_support=["MSA", "Gulf", "Egyptian", "Levantine", "Maghrebi"],
389
- special_features=["Falcon3-based", "32K context", "DPO aligned"]
390
- ),
391
- "tiiuae/Falcon-Arabic-7B-Base": TokenizerInfo(
392
- name="Falcon Arabic 7B Base",
393
- model_id="tiiuae/Falcon-Arabic-7B-Base",
394
- type=TokenizerType.ARABIC_LLM,
395
- algorithm=TokenizerAlgorithm.BPE,
396
- vocab_size=97024,
397
- description="Base model of Falcon Arabic for fine-tuning",
398
- organization="Technology Innovation Institute",
399
- arabic_support="Native",
400
- dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
401
- special_features=["Falcon3-based", "Fine-tuning ready"]
402
- ),
403
- "CohereForAI/c4ai-command-r7b-arabic-02-2025": TokenizerInfo(
404
- name="Cohere Command R7B Arabic",
405
- model_id="CohereForAI/c4ai-command-r7b-arabic-02-2025",
406
- type=TokenizerType.ARABIC_LLM,
407
- algorithm=TokenizerAlgorithm.BPE,
408
- vocab_size=256000,
409
- description="Cohere's Arabic-optimized model for RAG and enterprise use",
410
- organization="Cohere",
411
- arabic_support="Native",
412
- dialect_support=["MSA"],
413
- special_features=["RAG optimized", "128K context", "Enterprise ready"]
414
- ),
415
- "stabilityai/ar-stablelm-2-chat": TokenizerInfo(
416
- name="Arabic StableLM 2 Chat",
417
- model_id="stabilityai/ar-stablelm-2-chat",
418
- type=TokenizerType.ARABIC_LLM,
419
- algorithm=TokenizerAlgorithm.BPE,
420
- vocab_size=100289,
421
- description="Stability AI's Arabic instruction-tuned 1.6B model",
422
- organization="Stability AI",
423
- arabic_support="Native",
424
- dialect_support=["MSA"],
425
- special_features=["Compact 1.6B", "Chat optimized", "Efficient"]
426
- ),
427
  "Navid-AI/Yehia-7B-preview": TokenizerInfo(
428
  name="Yehia 7B Preview",
429
  model_id="Navid-AI/Yehia-7B-preview",
@@ -450,30 +338,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
450
  dialect_support=["Darija", "MSA"],
451
  special_features=["Moroccan dialect", "Transliteration", "Cultural"]
452
  ),
453
- "MBZUAI-Paris/Atlas-Chat-2B": TokenizerInfo(
454
- name="Atlas-Chat 2B (Darija)",
455
- model_id="MBZUAI-Paris/Atlas-Chat-2B",
456
- type=TokenizerType.ARABIC_LLM,
457
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
458
- vocab_size=256000,
459
- description="Compact Moroccan Arabic model for edge deployment",
460
- organization="MBZUAI Paris",
461
- arabic_support="Native",
462
- dialect_support=["Darija", "MSA"],
463
- special_features=["Compact", "Moroccan dialect", "Edge-ready"]
464
- ),
465
- "MBZUAI-Paris/Atlas-Chat-27B": TokenizerInfo(
466
- name="Atlas-Chat 27B (Darija)",
467
- model_id="MBZUAI-Paris/Atlas-Chat-27B",
468
- type=TokenizerType.ARABIC_LLM,
469
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
470
- vocab_size=256000,
471
- description="Largest Moroccan Arabic model with best performance",
472
- organization="MBZUAI Paris",
473
- arabic_support="Native",
474
- dialect_support=["Darija", "MSA"],
475
- special_features=["27B params", "Moroccan dialect", "SOTA Darija"]
476
- ),
477
 
478
  # ========== MULTILINGUAL LLMs WITH ARABIC SUPPORT ==========
479
  "Qwen/Qwen2.5-7B": TokenizerInfo(
@@ -488,18 +352,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
488
  dialect_support=["MSA"],
489
  special_features=["152K vocab", "128K context", "30+ languages"]
490
  ),
491
- "Qwen/Qwen2.5-14B-Instruct": TokenizerInfo(
492
- name="Qwen 2.5 14B Instruct",
493
- model_id="Qwen/Qwen2.5-14B-Instruct",
494
- type=TokenizerType.MULTILINGUAL_LLM,
495
- algorithm=TokenizerAlgorithm.BPE,
496
- vocab_size=151936,
497
- description="Larger Qwen with enhanced Arabic capabilities",
498
- organization="Alibaba Qwen",
499
- arabic_support="Supported",
500
- dialect_support=["MSA"],
501
- special_features=["14B params", "Strong Arabic", "Instruct tuned"]
502
- ),
503
  "google/gemma-2-9b": TokenizerInfo(
504
  name="Gemma 2 9B",
505
  model_id="google/gemma-2-9b",
@@ -512,42 +364,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
512
  dialect_support=["MSA"],
513
  special_features=["256K vocab", "Efficient architecture"]
514
  ),
515
- "google/gemma-2-9b-it": TokenizerInfo(
516
- name="Gemma 2 9B Instruct",
517
- model_id="google/gemma-2-9b-it",
518
- type=TokenizerType.MULTILINGUAL_LLM,
519
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
520
- vocab_size=256000,
521
- description="Instruction-tuned Gemma with Arabic support",
522
- organization="Google",
523
- arabic_support="Supported",
524
- dialect_support=["MSA"],
525
- special_features=["Instruct tuned", "256K vocab"]
526
- ),
527
- "CohereForAI/aya-expanse-8b": TokenizerInfo(
528
- name="Aya Expanse 8B",
529
- model_id="CohereForAI/aya-expanse-8b",
530
- type=TokenizerType.MULTILINGUAL_LLM,
531
- algorithm=TokenizerAlgorithm.BPE,
532
- vocab_size=256000,
533
- description="Cohere's multilingual model with strong Arabic support",
534
- organization="Cohere",
535
- arabic_support="Supported",
536
- dialect_support=["MSA"],
537
- special_features=["23 languages", "Arabic optimized"]
538
- ),
539
- "CohereForAI/aya-expanse-32b": TokenizerInfo(
540
- name="Aya Expanse 32B",
541
- model_id="CohereForAI/aya-expanse-32b",
542
- type=TokenizerType.MULTILINGUAL_LLM,
543
- algorithm=TokenizerAlgorithm.BPE,
544
- vocab_size=256000,
545
- description="Large Aya model with enhanced multilingual capabilities",
546
- organization="Cohere",
547
- arabic_support="Supported",
548
- dialect_support=["MSA"],
549
- special_features=["32B params", "23 languages"]
550
- ),
551
  "mistralai/Mistral-7B-v0.3": TokenizerInfo(
552
  name="Mistral 7B v0.3",
553
  model_id="mistralai/Mistral-7B-v0.3",
@@ -572,30 +388,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
572
  dialect_support=["MSA"],
573
  special_features=["Tekken tokenizer", "131K vocab", "Multilingual optimized"]
574
  ),
575
- "microsoft/Phi-3.5-mini-instruct": TokenizerInfo(
576
- name="Phi-3.5 Mini Instruct",
577
- model_id="microsoft/Phi-3.5-mini-instruct",
578
- type=TokenizerType.MULTILINGUAL_LLM,
579
- algorithm=TokenizerAlgorithm.BPE,
580
- vocab_size=32064,
581
- description="Microsoft's compact multilingual model",
582
- organization="Microsoft",
583
- arabic_support="Limited",
584
- dialect_support=["MSA"],
585
- special_features=["Compact", "3.8B params"]
586
- ),
587
- "google/mt5-base": TokenizerInfo(
588
- name="mT5 Base",
589
- model_id="google/mt5-base",
590
- type=TokenizerType.MULTILINGUAL_LLM,
591
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
592
- vocab_size=250112,
593
- description="Multilingual T5 covering 101 languages",
594
- organization="Google",
595
- arabic_support="Supported",
596
- dialect_support=["MSA"],
597
- special_features=["250K vocab", "101 languages", "Seq2Seq"]
598
- ),
599
  "xlm-roberta-base": TokenizerInfo(
600
  name="XLM-RoBERTa Base",
601
  model_id="xlm-roberta-base",
@@ -620,8 +412,6 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
620
  dialect_support=["MSA"],
621
  special_features=["Baseline model", "104 languages"]
622
  ),
623
-
624
- # ========== FALCON FAMILY ==========
625
  "tiiuae/falcon-7b": TokenizerInfo(
626
  name="Falcon 7B",
627
  model_id="tiiuae/falcon-7b",
@@ -634,96 +424,109 @@ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
634
  dialect_support=["MSA"],
635
  special_features=["65K vocab", "RefinedWeb trained"]
636
  ),
637
- "tiiuae/falcon-7b-instruct": TokenizerInfo(
638
- name="Falcon 7B Instruct",
639
- model_id="tiiuae/falcon-7b-instruct",
640
- type=TokenizerType.MULTILINGUAL_LLM,
641
- algorithm=TokenizerAlgorithm.BPE,
642
- vocab_size=65024,
643
- description="Instruction-tuned Falcon 7B",
644
- organization="Technology Innovation Institute",
645
- arabic_support="Limited",
646
- dialect_support=["MSA"],
647
- special_features=["Instruct tuned", "Chat ready"]
648
- ),
649
  }
650
 
651
- # Try to load gated/authenticated models
652
- GATED_MODELS = [
653
- ("meta-llama/Meta-Llama-3-8B", TokenizerInfo(
654
- name="Llama 3 8B",
655
- model_id="meta-llama/Meta-Llama-3-8B",
656
- type=TokenizerType.MULTILINGUAL_LLM,
657
- algorithm=TokenizerAlgorithm.BPE,
658
- vocab_size=128256,
659
- description="Meta's latest LLM with improved multilingual",
660
- organization="Meta AI",
661
- arabic_support="Limited",
662
- dialect_support=["MSA"],
663
- special_features=["128K vocab", "Improved tokenizer"]
664
- )),
665
- ("meta-llama/Llama-3.1-8B-Instruct", TokenizerInfo(
666
- name="Llama 3.1 8B Instruct",
667
- model_id="meta-llama/Llama-3.1-8B-Instruct",
668
- type=TokenizerType.MULTILINGUAL_LLM,
669
- algorithm=TokenizerAlgorithm.BPE,
670
- vocab_size=128256,
671
- description="Latest Llama with instruction tuning",
672
- organization="Meta AI",
673
- arabic_support="Limited",
674
- dialect_support=["MSA"],
675
- special_features=["128K context", "Tool use"]
676
- )),
677
- ("meta-llama/Llama-3.2-1B-Instruct", TokenizerInfo(
678
- name="Llama 3.2 1B Instruct",
679
- model_id="meta-llama/Llama-3.2-1B-Instruct",
680
- type=TokenizerType.MULTILINGUAL_LLM,
681
- algorithm=TokenizerAlgorithm.BPE,
682
- vocab_size=128256,
683
- description="Compact Llama for edge deployment",
684
- organization="Meta AI",
685
- arabic_support="Limited",
686
- dialect_support=["MSA"],
687
- special_features=["Compact 1B", "Edge ready"]
688
- )),
689
- ("meta-llama/Llama-3.3-70B-Instruct", TokenizerInfo(
690
- name="Llama 3.3 70B Instruct",
691
- model_id="meta-llama/Llama-3.3-70B-Instruct",
692
- type=TokenizerType.MULTILINGUAL_LLM,
693
- algorithm=TokenizerAlgorithm.BPE,
694
- vocab_size=128256,
695
- description="Large Llama with best Arabic among Llama family",
696
- organization="Meta AI",
697
- arabic_support="Supported",
698
- dialect_support=["MSA"],
699
- special_features=["70B params", "Best Llama Arabic"]
700
- )),
701
- ("meta-llama/Llama-2-7b-hf", TokenizerInfo(
702
- name="Llama 2 7B",
703
- model_id="meta-llama/Llama-2-7b-hf",
704
- type=TokenizerType.MULTILINGUAL_LLM,
705
- algorithm=TokenizerAlgorithm.SENTENCEPIECE,
706
- vocab_size=32000,
707
- description="Meta's Llama 2 base model",
708
- organization="Meta AI",
709
- arabic_support="Limited",
710
- dialect_support=["MSA"],
711
- special_features=["32K vocab", "Foundation model"]
712
- )),
713
- # Additional gated Arabic models
714
- ("CohereLabs/c4ai-command-a-03-2025", TokenizerInfo(
715
- name="Cohere Command A 111B",
716
- model_id="CohereLabs/c4ai-command-a-03-2025",
717
- type=TokenizerType.MULTILINGUAL_LLM,
718
- algorithm=TokenizerAlgorithm.BPE,
719
- vocab_size=256000,
720
- description="Cohere's flagship 111B model with Arabic support",
721
- organization="Cohere",
722
- arabic_support="Supported",
723
- dialect_support=["MSA"],
724
- special_features=["111B params", "256K context", "23 languages"]
725
- )),
726
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
 
728
  # ============================================================================
729
  # TOKENIZER LOADER AND CACHE
@@ -751,15 +554,6 @@ class TokenizerManager:
751
  except Exception as e:
752
  print(f" βœ— {info.name}: {str(e)[:50]}")
753
 
754
- # Try gated models
755
- for model_id, info in GATED_MODELS:
756
- try:
757
- _ = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
758
- self._available[model_id] = info
759
- print(f" βœ“ {info.name} (gated)")
760
- except Exception as e:
761
- print(f" βœ— {info.name} (gated): {str(e)[:50]}")
762
-
763
  print(f"\nTotal available tokenizers: {len(self._available)}")
764
 
765
  def get_tokenizer(self, model_id: str):
@@ -814,19 +608,8 @@ def has_diacritics(text: str) -> bool:
814
  diacritics = set('Ω‹ΩŒΩΩŽΩΩΩ‘Ω’Ω°')
815
  return any(c in diacritics for c in text)
816
 
817
- def normalize_arabic(text: str) -> str:
818
- """Basic Arabic normalization"""
819
- # Normalize alef variants
820
- text = re.sub('[Ψ₯Ψ£Ψ’Ψ§]', 'Ψ§', text)
821
- # Normalize yeh
822
- text = re.sub('Ω‰', 'ي', text)
823
- # Normalize teh marbuta
824
- text = re.sub('Ψ©', 'Ω‡', text)
825
- return text
826
-
827
  def get_arabic_words(text: str) -> List[str]:
828
  """Extract Arabic words from text"""
829
- # Split on whitespace and filter for words containing Arabic
830
  words = text.split()
831
  return [w for w in words if any(is_arabic_char(c) for c in w)]
832
 
@@ -920,6 +703,439 @@ def analyze_tokenization(
920
  decoded_text=decoded
921
  )
922
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923
  # ============================================================================
924
  # UI GENERATION FUNCTIONS
925
  # ============================================================================
@@ -927,9 +1143,8 @@ def analyze_tokenization(
927
  def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
928
  """Generate beautiful HTML visualization of tokens"""
929
 
930
- # Color palette for tokens (alternating for clarity)
931
  colors = [
932
- ('#1a1a2e', '#eaeaea'), # Dark blue bg, light text
933
  ('#16213e', '#f0f0f0'),
934
  ('#0f3460', '#ffffff'),
935
  ('#533483', '#f5f5f5'),
@@ -942,10 +1157,7 @@ def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str
942
  html_parts = []
943
  for i, (token, tid) in enumerate(zip(tokens, token_ids)):
944
  bg, fg = colors[i % len(colors)]
945
- # Escape HTML entities
946
  display_token = token.replace('<', '&lt;').replace('>', '&gt;')
947
-
948
- # Determine if token is Arabic
949
  is_arabic = any(is_arabic_char(c) for c in token)
950
  direction = 'rtl' if is_arabic else 'ltr'
951
 
@@ -969,7 +1181,6 @@ def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str
969
  def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
970
  """Generate metrics visualization card"""
971
 
972
- # Determine quality indicators
973
  fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
974
  strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
975
  compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
@@ -999,34 +1210,33 @@ def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) ->
999
  <div class="metric-card {strr_quality}">
1000
  <div class="metric-icon">✨</div>
1001
  <div class="metric-value">{metrics.single_token_retention_rate:.1%}</div>
1002
- <div class="metric-label">Single Token Rate (STRR)</div>
1003
  <div class="metric-hint">Higher is better</div>
1004
  </div>
1005
 
1006
  <div class="metric-card">
1007
- <div class="metric-icon">πŸ“</div>
1008
  <div class="metric-value">{metrics.char_per_token:.2f}</div>
1009
  <div class="metric-label">Characters/Token</div>
1010
  </div>
1011
 
1012
- <div class="metric-card">
1013
- <div class="metric-icon">⚑</div>
1014
- <div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div>
1015
- <div class="metric-label">Processing Time</div>
 
1016
  </div>
1017
 
1018
- <div class="metric-card arabic">
1019
- <div class="metric-icon">πŸ”€</div>
1020
  <div class="metric-value">{metrics.arabic_fertility:.3f}</div>
1021
  <div class="metric-label">Arabic Fertility</div>
1022
- <div class="metric-hint">Arabic-specific efficiency</div>
1023
  </div>
1024
 
1025
  <div class="metric-card">
1026
- <div class="metric-icon">{"βœ…" if metrics.oov_percentage == 0 else "⚠️"}</div>
1027
- <div class="metric-value">{metrics.oov_percentage:.1f}%</div>
1028
- <div class="metric-label">OOV Rate</div>
1029
- <div class="metric-hint">Lower is better (0% ideal)</div>
1030
  </div>
1031
  </div>
1032
  '''
@@ -1034,44 +1244,40 @@ def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) ->
1034
  def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
1035
  """Generate tokenizer information card"""
1036
 
1037
- dialect_badges = ' '.join([
1038
- f'<span class="dialect-badge">{d}</span>'
1039
- for d in info.dialect_support
1040
- ])
1041
 
1042
- feature_badges = ' '.join([
1043
- f'<span class="feature-badge">{f}</span>'
1044
- for f in info.special_features
1045
- ])
1046
-
1047
- support_class = info.arabic_support.lower().replace(' ', '-')
1048
 
1049
  return f'''
1050
- <div class="tokenizer-info">
1051
- <div class="tokenizer-header">
1052
  <h3>{info.name}</h3>
1053
  <span class="org-badge">{info.organization}</span>
1054
  </div>
1055
- <p class="tokenizer-desc">{info.description}</p>
1056
- <div class="tokenizer-meta">
1057
- <div class="meta-row">
1058
- <span class="meta-label">Type:</span>
1059
- <span class="meta-value">{info.type.value}</span>
 
 
1060
  </div>
1061
- <div class="meta-row">
1062
- <span class="meta-label">Algorithm:</span>
1063
- <span class="meta-value">{info.algorithm.value}</span>
1064
  </div>
1065
- <div class="meta-row">
1066
- <span class="meta-label">Vocab Size:</span>
1067
- <span class="meta-value">{info.vocab_size:,}</span>
1068
  </div>
1069
- <div class="meta-row">
1070
- <span class="meta-label">Arabic Support:</span>
1071
- <span class="support-badge {support_class}">{info.arabic_support}</span>
1072
  </div>
1073
  </div>
1074
- <div class="tokenizer-badges">
 
1075
  <div class="badge-group">
1076
  <span class="badge-label">Dialects:</span>
1077
  {dialect_badges}
@@ -1084,39 +1290,43 @@ def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
1084
  </div>
1085
  '''
1086
 
1087
- # ============================================================================
1088
- # MAIN ANALYSIS FUNCTION
1089
- # ============================================================================
1090
-
1091
  def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
1092
- """Analyze text with a single tokenizer"""
1093
 
1094
- if not text.strip():
1095
  return (
1096
- "<p class='warning'>Please enter some text to analyze.</p>",
1097
- "",
1098
- "",
1099
- ""
 
 
 
 
1100
  )
1101
 
1102
  model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
1103
- info = tokenizer_manager.get_available_tokenizers()[model_id]
 
 
 
 
 
 
1104
 
1105
  try:
1106
- metrics = analyze_tokenization(text, model_id, info)
1107
 
1108
- # Generate all outputs
1109
- info_html = generate_tokenizer_info_card(info)
1110
- metrics_html = generate_metrics_card(metrics, info)
1111
  tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
1112
 
1113
- # Decoded text output
1114
  decoded_html = f'''
1115
  <div class="decoded-section">
1116
  <h4>Decoded Output</h4>
1117
  <div class="decoded-text" dir="auto">{metrics.decoded_text}</div>
1118
  <div class="decoded-meta">
1119
- <span>Diacritics preserved: {"βœ… Yes" if metrics.diacritic_preservation else "❌ No"}</span>
1120
  </div>
1121
  </div>
1122
  '''
@@ -1124,139 +1334,146 @@ def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str
1124
  return info_html, metrics_html, tokens_html, decoded_html
1125
 
1126
  except Exception as e:
1127
- error_html = f'''
1128
- <div class="error-card">
1129
- <h4>Error analyzing with {info.name}</h4>
1130
- <p>{str(e)}</p>
1131
- </div>
1132
- '''
1133
- return error_html, "", "", ""
1134
 
1135
  def compare_tokenizers(tokenizer_choices: List[str], text: str) -> str:
1136
- """Compare multiple tokenizers side by side"""
1137
 
1138
- if not text.strip():
1139
- return "<p class='warning'>Please enter some text to analyze.</p>"
1140
 
1141
  if not tokenizer_choices or len(tokenizer_choices) < 2:
1142
- return "<p class='warning'>Please select at least 2 tokenizers to compare.</p>"
1143
 
1144
  results = []
1145
 
1146
  for choice in tokenizer_choices:
1147
  model_id = tokenizer_manager.get_model_id_from_choice(choice)
1148
- info = tokenizer_manager.get_available_tokenizers()[model_id]
1149
 
1150
- try:
1151
- metrics = analyze_tokenization(text, model_id, info)
1152
- results.append((info, metrics))
1153
- except Exception as e:
1154
- continue
1155
-
1156
- if not results:
1157
- return "<p class='error'>Failed to analyze with any selected tokenizers.</p>"
 
 
 
 
 
 
 
 
1158
 
1159
- # Sort by fertility (best first)
1160
- results.sort(key=lambda x: x[1].fertility)
 
 
 
 
 
 
 
 
1161
 
1162
  # Generate comparison table
1163
- table_rows = []
1164
- for i, (info, metrics) in enumerate(results):
1165
- rank_class = "rank-1" if i == 0 else "rank-2" if i == 1 else "rank-3" if i == 2 else ""
1166
-
1167
- table_rows.append(f'''
1168
- <tr class="{rank_class}">
1169
- <td class="rank-cell">{i + 1}</td>
1170
- <td class="name-cell">
1171
- <strong>{info.name}</strong>
1172
- <span class="org-small">{info.organization}</span>
1173
- </td>
1174
- <td class="metric-cell">{metrics.total_tokens}</td>
1175
- <td class="metric-cell highlight">{metrics.fertility:.3f}</td>
1176
- <td class="metric-cell">{metrics.compression_ratio:.2f}</td>
1177
- <td class="metric-cell">{metrics.single_token_retention_rate:.1%}</td>
1178
- <td class="metric-cell">{metrics.arabic_fertility:.3f}</td>
1179
- <td class="metric-cell">{metrics.oov_percentage:.1f}%</td>
1180
- <td class="metric-cell">{metrics.tokenization_time_ms:.2f}ms</td>
1181
- </tr>
1182
- ''')
1183
-
1184
- return f'''
1185
  <div class="comparison-container">
1186
- <h3>Tokenizer Comparison Results</h3>
1187
- <p class="comparison-subtitle">Ranked by fertility (lower is better)</p>
1188
  <table class="comparison-table">
1189
  <thead>
1190
  <tr>
1191
- <th>#</th>
1192
  <th>Tokenizer</th>
 
1193
  <th>Tokens</th>
1194
  <th>Fertility ↓</th>
1195
- <th>Compression</th>
1196
- <th>STRR</th>
1197
- <th>Arabic Fertility</th>
1198
  <th>OOV %</th>
1199
- <th>Time</th>
1200
  </tr>
1201
  </thead>
1202
  <tbody>
1203
- {''.join(table_rows)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1204
  </tbody>
1205
  </table>
1206
- <div class="comparison-legend">
1207
- <span class="legend-item"><span class="legend-color rank-1"></span> Best</span>
1208
- <span class="legend-item"><span class="legend-color rank-2"></span> Runner-up</span>
1209
- <span class="legend-item"><span class="legend-color rank-3"></span> Third</span>
1210
- </div>
1211
  </div>
1212
  '''
 
 
1213
 
1214
  # ============================================================================
1215
- # CSS STYLES
1216
  # ============================================================================
1217
 
1218
  CUSTOM_CSS = """
1219
- /* ===== GLOBAL STYLES ===== */
1220
  :root {
1221
- --primary: #0d47a1;
1222
- --primary-light: #1976d2;
1223
- --primary-dark: #002171;
1224
- --accent: #ff6f00;
1225
- --accent-light: #ffa040;
1226
- --success: #2e7d32;
1227
  --warning: #f57c00;
1228
  --error: #c62828;
1229
- --bg-dark: #0a0a0f;
1230
- --bg-card: #12121a;
1231
- --bg-elevated: #1a1a24;
1232
- --text-primary: #f5f5f5;
1233
- --text-secondary: #b0b0b0;
1234
- --border: #2a2a3a;
1235
- --gradient-1: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
1236
- --gradient-2: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
1237
- --gradient-arabic: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
1238
- }
1239
-
1240
- .gradio-container {
1241
- background: var(--bg-dark) !important;
1242
- font-family: 'IBM Plex Sans Arabic', 'Segoe UI', system-ui, sans-serif !important;
1243
  }
1244
 
1245
- /* ===== HEADER STYLES ===== */
1246
  .header-section {
1247
  text-align: center;
1248
- padding: 2rem;
1249
- background: var(--gradient-1);
1250
  border-radius: 16px;
1251
- margin-bottom: 2rem;
1252
  }
1253
 
1254
  .header-section h1 {
1255
  font-size: 2.5rem;
1256
- font-weight: 700;
1257
  color: white;
1258
  margin-bottom: 0.5rem;
1259
- text-shadow: 0 2px 10px rgba(0,0,0,0.3);
1260
  }
1261
 
1262
  .header-section p {
@@ -1264,85 +1481,138 @@ CUSTOM_CSS = """
1264
  font-size: 1.1rem;
1265
  }
1266
 
1267
- /* ===== TOKEN VISUALIZATION ===== */
1268
- .token-container {
1269
- display: flex;
1270
- flex-wrap: wrap;
1271
- gap: 8px;
1272
- padding: 1.5rem;
1273
  background: var(--bg-card);
1274
  border-radius: 12px;
 
1275
  border: 1px solid var(--border);
1276
- direction: rtl;
1277
  }
1278
 
1279
- .token {
1280
- display: inline-flex;
1281
- flex-direction: column;
1282
  align-items: center;
1283
- padding: 8px 12px;
1284
- border-radius: 8px;
1285
- font-family: 'IBM Plex Mono', monospace;
1286
- font-size: 0.95rem;
1287
- transition: transform 0.2s, box-shadow 0.2s;
1288
- cursor: default;
1289
  }
1290
 
1291
- .token:hover {
1292
- transform: translateY(-2px);
1293
- box-shadow: 0 4px 12px rgba(0,0,0,0.3);
1294
  }
1295
 
1296
- .token-id {
1297
- font-size: 0.7rem;
1298
- opacity: 0.7;
1299
- margin-top: 4px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1300
  }
1301
 
1302
  /* ===== METRICS GRID ===== */
1303
  .metrics-grid {
1304
  display: grid;
1305
- grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
1306
  gap: 1rem;
1307
- padding: 1rem;
1308
  }
1309
 
1310
  .metric-card {
1311
  background: var(--bg-card);
1312
- border: 1px solid var(--border);
1313
  border-radius: 12px;
1314
- padding: 1.25rem;
1315
  text-align: center;
1316
- transition: transform 0.2s, border-color 0.2s;
 
1317
  }
1318
 
1319
  .metric-card:hover {
1320
- transform: translateY(-3px);
1321
- border-color: var(--primary-light);
1322
  }
1323
 
1324
  .metric-card.excellent {
1325
  border-color: var(--success);
1326
- background: linear-gradient(to bottom, rgba(46, 125, 50, 0.1), transparent);
1327
  }
1328
 
1329
  .metric-card.good {
1330
- border-color: var(--primary-light);
1331
- background: linear-gradient(to bottom, rgba(25, 118, 210, 0.1), transparent);
1332
  }
1333
 
1334
  .metric-card.poor {
1335
- border-color: var(--warning);
1336
- background: linear-gradient(to bottom, rgba(245, 124, 0, 0.1), transparent);
1337
  }
1338
 
1339
  .metric-card.primary {
1340
- background: var(--gradient-1);
1341
- }
1342
-
1343
- .metric-card.arabic {
1344
- background: linear-gradient(to bottom, rgba(17, 153, 142, 0.2), transparent);
1345
- border-color: #11998e;
1346
  }
1347
 
1348
  .metric-icon {
@@ -1351,16 +1621,15 @@ CUSTOM_CSS = """
1351
  }
1352
 
1353
  .metric-value {
1354
- font-size: 1.75rem;
1355
  font-weight: 700;
1356
  color: var(--text-primary);
1357
- margin-bottom: 0.25rem;
1358
  }
1359
 
1360
  .metric-label {
1361
- font-size: 0.85rem;
1362
  color: var(--text-secondary);
1363
- margin-bottom: 0.25rem;
1364
  }
1365
 
1366
  .metric-hint {
@@ -1369,244 +1638,117 @@ CUSTOM_CSS = """
1369
  opacity: 0.7;
1370
  }
1371
 
1372
- /* ===== TOKENIZER INFO ===== */
1373
- .tokenizer-info {
1374
- background: var(--bg-card);
1375
- border: 1px solid var(--border);
 
 
 
1376
  border-radius: 12px;
1377
- padding: 1.5rem;
1378
  }
1379
 
1380
- .tokenizer-header {
1381
- display: flex;
 
1382
  align-items: center;
1383
- gap: 1rem;
1384
- margin-bottom: 1rem;
 
 
 
 
1385
  }
1386
 
1387
- .tokenizer-header h3 {
1388
- margin: 0;
1389
- color: var(--text-primary);
1390
- font-size: 1.5rem;
1391
  }
1392
 
1393
- .org-badge {
1394
- background: var(--gradient-1);
1395
- padding: 4px 12px;
1396
- border-radius: 20px;
1397
- font-size: 0.8rem;
1398
- color: white;
1399
  }
1400
 
1401
- .tokenizer-desc {
1402
- color: var(--text-secondary);
1403
- margin-bottom: 1rem;
1404
- line-height: 1.6;
 
 
1405
  }
1406
 
1407
- .tokenizer-meta {
1408
- display: grid;
1409
- grid-template-columns: repeat(2, 1fr);
1410
- gap: 0.75rem;
1411
  margin-bottom: 1rem;
1412
  }
1413
 
1414
- .meta-row {
1415
- display: flex;
1416
- gap: 0.5rem;
1417
- }
1418
-
1419
- .meta-label {
1420
- color: var(--text-secondary);
1421
- font-size: 0.85rem;
1422
- }
1423
-
1424
- .meta-value {
1425
  color: var(--text-primary);
1426
- font-weight: 500;
1427
- }
1428
-
1429
- .support-badge {
1430
- padding: 2px 8px;
1431
- border-radius: 4px;
1432
- font-size: 0.8rem;
1433
- }
1434
-
1435
- .support-badge.native {
1436
- background: var(--success);
1437
- color: white;
1438
- }
1439
-
1440
- .support-badge.adapted {
1441
- background: var(--primary-light);
1442
- color: white;
1443
- }
1444
-
1445
- .support-badge.supported {
1446
- background: var(--warning);
1447
- color: white;
1448
  }
1449
 
1450
- .support-badge.limited {
1451
- background: var(--error);
1452
- color: white;
1453
- }
1454
-
1455
- .tokenizer-badges {
1456
- display: flex;
1457
- flex-direction: column;
1458
- gap: 0.75rem;
1459
- }
1460
-
1461
- .badge-group {
1462
- display: flex;
1463
- flex-wrap: wrap;
1464
- align-items: center;
1465
- gap: 0.5rem;
1466
- }
1467
-
1468
- .badge-label {
1469
- color: var(--text-secondary);
1470
  font-size: 0.85rem;
1471
- }
1472
-
1473
- .dialect-badge, .feature-badge {
1474
- background: var(--bg-elevated);
1475
- border: 1px solid var(--border);
1476
- padding: 4px 10px;
1477
- border-radius: 6px;
1478
- font-size: 0.75rem;
1479
- color: var(--text-primary);
1480
  }
1481
 
1482
  /* ===== COMPARISON TABLE ===== */
1483
  .comparison-container {
1484
- background: var(--bg-card);
1485
- border-radius: 12px;
1486
- padding: 1.5rem;
1487
- border: 1px solid var(--border);
1488
- }
1489
-
1490
- .comparison-container h3 {
1491
- color: var(--text-primary);
1492
- margin-bottom: 0.25rem;
1493
- }
1494
-
1495
- .comparison-subtitle {
1496
- color: var(--text-secondary);
1497
- font-size: 0.9rem;
1498
- margin-bottom: 1.5rem;
1499
  }
1500
 
1501
  .comparison-table {
1502
  width: 100%;
1503
  border-collapse: collapse;
1504
- font-size: 0.9rem;
1505
  }
1506
 
1507
  .comparison-table th {
1508
- background: var(--bg-elevated);
1509
- color: var(--text-secondary);
1510
- padding: 12px 8px;
1511
  text-align: left;
1512
- font-weight: 500;
1513
- border-bottom: 2px solid var(--border);
1514
  }
1515
 
1516
  .comparison-table td {
1517
- padding: 12px 8px;
1518
  border-bottom: 1px solid var(--border);
1519
  color: var(--text-primary);
1520
  }
1521
 
1522
- .comparison-table tr.rank-1 {
1523
- background: linear-gradient(90deg, rgba(46, 125, 50, 0.2), transparent);
1524
  }
1525
 
1526
- .comparison-table tr.rank-2 {
1527
- background: linear-gradient(90deg, rgba(25, 118, 210, 0.15), transparent);
1528
  }
1529
 
1530
- .comparison-table tr.rank-3 {
1531
- background: linear-gradient(90deg, rgba(245, 124, 0, 0.1), transparent);
1532
  }
1533
 
1534
- .rank-cell {
1535
- font-weight: 700;
1536
- text-align: center;
1537
- }
1538
-
1539
- .name-cell strong {
1540
- display: block;
1541
- }
1542
-
1543
- .org-small {
1544
- font-size: 0.75rem;
1545
- color: var(--text-secondary);
1546
- }
1547
-
1548
- .metric-cell {
1549
- text-align: center;
1550
  }
1551
 
1552
- .metric-cell.highlight {
1553
- font-weight: 700;
1554
- color: var(--accent-light);
1555
- }
1556
-
1557
- .comparison-legend {
1558
- display: flex;
1559
- gap: 1.5rem;
1560
- margin-top: 1rem;
1561
- padding-top: 1rem;
1562
- border-top: 1px solid var(--border);
1563
- }
1564
-
1565
- .legend-item {
1566
- display: flex;
1567
- align-items: center;
1568
- gap: 0.5rem;
1569
- font-size: 0.85rem;
1570
- color: var(--text-secondary);
1571
- }
1572
-
1573
- .legend-color {
1574
- width: 16px;
1575
- height: 16px;
1576
- border-radius: 4px;
1577
- }
1578
-
1579
- .legend-color.rank-1 { background: var(--success); }
1580
- .legend-color.rank-2 { background: var(--primary-light); }
1581
- .legend-color.rank-3 { background: var(--warning); }
1582
-
1583
- /* ===== DECODED SECTION ===== */
1584
- .decoded-section {
1585
- background: var(--bg-card);
1586
- border: 1px solid var(--border);
1587
- border-radius: 12px;
1588
- padding: 1.5rem;
1589
- }
1590
-
1591
- .decoded-section h4 {
1592
- color: var(--text-primary);
1593
- margin-bottom: 1rem;
1594
  }
1595
 
1596
- .decoded-text {
1597
- background: var(--bg-elevated);
1598
- padding: 1rem;
1599
- border-radius: 8px;
1600
- font-family: 'IBM Plex Sans Arabic', serif;
1601
- font-size: 1.1rem;
1602
- line-height: 1.8;
1603
- color: var(--text-primary);
1604
  }
1605
 
1606
- .decoded-meta {
1607
- margin-top: 1rem;
1608
- font-size: 0.85rem;
1609
- color: var(--text-secondary);
1610
  }
1611
 
1612
  /* ===== UTILITY CLASSES ===== */
@@ -1661,14 +1803,14 @@ def create_interface():
1661
 
1662
  available_tokenizers = tokenizer_manager.get_tokenizer_choices()
1663
 
1664
- # Group tokenizers by type for better organization
1665
- arabic_specific = [t for t in available_tokenizers if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT'])]
1666
- arabic_llms = [t for t in available_tokenizers if any(x in t for x in ['Jais', 'AceGPT'])]
1667
  multilingual = [t for t in available_tokenizers if t not in arabic_specific and t not in arabic_llms]
1668
 
1669
  with gr.Blocks(css=CUSTOM_CSS, title="Arabic Tokenizer Arena Pro", theme=gr.themes.Base(
1670
- primary_hue="blue",
1671
- secondary_hue="purple",
1672
  neutral_hue="slate",
1673
  font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
1674
  )) as demo:
@@ -1715,7 +1857,6 @@ def create_interface():
1715
  tokens_output = gr.HTML(label="Token Visualization")
1716
  decoded_output = gr.HTML(label="Decoded Output")
1717
 
1718
- # Event handlers
1719
  sample_dropdown.change(
1720
  lambda x: SAMPLE_TEXTS.get(x, ""),
1721
  inputs=[sample_dropdown],
@@ -1768,7 +1909,70 @@ def create_interface():
1768
  outputs=[comparison_output]
1769
  )
1770
 
1771
- # ===== TAB 3: Metrics Reference =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1772
  with gr.TabItem("πŸ“– Metrics Guide", id="guide"):
1773
  gr.Markdown("""
1774
  ## Tokenization Evaluation Metrics Guide
@@ -1803,18 +2007,9 @@ def create_interface():
1803
  - *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.)
1804
  - *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025)
1805
  - *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024)
1806
-
1807
- ### Tokenizer Algorithm Types
1808
-
1809
- - **BPE (Byte-Pair Encoding)**: Iteratively merges frequent character pairs
1810
- - **Byte-Level BPE**: BPE applied to UTF-8 bytes instead of characters
1811
- - **WordPiece**: Google's variant, used in BERT models
1812
- - **SentencePiece**: Language-independent, uses unigram model
1813
- - **Unigram**: Probabilistic subword model
1814
- - **Tiktoken**: OpenAI's optimized BPE implementation
1815
  """)
1816
 
1817
- # ===== TAB 4: About =====
1818
  with gr.TabItem("ℹ️ About", id="about"):
1819
  gr.Markdown(f"""
1820
  ## Arabic Tokenizer Arena Pro
@@ -1824,13 +2019,13 @@ def create_interface():
1824
  ### Available Tokenizers: {len(available_tokenizers)}
1825
 
1826
  **Arabic-Specific Models:**
1827
- {chr(10).join(['- ' + t for t in arabic_specific])}
1828
 
1829
  **Arabic LLMs:**
1830
- {chr(10).join(['- ' + t for t in arabic_llms])}
1831
 
1832
  **Multilingual LLMs:**
1833
- {chr(10).join(['- ' + t for t in multilingual])}
1834
 
1835
  ### Features
1836
 
@@ -1838,6 +2033,7 @@ def create_interface():
1838
  βœ… Arabic-specific analysis (dialect support, diacritic preservation)
1839
  βœ… Side-by-side tokenizer comparison
1840
  βœ… Beautiful token visualization
 
1841
  βœ… Support for MSA, dialectal Arabic, and Classical Arabic
1842
  βœ… Research-backed evaluation methodology
1843
 
@@ -1861,4 +2057,4 @@ def create_interface():
1861
 
1862
  if __name__ == "__main__":
1863
  demo = create_interface()
1864
- demo.launch(share=True)
 
4
  A comprehensive research and production-grade tool for evaluating Arabic tokenizers
5
  across multiple dimensions: efficiency, coverage, morphological awareness, and more.
6
 
7
+ Now with LEADERBOARD - imports real Arabic datasets from HuggingFace!
8
+
9
  Supports:
10
  - Arabic-specific tokenizers (Aranizer, AraBERT, CAMeLBERT, MARBERT, etc.)
11
  - Major LLM tokenizers (Jais, AceGPT, Falcon-Arabic, ALLaM, Qwen, Llama, Mistral, GPT)
12
  - Comprehensive evaluation metrics based on latest research
13
+ - Real dataset benchmarking from HuggingFace
14
  """
15
 
16
  import gradio as gr
 
21
  from typing import Dict, List, Tuple, Optional, Any
22
  from dataclasses import dataclass, field
23
  from enum import Enum
24
+ from collections import defaultdict
25
+ import statistics
26
  import os
27
 
28
  # Hugging Face authentication
 
35
  from transformers import AutoTokenizer, logging
36
  logging.set_verbosity_error()
37
 
38
+ # Import datasets library for leaderboard
39
+ from datasets import load_dataset
40
+
41
  # ============================================================================
42
  # DATA CLASSES AND ENUMS
43
  # ============================================================================
 
212
  dialect_support=["MSA"],
213
  special_features=["100K vocabulary", "MSA focused"]
214
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  # ========== ARABIC-SPECIFIC TOKENIZERS ==========
217
  "riotu-lab/Aranizer-PBE-86k": TokenizerInfo(
 
238
  dialect_support=["MSA"],
239
  special_features=["Low fertility", "SentencePiece", "86K vocab"]
240
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  # ========== ARABIC-SPECIFIC LLMs ==========
243
  "ALLaM-AI/ALLaM-7B-Instruct-preview": TokenizerInfo(
 
288
  dialect_support=["MSA"],
289
  special_features=["LLaMA-based", "Cultural alignment", "RLHF", "Chat"]
290
  ),
 
 
 
 
 
 
 
 
 
 
 
 
291
  "silma-ai/SILMA-9B-Instruct-v1.0": TokenizerInfo(
292
  name="SILMA 9B Instruct",
293
  model_id="silma-ai/SILMA-9B-Instruct-v1.0",
 
300
  dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
301
  special_features=["Gemma-based", "SOTA 9B class", "Efficient"]
302
  ),
 
 
 
 
 
 
 
 
 
 
 
 
303
  "QCRI/Fanar-1-9B-Instruct": TokenizerInfo(
304
  name="Fanar 9B Instruct",
305
  model_id="QCRI/Fanar-1-9B-Instruct",
 
312
  dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
313
  special_features=["Islamic RAG", "Cultural alignment", "Gemma-based"]
314
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  "Navid-AI/Yehia-7B-preview": TokenizerInfo(
316
  name="Yehia 7B Preview",
317
  model_id="Navid-AI/Yehia-7B-preview",
 
338
  dialect_support=["Darija", "MSA"],
339
  special_features=["Moroccan dialect", "Transliteration", "Cultural"]
340
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  # ========== MULTILINGUAL LLMs WITH ARABIC SUPPORT ==========
343
  "Qwen/Qwen2.5-7B": TokenizerInfo(
 
352
  dialect_support=["MSA"],
353
  special_features=["152K vocab", "128K context", "30+ languages"]
354
  ),
 
 
 
 
 
 
 
 
 
 
 
 
355
  "google/gemma-2-9b": TokenizerInfo(
356
  name="Gemma 2 9B",
357
  model_id="google/gemma-2-9b",
 
364
  dialect_support=["MSA"],
365
  special_features=["256K vocab", "Efficient architecture"]
366
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  "mistralai/Mistral-7B-v0.3": TokenizerInfo(
368
  name="Mistral 7B v0.3",
369
  model_id="mistralai/Mistral-7B-v0.3",
 
388
  dialect_support=["MSA"],
389
  special_features=["Tekken tokenizer", "131K vocab", "Multilingual optimized"]
390
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  "xlm-roberta-base": TokenizerInfo(
392
  name="XLM-RoBERTa Base",
393
  model_id="xlm-roberta-base",
 
412
  dialect_support=["MSA"],
413
  special_features=["Baseline model", "104 languages"]
414
  ),
 
 
415
  "tiiuae/falcon-7b": TokenizerInfo(
416
  name="Falcon 7B",
417
  model_id="tiiuae/falcon-7b",
 
424
  dialect_support=["MSA"],
425
  special_features=["65K vocab", "RefinedWeb trained"]
426
  ),
 
 
 
 
 
 
 
 
 
 
 
 
427
  }
428
 
429
+ # ============================================================================
430
+ # LEADERBOARD DATASETS CONFIGURATION - Real HuggingFace Datasets
431
+ # ============================================================================
432
+
433
+ LEADERBOARD_DATASETS = {
434
+ # MSA Benchmarks
435
+ "arabic_mmlu": {
436
+ "hf_id": "MBZUAI/ArabicMMLU",
437
+ "name": "ArabicMMLU",
438
+ "category": "MSA Benchmark",
439
+ "text_column": "Question",
440
+ "split": "test",
441
+ "subset": None,
442
+ "samples": 500,
443
+ "description": "Multi-task benchmark from Arab school exams (14,575 MCQs)"
444
+ },
445
+
446
+ # Dialectal Arabic
447
+ "arsentd_lev": {
448
+ "hf_id": "ramybaly/arsentd_lev",
449
+ "name": "ArSenTD-LEV",
450
+ "category": "Levantine Dialect",
451
+ "text_column": "Tweet",
452
+ "split": "train",
453
+ "subset": None,
454
+ "samples": 500,
455
+ "description": "Levantine Arabic tweets (Jordan, Lebanon, Syria, Palestine)"
456
+ },
457
+
458
+ # Classical Arabic
459
+ "athar": {
460
+ "hf_id": "mohamed-khalil/ATHAR",
461
+ "name": "ATHAR Classical",
462
+ "category": "Classical Arabic",
463
+ "text_column": "arabic",
464
+ "split": "train",
465
+ "subset": None,
466
+ "samples": 500,
467
+ "description": "66K classical Arabic sentences with translations"
468
+ },
469
+
470
+ # Question Answering
471
+ "arcd": {
472
+ "hf_id": "arcd",
473
+ "name": "ARCD",
474
+ "category": "QA Dataset",
475
+ "text_column": "context",
476
+ "split": "train",
477
+ "subset": None,
478
+ "samples": 300,
479
+ "description": "Arabic Reading Comprehension Dataset (1,395 questions)"
480
+ },
481
+
482
+ # Poetry
483
+ "ashaar": {
484
+ "hf_id": "arbml/Ashaar_dataset",
485
+ "name": "Ashaar Poetry",
486
+ "category": "Poetry",
487
+ "text_column": "poem_text",
488
+ "split": "train",
489
+ "subset": None,
490
+ "samples": 500,
491
+ "description": "2M+ Arabic poetry verses with meter and theme labels"
492
+ },
493
+
494
+ # Religious - Hadith
495
+ "hadith": {
496
+ "hf_id": "gurgutan/sunnah_ar_en_dataset",
497
+ "name": "Hadith Collection",
498
+ "category": "Religious",
499
+ "text_column": "hadith_text_ar",
500
+ "split": "train",
501
+ "subset": None,
502
+ "samples": 400,
503
+ "description": "50,762 hadiths from 14 authentic books"
504
+ },
505
+
506
+ # Social Media
507
+ "arabic_sentiment": {
508
+ "hf_id": "arbml/Arabic_Sentiment_Twitter_Corpus",
509
+ "name": "Arabic Sentiment",
510
+ "category": "Social Media",
511
+ "text_column": "text",
512
+ "split": "train",
513
+ "subset": None,
514
+ "samples": 500,
515
+ "description": "Arabic Twitter sentiment corpus"
516
+ },
517
+
518
+ # News
519
+ "sanad": {
520
+ "hf_id": "arbml/SANAD",
521
+ "name": "SANAD News",
522
+ "category": "News",
523
+ "text_column": "text",
524
+ "split": "train",
525
+ "subset": "alarabiya",
526
+ "samples": 400,
527
+ "description": "Arabic news articles from Al Arabiya"
528
+ },
529
+ }
530
 
531
  # ============================================================================
532
  # TOKENIZER LOADER AND CACHE
 
554
  except Exception as e:
555
  print(f" βœ— {info.name}: {str(e)[:50]}")
556
 
 
 
 
 
 
 
 
 
 
557
  print(f"\nTotal available tokenizers: {len(self._available)}")
558
 
559
  def get_tokenizer(self, model_id: str):
 
608
  diacritics = set('Ω‹ΩŒΩΩŽΩΩΩ‘Ω’Ω°')
609
  return any(c in diacritics for c in text)
610
 
 
 
 
 
 
 
 
 
 
 
611
  def get_arabic_words(text: str) -> List[str]:
612
  """Extract Arabic words from text"""
 
613
  words = text.split()
614
  return [w for w in words if any(is_arabic_char(c) for c in w)]
615
 
 
703
  decoded_text=decoded
704
  )
705
 
706
+ # ============================================================================
707
+ # LEADERBOARD FUNCTIONS - Import Real Datasets from HuggingFace
708
+ # ============================================================================
709
+
710
+ class HFDatasetLoader:
711
+ """Load Arabic datasets from HuggingFace"""
712
+
713
+ def __init__(self):
714
+ self.cache = {}
715
+
716
+ def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]:
717
+ """Load texts from a HuggingFace dataset"""
718
+
719
+ if dataset_key in self.cache:
720
+ return self.cache[dataset_key], f"βœ… Loaded {len(self.cache[dataset_key])} samples (cached)"
721
+
722
+ config = LEADERBOARD_DATASETS.get(dataset_key)
723
+ if not config:
724
+ return [], f"❌ Unknown dataset: {dataset_key}"
725
+
726
+ try:
727
+ # Load dataset from HuggingFace
728
+ if config.get("subset"):
729
+ ds = load_dataset(
730
+ config["hf_id"],
731
+ config["subset"],
732
+ split=config["split"],
733
+ trust_remote_code=True
734
+ )
735
+ else:
736
+ ds = load_dataset(
737
+ config["hf_id"],
738
+ split=config["split"],
739
+ trust_remote_code=True
740
+ )
741
+
742
+ texts = []
743
+ text_col = config["text_column"]
744
+
745
+ # Try to find text column
746
+ if text_col not in ds.column_names:
747
+ for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]:
748
+ if col in ds.column_names:
749
+ text_col = col
750
+ break
751
+
752
+ # Extract texts
753
+ max_samples = config.get("samples", 500)
754
+ for i, item in enumerate(ds):
755
+ if i >= max_samples:
756
+ break
757
+ text = item.get(text_col, "")
758
+ if text and isinstance(text, str) and len(text.strip()) > 10:
759
+ texts.append(text.strip())
760
+
761
+ self.cache[dataset_key] = texts
762
+ return texts, f"βœ… Loaded {len(texts)} samples from HuggingFace"
763
+
764
+ except Exception as e:
765
+ return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}"
766
+
767
+ def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]:
768
+ """Evaluate a tokenizer on a list of texts"""
769
+
770
+ fertilities = []
771
+ compressions = []
772
+ unk_counts = 0
773
+ total_tokens = 0
774
+
775
+ for text in texts:
776
+ try:
777
+ tokens = tokenizer.encode(text, add_special_tokens=False)
778
+ decoded = tokenizer.convert_ids_to_tokens(tokens)
779
+
780
+ num_tokens = len(tokens)
781
+ num_words = len(text.split()) or 1
782
+ num_bytes = len(text.encode('utf-8'))
783
+
784
+ fertility = num_tokens / num_words
785
+ compression = num_bytes / num_tokens if num_tokens > 0 else 0
786
+
787
+ # Count UNKs
788
+ unk_token = getattr(tokenizer, 'unk_token', '[UNK]')
789
+ unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower()))
790
+
791
+ fertilities.append(fertility)
792
+ compressions.append(compression)
793
+ unk_counts += unks
794
+ total_tokens += num_tokens
795
+
796
+ except Exception:
797
+ continue
798
+
799
+ if not fertilities:
800
+ return None
801
+
802
+ return {
803
+ "avg_fertility": statistics.mean(fertilities),
804
+ "std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0,
805
+ "avg_compression": statistics.mean(compressions),
806
+ "unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0,
807
+ "samples": len(fertilities)
808
+ }
809
+
810
+ def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float:
811
+ """Calculate overall score (0-100, higher is better)"""
812
+ # Lower fertility is better (ideal ~1.0 for Arabic)
813
+ fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0
814
+ # Higher compression is better
815
+ compression_score = min(1, compression / 6)
816
+ # Lower UNK is better
817
+ unk_score = 1 - min(1, unk_ratio * 20)
818
+
819
+ # Weighted combination
820
+ score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100
821
+ return round(score, 1)
822
+
823
+ def run_leaderboard_evaluation(
824
+ selected_datasets: List[str],
825
+ selected_tokenizers: List[str],
826
+ progress=gr.Progress()
827
+ ) -> Tuple[str, str, str]:
828
+ """
829
+ Run the full leaderboard evaluation with real HF datasets
830
+ Returns: (leaderboard_html, per_dataset_html, status_message)
831
+ """
832
+
833
+ if not selected_datasets:
834
+ return "", "", "⚠️ Please select at least one dataset"
835
+
836
+ if not selected_tokenizers:
837
+ return "", "", "⚠️ Please select at least one tokenizer"
838
+
839
+ loader = HFDatasetLoader()
840
+ results = defaultdict(dict)
841
+
842
+ # Status tracking
843
+ status_lines = []
844
+
845
+ # Load datasets from HuggingFace
846
+ status_lines.append("πŸ“š **Loading Datasets from HuggingFace:**\n")
847
+ loaded_datasets = {}
848
+
849
+ for i, ds_key in enumerate(selected_datasets):
850
+ progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...")
851
+ texts, msg = loader.load_dataset_texts(ds_key)
852
+ ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
853
+ status_lines.append(f" β€’ {ds_name}: {msg}")
854
+ if texts:
855
+ loaded_datasets[ds_key] = texts
856
+
857
+ if not loaded_datasets:
858
+ return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully"
859
+
860
+ # Evaluate tokenizers
861
+ status_lines.append("\nπŸ”„ **Evaluating Tokenizers:**\n")
862
+
863
+ tokenizer_cache = {}
864
+ total_steps = len(selected_tokenizers) * len(loaded_datasets)
865
+ current_step = 0
866
+
867
+ for tok_choice in selected_tokenizers:
868
+ # Get model ID from choice
869
+ tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
870
+ tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
871
+ tok_name = tok_info.name if tok_info else tok_choice
872
+
873
+ # Load tokenizer
874
+ try:
875
+ if tok_id not in tokenizer_cache:
876
+ tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained(
877
+ tok_id, trust_remote_code=True
878
+ )
879
+ tokenizer = tokenizer_cache[tok_id]
880
+ status_lines.append(f" β€’ {tok_name}: βœ… Loaded")
881
+ except Exception as e:
882
+ status_lines.append(f" β€’ {tok_name}: ❌ Failed ({str(e)[:30]})")
883
+ continue
884
+
885
+ # Evaluate on each dataset
886
+ for ds_key, texts in loaded_datasets.items():
887
+ current_step += 1
888
+ progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...")
889
+
890
+ metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
891
+ if metrics:
892
+ results[tok_choice][ds_key] = metrics
893
+
894
+ # Generate leaderboard
895
+ progress(0.95, "Generating leaderboard...")
896
+
897
+ leaderboard_data = []
898
+ per_dataset_data = []
899
+
900
+ for tok_choice, ds_results in results.items():
901
+ if not ds_results:
902
+ continue
903
+
904
+ tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
905
+ tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
906
+
907
+ # Aggregate across datasets
908
+ all_fertility = [m["avg_fertility"] for m in ds_results.values()]
909
+ all_compression = [m["avg_compression"] for m in ds_results.values()]
910
+ all_unk = [m["unk_ratio"] for m in ds_results.values()]
911
+
912
+ avg_fertility = statistics.mean(all_fertility)
913
+ avg_compression = statistics.mean(all_compression)
914
+ avg_unk = statistics.mean(all_unk)
915
+
916
+ score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)
917
+
918
+ leaderboard_data.append({
919
+ "name": tok_info.name if tok_info else tok_choice,
920
+ "type": tok_info.type.value if tok_info else "Unknown",
921
+ "org": tok_info.organization if tok_info else "Unknown",
922
+ "score": score,
923
+ "fertility": avg_fertility,
924
+ "compression": avg_compression,
925
+ "unk_ratio": avg_unk,
926
+ "num_datasets": len(ds_results)
927
+ })
928
+
929
+ # Per-dataset row
930
+ per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice}
931
+ for ds_key in selected_datasets:
932
+ ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
933
+ if ds_key in ds_results:
934
+ per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2)
935
+ else:
936
+ per_ds_row[ds_name] = "-"
937
+ per_dataset_data.append(per_ds_row)
938
+
939
+ # Sort by score
940
+ leaderboard_data.sort(key=lambda x: x["score"], reverse=True)
941
+
942
+ # Create HTML tables
943
+ leaderboard_html = generate_leaderboard_html(leaderboard_data)
944
+ per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
945
+
946
+ status_lines.append(f"\nβœ… **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
947
+
948
+ return leaderboard_html, per_dataset_html, "\n".join(status_lines)
949
+
950
+ def generate_leaderboard_html(data: List[Dict]) -> str:
951
+ """Generate HTML for main leaderboard"""
952
+
953
+ if not data:
954
+ return "<p>No results to display</p>"
955
+
956
+ html = """
957
+ <style>
958
+ .leaderboard-table {
959
+ width: 100%;
960
+ border-collapse: collapse;
961
+ font-family: system-ui, -apple-system, sans-serif;
962
+ margin: 20px 0;
963
+ }
964
+ .leaderboard-table th {
965
+ background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4e 100%);
966
+ color: white;
967
+ padding: 12px 8px;
968
+ text-align: left;
969
+ font-weight: 600;
970
+ }
971
+ .leaderboard-table td {
972
+ padding: 10px 8px;
973
+ border-bottom: 1px solid #e0e0e0;
974
+ }
975
+ .leaderboard-table tr:nth-child(even) {
976
+ background-color: #f8f9fa;
977
+ }
978
+ .leaderboard-table tr:hover {
979
+ background-color: #e8f5e9;
980
+ }
981
+ .rank-1 { background: linear-gradient(90deg, #ffd700 0%, #fff8dc 100%) !important; }
982
+ .rank-2 { background: linear-gradient(90deg, #c0c0c0 0%, #f5f5f5 100%) !important; }
983
+ .rank-3 { background: linear-gradient(90deg, #cd7f32 0%, #ffe4c4 100%) !important; }
984
+ .score-badge {
985
+ background: #2d8f4e;
986
+ color: white;
987
+ padding: 4px 8px;
988
+ border-radius: 12px;
989
+ font-weight: bold;
990
+ }
991
+ .type-badge {
992
+ background: #e3f2fd;
993
+ color: #1565c0;
994
+ padding: 2px 6px;
995
+ border-radius: 4px;
996
+ font-size: 0.85em;
997
+ }
998
+ .metric-good { color: #2e7d32; font-weight: 600; }
999
+ .metric-bad { color: #c62828; }
1000
+ </style>
1001
+
1002
+ <table class="leaderboard-table">
1003
+ <thead>
1004
+ <tr>
1005
+ <th>Rank</th>
1006
+ <th>Tokenizer</th>
1007
+ <th>Type</th>
1008
+ <th>Organization</th>
1009
+ <th>Score ↑</th>
1010
+ <th>Fertility ↓</th>
1011
+ <th>Compression ↑</th>
1012
+ <th>UNK Rate ↓</th>
1013
+ <th>Datasets</th>
1014
+ </tr>
1015
+ </thead>
1016
+ <tbody>
1017
+ """
1018
+
1019
+ for i, entry in enumerate(data):
1020
+ rank = i + 1
1021
+ rank_class = f"rank-{rank}" if rank <= 3 else ""
1022
+
1023
+ # Color coding for metrics
1024
+ fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
1025
+ comp_class = "metric-good" if entry["compression"] > 3.5 else ""
1026
+ unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
1027
+
1028
+ html += f"""
1029
+ <tr class="{rank_class}">
1030
+ <td><strong>#{rank}</strong></td>
1031
+ <td><strong>{entry["name"]}</strong></td>
1032
+ <td><span class="type-badge">{entry["type"]}</span></td>
1033
+ <td>{entry["org"]}</td>
1034
+ <td><span class="score-badge">{entry["score"]}</span></td>
1035
+ <td class="{fert_class}">{entry["fertility"]:.3f}</td>
1036
+ <td class="{comp_class}">{entry["compression"]:.2f}</td>
1037
+ <td class="{unk_class}">{entry["unk_ratio"]:.2%}</td>
1038
+ <td>{entry["num_datasets"]}</td>
1039
+ </tr>
1040
+ """
1041
+
1042
+ html += """
1043
+ </tbody>
1044
+ </table>
1045
+
1046
+ <div style="margin-top: 15px; padding: 10px; background: #f5f5f5; border-radius: 8px; font-size: 0.9em;">
1047
+ <strong>πŸ“Š Metric Guide:</strong><br>
1048
+ β€’ <strong>Score:</strong> Overall ranking (0-100, higher = better)<br>
1049
+ β€’ <strong>Fertility:</strong> Tokens per word (lower = better, 1.0 ideal for Arabic)<br>
1050
+ β€’ <strong>Compression:</strong> Bytes per token (higher = more efficient)<br>
1051
+ β€’ <strong>UNK Rate:</strong> Unknown token percentage (lower = better)
1052
+ </div>
1053
+ """
1054
+
1055
+ return html
1056
+
1057
+ def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
1058
+ """Generate HTML for per-dataset fertility table"""
1059
+
1060
+ if not data:
1061
+ return "<p>No per-dataset results</p>"
1062
+
1063
+ ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
1064
+
1065
+ html = """
1066
+ <style>
1067
+ .dataset-table {
1068
+ width: 100%;
1069
+ border-collapse: collapse;
1070
+ font-family: system-ui, -apple-system, sans-serif;
1071
+ margin: 20px 0;
1072
+ font-size: 0.9em;
1073
+ }
1074
+ .dataset-table th {
1075
+ background: #37474f;
1076
+ color: white;
1077
+ padding: 10px 6px;
1078
+ text-align: center;
1079
+ }
1080
+ .dataset-table th:first-child {
1081
+ text-align: left;
1082
+ }
1083
+ .dataset-table td {
1084
+ padding: 8px 6px;
1085
+ text-align: center;
1086
+ border-bottom: 1px solid #e0e0e0;
1087
+ }
1088
+ .dataset-table td:first-child {
1089
+ text-align: left;
1090
+ font-weight: 500;
1091
+ }
1092
+ .dataset-table tr:nth-child(even) {
1093
+ background-color: #fafafa;
1094
+ }
1095
+ .fert-excellent { background: #c8e6c9; color: #1b5e20; font-weight: 600; }
1096
+ .fert-good { background: #fff9c4; color: #f57f17; }
1097
+ .fert-poor { background: #ffcdd2; color: #b71c1c; }
1098
+ </style>
1099
+
1100
+ <h4>πŸ“ˆ Fertility per Dataset (tokens/word - lower is better)</h4>
1101
+ <table class="dataset-table">
1102
+ <thead>
1103
+ <tr>
1104
+ <th>Tokenizer</th>
1105
+ """
1106
+
1107
+ for ds_name in ds_names:
1108
+ html += f"<th>{ds_name}</th>"
1109
+
1110
+ html += """
1111
+ </tr>
1112
+ </thead>
1113
+ <tbody>
1114
+ """
1115
+
1116
+ for row in data:
1117
+ html += f"<tr><td>{row['Tokenizer']}</td>"
1118
+ for ds_name in ds_names:
1119
+ val = row.get(ds_name, "-")
1120
+ if val != "-":
1121
+ if val < 1.8:
1122
+ cls = "fert-excellent"
1123
+ elif val < 2.5:
1124
+ cls = "fert-good"
1125
+ else:
1126
+ cls = "fert-poor"
1127
+ html += f'<td class="{cls}">{val}</td>'
1128
+ else:
1129
+ html += '<td>-</td>'
1130
+ html += "</tr>"
1131
+
1132
+ html += """
1133
+ </tbody>
1134
+ </table>
1135
+ """
1136
+
1137
+ return html
1138
+
1139
  # ============================================================================
1140
  # UI GENERATION FUNCTIONS
1141
  # ============================================================================
 
1143
  def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
1144
  """Generate beautiful HTML visualization of tokens"""
1145
 
 
1146
  colors = [
1147
+ ('#1a1a2e', '#eaeaea'),
1148
  ('#16213e', '#f0f0f0'),
1149
  ('#0f3460', '#ffffff'),
1150
  ('#533483', '#f5f5f5'),
 
1157
  html_parts = []
1158
  for i, (token, tid) in enumerate(zip(tokens, token_ids)):
1159
  bg, fg = colors[i % len(colors)]
 
1160
  display_token = token.replace('<', '&lt;').replace('>', '&gt;')
 
 
1161
  is_arabic = any(is_arabic_char(c) for c in token)
1162
  direction = 'rtl' if is_arabic else 'ltr'
1163
 
 
1181
  def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
1182
  """Generate metrics visualization card"""
1183
 
 
1184
  fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
1185
  strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
1186
  compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
 
1210
  <div class="metric-card {strr_quality}">
1211
  <div class="metric-icon">✨</div>
1212
  <div class="metric-value">{metrics.single_token_retention_rate:.1%}</div>
1213
+ <div class="metric-label">STRR (Single Token Retention)</div>
1214
  <div class="metric-hint">Higher is better</div>
1215
  </div>
1216
 
1217
  <div class="metric-card">
1218
+ <div class="metric-icon">πŸ”€</div>
1219
  <div class="metric-value">{metrics.char_per_token:.2f}</div>
1220
  <div class="metric-label">Characters/Token</div>
1221
  </div>
1222
 
1223
+ <div class="metric-card {'excellent' if metrics.oov_percentage == 0 else 'poor' if metrics.oov_percentage > 5 else 'good'}">
1224
+ <div class="metric-icon">❓</div>
1225
+ <div class="metric-value">{metrics.oov_percentage:.1f}%</div>
1226
+ <div class="metric-label">OOV Rate</div>
1227
+ <div class="metric-hint">Lower is better (0% ideal)</div>
1228
  </div>
1229
 
1230
+ <div class="metric-card">
1231
+ <div class="metric-icon">🌍</div>
1232
  <div class="metric-value">{metrics.arabic_fertility:.3f}</div>
1233
  <div class="metric-label">Arabic Fertility</div>
 
1234
  </div>
1235
 
1236
  <div class="metric-card">
1237
+ <div class="metric-icon">⚑</div>
1238
+ <div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div>
1239
+ <div class="metric-label">Processing Time</div>
 
1240
  </div>
1241
  </div>
1242
  '''
 
1244
  def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
1245
  """Generate tokenizer information card"""
1246
 
1247
+ dialect_badges = ''.join([f'<span class="badge dialect">{d}</span>' for d in info.dialect_support])
1248
+ feature_badges = ''.join([f'<span class="badge feature">{f}</span>' for f in info.special_features])
 
 
1249
 
1250
+ support_class = "native" if info.arabic_support == "Native" else "supported" if info.arabic_support == "Supported" else "limited"
 
 
 
 
 
1251
 
1252
  return f'''
1253
+ <div class="info-card">
1254
+ <div class="info-header">
1255
  <h3>{info.name}</h3>
1256
  <span class="org-badge">{info.organization}</span>
1257
  </div>
1258
+
1259
+ <p class="description">{info.description}</p>
1260
+
1261
+ <div class="info-grid">
1262
+ <div class="info-item">
1263
+ <span class="info-label">Type:</span>
1264
+ <span class="info-value">{info.type.value}</span>
1265
  </div>
1266
+ <div class="info-item">
1267
+ <span class="info-label">Algorithm:</span>
1268
+ <span class="info-value">{info.algorithm.value}</span>
1269
  </div>
1270
+ <div class="info-item">
1271
+ <span class="info-label">Vocab Size:</span>
1272
+ <span class="info-value">{info.vocab_size:,}</span>
1273
  </div>
1274
+ <div class="info-item">
1275
+ <span class="info-label">Arabic Support:</span>
1276
+ <span class="info-value support-{support_class}">{info.arabic_support}</span>
1277
  </div>
1278
  </div>
1279
+
1280
+ <div class="badge-container">
1281
  <div class="badge-group">
1282
  <span class="badge-label">Dialects:</span>
1283
  {dialect_badges}
 
1290
  </div>
1291
  '''
1292
 
 
 
 
 
1293
  def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
1294
+ """Analyze a single tokenizer"""
1295
 
1296
+ if not text or not text.strip():
1297
  return (
1298
+ '<div class="warning">⚠️ Please enter some text to analyze</div>',
1299
+ '', '', ''
1300
+ )
1301
+
1302
+ if not tokenizer_choice:
1303
+ return (
1304
+ '<div class="warning">⚠️ Please select a tokenizer</div>',
1305
+ '', '', ''
1306
  )
1307
 
1308
  model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
1309
+ tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
1310
+
1311
+ if not tokenizer_info:
1312
+ return (
1313
+ '<div class="error-card"><h4>Error</h4><p>Tokenizer not found</p></div>',
1314
+ '', '', ''
1315
+ )
1316
 
1317
  try:
1318
+ metrics = analyze_tokenization(text, model_id, tokenizer_info)
1319
 
1320
+ info_html = generate_tokenizer_info_card(tokenizer_info)
1321
+ metrics_html = generate_metrics_card(metrics, tokenizer_info)
 
1322
  tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
1323
 
 
1324
  decoded_html = f'''
1325
  <div class="decoded-section">
1326
  <h4>Decoded Output</h4>
1327
  <div class="decoded-text" dir="auto">{metrics.decoded_text}</div>
1328
  <div class="decoded-meta">
1329
+ Diacritics preserved: {'βœ… Yes' if metrics.diacritic_preservation else '❌ No'}
1330
  </div>
1331
  </div>
1332
  '''
 
1334
  return info_html, metrics_html, tokens_html, decoded_html
1335
 
1336
  except Exception as e:
1337
+ return (
1338
+ f'<div class="error-card"><h4>Error</h4><p>{str(e)}</p></div>',
1339
+ '', '', ''
1340
+ )
 
 
 
1341
 
1342
  def compare_tokenizers(tokenizer_choices: List[str], text: str) -> str:
1343
+ """Compare multiple tokenizers"""
1344
 
1345
+ if not text or not text.strip():
1346
+ return '<div class="warning">⚠️ Please enter some text to analyze</div>'
1347
 
1348
  if not tokenizer_choices or len(tokenizer_choices) < 2:
1349
+ return '<div class="warning">⚠️ Please select at least 2 tokenizers to compare</div>'
1350
 
1351
  results = []
1352
 
1353
  for choice in tokenizer_choices:
1354
  model_id = tokenizer_manager.get_model_id_from_choice(choice)
1355
+ tokenizer_info = tokenizer_manager.get_available_tokenizers().get(model_id)
1356
 
1357
+ if tokenizer_info:
1358
+ try:
1359
+ metrics = analyze_tokenization(text, model_id, tokenizer_info)
1360
+ results.append({
1361
+ 'name': tokenizer_info.name,
1362
+ 'org': tokenizer_info.organization,
1363
+ 'type': tokenizer_info.type.value,
1364
+ 'metrics': metrics
1365
+ })
1366
+ except Exception as e:
1367
+ results.append({
1368
+ 'name': tokenizer_info.name,
1369
+ 'org': tokenizer_info.organization,
1370
+ 'type': tokenizer_info.type.value,
1371
+ 'error': str(e)
1372
+ })
1373
 
1374
+ # Sort by fertility (lower is better)
1375
+ results.sort(key=lambda x: x.get('metrics', TokenizationMetrics(
1376
+ total_tokens=0, total_words=0, total_characters=0, total_bytes=0,
1377
+ fertility=999, compression_ratio=0, char_per_token=0,
1378
+ oov_count=0, oov_percentage=0, single_token_words=0,
1379
+ single_token_retention_rate=0, avg_subwords_per_word=0,
1380
+ max_subwords_per_word=0, continued_words_ratio=0,
1381
+ arabic_char_count=0, arabic_token_count=0, arabic_fertility=0,
1382
+ diacritic_preservation=False, tokenization_time_ms=0
1383
+ )).fertility)
1384
 
1385
  # Generate comparison table
1386
+ html = '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1387
  <div class="comparison-container">
 
 
1388
  <table class="comparison-table">
1389
  <thead>
1390
  <tr>
1391
+ <th>Rank</th>
1392
  <th>Tokenizer</th>
1393
+ <th>Type</th>
1394
  <th>Tokens</th>
1395
  <th>Fertility ↓</th>
1396
+ <th>Compression ↑</th>
1397
+ <th>STRR ↑</th>
 
1398
  <th>OOV %</th>
 
1399
  </tr>
1400
  </thead>
1401
  <tbody>
1402
+ '''
1403
+
1404
+ for i, result in enumerate(results):
1405
+ rank = i + 1
1406
+ rank_class = 'rank-1' if rank == 1 else 'rank-2' if rank == 2 else 'rank-3' if rank == 3 else ''
1407
+
1408
+ if 'error' in result:
1409
+ html += f'''
1410
+ <tr class="{rank_class}">
1411
+ <td>#{rank}</td>
1412
+ <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
1413
+ <td>{result['type']}</td>
1414
+ <td colspan="5" class="error">Error: {result['error']}</td>
1415
+ </tr>
1416
+ '''
1417
+ else:
1418
+ m = result['metrics']
1419
+ fertility_class = 'excellent' if m.fertility < 1.5 else 'good' if m.fertility < 2.5 else 'poor'
1420
+
1421
+ html += f'''
1422
+ <tr class="{rank_class}">
1423
+ <td><strong>#{rank}</strong></td>
1424
+ <td><strong>{result['name']}</strong><br><small>{result['org']}</small></td>
1425
+ <td>{result['type']}</td>
1426
+ <td>{m.total_tokens}</td>
1427
+ <td class="{fertility_class}">{m.fertility:.3f}</td>
1428
+ <td>{m.compression_ratio:.2f}</td>
1429
+ <td>{m.single_token_retention_rate:.1%}</td>
1430
+ <td>{m.oov_percentage:.1f}%</td>
1431
+ </tr>
1432
+ '''
1433
+
1434
+ html += '''
1435
  </tbody>
1436
  </table>
 
 
 
 
 
1437
  </div>
1438
  '''
1439
+
1440
+ return html
1441
 
1442
  # ============================================================================
1443
+ # CUSTOM CSS
1444
  # ============================================================================
1445
 
1446
  CUSTOM_CSS = """
1447
+ /* ===== ROOT VARIABLES ===== */
1448
  :root {
1449
+ --primary: #1a5f2a;
1450
+ --primary-light: #2d8f4e;
1451
+ --secondary: #4a90d9;
1452
+ --accent: #f59e0b;
1453
+ --success: #10b981;
 
1454
  --warning: #f57c00;
1455
  --error: #c62828;
1456
+ --bg-primary: #0f1419;
1457
+ --bg-secondary: #1c2128;
1458
+ --bg-card: #22272e;
1459
+ --text-primary: #e6edf3;
1460
+ --text-secondary: #8b949e;
1461
+ --border: #30363d;
 
 
 
 
 
 
 
 
1462
  }
1463
 
1464
+ /* ===== HEADER ===== */
1465
  .header-section {
1466
  text-align: center;
1467
+ padding: 2rem 1rem;
1468
+ background: linear-gradient(135deg, var(--primary) 0%, var(--primary-light) 100%);
1469
  border-radius: 16px;
1470
+ margin-bottom: 1.5rem;
1471
  }
1472
 
1473
  .header-section h1 {
1474
  font-size: 2.5rem;
 
1475
  color: white;
1476
  margin-bottom: 0.5rem;
 
1477
  }
1478
 
1479
  .header-section p {
 
1481
  font-size: 1.1rem;
1482
  }
1483
 
1484
+ /* ===== INFO CARD ===== */
1485
+ .info-card {
 
 
 
 
1486
  background: var(--bg-card);
1487
  border-radius: 12px;
1488
+ padding: 1.5rem;
1489
  border: 1px solid var(--border);
 
1490
  }
1491
 
1492
+ .info-header {
1493
+ display: flex;
1494
+ justify-content: space-between;
1495
  align-items: center;
1496
+ margin-bottom: 1rem;
 
 
 
 
 
1497
  }
1498
 
1499
+ .info-header h3 {
1500
+ color: var(--text-primary);
1501
+ margin: 0;
1502
  }
1503
 
1504
+ .org-badge {
1505
+ background: var(--primary);
1506
+ color: white;
1507
+ padding: 0.25rem 0.75rem;
1508
+ border-radius: 20px;
1509
+ font-size: 0.85rem;
1510
+ }
1511
+
1512
+ .description {
1513
+ color: var(--text-secondary);
1514
+ line-height: 1.6;
1515
+ }
1516
+
1517
+ .info-grid {
1518
+ display: grid;
1519
+ grid-template-columns: repeat(2, 1fr);
1520
+ gap: 1rem;
1521
+ margin: 1rem 0;
1522
+ }
1523
+
1524
+ .info-item {
1525
+ display: flex;
1526
+ flex-direction: column;
1527
+ }
1528
+
1529
+ .info-label {
1530
+ color: var(--text-secondary);
1531
+ font-size: 0.85rem;
1532
+ }
1533
+
1534
+ .info-value {
1535
+ color: var(--text-primary);
1536
+ font-weight: 600;
1537
+ }
1538
+
1539
+ .support-native { color: var(--success); }
1540
+ .support-supported { color: var(--secondary); }
1541
+ .support-limited { color: var(--warning); }
1542
+
1543
+ /* ===== BADGES ===== */
1544
+ .badge-container {
1545
+ margin-top: 1rem;
1546
+ }
1547
+
1548
+ .badge-group {
1549
+ margin-bottom: 0.5rem;
1550
+ }
1551
+
1552
+ .badge-label {
1553
+ color: var(--text-secondary);
1554
+ font-size: 0.85rem;
1555
+ margin-right: 0.5rem;
1556
+ }
1557
+
1558
+ .badge {
1559
+ display: inline-block;
1560
+ padding: 0.2rem 0.5rem;
1561
+ border-radius: 4px;
1562
+ font-size: 0.75rem;
1563
+ margin-right: 0.25rem;
1564
+ margin-bottom: 0.25rem;
1565
+ }
1566
+
1567
+ .badge.dialect {
1568
+ background: rgba(74, 144, 217, 0.2);
1569
+ color: var(--secondary);
1570
+ }
1571
+
1572
+ .badge.feature {
1573
+ background: rgba(245, 158, 11, 0.2);
1574
+ color: var(--accent);
1575
  }
1576
 
1577
  /* ===== METRICS GRID ===== */
1578
  .metrics-grid {
1579
  display: grid;
1580
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
1581
  gap: 1rem;
1582
+ margin: 1rem 0;
1583
  }
1584
 
1585
  .metric-card {
1586
  background: var(--bg-card);
 
1587
  border-radius: 12px;
1588
+ padding: 1rem;
1589
  text-align: center;
1590
+ border: 1px solid var(--border);
1591
+ transition: transform 0.2s;
1592
  }
1593
 
1594
  .metric-card:hover {
1595
+ transform: translateY(-2px);
 
1596
  }
1597
 
1598
  .metric-card.excellent {
1599
  border-color: var(--success);
1600
+ background: linear-gradient(to bottom, rgba(16, 185, 129, 0.1), transparent);
1601
  }
1602
 
1603
  .metric-card.good {
1604
+ border-color: var(--secondary);
1605
+ background: linear-gradient(to bottom, rgba(74, 144, 217, 0.1), transparent);
1606
  }
1607
 
1608
  .metric-card.poor {
1609
+ border-color: var(--error);
1610
+ background: linear-gradient(to bottom, rgba(198, 40, 40, 0.1), transparent);
1611
  }
1612
 
1613
  .metric-card.primary {
1614
+ border-color: var(--primary);
1615
+ background: linear-gradient(to bottom, rgba(26, 95, 42, 0.1), transparent);
 
 
 
 
1616
  }
1617
 
1618
  .metric-icon {
 
1621
  }
1622
 
1623
  .metric-value {
1624
+ font-size: 1.5rem;
1625
  font-weight: 700;
1626
  color: var(--text-primary);
 
1627
  }
1628
 
1629
  .metric-label {
1630
+ font-size: 0.8rem;
1631
  color: var(--text-secondary);
1632
+ margin-top: 0.25rem;
1633
  }
1634
 
1635
  .metric-hint {
 
1638
  opacity: 0.7;
1639
  }
1640
 
1641
+ /* ===== TOKEN VISUALIZATION ===== */
1642
+ .token-container {
1643
+ display: flex;
1644
+ flex-wrap: wrap;
1645
+ gap: 0.5rem;
1646
+ padding: 1rem;
1647
+ background: var(--bg-secondary);
1648
  border-radius: 12px;
1649
+ direction: rtl;
1650
  }
1651
 
1652
+ .token {
1653
+ display: inline-flex;
1654
+ flex-direction: column;
1655
  align-items: center;
1656
+ padding: 0.5rem 0.75rem;
1657
+ border-radius: 8px;
1658
+ font-family: 'IBM Plex Sans Arabic', monospace;
1659
+ font-size: 1rem;
1660
+ transition: transform 0.2s;
1661
+ cursor: default;
1662
  }
1663
 
1664
+ .token:hover {
1665
+ transform: scale(1.05);
 
 
1666
  }
1667
 
1668
+ .token-id {
1669
+ font-size: 0.65rem;
1670
+ opacity: 0.7;
1671
+ margin-top: 0.25rem;
 
 
1672
  }
1673
 
1674
+ /* ===== DECODED SECTION ===== */
1675
+ .decoded-section {
1676
+ background: var(--bg-card);
1677
+ border-radius: 12px;
1678
+ padding: 1.5rem;
1679
+ border: 1px solid var(--border);
1680
  }
1681
 
1682
+ .decoded-section h4 {
1683
+ color: var(--text-primary);
 
 
1684
  margin-bottom: 1rem;
1685
  }
1686
 
1687
+ .decoded-text {
1688
+ font-family: 'IBM Plex Sans Arabic', serif;
1689
+ font-size: 1.1rem;
1690
+ line-height: 1.8;
 
 
 
 
 
 
 
1691
  color: var(--text-primary);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1692
  }
1693
 
1694
+ .decoded-meta {
1695
+ margin-top: 1rem;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1696
  font-size: 0.85rem;
1697
+ color: var(--text-secondary);
 
 
 
 
 
 
 
 
1698
  }
1699
 
1700
  /* ===== COMPARISON TABLE ===== */
1701
  .comparison-container {
1702
+ overflow-x: auto;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1703
  }
1704
 
1705
  .comparison-table {
1706
  width: 100%;
1707
  border-collapse: collapse;
1708
+ margin: 1rem 0;
1709
  }
1710
 
1711
  .comparison-table th {
1712
+ background: var(--primary);
1713
+ color: white;
1714
+ padding: 0.75rem;
1715
  text-align: left;
1716
+ font-weight: 600;
 
1717
  }
1718
 
1719
  .comparison-table td {
1720
+ padding: 0.75rem;
1721
  border-bottom: 1px solid var(--border);
1722
  color: var(--text-primary);
1723
  }
1724
 
1725
+ .comparison-table tr:hover {
1726
+ background: rgba(74, 144, 217, 0.1);
1727
  }
1728
 
1729
+ .comparison-table .rank-1 {
1730
+ background: linear-gradient(90deg, rgba(255, 215, 0, 0.2), transparent);
1731
  }
1732
 
1733
+ .comparison-table .rank-2 {
1734
+ background: linear-gradient(90deg, rgba(192, 192, 192, 0.2), transparent);
1735
  }
1736
 
1737
+ .comparison-table .rank-3 {
1738
+ background: linear-gradient(90deg, rgba(205, 127, 50, 0.2), transparent);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1739
  }
1740
 
1741
+ .comparison-table .excellent {
1742
+ color: var(--success);
1743
+ font-weight: 600;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1744
  }
1745
 
1746
+ .comparison-table .good {
1747
+ color: var(--secondary);
 
 
 
 
 
 
1748
  }
1749
 
1750
+ .comparison-table .poor {
1751
+ color: var(--error);
 
 
1752
  }
1753
 
1754
  /* ===== UTILITY CLASSES ===== */
 
1803
 
1804
  available_tokenizers = tokenizer_manager.get_tokenizer_choices()
1805
 
1806
+ # Group tokenizers by type
1807
+ arabic_specific = [t for t in available_tokenizers if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT', 'Aranizer'])]
1808
+ arabic_llms = [t for t in available_tokenizers if any(x in t for x in ['Jais', 'AceGPT', 'ALLaM', 'SILMA', 'Fanar', 'Yehia', 'Atlas'])]
1809
  multilingual = [t for t in available_tokenizers if t not in arabic_specific and t not in arabic_llms]
1810
 
1811
  with gr.Blocks(css=CUSTOM_CSS, title="Arabic Tokenizer Arena Pro", theme=gr.themes.Base(
1812
+ primary_hue="green",
1813
+ secondary_hue="blue",
1814
  neutral_hue="slate",
1815
  font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
1816
  )) as demo:
 
1857
  tokens_output = gr.HTML(label="Token Visualization")
1858
  decoded_output = gr.HTML(label="Decoded Output")
1859
 
 
1860
  sample_dropdown.change(
1861
  lambda x: SAMPLE_TEXTS.get(x, ""),
1862
  inputs=[sample_dropdown],
 
1909
  outputs=[comparison_output]
1910
  )
1911
 
1912
+ # ===== TAB 3: LEADERBOARD - Real HF Datasets =====
1913
+ with gr.TabItem("πŸ† Leaderboard", id="leaderboard"):
1914
+ gr.Markdown("""
1915
+ ## πŸ† Arabic Tokenizer Leaderboard
1916
+
1917
+ Evaluate and rank tokenizers using **real Arabic datasets from HuggingFace**.
1918
+ Select datasets and tokenizers below, then click "Run Evaluation" to generate the leaderboard.
1919
+
1920
+ ⚠️ **Note:** First run will download datasets from HuggingFace (may take a few minutes).
1921
+ """)
1922
+
1923
+ with gr.Row():
1924
+ with gr.Column(scale=1):
1925
+ gr.Markdown("### πŸ“š Select Datasets")
1926
+ dataset_choices = gr.CheckboxGroup(
1927
+ choices=[(f"{v['name']} ({v['category']})", k) for k, v in LEADERBOARD_DATASETS.items()],
1928
+ value=["arabic_mmlu", "arsentd_lev", "athar", "arcd"],
1929
+ label="HuggingFace Datasets",
1930
+ info="Datasets will be downloaded from HuggingFace"
1931
+ )
1932
+
1933
+ with gr.Column(scale=1):
1934
+ gr.Markdown("### πŸ”§ Select Tokenizers")
1935
+ leaderboard_tokenizer_choices = gr.CheckboxGroup(
1936
+ choices=available_tokenizers,
1937
+ value=available_tokenizers[:8] if len(available_tokenizers) >= 8 else available_tokenizers,
1938
+ label="Tokenizers to Evaluate"
1939
+ )
1940
+
1941
+ run_leaderboard_btn = gr.Button("πŸš€ Run Evaluation", variant="primary", size="lg")
1942
+
1943
+ status_output = gr.Markdown("Click 'Run Evaluation' to start...")
1944
+
1945
+ gr.Markdown("---")
1946
+ gr.Markdown("### πŸ“Š Leaderboard Results")
1947
+
1948
+ leaderboard_output = gr.HTML()
1949
+
1950
+ gr.Markdown("### πŸ“ˆ Per-Dataset Breakdown")
1951
+ per_dataset_output = gr.HTML()
1952
+
1953
+ run_leaderboard_btn.click(
1954
+ fn=run_leaderboard_evaluation,
1955
+ inputs=[dataset_choices, leaderboard_tokenizer_choices],
1956
+ outputs=[leaderboard_output, per_dataset_output, status_output]
1957
+ )
1958
+
1959
+ gr.Markdown("""
1960
+ ---
1961
+ ### πŸ“– Dataset Sources (from HuggingFace)
1962
+
1963
+ | Dataset | HuggingFace ID | Category | Description |
1964
+ |---------|----------------|----------|-------------|
1965
+ | ArabicMMLU | `MBZUAI/ArabicMMLU` | Benchmark | Multi-task exam questions (14,575 MCQs) |
1966
+ | ArSenTD-LEV | `ramybaly/arsentd_lev` | Dialectal | Levantine tweets |
1967
+ | ATHAR | `mohamed-khalil/ATHAR` | Classical | 66K classical Arabic sentences |
1968
+ | ARCD | `arcd` | QA | Arabic Reading Comprehension |
1969
+ | Ashaar | `arbml/Ashaar_dataset` | Poetry | 2M+ Arabic poetry verses |
1970
+ | Hadith | `gurgutan/sunnah_ar_en_dataset` | Religious | 50,762 hadiths |
1971
+ | Arabic Sentiment | `arbml/Arabic_Sentiment_Twitter_Corpus` | Social Media | Twitter sentiment |
1972
+ | SANAD | `arbml/SANAD` | News | Arabic news articles |
1973
+ """)
1974
+
1975
+ # ===== TAB 4: Metrics Reference =====
1976
  with gr.TabItem("πŸ“– Metrics Guide", id="guide"):
1977
  gr.Markdown("""
1978
  ## Tokenization Evaluation Metrics Guide
 
2007
  - *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.)
2008
  - *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025)
2009
  - *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024)
 
 
 
 
 
 
 
 
 
2010
  """)
2011
 
2012
+ # ===== TAB 5: About =====
2013
  with gr.TabItem("ℹ️ About", id="about"):
2014
  gr.Markdown(f"""
2015
  ## Arabic Tokenizer Arena Pro
 
2019
  ### Available Tokenizers: {len(available_tokenizers)}
2020
 
2021
  **Arabic-Specific Models:**
2022
+ {chr(10).join(['- ' + t for t in arabic_specific[:10]])}
2023
 
2024
  **Arabic LLMs:**
2025
+ {chr(10).join(['- ' + t for t in arabic_llms[:10]])}
2026
 
2027
  **Multilingual LLMs:**
2028
+ {chr(10).join(['- ' + t for t in multilingual[:10]])}
2029
 
2030
  ### Features
2031
 
 
2033
  βœ… Arabic-specific analysis (dialect support, diacritic preservation)
2034
  βœ… Side-by-side tokenizer comparison
2035
  βœ… Beautiful token visualization
2036
+ βœ… **NEW: Leaderboard with real HuggingFace datasets**
2037
  βœ… Support for MSA, dialectal Arabic, and Classical Arabic
2038
  βœ… Research-backed evaluation methodology
2039
 
 
2057
 
2058
  if __name__ == "__main__":
2059
  demo = create_interface()
2060
+ demo.launch()