Arabic_Tokenizer / leaderboard.py
HeshamHaroon's picture
Auto-run leaderboard evaluation and improve UI colors
7b9de45
raw
history blame
16.1 kB
"""
Leaderboard Module
==================
Evaluate tokenizers on real HuggingFace Arabic datasets
"""
import statistics
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
import gradio as gr
from datasets import load_dataset
from transformers import AutoTokenizer
from config import LEADERBOARD_DATASETS
from tokenizer_manager import tokenizer_manager
class HFDatasetLoader:
"""Load Arabic datasets from HuggingFace"""
def __init__(self):
self.cache = {}
def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]:
"""Load texts from a HuggingFace dataset"""
if dataset_key in self.cache:
return self.cache[dataset_key], f"✅ Loaded {len(self.cache[dataset_key])} samples (cached)"
config = LEADERBOARD_DATASETS.get(dataset_key)
if not config:
return [], f"❌ Unknown dataset: {dataset_key}"
try:
# Load dataset from HuggingFace
if config.get("subset"):
ds = load_dataset(
config["hf_id"],
config["subset"],
split=config["split"],
trust_remote_code=True
)
else:
ds = load_dataset(
config["hf_id"],
split=config["split"],
trust_remote_code=True
)
texts = []
text_col = config["text_column"]
# Try to find text column
if text_col not in ds.column_names:
for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]:
if col in ds.column_names:
text_col = col
break
# Extract texts
max_samples = config.get("samples", 500)
for i, item in enumerate(ds):
if i >= max_samples:
break
text = item.get(text_col, "")
if text and isinstance(text, str) and len(text.strip()) > 10:
texts.append(text.strip())
self.cache[dataset_key] = texts
return texts, f"✅ Loaded {len(texts)} samples from HuggingFace"
except Exception as e:
return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}"
def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]:
"""Evaluate a tokenizer on a list of texts"""
fertilities = []
compressions = []
unk_counts = 0
total_tokens = 0
for text in texts:
try:
tokens = tokenizer.encode(text, add_special_tokens=False)
decoded = tokenizer.convert_ids_to_tokens(tokens)
num_tokens = len(tokens)
num_words = len(text.split()) or 1
num_bytes = len(text.encode('utf-8'))
fertility = num_tokens / num_words
compression = num_bytes / num_tokens if num_tokens > 0 else 0
# Count UNKs
unk_token = getattr(tokenizer, 'unk_token', '[UNK]')
unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower()))
fertilities.append(fertility)
compressions.append(compression)
unk_counts += unks
total_tokens += num_tokens
except Exception:
continue
if not fertilities:
return None
return {
"avg_fertility": statistics.mean(fertilities),
"std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0,
"avg_compression": statistics.mean(compressions),
"unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0,
"samples": len(fertilities)
}
def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float:
"""Calculate overall score (0-100, higher is better)"""
# Lower fertility is better (ideal ~1.0 for Arabic)
fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0
# Higher compression is better
compression_score = min(1, compression / 6)
# Lower UNK is better
unk_score = 1 - min(1, unk_ratio * 20)
# Weighted combination
score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100
return round(score, 1)
def run_leaderboard_evaluation(
selected_datasets: List[str],
selected_tokenizers: List[str],
progress=gr.Progress()
) -> Tuple[str, str, str]:
"""
Run the full leaderboard evaluation with real HF datasets
Returns: (leaderboard_html, per_dataset_html, status_message)
"""
if not selected_datasets:
return "", "", "⚠️ Please select at least one dataset"
if not selected_tokenizers:
return "", "", "⚠️ Please select at least one tokenizer"
loader = HFDatasetLoader()
results = defaultdict(dict)
# Status tracking
status_lines = []
# Load datasets from HuggingFace
status_lines.append("📚 **Loading Datasets from HuggingFace:**\n")
loaded_datasets = {}
for i, ds_key in enumerate(selected_datasets):
progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...")
texts, msg = loader.load_dataset_texts(ds_key)
ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
status_lines.append(f" • {ds_name}: {msg}")
if texts:
loaded_datasets[ds_key] = texts
if not loaded_datasets:
return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully"
# Evaluate tokenizers
status_lines.append("\n🔄 **Evaluating Tokenizers:**\n")
tokenizer_cache = {}
total_steps = len(selected_tokenizers) * len(loaded_datasets)
current_step = 0
for tok_choice in selected_tokenizers:
# Get model ID from choice
tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
tok_name = tok_info.name if tok_info else tok_choice
# Load tokenizer
try:
if tok_id not in tokenizer_cache:
tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained(
tok_id, trust_remote_code=True
)
tokenizer = tokenizer_cache[tok_id]
status_lines.append(f" • {tok_name}: ✅ Loaded")
except Exception as e:
status_lines.append(f" • {tok_name}: ❌ Failed ({str(e)[:30]})")
continue
# Evaluate on each dataset
for ds_key, texts in loaded_datasets.items():
current_step += 1
progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...")
metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
if metrics:
results[tok_choice][ds_key] = metrics
# Generate leaderboard
progress(0.95, "Generating leaderboard...")
leaderboard_data = []
per_dataset_data = []
for tok_choice, ds_results in results.items():
if not ds_results:
continue
tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
# Aggregate across datasets
all_fertility = [m["avg_fertility"] for m in ds_results.values()]
all_compression = [m["avg_compression"] for m in ds_results.values()]
all_unk = [m["unk_ratio"] for m in ds_results.values()]
avg_fertility = statistics.mean(all_fertility)
avg_compression = statistics.mean(all_compression)
avg_unk = statistics.mean(all_unk)
score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)
leaderboard_data.append({
"name": tok_info.name if tok_info else tok_choice,
"type": tok_info.type.value if tok_info else "Unknown",
"org": tok_info.organization if tok_info else "Unknown",
"score": score,
"fertility": avg_fertility,
"compression": avg_compression,
"unk_ratio": avg_unk,
"num_datasets": len(ds_results)
})
# Per-dataset row
per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice}
for ds_key in selected_datasets:
ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
if ds_key in ds_results:
per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2)
else:
per_ds_row[ds_name] = "-"
per_dataset_data.append(per_ds_row)
# Sort by score
leaderboard_data.sort(key=lambda x: x["score"], reverse=True)
# Create HTML tables
leaderboard_html = generate_leaderboard_html(leaderboard_data)
per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)
status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
return leaderboard_html, per_dataset_html, "\n".join(status_lines)
def generate_leaderboard_html(data: List[Dict]) -> str:
"""Generate HTML for main leaderboard - clean professional design"""
if not data:
return "<p>No results to display</p>"
html = """
<style>
.leaderboard-table {
width: 100%;
border-collapse: collapse;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
font-size: 14px;
margin: 16px 0;
}
.leaderboard-table th {
background: #2c3e50;
color: #fff;
padding: 12px 10px;
text-align: left;
font-weight: 500;
border-bottom: 2px solid #1a252f;
}
.leaderboard-table td {
padding: 10px;
border-bottom: 1px solid #e9ecef;
color: #333;
}
.leaderboard-table tr:nth-child(even) {
background-color: #f8f9fa;
}
.leaderboard-table tr:hover {
background-color: #eef2f7;
}
.leaderboard-table .rank-1 td { background: #f0f7ff; }
.leaderboard-table .rank-2 td { background: #f5f5f5; }
.leaderboard-table .rank-3 td { background: #fdf8f3; }
.score-badge {
background: #2c3e50;
color: #fff;
padding: 4px 10px;
border-radius: 4px;
font-weight: 600;
font-size: 13px;
}
.type-badge {
background: #e9ecef;
color: #495057;
padding: 3px 8px;
border-radius: 3px;
font-size: 12px;
}
.metric-good { color: #198754; font-weight: 500; }
.metric-bad { color: #dc3545; font-weight: 500; }
.rank-medal { font-size: 16px; margin-right: 4px; }
</style>
<table class="leaderboard-table">
<thead>
<tr>
<th>Rank</th>
<th>Tokenizer</th>
<th>Type</th>
<th>Organization</th>
<th>Score</th>
<th>Fertility</th>
<th>Compression</th>
<th>UNK Rate</th>
<th>Datasets</th>
</tr>
</thead>
<tbody>
"""
for i, entry in enumerate(data):
rank = i + 1
rank_class = f"rank-{rank}" if rank <= 3 else ""
# Medal for top 3
if rank == 1:
rank_display = '<span class="rank-medal">🥇</span> 1'
elif rank == 2:
rank_display = '<span class="rank-medal">🥈</span> 2'
elif rank == 3:
rank_display = '<span class="rank-medal">🥉</span> 3'
else:
rank_display = f"#{rank}"
fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
comp_class = "metric-good" if entry["compression"] > 3.5 else ""
unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""
html += f"""
<tr class="{rank_class}">
<td><strong>{rank_display}</strong></td>
<td><strong>{entry["name"]}</strong></td>
<td><span class="type-badge">{entry["type"]}</span></td>
<td>{entry["org"]}</td>
<td><span class="score-badge">{entry["score"]}</span></td>
<td class="{fert_class}">{entry["fertility"]:.3f}</td>
<td class="{comp_class}">{entry["compression"]:.2f}</td>
<td class="{unk_class}">{entry["unk_ratio"]:.2%}</td>
<td>{entry["num_datasets"]}</td>
</tr>
"""
html += """
</tbody>
</table>
<div style="margin-top: 12px; padding: 12px 16px; background: #f8f9fa; border-left: 3px solid #2c3e50; font-size: 13px; color: #495057;">
<strong>Metrics:</strong>
Score (0-100, higher=better) •
Fertility (tokens/word, lower=better) •
Compression (bytes/token, higher=better) •
UNK Rate (lower=better)
</div>
"""
return html
def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
"""Generate HTML for per-dataset fertility table - clean professional design"""
if not data:
return "<p>No per-dataset results</p>"
ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]
html = """
<style>
.dataset-table {
width: 100%;
border-collapse: collapse;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
font-size: 13px;
margin: 16px 0;
}
.dataset-table th {
background: #495057;
color: #fff;
padding: 10px 8px;
text-align: center;
font-weight: 500;
}
.dataset-table th:first-child {
text-align: left;
}
.dataset-table td {
padding: 8px;
text-align: center;
border-bottom: 1px solid #e9ecef;
color: #333;
}
.dataset-table td:first-child {
text-align: left;
font-weight: 500;
}
.dataset-table tr:nth-child(even) {
background-color: #f8f9fa;
}
.dataset-table tr:hover {
background-color: #eef2f7;
}
.fert-excellent { background: #d4edda; color: #155724; font-weight: 500; }
.fert-good { background: #fff3cd; color: #856404; font-weight: 500; }
.fert-poor { background: #f8d7da; color: #721c24; font-weight: 500; }
</style>
<table class="dataset-table">
<thead>
<tr>
<th>Tokenizer</th>
"""
for ds_name in ds_names:
html += f"<th>{ds_name}</th>"
html += """
</tr>
</thead>
<tbody>
"""
for row in data:
html += f"<tr><td>{row['Tokenizer']}</td>"
for ds_name in ds_names:
val = row.get(ds_name, "-")
if val != "-":
if val < 1.8:
cls = "fert-excellent"
elif val < 2.5:
cls = "fert-good"
else:
cls = "fert-poor"
html += f'<td class="{cls}">{val}</td>'
else:
html += '<td>-</td>'
html += "</tr>"
html += """
</tbody>
</table>
"""
return html