Spaces:

HeshamHaroon
/

Arabic_Tokenizer

Running

App Files Files Community

Arabic_Tokenizer / leaderboard.py

HeshamHaroon

Auto-run leaderboard evaluation and improve UI colors

7b9de45 24 days ago

raw

history blame

16.1 kB

	"""
	Leaderboard Module
	==================
	Evaluate tokenizers on real HuggingFace Arabic datasets
	"""

	import statistics
	from typing import Dict, List, Tuple, Optional
	from collections import defaultdict
	import gradio as gr
	from datasets import load_dataset
	from transformers import AutoTokenizer

	from config import LEADERBOARD_DATASETS
	from tokenizer_manager import tokenizer_manager


	class HFDatasetLoader:
	"""Load Arabic datasets from HuggingFace"""

	def __init__(self):
	self.cache = {}

	def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]:
	"""Load texts from a HuggingFace dataset"""

	if dataset_key in self.cache:
	return self.cache[dataset_key], f"✅ Loaded {len(self.cache[dataset_key])} samples (cached)"

	config = LEADERBOARD_DATASETS.get(dataset_key)
	if not config:
	return [], f"❌ Unknown dataset: {dataset_key}"

	try:
	# Load dataset from HuggingFace
	if config.get("subset"):
	ds = load_dataset(
	config["hf_id"],
	config["subset"],
	split=config["split"],
	trust_remote_code=True
	)
	else:
	ds = load_dataset(
	config["hf_id"],
	split=config["split"],
	trust_remote_code=True
	)

	texts = []
	text_col = config["text_column"]

	# Try to find text column
	if text_col not in ds.column_names:
	for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]:
	if col in ds.column_names:
	text_col = col
	break

	# Extract texts
	max_samples = config.get("samples", 500)
	for i, item in enumerate(ds):
	if i >= max_samples:
	break
	text = item.get(text_col, "")
	if text and isinstance(text, str) and len(text.strip()) > 10:
	texts.append(text.strip())

	self.cache[dataset_key] = texts
	return texts, f"✅ Loaded {len(texts)} samples from HuggingFace"

	except Exception as e:
	return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}"


	def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]:
	"""Evaluate a tokenizer on a list of texts"""

	fertilities = []
	compressions = []
	unk_counts = 0
	total_tokens = 0

	for text in texts:
	try:
	tokens = tokenizer.encode(text, add_special_tokens=False)
	decoded = tokenizer.convert_ids_to_tokens(tokens)

	num_tokens = len(tokens)
	num_words = len(text.split()) or 1
	num_bytes = len(text.encode('utf-8'))

	fertility = num_tokens / num_words
	compression = num_bytes / num_tokens if num_tokens > 0 else 0

	# Count UNKs
	unk_token = getattr(tokenizer, 'unk_token', '[UNK]')
	unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower()))

	fertilities.append(fertility)
	compressions.append(compression)
	unk_counts += unks
	total_tokens += num_tokens

	except Exception:
	continue

	if not fertilities:
	return None

	return {
	"avg_fertility": statistics.mean(fertilities),
	"std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0,
	"avg_compression": statistics.mean(compressions),
	"unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0,
	"samples": len(fertilities)
	}


	def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float:
	"""Calculate overall score (0-100, higher is better)"""
	# Lower fertility is better (ideal ~1.0 for Arabic)
	fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0
	# Higher compression is better
	compression_score = min(1, compression / 6)
	# Lower UNK is better
	unk_score = 1 - min(1, unk_ratio * 20)

	# Weighted combination
	score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100
	return round(score, 1)


	def run_leaderboard_evaluation(
	selected_datasets: List[str],
	selected_tokenizers: List[str],
	progress=gr.Progress()
	) -> Tuple[str, str, str]:
	"""
	Run the full leaderboard evaluation with real HF datasets
	Returns: (leaderboard_html, per_dataset_html, status_message)
	"""

	if not selected_datasets:
	return "", "", "⚠️ Please select at least one dataset"

	if not selected_tokenizers:
	return "", "", "⚠️ Please select at least one tokenizer"

	loader = HFDatasetLoader()
	results = defaultdict(dict)

	# Status tracking
	status_lines = []

	# Load datasets from HuggingFace
	status_lines.append("📚 Loading Datasets from HuggingFace:\n")
	loaded_datasets = {}

	for i, ds_key in enumerate(selected_datasets):
	progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...")
	texts, msg = loader.load_dataset_texts(ds_key)
	ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
	status_lines.append(f" • {ds_name}: {msg}")
	if texts:
	loaded_datasets[ds_key] = texts

	if not loaded_datasets:
	return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully"

	# Evaluate tokenizers
	status_lines.append("\n🔄 Evaluating Tokenizers:\n")

	tokenizer_cache = {}
	total_steps = len(selected_tokenizers) * len(loaded_datasets)
	current_step = 0

	for tok_choice in selected_tokenizers:
	# Get model ID from choice
	tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
	tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
	tok_name = tok_info.name if tok_info else tok_choice

	# Load tokenizer
	try:
	if tok_id not in tokenizer_cache:
	tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained(
	tok_id, trust_remote_code=True
	)
	tokenizer = tokenizer_cache[tok_id]
	status_lines.append(f" • {tok_name}: ✅ Loaded")
	except Exception as e:
	status_lines.append(f" • {tok_name}: ❌ Failed ({str(e)[:30]})")
	continue

	# Evaluate on each dataset
	for ds_key, texts in loaded_datasets.items():
	current_step += 1
	progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...")

	metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
	if metrics:
	results[tok_choice][ds_key] = metrics

	# Generate leaderboard
	progress(0.95, "Generating leaderboard...")

	leaderboard_data = []
	per_dataset_data = []

	for tok_choice, ds_results in results.items():
	if not ds_results:
	continue

	tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
	tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)

	# Aggregate across datasets
	all_fertility = [m["avg_fertility"] for m in ds_results.values()]
	all_compression = [m["avg_compression"] for m in ds_results.values()]
	all_unk = [m["unk_ratio"] for m in ds_results.values()]

	avg_fertility = statistics.mean(all_fertility)
	avg_compression = statistics.mean(all_compression)
	avg_unk = statistics.mean(all_unk)

	score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)

	leaderboard_data.append({
	"name": tok_info.name if tok_info else tok_choice,
	"type": tok_info.type.value if tok_info else "Unknown",
	"org": tok_info.organization if tok_info else "Unknown",
	"score": score,
	"fertility": avg_fertility,
	"compression": avg_compression,
	"unk_ratio": avg_unk,
	"num_datasets": len(ds_results)
	})

	# Per-dataset row
	per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice}
	for ds_key in selected_datasets:
	ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
	if ds_key in ds_results:
	per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2)
	else:
	per_ds_row[ds_name] = "-"
	per_dataset_data.append(per_ds_row)

	# Sort by score
	leaderboard_data.sort(key=lambda x: x["score"], reverse=True)

	# Create HTML tables
	leaderboard_html = generate_leaderboard_html(leaderboard_data)
	per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)

	status_lines.append(f"\n✅ Evaluation Complete! Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")

	return leaderboard_html, per_dataset_html, "\n".join(status_lines)


	def generate_leaderboard_html(data: List[Dict]) -> str:
	"""Generate HTML for main leaderboard - clean professional design"""

	if not data:
	return "<p>No results to display</p>"

	html = """
	<style>
	.leaderboard-table {
	width: 100%;
	border-collapse: collapse;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
	font-size: 14px;
	margin: 16px 0;
	}
	.leaderboard-table th {
	background: #2c3e50;
	color: #fff;
	padding: 12px 10px;
	text-align: left;
	font-weight: 500;
	border-bottom: 2px solid #1a252f;
	}
	.leaderboard-table td {
	padding: 10px;
	border-bottom: 1px solid #e9ecef;
	color: #333;
	}
	.leaderboard-table tr:nth-child(even) {
	background-color: #f8f9fa;
	}
	.leaderboard-table tr:hover {
	background-color: #eef2f7;
	}
	.leaderboard-table .rank-1 td { background: #f0f7ff; }
	.leaderboard-table .rank-2 td { background: #f5f5f5; }
	.leaderboard-table .rank-3 td { background: #fdf8f3; }
	.score-badge {
	background: #2c3e50;
	color: #fff;
	padding: 4px 10px;
	border-radius: 4px;
	font-weight: 600;
	font-size: 13px;
	}
	.type-badge {
	background: #e9ecef;
	color: #495057;
	padding: 3px 8px;
	border-radius: 3px;
	font-size: 12px;
	}
	.metric-good { color: #198754; font-weight: 500; }
	.metric-bad { color: #dc3545; font-weight: 500; }
	.rank-medal { font-size: 16px; margin-right: 4px; }
	</style>

	<table class="leaderboard-table">
	<thead>
	<tr>
	<th>Rank</th>
	<th>Tokenizer</th>
	<th>Type</th>
	<th>Organization</th>
	<th>Score</th>
	<th>Fertility</th>
	<th>Compression</th>
	<th>UNK Rate</th>
	<th>Datasets</th>
	</tr>
	</thead>
	<tbody>
	"""

	for i, entry in enumerate(data):
	rank = i + 1
	rank_class = f"rank-{rank}" if rank <= 3 else ""

	# Medal for top 3
	if rank == 1:
	rank_display = '<span class="rank-medal">🥇</span> 1'
	elif rank == 2:
	rank_display = '<span class="rank-medal">🥈</span> 2'
	elif rank == 3:
	rank_display = '<span class="rank-medal">🥉</span> 3'
	else:
	rank_display = f"#{rank}"

	fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
	comp_class = "metric-good" if entry["compression"] > 3.5 else ""
	unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""

	html += f"""
	<tr class="{rank_class}">
	<td><strong>{rank_display}</strong></td>
	<td><strong>{entry["name"]}</strong></td>
	<td><span class="type-badge">{entry["type"]}</span></td>
	<td>{entry["org"]}</td>
	<td><span class="score-badge">{entry["score"]}</span></td>
	<td class="{fert_class}">{entry["fertility"]:.3f}</td>
	<td class="{comp_class}">{entry["compression"]:.2f}</td>
	<td class="{unk_class}">{entry["unk_ratio"]:.2%}</td>
	<td>{entry["num_datasets"]}</td>
	</tr>
	"""

	html += """
	</tbody>
	</table>

	<div style="margin-top: 12px; padding: 12px 16px; background: #f8f9fa; border-left: 3px solid #2c3e50; font-size: 13px; color: #495057;">
	<strong>Metrics:</strong>
	Score (0-100, higher=better) •
	Fertility (tokens/word, lower=better) •
	Compression (bytes/token, higher=better) •
	UNK Rate (lower=better)
	</div>
	"""

	return html


	def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
	"""Generate HTML for per-dataset fertility table - clean professional design"""

	if not data:
	return "<p>No per-dataset results</p>"

	ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]

	html = """
	<style>
	.dataset-table {
	width: 100%;
	border-collapse: collapse;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
	font-size: 13px;
	margin: 16px 0;
	}
	.dataset-table th {
	background: #495057;
	color: #fff;
	padding: 10px 8px;
	text-align: center;
	font-weight: 500;
	}
	.dataset-table th:first-child {
	text-align: left;
	}
	.dataset-table td {
	padding: 8px;
	text-align: center;
	border-bottom: 1px solid #e9ecef;
	color: #333;
	}
	.dataset-table td:first-child {
	text-align: left;
	font-weight: 500;
	}
	.dataset-table tr:nth-child(even) {
	background-color: #f8f9fa;
	}
	.dataset-table tr:hover {
	background-color: #eef2f7;
	}
	.fert-excellent { background: #d4edda; color: #155724; font-weight: 500; }
	.fert-good { background: #fff3cd; color: #856404; font-weight: 500; }
	.fert-poor { background: #f8d7da; color: #721c24; font-weight: 500; }
	</style>

	<table class="dataset-table">
	<thead>
	<tr>
	<th>Tokenizer</th>
	"""

	for ds_name in ds_names:
	html += f"<th>{ds_name}</th>"

	html += """
	</tr>
	</thead>
	<tbody>
	"""

	for row in data:
	html += f"<tr><td>{row['Tokenizer']}</td>"
	for ds_name in ds_names:
	val = row.get(ds_name, "-")
	if val != "-":
	if val < 1.8:
	cls = "fert-excellent"
	elif val < 2.5:
	cls = "fert-good"
	else:
	cls = "fert-poor"
	html += f'<td class="{cls}">{val}</td>'
	else:
	html += '<td>-</td>'
	html += "</tr>"

	html += """
	</tbody>
	</table>
	"""

	return html