Spaces:
Running
Running
| """ | |
| Leaderboard Module | |
| ================== | |
| Evaluate tokenizers on real HuggingFace Arabic datasets | |
| """ | |
| import statistics | |
| from typing import Dict, List, Tuple, Optional | |
| from collections import defaultdict | |
| import gradio as gr | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| from config import LEADERBOARD_DATASETS | |
| from tokenizer_manager import tokenizer_manager | |
| class HFDatasetLoader: | |
| """Load Arabic datasets from HuggingFace""" | |
| def __init__(self): | |
| self.cache = {} | |
| def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]: | |
| """Load texts from a HuggingFace dataset""" | |
| if dataset_key in self.cache: | |
| return self.cache[dataset_key], f"✅ Loaded {len(self.cache[dataset_key])} samples (cached)" | |
| config = LEADERBOARD_DATASETS.get(dataset_key) | |
| if not config: | |
| return [], f"❌ Unknown dataset: {dataset_key}" | |
| try: | |
| # Load dataset from HuggingFace | |
| if config.get("subset"): | |
| ds = load_dataset( | |
| config["hf_id"], | |
| config["subset"], | |
| split=config["split"], | |
| trust_remote_code=True | |
| ) | |
| else: | |
| ds = load_dataset( | |
| config["hf_id"], | |
| split=config["split"], | |
| trust_remote_code=True | |
| ) | |
| texts = [] | |
| text_col = config["text_column"] | |
| # Try to find text column | |
| if text_col not in ds.column_names: | |
| for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]: | |
| if col in ds.column_names: | |
| text_col = col | |
| break | |
| # Extract texts | |
| max_samples = config.get("samples", 500) | |
| for i, item in enumerate(ds): | |
| if i >= max_samples: | |
| break | |
| text = item.get(text_col, "") | |
| if text and isinstance(text, str) and len(text.strip()) > 10: | |
| texts.append(text.strip()) | |
| self.cache[dataset_key] = texts | |
| return texts, f"✅ Loaded {len(texts)} samples from HuggingFace" | |
| except Exception as e: | |
| return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}" | |
| def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]: | |
| """Evaluate a tokenizer on a list of texts""" | |
| fertilities = [] | |
| compressions = [] | |
| unk_counts = 0 | |
| total_tokens = 0 | |
| for text in texts: | |
| try: | |
| tokens = tokenizer.encode(text, add_special_tokens=False) | |
| decoded = tokenizer.convert_ids_to_tokens(tokens) | |
| num_tokens = len(tokens) | |
| num_words = len(text.split()) or 1 | |
| num_bytes = len(text.encode('utf-8')) | |
| fertility = num_tokens / num_words | |
| compression = num_bytes / num_tokens if num_tokens > 0 else 0 | |
| # Count UNKs | |
| unk_token = getattr(tokenizer, 'unk_token', '[UNK]') | |
| unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower())) | |
| fertilities.append(fertility) | |
| compressions.append(compression) | |
| unk_counts += unks | |
| total_tokens += num_tokens | |
| except Exception: | |
| continue | |
| if not fertilities: | |
| return None | |
| return { | |
| "avg_fertility": statistics.mean(fertilities), | |
| "std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0, | |
| "avg_compression": statistics.mean(compressions), | |
| "unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0, | |
| "samples": len(fertilities) | |
| } | |
| def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float: | |
| """Calculate overall score (0-100, higher is better)""" | |
| # Lower fertility is better (ideal ~1.0 for Arabic) | |
| fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0 | |
| # Higher compression is better | |
| compression_score = min(1, compression / 6) | |
| # Lower UNK is better | |
| unk_score = 1 - min(1, unk_ratio * 20) | |
| # Weighted combination | |
| score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100 | |
| return round(score, 1) | |
| def run_leaderboard_evaluation( | |
| selected_datasets: List[str], | |
| selected_tokenizers: List[str], | |
| progress=gr.Progress() | |
| ) -> Tuple[str, str, str]: | |
| """ | |
| Run the full leaderboard evaluation with real HF datasets | |
| Returns: (leaderboard_html, per_dataset_html, status_message) | |
| """ | |
| if not selected_datasets: | |
| return "", "", "⚠️ Please select at least one dataset" | |
| if not selected_tokenizers: | |
| return "", "", "⚠️ Please select at least one tokenizer" | |
| loader = HFDatasetLoader() | |
| results = defaultdict(dict) | |
| # Status tracking | |
| status_lines = [] | |
| # Load datasets from HuggingFace | |
| status_lines.append("📚 **Loading Datasets from HuggingFace:**\n") | |
| loaded_datasets = {} | |
| for i, ds_key in enumerate(selected_datasets): | |
| progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...") | |
| texts, msg = loader.load_dataset_texts(ds_key) | |
| ds_name = LEADERBOARD_DATASETS[ds_key]["name"] | |
| status_lines.append(f" • {ds_name}: {msg}") | |
| if texts: | |
| loaded_datasets[ds_key] = texts | |
| if not loaded_datasets: | |
| return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully" | |
| # Evaluate tokenizers | |
| status_lines.append("\n🔄 **Evaluating Tokenizers:**\n") | |
| tokenizer_cache = {} | |
| total_steps = len(selected_tokenizers) * len(loaded_datasets) | |
| current_step = 0 | |
| for tok_choice in selected_tokenizers: | |
| # Get model ID from choice | |
| tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice) | |
| tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id) | |
| tok_name = tok_info.name if tok_info else tok_choice | |
| # Load tokenizer | |
| try: | |
| if tok_id not in tokenizer_cache: | |
| tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained( | |
| tok_id, trust_remote_code=True | |
| ) | |
| tokenizer = tokenizer_cache[tok_id] | |
| status_lines.append(f" • {tok_name}: ✅ Loaded") | |
| except Exception as e: | |
| status_lines.append(f" • {tok_name}: ❌ Failed ({str(e)[:30]})") | |
| continue | |
| # Evaluate on each dataset | |
| for ds_key, texts in loaded_datasets.items(): | |
| current_step += 1 | |
| progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...") | |
| metrics = evaluate_tokenizer_on_texts(tokenizer, texts) | |
| if metrics: | |
| results[tok_choice][ds_key] = metrics | |
| # Generate leaderboard | |
| progress(0.95, "Generating leaderboard...") | |
| leaderboard_data = [] | |
| per_dataset_data = [] | |
| for tok_choice, ds_results in results.items(): | |
| if not ds_results: | |
| continue | |
| tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice) | |
| tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id) | |
| # Aggregate across datasets | |
| all_fertility = [m["avg_fertility"] for m in ds_results.values()] | |
| all_compression = [m["avg_compression"] for m in ds_results.values()] | |
| all_unk = [m["unk_ratio"] for m in ds_results.values()] | |
| avg_fertility = statistics.mean(all_fertility) | |
| avg_compression = statistics.mean(all_compression) | |
| avg_unk = statistics.mean(all_unk) | |
| score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk) | |
| leaderboard_data.append({ | |
| "name": tok_info.name if tok_info else tok_choice, | |
| "type": tok_info.type.value if tok_info else "Unknown", | |
| "org": tok_info.organization if tok_info else "Unknown", | |
| "score": score, | |
| "fertility": avg_fertility, | |
| "compression": avg_compression, | |
| "unk_ratio": avg_unk, | |
| "num_datasets": len(ds_results) | |
| }) | |
| # Per-dataset row | |
| per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice} | |
| for ds_key in selected_datasets: | |
| ds_name = LEADERBOARD_DATASETS[ds_key]["name"] | |
| if ds_key in ds_results: | |
| per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2) | |
| else: | |
| per_ds_row[ds_name] = "-" | |
| per_dataset_data.append(per_ds_row) | |
| # Sort by score | |
| leaderboard_data.sort(key=lambda x: x["score"], reverse=True) | |
| # Create HTML tables | |
| leaderboard_html = generate_leaderboard_html(leaderboard_data) | |
| per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets) | |
| status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.") | |
| return leaderboard_html, per_dataset_html, "\n".join(status_lines) | |
| def generate_leaderboard_html(data: List[Dict]) -> str: | |
| """Generate HTML for main leaderboard - clean professional design""" | |
| if not data: | |
| return "<p>No results to display</p>" | |
| html = """ | |
| <style> | |
| .leaderboard-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; | |
| font-size: 14px; | |
| margin: 16px 0; | |
| } | |
| .leaderboard-table th { | |
| background: #2c3e50; | |
| color: #fff; | |
| padding: 12px 10px; | |
| text-align: left; | |
| font-weight: 500; | |
| border-bottom: 2px solid #1a252f; | |
| } | |
| .leaderboard-table td { | |
| padding: 10px; | |
| border-bottom: 1px solid #e9ecef; | |
| color: #333; | |
| } | |
| .leaderboard-table tr:nth-child(even) { | |
| background-color: #f8f9fa; | |
| } | |
| .leaderboard-table tr:hover { | |
| background-color: #eef2f7; | |
| } | |
| .leaderboard-table .rank-1 td { background: #f0f7ff; } | |
| .leaderboard-table .rank-2 td { background: #f5f5f5; } | |
| .leaderboard-table .rank-3 td { background: #fdf8f3; } | |
| .score-badge { | |
| background: #2c3e50; | |
| color: #fff; | |
| padding: 4px 10px; | |
| border-radius: 4px; | |
| font-weight: 600; | |
| font-size: 13px; | |
| } | |
| .type-badge { | |
| background: #e9ecef; | |
| color: #495057; | |
| padding: 3px 8px; | |
| border-radius: 3px; | |
| font-size: 12px; | |
| } | |
| .metric-good { color: #198754; font-weight: 500; } | |
| .metric-bad { color: #dc3545; font-weight: 500; } | |
| .rank-medal { font-size: 16px; margin-right: 4px; } | |
| </style> | |
| <table class="leaderboard-table"> | |
| <thead> | |
| <tr> | |
| <th>Rank</th> | |
| <th>Tokenizer</th> | |
| <th>Type</th> | |
| <th>Organization</th> | |
| <th>Score</th> | |
| <th>Fertility</th> | |
| <th>Compression</th> | |
| <th>UNK Rate</th> | |
| <th>Datasets</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| """ | |
| for i, entry in enumerate(data): | |
| rank = i + 1 | |
| rank_class = f"rank-{rank}" if rank <= 3 else "" | |
| # Medal for top 3 | |
| if rank == 1: | |
| rank_display = '<span class="rank-medal">🥇</span> 1' | |
| elif rank == 2: | |
| rank_display = '<span class="rank-medal">🥈</span> 2' | |
| elif rank == 3: | |
| rank_display = '<span class="rank-medal">🥉</span> 3' | |
| else: | |
| rank_display = f"#{rank}" | |
| fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else "" | |
| comp_class = "metric-good" if entry["compression"] > 3.5 else "" | |
| unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else "" | |
| html += f""" | |
| <tr class="{rank_class}"> | |
| <td><strong>{rank_display}</strong></td> | |
| <td><strong>{entry["name"]}</strong></td> | |
| <td><span class="type-badge">{entry["type"]}</span></td> | |
| <td>{entry["org"]}</td> | |
| <td><span class="score-badge">{entry["score"]}</span></td> | |
| <td class="{fert_class}">{entry["fertility"]:.3f}</td> | |
| <td class="{comp_class}">{entry["compression"]:.2f}</td> | |
| <td class="{unk_class}">{entry["unk_ratio"]:.2%}</td> | |
| <td>{entry["num_datasets"]}</td> | |
| </tr> | |
| """ | |
| html += """ | |
| </tbody> | |
| </table> | |
| <div style="margin-top: 12px; padding: 12px 16px; background: #f8f9fa; border-left: 3px solid #2c3e50; font-size: 13px; color: #495057;"> | |
| <strong>Metrics:</strong> | |
| Score (0-100, higher=better) • | |
| Fertility (tokens/word, lower=better) • | |
| Compression (bytes/token, higher=better) • | |
| UNK Rate (lower=better) | |
| </div> | |
| """ | |
| return html | |
| def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str: | |
| """Generate HTML for per-dataset fertility table - clean professional design""" | |
| if not data: | |
| return "<p>No per-dataset results</p>" | |
| ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys] | |
| html = """ | |
| <style> | |
| .dataset-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; | |
| font-size: 13px; | |
| margin: 16px 0; | |
| } | |
| .dataset-table th { | |
| background: #495057; | |
| color: #fff; | |
| padding: 10px 8px; | |
| text-align: center; | |
| font-weight: 500; | |
| } | |
| .dataset-table th:first-child { | |
| text-align: left; | |
| } | |
| .dataset-table td { | |
| padding: 8px; | |
| text-align: center; | |
| border-bottom: 1px solid #e9ecef; | |
| color: #333; | |
| } | |
| .dataset-table td:first-child { | |
| text-align: left; | |
| font-weight: 500; | |
| } | |
| .dataset-table tr:nth-child(even) { | |
| background-color: #f8f9fa; | |
| } | |
| .dataset-table tr:hover { | |
| background-color: #eef2f7; | |
| } | |
| .fert-excellent { background: #d4edda; color: #155724; font-weight: 500; } | |
| .fert-good { background: #fff3cd; color: #856404; font-weight: 500; } | |
| .fert-poor { background: #f8d7da; color: #721c24; font-weight: 500; } | |
| </style> | |
| <table class="dataset-table"> | |
| <thead> | |
| <tr> | |
| <th>Tokenizer</th> | |
| """ | |
| for ds_name in ds_names: | |
| html += f"<th>{ds_name}</th>" | |
| html += """ | |
| </tr> | |
| </thead> | |
| <tbody> | |
| """ | |
| for row in data: | |
| html += f"<tr><td>{row['Tokenizer']}</td>" | |
| for ds_name in ds_names: | |
| val = row.get(ds_name, "-") | |
| if val != "-": | |
| if val < 1.8: | |
| cls = "fert-excellent" | |
| elif val < 2.5: | |
| cls = "fert-good" | |
| else: | |
| cls = "fert-poor" | |
| html += f'<td class="{cls}">{val}</td>' | |
| else: | |
| html += '<td>-</td>' | |
| html += "</tr>" | |
| html += """ | |
| </tbody> | |
| </table> | |
| """ | |
| return html | |