"""
Leaderboard Module
==================
Evaluate tokenizers on real HuggingFace Arabic datasets
"""

import json
import os
import statistics
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
import gradio as gr
from datasets import load_dataset
from transformers import AutoTokenizer

from config import LEADERBOARD_DATASETS
from tokenizer_manager import tokenizer_manager


# File path for persistent storage of submitted tokenizers
SUBMISSIONS_FILE = os.path.join(os.path.dirname(__file__), "submissions.json")

# File path for cached leaderboard results
LEADERBOARD_CACHE_FILE = os.path.join(os.path.dirname(__file__), "leaderboard_cache.json")


def load_submitted_tokenizers() -> Dict[str, Dict]:
    """Load submitted tokenizers from persistent storage"""
    if os.path.exists(SUBMISSIONS_FILE):
        try:
            with open(SUBMISSIONS_FILE, 'r', encoding='utf-8') as f:
                return json.load(f)
        except (json.JSONDecodeError, IOError):
            return {}
    return {}


def save_submitted_tokenizer(model_id: str, data: Dict) -> None:
    """Save a submitted tokenizer to persistent storage"""
    submissions = load_submitted_tokenizers()
    submissions[model_id] = data
    try:
        with open(SUBMISSIONS_FILE, 'w', encoding='utf-8') as f:
            json.dump(submissions, f, indent=2, ensure_ascii=False)
    except IOError as e:
        print(f"Warning: Could not save submission: {e}")


def load_leaderboard_cache() -> Optional[Dict]:
    """Load cached leaderboard results"""
    if os.path.exists(LEADERBOARD_CACHE_FILE):
        try:
            with open(LEADERBOARD_CACHE_FILE, 'r', encoding='utf-8') as f:
                return json.load(f)
        except (json.JSONDecodeError, IOError):
            return None
    return None


def save_leaderboard_cache(leaderboard_html: str, per_dataset_html: str, status: str) -> None:
    """Save leaderboard results to cache"""
    cache_data = {
        "leaderboard_html": leaderboard_html,
        "per_dataset_html": per_dataset_html,
        "status": status
    }
    try:
        with open(LEADERBOARD_CACHE_FILE, 'w', encoding='utf-8') as f:
            json.dump(cache_data, f, ensure_ascii=False)
    except IOError as e:
        print(f"Warning: Could not save leaderboard cache: {e}")


class HFDatasetLoader:
    """Load Arabic datasets from HuggingFace"""
    
    def __init__(self):
        self.cache = {}
    
    def load_dataset_texts(self, dataset_key: str) -> Tuple[List[str], str]:
        """Load texts from a HuggingFace dataset"""
        
        if dataset_key in self.cache:
            return self.cache[dataset_key], f"✅ Loaded {len(self.cache[dataset_key])} samples (cached)"
        
        config = LEADERBOARD_DATASETS.get(dataset_key)
        if not config:
            return [], f"❌ Unknown dataset: {dataset_key}"
        
        try:
            # Load dataset from HuggingFace
            if config.get("subset"):
                ds = load_dataset(
                    config["hf_id"],
                    config["subset"],
                    split=config["split"],
                    trust_remote_code=True
                )
            else:
                ds = load_dataset(
                    config["hf_id"],
                    split=config["split"],
                    trust_remote_code=True
                )
            
            texts = []
            text_col = config["text_column"]
            
            # Try to find text column
            if text_col not in ds.column_names:
                for col in ["text", "content", "sentence", "arabic", "context", "Tweet", "question", "poem_text", "hadith_text_ar"]:
                    if col in ds.column_names:
                        text_col = col
                        break
            
            # Extract texts
            max_samples = config.get("samples", 500)
            for i, item in enumerate(ds):
                if i >= max_samples:
                    break
                text = item.get(text_col, "")
                if text and isinstance(text, str) and len(text.strip()) > 10:
                    texts.append(text.strip())
            
            self.cache[dataset_key] = texts
            return texts, f"✅ Loaded {len(texts)} samples from HuggingFace"
            
        except Exception as e:
            return [], f"❌ Error loading {config['hf_id']}: {str(e)[:80]}"


def evaluate_tokenizer_on_texts(tokenizer, texts: List[str]) -> Optional[Dict]:
    """Evaluate a tokenizer on a list of texts"""
    
    fertilities = []
    compressions = []
    unk_counts = 0
    total_tokens = 0
    
    for text in texts:
        try:
            tokens = tokenizer.encode(text, add_special_tokens=False)
            decoded = tokenizer.convert_ids_to_tokens(tokens)
            
            num_tokens = len(tokens)
            num_words = len(text.split()) or 1
            num_bytes = len(text.encode('utf-8'))
            
            fertility = num_tokens / num_words
            compression = num_bytes / num_tokens if num_tokens > 0 else 0
            
            # Count UNKs
            unk_token = getattr(tokenizer, 'unk_token', '[UNK]')
            unks = sum(1 for t in decoded if t and (t == unk_token or '<unk>' in str(t).lower() or '[unk]' in str(t).lower()))
            
            fertilities.append(fertility)
            compressions.append(compression)
            unk_counts += unks
            total_tokens += num_tokens
            
        except Exception:
            continue
    
    if not fertilities:
        return None
    
    return {
        "avg_fertility": statistics.mean(fertilities),
        "std_fertility": statistics.stdev(fertilities) if len(fertilities) > 1 else 0,
        "avg_compression": statistics.mean(compressions),
        "unk_ratio": unk_counts / total_tokens if total_tokens > 0 else 0,
        "samples": len(fertilities)
    }


def calculate_leaderboard_score(fertility: float, compression: float, unk_ratio: float) -> float:
    """Calculate overall score (0-100, higher is better)"""
    # Lower fertility is better (ideal ~1.0 for Arabic)
    fertility_score = max(0, min(1, 2.0 / fertility)) if fertility > 0 else 0
    # Higher compression is better
    compression_score = min(1, compression / 6)
    # Lower UNK is better
    unk_score = 1 - min(1, unk_ratio * 20)
    
    # Weighted combination
    score = (fertility_score * 0.45 + compression_score * 0.35 + unk_score * 0.20) * 100
    return round(score, 1)


def get_cached_leaderboard(progress=gr.Progress()) -> Tuple[str, str, str]:
    """
    Get leaderboard results from cache if available.
    If no cache exists, shows a message to run evaluation.
    Returns: (leaderboard_html, per_dataset_html, status_message)
    """
    cache = load_leaderboard_cache()
    if cache:
        # Also include any new submissions that were added after the cache
        return (
            cache.get("leaderboard_html", ""),
            cache.get("per_dataset_html", ""),
            cache.get("status", "") + "\n\n📦 *Loaded from cache. Click 'Re-evaluate All' to refresh.*"
        )

    # No cache exists - show message to run evaluation
    no_data_html = """
    <div style="text-align: center; padding: 40px; background: #22272e; border-radius: 12px; border: 1px solid #30363d;">
        <p style="color: #8b949e; font-size: 16px; margin-bottom: 16px;">📊 No evaluation data available yet.</p>
        <p style="color: #e6edf3; font-size: 14px;">Click <strong>"Re-evaluate All"</strong> button above to run the full evaluation.</p>
        <p style="color: #8b949e; font-size: 12px; margin-top: 12px;">This will evaluate all tokenizers on all 8 Arabic datasets (~5-10 minutes).</p>
    </div>
    """
    return (
        no_data_html,
        no_data_html,
        "⚠️ **No cached results found.** Click 'Re-evaluate All' to run the evaluation."
    )


def run_leaderboard_evaluation(
    progress=gr.Progress()
) -> Tuple[str, str, str]:
    """
    Run the full leaderboard evaluation with real HF datasets
    Evaluates ALL tokenizers on ALL datasets
    Returns: (leaderboard_html, per_dataset_html, status_message)
    """

    # Use ALL datasets
    selected_datasets = list(LEADERBOARD_DATASETS.keys())

    # Use ALL available tokenizers
    selected_tokenizers = tokenizer_manager.get_tokenizer_choices()
    
    loader = HFDatasetLoader()
    results = defaultdict(dict)
    
    # Status tracking
    status_lines = []
    
    # Load datasets from HuggingFace
    status_lines.append("📚 **Loading Datasets from HuggingFace:**\n")
    loaded_datasets = {}
    
    for i, ds_key in enumerate(selected_datasets):
        progress((i + 1) / len(selected_datasets) * 0.3, f"Loading {ds_key}...")
        texts, msg = loader.load_dataset_texts(ds_key)
        ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
        status_lines.append(f"  • {ds_name}: {msg}")
        if texts:
            loaded_datasets[ds_key] = texts
    
    if not loaded_datasets:
        return "", "", "\n".join(status_lines) + "\n\n❌ No datasets loaded successfully"
    
    # Evaluate tokenizers
    status_lines.append("\n🔄 **Evaluating Tokenizers:**\n")
    
    tokenizer_cache = {}
    total_steps = len(selected_tokenizers) * len(loaded_datasets)
    current_step = 0
    
    for tok_choice in selected_tokenizers:
        # Get model ID from choice
        tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
        tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
        tok_name = tok_info.name if tok_info else tok_choice
        
        # Load tokenizer
        try:
            if tok_id not in tokenizer_cache:
                tokenizer_cache[tok_id] = AutoTokenizer.from_pretrained(
                    tok_id, trust_remote_code=True
                )
            tokenizer = tokenizer_cache[tok_id]
            status_lines.append(f"  • {tok_name}: ✅ Loaded")
        except Exception as e:
            status_lines.append(f"  • {tok_name}: ❌ Failed ({str(e)[:30]})")
            continue
        
        # Evaluate on each dataset
        for ds_key, texts in loaded_datasets.items():
            current_step += 1
            progress(0.3 + (current_step / total_steps) * 0.6, f"Evaluating {tok_name} on {ds_key}...")
            
            metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
            if metrics:
                results[tok_choice][ds_key] = metrics
    
    # Generate leaderboard
    progress(0.95, "Generating leaderboard...")
    
    leaderboard_data = []
    per_dataset_data = []
    
    for tok_choice, ds_results in results.items():
        if not ds_results:
            continue
        
        tok_id = tokenizer_manager.get_model_id_from_choice(tok_choice)
        tok_info = tokenizer_manager.get_available_tokenizers().get(tok_id)
        
        # Aggregate across datasets
        all_fertility = [m["avg_fertility"] for m in ds_results.values()]
        all_compression = [m["avg_compression"] for m in ds_results.values()]
        all_unk = [m["unk_ratio"] for m in ds_results.values()]
        
        avg_fertility = statistics.mean(all_fertility)
        avg_compression = statistics.mean(all_compression)
        avg_unk = statistics.mean(all_unk)
        
        score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)
        
        leaderboard_data.append({
            "name": tok_info.name if tok_info else tok_choice,
            "type": tok_info.type.value if tok_info else "Unknown",
            "org": tok_info.organization if tok_info else "Unknown",
            "score": score,
            "fertility": avg_fertility,
            "compression": avg_compression,
            "unk_ratio": avg_unk,
            "num_datasets": len(ds_results)
        })
        
        # Per-dataset row
        per_ds_row = {"Tokenizer": tok_info.name if tok_info else tok_choice}
        for ds_key in selected_datasets:
            ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
            if ds_key in ds_results:
                per_ds_row[ds_name] = round(ds_results[ds_key]["avg_fertility"], 2)
            else:
                per_ds_row[ds_name] = "-"
        per_dataset_data.append(per_ds_row)
    
    # Add submitted tokenizers to the leaderboard
    submitted = load_submitted_tokenizers()
    for model_id, sub_data in submitted.items():
        # Check if already in leaderboard (avoid duplicates)
        if any(d["name"] == sub_data["name"] for d in leaderboard_data):
            continue

        leaderboard_data.append({
            "name": sub_data["name"],
            "type": sub_data.get("type", "Custom"),
            "org": sub_data.get("org", "Community"),
            "score": sub_data["score"],
            "fertility": sub_data["fertility"],
            "compression": sub_data["compression"],
            "unk_ratio": sub_data["unk_ratio"],
            "num_datasets": len(sub_data.get("per_dataset", {}))
        })

        # Add per-dataset row for submitted tokenizer
        per_ds_row = {"Tokenizer": sub_data["name"]}
        per_dataset_results = sub_data.get("per_dataset", {})
        for ds_key in selected_datasets:
            ds_name = LEADERBOARD_DATASETS[ds_key]["name"]
            if ds_key in per_dataset_results:
                per_ds_row[ds_name] = round(per_dataset_results[ds_key]["avg_fertility"], 2)
            else:
                per_ds_row[ds_name] = "-"
        per_dataset_data.append(per_ds_row)

    # Sort by score
    leaderboard_data.sort(key=lambda x: x["score"], reverse=True)
    
    # Create HTML tables
    leaderboard_html = generate_leaderboard_html(leaderboard_data)
    per_dataset_html = generate_per_dataset_html(per_dataset_data, selected_datasets)

    status_lines.append(f"\n✅ **Evaluation Complete!** Evaluated {len(results)} tokenizers on {len(loaded_datasets)} datasets.")
    status_message = "\n".join(status_lines)

    # Save results to cache
    save_leaderboard_cache(leaderboard_html, per_dataset_html, status_message)

    return leaderboard_html, per_dataset_html, status_message


def generate_leaderboard_html(data: List[Dict]) -> str:
    """Generate HTML for main leaderboard - dark theme design"""

    if not data:
        return "<p style='color: #e6edf3;'>No results to display</p>"

    html = """
    <style>
        .leaderboard-table {
            width: 100%;
            border-collapse: collapse;
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
            font-size: 14px;
            margin: 16px 0;
        }
        .leaderboard-table th {
            background: #1a5f2a;
            color: #fff;
            padding: 12px 10px;
            text-align: left;
            font-weight: 500;
            border-bottom: 2px solid #145022;
        }
        .leaderboard-table td {
            padding: 10px;
            border-bottom: 1px solid #30363d;
            color: #e6edf3;
        }
        .leaderboard-table tr:nth-child(even) {
            background-color: #1c2128;
        }
        .leaderboard-table tr:nth-child(odd) {
            background-color: #22272e;
        }
        .leaderboard-table tr:hover {
            background-color: #2d333b;
        }
        .leaderboard-table .rank-1 td { background: rgba(255, 215, 0, 0.15); }
        .leaderboard-table .rank-2 td { background: rgba(192, 192, 192, 0.15); }
        .leaderboard-table .rank-3 td { background: rgba(205, 127, 50, 0.15); }
        .score-badge {
            background: #2d8f4e;
            color: #fff;
            padding: 4px 10px;
            border-radius: 4px;
            font-weight: 600;
            font-size: 13px;
        }
        .type-badge {
            background: #30363d;
            color: #8b949e;
            padding: 3px 8px;
            border-radius: 3px;
            font-size: 12px;
        }
        .metric-good { color: #10b981; font-weight: 500; }
        .metric-bad { color: #f87171; font-weight: 500; }
        .rank-medal { font-size: 16px; margin-right: 4px; }
    </style>

    <table class="leaderboard-table">
        <thead>
            <tr>
                <th>Rank</th>
                <th>Tokenizer</th>
                <th>Type</th>
                <th>Organization</th>
                <th>Score</th>
                <th>Fertility</th>
                <th>Compression</th>
                <th>UNK Rate</th>
                <th>Datasets</th>
            </tr>
        </thead>
        <tbody>
    """

    for i, entry in enumerate(data):
        rank = i + 1
        rank_class = f"rank-{rank}" if rank <= 3 else ""

        # Medal for top 3
        if rank == 1:
            rank_display = '<span class="rank-medal">🥇</span> 1'
        elif rank == 2:
            rank_display = '<span class="rank-medal">🥈</span> 2'
        elif rank == 3:
            rank_display = '<span class="rank-medal">🥉</span> 3'
        else:
            rank_display = f"#{rank}"

        fert_class = "metric-good" if entry["fertility"] < 2.0 else "metric-bad" if entry["fertility"] > 3.0 else ""
        comp_class = "metric-good" if entry["compression"] > 3.5 else ""
        unk_class = "metric-good" if entry["unk_ratio"] < 0.01 else "metric-bad" if entry["unk_ratio"] > 0.05 else ""

        html += f"""
            <tr class="{rank_class}">
                <td><strong>{rank_display}</strong></td>
                <td><strong>{entry["name"]}</strong></td>
                <td><span class="type-badge">{entry["type"]}</span></td>
                <td>{entry["org"]}</td>
                <td><span class="score-badge">{entry["score"]}</span></td>
                <td class="{fert_class}">{entry["fertility"]:.3f}</td>
                <td class="{comp_class}">{entry["compression"]:.2f}</td>
                <td class="{unk_class}">{entry["unk_ratio"]:.2%}</td>
                <td>{entry["num_datasets"]}</td>
            </tr>
        """

    html += """
        </tbody>
    </table>

    <div style="margin-top: 12px; padding: 12px 16px; background: #22272e; border-left: 3px solid #2d8f4e; font-size: 13px; color: #8b949e; border-radius: 0 8px 8px 0;">
        <strong style="color: #e6edf3;">Metrics:</strong>
        Score (0-100, higher=better) •
        Fertility (tokens/word, lower=better) •
        Compression (bytes/token, higher=better) •
        UNK Rate (lower=better)
    </div>
    """

    return html


def generate_per_dataset_html(data: List[Dict], dataset_keys: List[str]) -> str:
    """Generate HTML for per-dataset fertility table - dark theme design"""

    if not data:
        return "<p style='color: #e6edf3;'>No per-dataset results</p>"

    ds_names = [LEADERBOARD_DATASETS[k]["name"] for k in dataset_keys]

    html = """
    <style>
        .dataset-table {
            width: 100%;
            border-collapse: collapse;
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
            font-size: 13px;
            margin: 16px 0;
        }
        .dataset-table th {
            background: #1a5f2a;
            color: #fff;
            padding: 10px 8px;
            text-align: center;
            font-weight: 500;
        }
        .dataset-table th:first-child {
            text-align: left;
        }
        .dataset-table td {
            padding: 8px;
            text-align: center;
            border-bottom: 1px solid #30363d;
            color: #e6edf3;
        }
        .dataset-table td:first-child {
            text-align: left;
            font-weight: 500;
        }
        .dataset-table tr:nth-child(even) {
            background-color: #1c2128;
        }
        .dataset-table tr:nth-child(odd) {
            background-color: #22272e;
        }
        .dataset-table tr:hover {
            background-color: #2d333b;
        }
        .fert-excellent { background: rgba(16, 185, 129, 0.25); color: #34d399; font-weight: 500; }
        .fert-good { background: rgba(245, 158, 11, 0.25); color: #fbbf24; font-weight: 500; }
        .fert-poor { background: rgba(248, 113, 113, 0.25); color: #f87171; font-weight: 500; }
    </style>

    <table class="dataset-table">
        <thead>
            <tr>
                <th>Tokenizer</th>
    """

    for ds_name in ds_names:
        html += f"<th>{ds_name}</th>"

    html += """
            </tr>
        </thead>
        <tbody>
    """

    for row in data:
        html += f"<tr><td>{row['Tokenizer']}</td>"
        for ds_name in ds_names:
            val = row.get(ds_name, "-")
            if val != "-":
                if val < 1.8:
                    cls = "fert-excellent"
                elif val < 2.5:
                    cls = "fert-good"
                else:
                    cls = "fert-poor"
                html += f'<td class="{cls}">{val}</td>'
            else:
                html += '<td>-</td>'
        html += "</tr>"

    html += """
        </tbody>
    </table>
    """

    return html


def evaluate_submitted_tokenizer(
    model_id: str,
    model_name: str,
    organization: str,
    model_type: str,
    progress=gr.Progress()
) -> Tuple[str, str]:
    """
    Evaluate a user-submitted tokenizer on ALL datasets
    Returns: (result_html, status_message)
    """

    if not model_id or not model_id.strip():
        return "", "❌ Please enter a HuggingFace model ID (e.g., 'google/gemma-2-9b')"

    model_id = model_id.strip()

    # Use ALL datasets
    selected_datasets = list(LEADERBOARD_DATASETS.keys())

    # Try to load the tokenizer
    progress(0.1, f"Loading tokenizer: {model_id}...")

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    except Exception as e:
        return "", f"❌ Failed to load tokenizer '{model_id}': {str(e)[:100]}"

    # Get tokenizer info
    vocab_size = tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else len(tokenizer)
    display_name = model_name.strip() if model_name and model_name.strip() else model_id.split('/')[-1]
    org = organization.strip() if organization and organization.strip() else (model_id.split('/')[0] if '/' in model_id else "Unknown")

    progress(0.2, "Loading datasets...")

    # Load datasets
    loader = HFDatasetLoader()
    loaded_datasets = {}

    for ds_key in selected_datasets:
        texts, _ = loader.load_dataset_texts(ds_key)
        if texts:
            loaded_datasets[ds_key] = texts

    if not loaded_datasets:
        return "", "❌ Failed to load any datasets for evaluation"

    # Evaluate
    progress(0.4, "Evaluating tokenizer...")

    all_fertility = []
    all_compression = []
    all_unk = []
    per_dataset_results = {}

    for ds_key, texts in loaded_datasets.items():
        progress(0.4 + (len(per_dataset_results) / len(loaded_datasets)) * 0.4,
                 f"Evaluating on {LEADERBOARD_DATASETS[ds_key]['name']}...")

        metrics = evaluate_tokenizer_on_texts(tokenizer, texts)
        if metrics:
            per_dataset_results[ds_key] = metrics
            all_fertility.append(metrics["avg_fertility"])
            all_compression.append(metrics["avg_compression"])
            all_unk.append(metrics["unk_ratio"])

    if not all_fertility:
        return "", "❌ Evaluation failed - no valid results"

    # Calculate overall metrics
    avg_fertility = statistics.mean(all_fertility)
    avg_compression = statistics.mean(all_compression)
    avg_unk = statistics.mean(all_unk)
    score = calculate_leaderboard_score(avg_fertility, avg_compression, avg_unk)

    progress(0.9, "Saving results...")

    # Save submission to persistent storage
    submission_data = {
        "name": display_name,
        "org": org,
        "type": model_type or "Custom",
        "vocab_size": vocab_size,
        "score": score,
        "fertility": avg_fertility,
        "compression": avg_compression,
        "unk_ratio": avg_unk,
        "per_dataset": per_dataset_results
    }
    save_submitted_tokenizer(model_id, submission_data)

    # Generate result HTML
    result_html = generate_submission_result_html(
        display_name, org, model_type, vocab_size, score,
        avg_fertility, avg_compression, avg_unk,
        per_dataset_results, selected_datasets
    )

    status = f"✅ **{display_name}** has been evaluated on {len(loaded_datasets)} datasets and added to the leaderboard! Refresh the Leaderboard tab to see the updated rankings."

    return result_html, status


def generate_submission_result_html(
    name: str, org: str, model_type: str, vocab_size: int, score: float,
    fertility: float, compression: float, unk_ratio: float,
    per_dataset: Dict, dataset_keys: List[str]
) -> str:
    """Generate HTML for submission results - dark theme design"""

    # Determine score quality
    if score >= 70:
        score_color = "#10b981"
        score_label = "Excellent"
    elif score >= 50:
        score_color = "#4a90d9"
        score_label = "Good"
    elif score >= 30:
        score_color = "#f59e0b"
        score_label = "Fair"
    else:
        score_color = "#f87171"
        score_label = "Needs Improvement"

    html = f"""
    <div style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;">
        <div style="background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4e 100%); color: white; padding: 24px; border-radius: 12px; margin-bottom: 20px;">
            <h2 style="margin: 0 0 8px 0; font-size: 24px;">📊 Evaluation Results</h2>
            <p style="margin: 0; opacity: 0.9; font-size: 14px;">{name} by {org}</p>
        </div>

        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 16px; margin-bottom: 24px;">
            <div style="background: #22272e; padding: 20px; border-radius: 8px; text-align: center; border-left: 4px solid {score_color};">
                <div style="font-size: 32px; font-weight: 700; color: {score_color};">{score}</div>
                <div style="font-size: 12px; color: #8b949e; margin-top: 4px;">Overall Score</div>
                <div style="font-size: 11px; color: {score_color}; font-weight: 500;">{score_label}</div>
            </div>
            <div style="background: #22272e; padding: 20px; border-radius: 8px; text-align: center;">
                <div style="font-size: 28px; font-weight: 600; color: #e6edf3;">{fertility:.3f}</div>
                <div style="font-size: 12px; color: #8b949e; margin-top: 4px;">Fertility</div>
                <div style="font-size: 11px; color: #8b949e;">tokens/word</div>
            </div>
            <div style="background: #22272e; padding: 20px; border-radius: 8px; text-align: center;">
                <div style="font-size: 28px; font-weight: 600; color: #e6edf3;">{compression:.2f}</div>
                <div style="font-size: 12px; color: #8b949e; margin-top: 4px;">Compression</div>
                <div style="font-size: 11px; color: #8b949e;">bytes/token</div>
            </div>
            <div style="background: #22272e; padding: 20px; border-radius: 8px; text-align: center;">
                <div style="font-size: 28px; font-weight: 600; color: #e6edf3;">{unk_ratio:.2%}</div>
                <div style="font-size: 12px; color: #8b949e; margin-top: 4px;">UNK Rate</div>
                <div style="font-size: 11px; color: #8b949e;">unknown tokens</div>
            </div>
        </div>

        <div style="background: #22272e; padding: 16px; border-radius: 8px; margin-bottom: 20px; border: 1px solid #30363d;">
            <h4 style="margin: 0 0 12px 0; color: #e6edf3; font-size: 14px;">Model Details</h4>
            <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px; font-size: 13px;">
                <div><span style="color: #8b949e;">Type:</span> <strong style="color: #e6edf3;">{model_type or 'Custom'}</strong></div>
                <div><span style="color: #8b949e;">Vocab Size:</span> <strong style="color: #e6edf3;">{vocab_size:,}</strong></div>
            </div>
        </div>

        <h4 style="margin: 0 0 12px 0; color: #e6edf3; font-size: 14px;">📈 Per-Dataset Results</h4>
        <table style="width: 100%; border-collapse: collapse; font-size: 13px;">
            <thead>
                <tr style="background: #1a5f2a; color: white;">
                    <th style="padding: 10px; text-align: left;">Dataset</th>
                    <th style="padding: 10px; text-align: center;">Fertility</th>
                    <th style="padding: 10px; text-align: center;">Compression</th>
                    <th style="padding: 10px; text-align: center;">Samples</th>
                </tr>
            </thead>
            <tbody>
    """

    for ds_key in dataset_keys:
        if ds_key in per_dataset:
            m = per_dataset[ds_key]
            ds_name = LEADERBOARD_DATASETS[ds_key]["name"]

            fert_val = m["avg_fertility"]
            if fert_val < 1.8:
                fert_style = "background: rgba(16, 185, 129, 0.25); color: #34d399;"
            elif fert_val < 2.5:
                fert_style = "background: rgba(245, 158, 11, 0.25); color: #fbbf24;"
            else:
                fert_style = "background: rgba(248, 113, 113, 0.25); color: #f87171;"

            html += f"""
                <tr style="border-bottom: 1px solid #30363d; background: #22272e;">
                    <td style="padding: 10px; color: #e6edf3;">{ds_name}</td>
                    <td style="padding: 10px; text-align: center; {fert_style} font-weight: 500;">{fert_val:.3f}</td>
                    <td style="padding: 10px; text-align: center; color: #e6edf3;">{m["avg_compression"]:.2f}</td>
                    <td style="padding: 10px; text-align: center; color: #8b949e;">{m["samples"]}</td>
                </tr>
            """

    html += """
            </tbody>
        </table>
    </div>
    """

    return html