# app.py
# Main Gradio dashboard for IndoNLP Space

import os
import json
import traceback
from typing import Optional

import gradio as gr
import pandas as pd
import plotly.express as px

from models import sentiment as sentiment_mod
from models import ner as ner_mod
from models import qa as qa_mod
from models import summarization as summ_mod
from metrics.evaluate import evaluate_classification, evaluate_ner, evaluate_qa, load_leaderboard, save_leaderboard

# Ensure metrics dir
os.makedirs("metrics", exist_ok=True)

LEADERBOARD_PATH = "metrics/leaderboard.json"
leaderboard = load_leaderboard(LEADERBOARD_PATH)

# Utility to create a plot from leaderboard
def leaderboard_plot(metric: str = "f1"):
    if leaderboard.empty:
        return px.bar(title="No leaderboard data")
    if metric not in leaderboard.columns:
        metric = "f1"
    fig = px.bar(leaderboard.sort_values(metric, ascending=False), x="model", y=metric, color="task", text=metric,
                 title=f"Leaderboard by {metric.upper()}")
    return fig

# Gradio UI
with gr.Blocks(title="IndoNLP Dashboard") as demo:
    gr.Markdown("# 🇮🇩 IndoNLP Dashboard\nTry Indonesian NLP models, run benchmarks, and visualize metrics.")

    with gr.Tab("Overview"):
        gr.Markdown("## Leaderboard & Comparison")
        metric_choice = gr.Radio(choices=["f1", "accuracy", "precision", "recall"], value="f1")
        leaderboard_plot_el = gr.Plot(value=leaderboard_plot("f1"))
        metric_choice.change(lambda m: leaderboard_plot(m), inputs=[metric_choice], outputs=[leaderboard_plot_el])

    with gr.Tab("Try Models"):
        gr.Markdown("### Interactive Inference")
        task = gr.Dropdown(choices=["sentiment", "ner", "qa", "summarization"], value="sentiment", label="Task")
        model = gr.Dropdown(choices=[], label="Model")
        input_text = gr.Textbox(lines=6, placeholder="Type Indonesian text here...", label="Input Text")
        qa_question = gr.Textbox(lines=2, placeholder="Question (for QA)", visible=False, label="Question")
        run_btn = gr.Button("Run")
        output = gr.HTML(label="Output")

        # update model choices per task
        def update_models_for_task(t):
            if t == "sentiment":
                return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
            if t == "ner":
                return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
            if t == "qa":
                return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=True)
            if t == "summarization":
                return gr.update(choices=list(summ_mod.AVAILABLE_MODELS.keys()), value=list(summ_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
            return gr.update(choices=[]), gr.update(visible=False)

        task.change(lambda t: update_models_for_task(t), inputs=[task], outputs=[model, qa_question])

        def run_model(task, model_key, text, question):
            try:
                if task == "sentiment":
                    res = sentiment_mod.predict(text, model_key)
                    # format as HTML
                    rows = [f"<li>{label}: {score:.4f}</li>" for label, score in res]
                    return f"<b>Sentiment (top scores):</b><ul>{''.join(rows)}</ul>"
                if task == "ner":
                    ents = ner_mod.predict(text, model_key)
                    # render token-highlighted HTML
                    html = ner_mod.render_ner_html(text, ents)
                    return html
                if task == "qa":
                    ans = qa_mod.predict(text, question, model_key)
                    return f"<b>Answer:</b> {ans.get('answer')} <br/><small>score: {ans.get('score'):.4f}</small>"
                if task == "summarization":
                    summ = summ_mod.predict(text, model_key)
                    return f"<b>Summary:</b><p>{summ}</p>"
                return "Unsupported task"
            except Exception as e:
                return f"<pre>Error: {e}\n{traceback.format_exc()}</pre>"

        run_btn.click(run_model, inputs=[task, model, input_text, qa_question], outputs=[output])

    with gr.Tab("Benchmark / Evaluate"):
        gr.Markdown("Upload CSV (classification): columns `text`,`label`. For NER provide CoNLL-like TSV or JSONL.\nFor QA provide JSONL with `context`,`question`,`answers`.")
        file_in = gr.File(label="Upload file")
        bench_task = gr.Dropdown(choices=["sentiment", "ner", "qa"], value="sentiment")
        bench_model = gr.Dropdown(choices=[], label="Model")
        run_eval = gr.Button("Evaluate")
        eval_output = gr.JSON()
        cm_plot = gr.Plot()

        def update_models_for_eval(t):
            if t == "sentiment":
                return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0])
            if t == "ner":
                return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0])
            if t == "qa":
                return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0])
            return gr.update(choices=[])

        bench_task.change(lambda t: update_models_for_eval(t), inputs=[bench_task], outputs=[bench_model])

        def run_evaluation(file, task, model_key):
            if file is None:
                return {"error": "No file uploaded"}, px.imshow([[0]])
            try:
                if task == "sentiment":
                    df = pd.read_csv(file.name if hasattr(file, 'name') else file.file)
                    res = evaluate_classification(df, model_key)
                    # update leaderboard
                    new_entry = {"task": "Sentiment", "model": model_key, **res['metrics']}
                    # append and save
                    global leaderboard
                    leaderboard = pd.concat([leaderboard, pd.DataFrame([new_entry])], ignore_index=True)
                    save_leaderboard(leaderboard, LEADERBOARD_PATH)
                    # build confusion matrix plot
                    cm = res.get('confusion_matrix')
                    labels = res.get('labels')
                    fig = px.imshow(cm, x=labels, y=labels, text_auto=True, title="Confusion Matrix")
                    return res, fig
                if task == "ner":
                    # accept JSONL of entities or CoNLL TSV; evaluate_ner will parse
                    res = evaluate_ner(file, model_key)
                    return res, px.imshow(res.get('confusion_matrix', [[0]]), text_auto=True)
                if task == "qa":
                    res = evaluate_qa(file, model_key)
                    return res, px.imshow([[res.get('f1',0)]], title="QA F1")
                return {"error": "Unsupported task"}, px.imshow([[0]])
            except Exception as e:
                return {"error": str(e), "traceback": traceback.format_exc()}, px.imshow([[0]])

        run_eval.click(run_evaluation, inputs=[file_in, bench_task, bench_model], outputs=[eval_output, cm_plot])

    with gr.Tab("About & Debugging"):
        gr.Markdown("## Notes & Debugging")
        gr.Markdown("""
- If a model fails to load, check model slug and Space `HF_TOKEN` setting.
- Large models require GPU in Space settings to avoid OOM.
- For private models set `HF_TOKEN` as a secret in Space settings.
""")

if __name__ == "__main__":
    demo.launch()