# app.py # Main Gradio dashboard for IndoNLP Space import os import json import traceback from typing import Optional import gradio as gr import pandas as pd import plotly.express as px from models import sentiment as sentiment_mod from models import ner as ner_mod from models import qa as qa_mod from models import summarization as summ_mod from metrics.evaluate import evaluate_classification, evaluate_ner, evaluate_qa, load_leaderboard, save_leaderboard # Ensure metrics dir os.makedirs("metrics", exist_ok=True) LEADERBOARD_PATH = "metrics/leaderboard.json" leaderboard = load_leaderboard(LEADERBOARD_PATH) # Utility to create a plot from leaderboard def leaderboard_plot(metric: str = "f1"): if leaderboard.empty: return px.bar(title="No leaderboard data") if metric not in leaderboard.columns: metric = "f1" fig = px.bar(leaderboard.sort_values(metric, ascending=False), x="model", y=metric, color="task", text=metric, title=f"Leaderboard by {metric.upper()}") return fig # Gradio UI with gr.Blocks(title="IndoNLP Dashboard") as demo: gr.Markdown("# 🇮🇩 IndoNLP Dashboard\nTry Indonesian NLP models, run benchmarks, and visualize metrics.") with gr.Tab("Overview"): gr.Markdown("## Leaderboard & Comparison") metric_choice = gr.Radio(choices=["f1", "accuracy", "precision", "recall"], value="f1") leaderboard_plot_el = gr.Plot(value=leaderboard_plot("f1")) metric_choice.change(lambda m: leaderboard_plot(m), inputs=[metric_choice], outputs=[leaderboard_plot_el]) with gr.Tab("Try Models"): gr.Markdown("### Interactive Inference") task = gr.Dropdown(choices=["sentiment", "ner", "qa", "summarization"], value="sentiment", label="Task") model = gr.Dropdown(choices=[], label="Model") input_text = gr.Textbox(lines=6, placeholder="Type Indonesian text here...", label="Input Text") qa_question = gr.Textbox(lines=2, placeholder="Question (for QA)", visible=False, label="Question") run_btn = gr.Button("Run") output = gr.HTML(label="Output") # update model choices per task def update_models_for_task(t): if t == "sentiment": return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) if t == "ner": return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) if t == "qa": return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=True) if t == "summarization": return gr.update(choices=list(summ_mod.AVAILABLE_MODELS.keys()), value=list(summ_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) return gr.update(choices=[]), gr.update(visible=False) task.change(lambda t: update_models_for_task(t), inputs=[task], outputs=[model, qa_question]) def run_model(task, model_key, text, question): try: if task == "sentiment": res = sentiment_mod.predict(text, model_key) # format as HTML rows = [f"
{summ}
" return "Unsupported task" except Exception as e: return f"Error: {e}\n{traceback.format_exc()}"
run_btn.click(run_model, inputs=[task, model, input_text, qa_question], outputs=[output])
with gr.Tab("Benchmark / Evaluate"):
gr.Markdown("Upload CSV (classification): columns `text`,`label`. For NER provide CoNLL-like TSV or JSONL.\nFor QA provide JSONL with `context`,`question`,`answers`.")
file_in = gr.File(label="Upload file")
bench_task = gr.Dropdown(choices=["sentiment", "ner", "qa"], value="sentiment")
bench_model = gr.Dropdown(choices=[], label="Model")
run_eval = gr.Button("Evaluate")
eval_output = gr.JSON()
cm_plot = gr.Plot()
def update_models_for_eval(t):
if t == "sentiment":
return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0])
if t == "ner":
return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0])
if t == "qa":
return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0])
return gr.update(choices=[])
bench_task.change(lambda t: update_models_for_eval(t), inputs=[bench_task], outputs=[bench_model])
def run_evaluation(file, task, model_key):
if file is None:
return {"error": "No file uploaded"}, px.imshow([[0]])
try:
if task == "sentiment":
df = pd.read_csv(file.name if hasattr(file, 'name') else file.file)
res = evaluate_classification(df, model_key)
# update leaderboard
new_entry = {"task": "Sentiment", "model": model_key, **res['metrics']}
# append and save
global leaderboard
leaderboard = pd.concat([leaderboard, pd.DataFrame([new_entry])], ignore_index=True)
save_leaderboard(leaderboard, LEADERBOARD_PATH)
# build confusion matrix plot
cm = res.get('confusion_matrix')
labels = res.get('labels')
fig = px.imshow(cm, x=labels, y=labels, text_auto=True, title="Confusion Matrix")
return res, fig
if task == "ner":
# accept JSONL of entities or CoNLL TSV; evaluate_ner will parse
res = evaluate_ner(file, model_key)
return res, px.imshow(res.get('confusion_matrix', [[0]]), text_auto=True)
if task == "qa":
res = evaluate_qa(file, model_key)
return res, px.imshow([[res.get('f1',0)]], title="QA F1")
return {"error": "Unsupported task"}, px.imshow([[0]])
except Exception as e:
return {"error": str(e), "traceback": traceback.format_exc()}, px.imshow([[0]])
run_eval.click(run_evaluation, inputs=[file_in, bench_task, bench_model], outputs=[eval_output, cm_plot])
with gr.Tab("About & Debugging"):
gr.Markdown("## Notes & Debugging")
gr.Markdown("""
- If a model fails to load, check model slug and Space `HF_TOKEN` setting.
- Large models require GPU in Space settings to avoid OOM.
- For private models set `HF_TOKEN` as a secret in Space settings.
""")
if __name__ == "__main__":
demo.launch()