# app.py # Main Gradio dashboard for IndoNLP Space import os import json import traceback from typing import Optional import gradio as gr import pandas as pd import plotly.express as px from models import sentiment as sentiment_mod from models import ner as ner_mod from models import qa as qa_mod from models import summarization as summ_mod from metrics.evaluate import evaluate_classification, evaluate_ner, evaluate_qa, load_leaderboard, save_leaderboard # Ensure metrics dir os.makedirs("metrics", exist_ok=True) LEADERBOARD_PATH = "metrics/leaderboard.json" leaderboard = load_leaderboard(LEADERBOARD_PATH) # Utility to create a plot from leaderboard def leaderboard_plot(metric: str = "f1"): if leaderboard.empty: return px.bar(title="No leaderboard data") if metric not in leaderboard.columns: metric = "f1" fig = px.bar(leaderboard.sort_values(metric, ascending=False), x="model", y=metric, color="task", text=metric, title=f"Leaderboard by {metric.upper()}") return fig # Gradio UI with gr.Blocks(title="IndoNLP Dashboard") as demo: gr.Markdown("# 🇮🇩 IndoNLP Dashboard\nTry Indonesian NLP models, run benchmarks, and visualize metrics.") with gr.Tab("Overview"): gr.Markdown("## Leaderboard & Comparison") metric_choice = gr.Radio(choices=["f1", "accuracy", "precision", "recall"], value="f1") leaderboard_plot_el = gr.Plot(value=leaderboard_plot("f1")) metric_choice.change(lambda m: leaderboard_plot(m), inputs=[metric_choice], outputs=[leaderboard_plot_el]) with gr.Tab("Try Models"): gr.Markdown("### Interactive Inference") task = gr.Dropdown(choices=["sentiment", "ner", "qa", "summarization"], value="sentiment", label="Task") model = gr.Dropdown(choices=[], label="Model") input_text = gr.Textbox(lines=6, placeholder="Type Indonesian text here...", label="Input Text") qa_question = gr.Textbox(lines=2, placeholder="Question (for QA)", visible=False, label="Question") run_btn = gr.Button("Run") output = gr.HTML(label="Output") # update model choices per task def update_models_for_task(t): if t == "sentiment": return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) if t == "ner": return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) if t == "qa": return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=True) if t == "summarization": return gr.update(choices=list(summ_mod.AVAILABLE_MODELS.keys()), value=list(summ_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) return gr.update(choices=[]), gr.update(visible=False) task.change(lambda t: update_models_for_task(t), inputs=[task], outputs=[model, qa_question]) def run_model(task, model_key, text, question): try: if task == "sentiment": res = sentiment_mod.predict(text, model_key) # format as HTML rows = [f"
  • {label}: {score:.4f}
  • " for label, score in res] return f"Sentiment (top scores):" if task == "ner": ents = ner_mod.predict(text, model_key) # render token-highlighted HTML html = ner_mod.render_ner_html(text, ents) return html if task == "qa": ans = qa_mod.predict(text, question, model_key) return f"Answer: {ans.get('answer')}
    score: {ans.get('score'):.4f}" if task == "summarization": summ = summ_mod.predict(text, model_key) return f"Summary:

    {summ}

    " return "Unsupported task" except Exception as e: return f"
    Error: {e}\n{traceback.format_exc()}
    " run_btn.click(run_model, inputs=[task, model, input_text, qa_question], outputs=[output]) with gr.Tab("Benchmark / Evaluate"): gr.Markdown("Upload CSV (classification): columns `text`,`label`. For NER provide CoNLL-like TSV or JSONL.\nFor QA provide JSONL with `context`,`question`,`answers`.") file_in = gr.File(label="Upload file") bench_task = gr.Dropdown(choices=["sentiment", "ner", "qa"], value="sentiment") bench_model = gr.Dropdown(choices=[], label="Model") run_eval = gr.Button("Evaluate") eval_output = gr.JSON() cm_plot = gr.Plot() def update_models_for_eval(t): if t == "sentiment": return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0]) if t == "ner": return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0]) if t == "qa": return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0]) return gr.update(choices=[]) bench_task.change(lambda t: update_models_for_eval(t), inputs=[bench_task], outputs=[bench_model]) def run_evaluation(file, task, model_key): if file is None: return {"error": "No file uploaded"}, px.imshow([[0]]) try: if task == "sentiment": df = pd.read_csv(file.name if hasattr(file, 'name') else file.file) res = evaluate_classification(df, model_key) # update leaderboard new_entry = {"task": "Sentiment", "model": model_key, **res['metrics']} # append and save global leaderboard leaderboard = pd.concat([leaderboard, pd.DataFrame([new_entry])], ignore_index=True) save_leaderboard(leaderboard, LEADERBOARD_PATH) # build confusion matrix plot cm = res.get('confusion_matrix') labels = res.get('labels') fig = px.imshow(cm, x=labels, y=labels, text_auto=True, title="Confusion Matrix") return res, fig if task == "ner": # accept JSONL of entities or CoNLL TSV; evaluate_ner will parse res = evaluate_ner(file, model_key) return res, px.imshow(res.get('confusion_matrix', [[0]]), text_auto=True) if task == "qa": res = evaluate_qa(file, model_key) return res, px.imshow([[res.get('f1',0)]], title="QA F1") return {"error": "Unsupported task"}, px.imshow([[0]]) except Exception as e: return {"error": str(e), "traceback": traceback.format_exc()}, px.imshow([[0]]) run_eval.click(run_evaluation, inputs=[file_in, bench_task, bench_model], outputs=[eval_output, cm_plot]) with gr.Tab("About & Debugging"): gr.Markdown("## Notes & Debugging") gr.Markdown(""" - If a model fails to load, check model slug and Space `HF_TOKEN` setting. - Large models require GPU in Space settings to avoid OOM. - For private models set `HF_TOKEN` as a secret in Space settings. """) if __name__ == "__main__": demo.launch()