indo_nlp / metrics /evaluate.py
darisdzakwanhoesien2
Update app.py and evaluate.py with latest changes
0eb435a
# metrics/evaluate.py
# Evaluation utilities for classification, NER, and QA. Produces standard metrics and confusion matrices.
import os
import json
from typing import Dict, Any
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from models import sentiment as sentiment_mod
from models import ner as ner_mod
from models import qa as qa_mod
def compute_classification_metrics(y_true, y_pred):
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
return {"accuracy": float(acc), "precision": float(prec), "recall": float(rec), "f1": float(f1)}
def evaluate_classification(df: pd.DataFrame, model_key: str, text_col: str = 'text', label_col: str = 'label') -> Dict[str, Any]:
# df must have text and label columns
texts = df[text_col].astype(str).tolist()
y_true = df[label_col].astype(str).tolist()
y_pred = []
for t in texts:
preds = sentiment_mod.predict(t, model_key)
if preds:
# preds is list of (label, score) tuples, pick top
y_pred.append(preds[0][0])
else:
y_pred.append('__empty__')
metrics = compute_classification_metrics(y_true, y_pred)
labels = sorted(list(set(y_true)))
cm = confusion_matrix(y_true, y_pred, labels=labels)
return {"metrics": metrics, "confusion_matrix": cm.tolist(), "labels": labels, **metrics}
def evaluate_ner(file, model_key: str) -> Dict[str, Any]:
# Accepts a file-like object. Expect JSONL with objects: {"text": "...", "entities": [{"start":int, "end":int, "label":"..."}, ...]}
# Or accept TSV-CoNLL format where tokens and labels per line separated by \t and sentences by blank lines.
import io
if hasattr(file, 'name'):
file_obj = open(file.name, 'r', encoding='utf-8')
else:
file_obj = io.TextIOWrapper(file.file, encoding='utf-8')
gold_entities = []
texts = []
# Try JSONL first
file_obj.seek(0)
first_chars = file_obj.read(2)
file_obj.seek(0)
if first_chars.strip().startswith('{'):
# JSONL
for line in file_obj:
line = line.strip()
if not line:
continue
obj = json.loads(line)
texts.append(obj.get('text',''))
gold_entities.append(obj.get('entities', []))
else:
# CoNLL-like TSV: token\tlabel per line, sentences separated by blank lines. We'll reconstruct text naive way.
tokens = []
labels = []
sentences = []
for line in file_obj:
line = line.strip()
if not line:
if tokens:
sentences.append((tokens, labels))
tokens=[]; labels=[]
continue
parts = line.split() # token label
if len(parts) >= 2:
tokens.append(parts[0]); labels.append(parts[-1])
# convert to text and entities (simple)
for toks, labs in sentences:
text = ' '.join(toks)
texts.append(text)
ents = []
char_idx = 0
for tok, lab in zip(toks, labs):
start = text.find(tok, char_idx)
if start==-1:
start = char_idx
end = start + len(tok)
char_idx = end
if lab!='O':
ents.append({'start': start, 'end': end, 'label': lab})
gold_entities.append(ents)
# Now run model predictions and compute simple metrics: token-level span overlap precision/recall/F1
pred_entities = []
for text in texts:
preds = ner_mod.predict(text, model_key)
# normalize preds to spans
spans = []
for p in preds:
if 'start' in p and 'end' in p:
spans.append({'start': p['start'], 'end': p['end'], 'label': p.get('entity_group') or p.get('entity')})
elif 'word' in p:
# find in text (first occ)
w = p['word']
i = text.find(w)
if i>=0:
spans.append({'start': i, 'end': i+len(w), 'label': p.get('entity_group') or p.get('entity')})
pred_entities.append(spans)
# simple span-level evaluation (exact match)
tp=0; fp=0; fn=0
for gold, pred in zip(gold_entities, pred_entities):
gold_set = set([(g['start'], g['end'], g['label']) for g in gold])
pred_set = set([(p['start'], p['end'], p['label']) for p in pred])
tp += len(gold_set & pred_set)
fp += len(pred_set - gold_set)
fn += len(gold_set - pred_set)
prec = tp/(tp+fp) if tp+fp>0 else 0.0
rec = tp/(tp+fn) if tp+fn>0 else 0.0
f1 = 2*prec*rec/(prec+rec) if prec+rec>0 else 0.0
return {"precision": prec, "recall": rec, "f1": f1, "tp": tp, "fp": fp, "fn": fn, "confusion_matrix": [[tp,fp],[fn,0]]}
def evaluate_qa(file, model_key: str) -> Dict[str, Any]:
# Accept JSONL of {"context":..., "question":..., "answers": [{"text":..., "answer_start":...}, ...]}
import io
import json
if hasattr(file, 'name'):
f = open(file.name, 'r', encoding='utf-8')
else:
f = io.TextIOWrapper(file.file, encoding='utf-8')
examples = []
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
examples.append(obj)
# run model
total_em = 0; total_f1 = 0; n=0
for ex in examples:
context = ex.get('context','')
question = ex.get('question','')
gold_answers = [a['text'] for a in ex.get('answers', []) if a.get('text')]
pred = qa_mod.predict(context, question, model_key)
pred_text = pred.get('answer','')
# compute EM and simple F1 between token sets
em = 1 if pred_text.strip() in gold_answers else 0
# F1 token overlap: micro
def f1_score(a,b):
at = a.split(); bt = b.split()
common = sum(1 for t in at if t in bt)
if common==0:
return 0.0
prec = common/len(at); rec = common/len(bt)
return 2*prec*rec/(prec+rec)
f1 = max([f1_score(pred_text, g) for g in gold_answers]) if gold_answers else 0.0
total_em += em; total_f1 += f1; n+=1
return {"exact_match": total_em/n if n else 0.0, "f1": total_f1/n if n else 0.0}
def load_leaderboard(path: str):
if not os.path.exists(path) or os.path.getsize(path) == 0:
# default
df = pd.DataFrame([{"task": "Sentiment", "model": "indobert", "f1": 0.88, "accuracy": 0.87}])
df.to_json(path, orient='records')
return df
return pd.read_json(path)
def save_leaderboard(df, path: str):
return pd.read_json(path)