# metrics/evaluate.py # Evaluation utilities for classification, NER, and QA. Produces standard metrics and confusion matrices. import os import json from typing import Dict, Any import numpy as np import pandas as pd from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix from models import sentiment as sentiment_mod from models import ner as ner_mod from models import qa as qa_mod def compute_classification_metrics(y_true, y_pred): acc = accuracy_score(y_true, y_pred) prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0) return {"accuracy": float(acc), "precision": float(prec), "recall": float(rec), "f1": float(f1)} def evaluate_classification(df: pd.DataFrame, model_key: str, text_col: str = 'text', label_col: str = 'label') -> Dict[str, Any]: # df must have text and label columns texts = df[text_col].astype(str).tolist() y_true = df[label_col].astype(str).tolist() y_pred = [] for t in texts: preds = sentiment_mod.predict(t, model_key) if preds: # preds is list of (label, score) tuples, pick top y_pred.append(preds[0][0]) else: y_pred.append('__empty__') metrics = compute_classification_metrics(y_true, y_pred) labels = sorted(list(set(y_true))) cm = confusion_matrix(y_true, y_pred, labels=labels) return {"metrics": metrics, "confusion_matrix": cm.tolist(), "labels": labels, **metrics} def evaluate_ner(file, model_key: str) -> Dict[str, Any]: # Accepts a file-like object. Expect JSONL with objects: {"text": "...", "entities": [{"start":int, "end":int, "label":"..."}, ...]} # Or accept TSV-CoNLL format where tokens and labels per line separated by \t and sentences by blank lines. import io if hasattr(file, 'name'): file_obj = open(file.name, 'r', encoding='utf-8') else: file_obj = io.TextIOWrapper(file.file, encoding='utf-8') gold_entities = [] texts = [] # Try JSONL first file_obj.seek(0) first_chars = file_obj.read(2) file_obj.seek(0) if first_chars.strip().startswith('{'): # JSONL for line in file_obj: line = line.strip() if not line: continue obj = json.loads(line) texts.append(obj.get('text','')) gold_entities.append(obj.get('entities', [])) else: # CoNLL-like TSV: token\tlabel per line, sentences separated by blank lines. We'll reconstruct text naive way. tokens = [] labels = [] sentences = [] for line in file_obj: line = line.strip() if not line: if tokens: sentences.append((tokens, labels)) tokens=[]; labels=[] continue parts = line.split() # token label if len(parts) >= 2: tokens.append(parts[0]); labels.append(parts[-1]) # convert to text and entities (simple) for toks, labs in sentences: text = ' '.join(toks) texts.append(text) ents = [] char_idx = 0 for tok, lab in zip(toks, labs): start = text.find(tok, char_idx) if start==-1: start = char_idx end = start + len(tok) char_idx = end if lab!='O': ents.append({'start': start, 'end': end, 'label': lab}) gold_entities.append(ents) # Now run model predictions and compute simple metrics: token-level span overlap precision/recall/F1 pred_entities = [] for text in texts: preds = ner_mod.predict(text, model_key) # normalize preds to spans spans = [] for p in preds: if 'start' in p and 'end' in p: spans.append({'start': p['start'], 'end': p['end'], 'label': p.get('entity_group') or p.get('entity')}) elif 'word' in p: # find in text (first occ) w = p['word'] i = text.find(w) if i>=0: spans.append({'start': i, 'end': i+len(w), 'label': p.get('entity_group') or p.get('entity')}) pred_entities.append(spans) # simple span-level evaluation (exact match) tp=0; fp=0; fn=0 for gold, pred in zip(gold_entities, pred_entities): gold_set = set([(g['start'], g['end'], g['label']) for g in gold]) pred_set = set([(p['start'], p['end'], p['label']) for p in pred]) tp += len(gold_set & pred_set) fp += len(pred_set - gold_set) fn += len(gold_set - pred_set) prec = tp/(tp+fp) if tp+fp>0 else 0.0 rec = tp/(tp+fn) if tp+fn>0 else 0.0 f1 = 2*prec*rec/(prec+rec) if prec+rec>0 else 0.0 return {"precision": prec, "recall": rec, "f1": f1, "tp": tp, "fp": fp, "fn": fn, "confusion_matrix": [[tp,fp],[fn,0]]} def evaluate_qa(file, model_key: str) -> Dict[str, Any]: # Accept JSONL of {"context":..., "question":..., "answers": [{"text":..., "answer_start":...}, ...]} import io import json if hasattr(file, 'name'): f = open(file.name, 'r', encoding='utf-8') else: f = io.TextIOWrapper(file.file, encoding='utf-8') examples = [] for line in f: line = line.strip() if not line: continue obj = json.loads(line) examples.append(obj) # run model total_em = 0; total_f1 = 0; n=0 for ex in examples: context = ex.get('context','') question = ex.get('question','') gold_answers = [a['text'] for a in ex.get('answers', []) if a.get('text')] pred = qa_mod.predict(context, question, model_key) pred_text = pred.get('answer','') # compute EM and simple F1 between token sets em = 1 if pred_text.strip() in gold_answers else 0 # F1 token overlap: micro def f1_score(a,b): at = a.split(); bt = b.split() common = sum(1 for t in at if t in bt) if common==0: return 0.0 prec = common/len(at); rec = common/len(bt) return 2*prec*rec/(prec+rec) f1 = max([f1_score(pred_text, g) for g in gold_answers]) if gold_answers else 0.0 total_em += em; total_f1 += f1; n+=1 return {"exact_match": total_em/n if n else 0.0, "f1": total_f1/n if n else 0.0} def load_leaderboard(path: str): if not os.path.exists(path) or os.path.getsize(path) == 0: # default df = pd.DataFrame([{"task": "Sentiment", "model": "indobert", "f1": 0.88, "accuracy": 0.87}]) df.to_json(path, orient='records') return df return pd.read_json(path) def save_leaderboard(df, path: str): return pd.read_json(path)