Spaces:
Sleeping
Sleeping
| # metrics/evaluate.py | |
| # Evaluation utilities for classification, NER, and QA. Produces standard metrics and confusion matrices. | |
| import os | |
| import json | |
| from typing import Dict, Any | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix | |
| from models import sentiment as sentiment_mod | |
| from models import ner as ner_mod | |
| from models import qa as qa_mod | |
| def compute_classification_metrics(y_true, y_pred): | |
| acc = accuracy_score(y_true, y_pred) | |
| prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0) | |
| return {"accuracy": float(acc), "precision": float(prec), "recall": float(rec), "f1": float(f1)} | |
| def evaluate_classification(df: pd.DataFrame, model_key: str, text_col: str = 'text', label_col: str = 'label') -> Dict[str, Any]: | |
| # df must have text and label columns | |
| texts = df[text_col].astype(str).tolist() | |
| y_true = df[label_col].astype(str).tolist() | |
| y_pred = [] | |
| for t in texts: | |
| preds = sentiment_mod.predict(t, model_key) | |
| if preds: | |
| # preds is list of (label, score) tuples, pick top | |
| y_pred.append(preds[0][0]) | |
| else: | |
| y_pred.append('__empty__') | |
| metrics = compute_classification_metrics(y_true, y_pred) | |
| labels = sorted(list(set(y_true))) | |
| cm = confusion_matrix(y_true, y_pred, labels=labels) | |
| return {"metrics": metrics, "confusion_matrix": cm.tolist(), "labels": labels, **metrics} | |
| def evaluate_ner(file, model_key: str) -> Dict[str, Any]: | |
| # Accepts a file-like object. Expect JSONL with objects: {"text": "...", "entities": [{"start":int, "end":int, "label":"..."}, ...]} | |
| # Or accept TSV-CoNLL format where tokens and labels per line separated by \t and sentences by blank lines. | |
| import io | |
| if hasattr(file, 'name'): | |
| file_obj = open(file.name, 'r', encoding='utf-8') | |
| else: | |
| file_obj = io.TextIOWrapper(file.file, encoding='utf-8') | |
| gold_entities = [] | |
| texts = [] | |
| # Try JSONL first | |
| file_obj.seek(0) | |
| first_chars = file_obj.read(2) | |
| file_obj.seek(0) | |
| if first_chars.strip().startswith('{'): | |
| # JSONL | |
| for line in file_obj: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| obj = json.loads(line) | |
| texts.append(obj.get('text','')) | |
| gold_entities.append(obj.get('entities', [])) | |
| else: | |
| # CoNLL-like TSV: token\tlabel per line, sentences separated by blank lines. We'll reconstruct text naive way. | |
| tokens = [] | |
| labels = [] | |
| sentences = [] | |
| for line in file_obj: | |
| line = line.strip() | |
| if not line: | |
| if tokens: | |
| sentences.append((tokens, labels)) | |
| tokens=[]; labels=[] | |
| continue | |
| parts = line.split() # token label | |
| if len(parts) >= 2: | |
| tokens.append(parts[0]); labels.append(parts[-1]) | |
| # convert to text and entities (simple) | |
| for toks, labs in sentences: | |
| text = ' '.join(toks) | |
| texts.append(text) | |
| ents = [] | |
| char_idx = 0 | |
| for tok, lab in zip(toks, labs): | |
| start = text.find(tok, char_idx) | |
| if start==-1: | |
| start = char_idx | |
| end = start + len(tok) | |
| char_idx = end | |
| if lab!='O': | |
| ents.append({'start': start, 'end': end, 'label': lab}) | |
| gold_entities.append(ents) | |
| # Now run model predictions and compute simple metrics: token-level span overlap precision/recall/F1 | |
| pred_entities = [] | |
| for text in texts: | |
| preds = ner_mod.predict(text, model_key) | |
| # normalize preds to spans | |
| spans = [] | |
| for p in preds: | |
| if 'start' in p and 'end' in p: | |
| spans.append({'start': p['start'], 'end': p['end'], 'label': p.get('entity_group') or p.get('entity')}) | |
| elif 'word' in p: | |
| # find in text (first occ) | |
| w = p['word'] | |
| i = text.find(w) | |
| if i>=0: | |
| spans.append({'start': i, 'end': i+len(w), 'label': p.get('entity_group') or p.get('entity')}) | |
| pred_entities.append(spans) | |
| # simple span-level evaluation (exact match) | |
| tp=0; fp=0; fn=0 | |
| for gold, pred in zip(gold_entities, pred_entities): | |
| gold_set = set([(g['start'], g['end'], g['label']) for g in gold]) | |
| pred_set = set([(p['start'], p['end'], p['label']) for p in pred]) | |
| tp += len(gold_set & pred_set) | |
| fp += len(pred_set - gold_set) | |
| fn += len(gold_set - pred_set) | |
| prec = tp/(tp+fp) if tp+fp>0 else 0.0 | |
| rec = tp/(tp+fn) if tp+fn>0 else 0.0 | |
| f1 = 2*prec*rec/(prec+rec) if prec+rec>0 else 0.0 | |
| return {"precision": prec, "recall": rec, "f1": f1, "tp": tp, "fp": fp, "fn": fn, "confusion_matrix": [[tp,fp],[fn,0]]} | |
| def evaluate_qa(file, model_key: str) -> Dict[str, Any]: | |
| # Accept JSONL of {"context":..., "question":..., "answers": [{"text":..., "answer_start":...}, ...]} | |
| import io | |
| import json | |
| if hasattr(file, 'name'): | |
| f = open(file.name, 'r', encoding='utf-8') | |
| else: | |
| f = io.TextIOWrapper(file.file, encoding='utf-8') | |
| examples = [] | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| obj = json.loads(line) | |
| examples.append(obj) | |
| # run model | |
| total_em = 0; total_f1 = 0; n=0 | |
| for ex in examples: | |
| context = ex.get('context','') | |
| question = ex.get('question','') | |
| gold_answers = [a['text'] for a in ex.get('answers', []) if a.get('text')] | |
| pred = qa_mod.predict(context, question, model_key) | |
| pred_text = pred.get('answer','') | |
| # compute EM and simple F1 between token sets | |
| em = 1 if pred_text.strip() in gold_answers else 0 | |
| # F1 token overlap: micro | |
| def f1_score(a,b): | |
| at = a.split(); bt = b.split() | |
| common = sum(1 for t in at if t in bt) | |
| if common==0: | |
| return 0.0 | |
| prec = common/len(at); rec = common/len(bt) | |
| return 2*prec*rec/(prec+rec) | |
| f1 = max([f1_score(pred_text, g) for g in gold_answers]) if gold_answers else 0.0 | |
| total_em += em; total_f1 += f1; n+=1 | |
| return {"exact_match": total_em/n if n else 0.0, "f1": total_f1/n if n else 0.0} | |
| def load_leaderboard(path: str): | |
| if not os.path.exists(path) or os.path.getsize(path) == 0: | |
| # default | |
| df = pd.DataFrame([{"task": "Sentiment", "model": "indobert", "f1": 0.88, "accuracy": 0.87}]) | |
| df.to_json(path, orient='records') | |
| return df | |
| return pd.read_json(path) | |
| def save_leaderboard(df, path: str): | |
| return pd.read_json(path) |