Spaces:

darisdzakwanhoesien
/

indo_nlp

Sleeping

indo_nlp / metrics /evaluate.py

darisdzakwanhoesien2

Update app.py and evaluate.py with latest changes

0eb435a about 1 month ago

6.86 kB

	# metrics/evaluate.py
	# Evaluation utilities for classification, NER, and QA. Produces standard metrics and confusion matrices.
	import os
	import json
	from typing import Dict, Any
	import numpy as np
	import pandas as pd
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

	from models import sentiment as sentiment_mod
	from models import ner as ner_mod
	from models import qa as qa_mod


	def compute_classification_metrics(y_true, y_pred):
	acc = accuracy_score(y_true, y_pred)
	prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
	return {"accuracy": float(acc), "precision": float(prec), "recall": float(rec), "f1": float(f1)}


	def evaluate_classification(df: pd.DataFrame, model_key: str, text_col: str = 'text', label_col: str = 'label') -> Dict[str, Any]:
	# df must have text and label columns
	texts = df[text_col].astype(str).tolist()
	y_true = df[label_col].astype(str).tolist()
	y_pred = []
	for t in texts:
	preds = sentiment_mod.predict(t, model_key)
	if preds:
	# preds is list of (label, score) tuples, pick top
	y_pred.append(preds[0][0])
	else:
	y_pred.append('__empty__')
	metrics = compute_classification_metrics(y_true, y_pred)
	labels = sorted(list(set(y_true)))
	cm = confusion_matrix(y_true, y_pred, labels=labels)
	return {"metrics": metrics, "confusion_matrix": cm.tolist(), "labels": labels, **metrics}


	def evaluate_ner(file, model_key: str) -> Dict[str, Any]:
	# Accepts a file-like object. Expect JSONL with objects: {"text": "...", "entities": [{"start":int, "end":int, "label":"..."}, ...]}
	# Or accept TSV-CoNLL format where tokens and labels per line separated by \t and sentences by blank lines.
	import io
	if hasattr(file, 'name'):
	file_obj = open(file.name, 'r', encoding='utf-8')
	else:
	file_obj = io.TextIOWrapper(file.file, encoding='utf-8')
	gold_entities = []
	texts = []
	# Try JSONL first
	file_obj.seek(0)
	first_chars = file_obj.read(2)
	file_obj.seek(0)
	if first_chars.strip().startswith('{'):
	# JSONL
	for line in file_obj:
	line = line.strip()
	if not line:
	continue
	obj = json.loads(line)
	texts.append(obj.get('text',''))
	gold_entities.append(obj.get('entities', []))
	else:
	# CoNLL-like TSV: token\tlabel per line, sentences separated by blank lines. We'll reconstruct text naive way.
	tokens = []
	labels = []
	sentences = []
	for line in file_obj:
	line = line.strip()
	if not line:
	if tokens:
	sentences.append((tokens, labels))
	tokens=[]; labels=[]
	continue
	parts = line.split() # token label
	if len(parts) >= 2:
	tokens.append(parts[0]); labels.append(parts[-1])
	# convert to text and entities (simple)
	for toks, labs in sentences:
	text = ' '.join(toks)
	texts.append(text)
	ents = []
	char_idx = 0
	for tok, lab in zip(toks, labs):
	start = text.find(tok, char_idx)
	if start==-1:
	start = char_idx
	end = start + len(tok)
	char_idx = end
	if lab!='O':
	ents.append({'start': start, 'end': end, 'label': lab})
	gold_entities.append(ents)
	# Now run model predictions and compute simple metrics: token-level span overlap precision/recall/F1
	pred_entities = []
	for text in texts:
	preds = ner_mod.predict(text, model_key)
	# normalize preds to spans
	spans = []
	for p in preds:
	if 'start' in p and 'end' in p:
	spans.append({'start': p['start'], 'end': p['end'], 'label': p.get('entity_group') or p.get('entity')})
	elif 'word' in p:
	# find in text (first occ)
	w = p['word']
	i = text.find(w)
	if i>=0:
	spans.append({'start': i, 'end': i+len(w), 'label': p.get('entity_group') or p.get('entity')})
	pred_entities.append(spans)
	# simple span-level evaluation (exact match)
	tp=0; fp=0; fn=0
	for gold, pred in zip(gold_entities, pred_entities):
	gold_set = set([(g['start'], g['end'], g['label']) for g in gold])
	pred_set = set([(p['start'], p['end'], p['label']) for p in pred])
	tp += len(gold_set & pred_set)
	fp += len(pred_set - gold_set)
	fn += len(gold_set - pred_set)
	prec = tp/(tp+fp) if tp+fp>0 else 0.0
	rec = tp/(tp+fn) if tp+fn>0 else 0.0
	f1 = 2precrec/(prec+rec) if prec+rec>0 else 0.0
	return {"precision": prec, "recall": rec, "f1": f1, "tp": tp, "fp": fp, "fn": fn, "confusion_matrix": [[tp,fp],[fn,0]]}


	def evaluate_qa(file, model_key: str) -> Dict[str, Any]:
	# Accept JSONL of {"context":..., "question":..., "answers": [{"text":..., "answer_start":...}, ...]}
	import io
	import json
	if hasattr(file, 'name'):
	f = open(file.name, 'r', encoding='utf-8')
	else:
	f = io.TextIOWrapper(file.file, encoding='utf-8')
	examples = []
	for line in f:
	line = line.strip()
	if not line:
	continue
	obj = json.loads(line)
	examples.append(obj)
	# run model
	total_em = 0; total_f1 = 0; n=0
	for ex in examples:
	context = ex.get('context','')
	question = ex.get('question','')
	gold_answers = [a['text'] for a in ex.get('answers', []) if a.get('text')]
	pred = qa_mod.predict(context, question, model_key)
	pred_text = pred.get('answer','')
	# compute EM and simple F1 between token sets
	em = 1 if pred_text.strip() in gold_answers else 0
	# F1 token overlap: micro
	def f1_score(a,b):
	at = a.split(); bt = b.split()
	common = sum(1 for t in at if t in bt)
	if common==0:
	return 0.0
	prec = common/len(at); rec = common/len(bt)
	return 2precrec/(prec+rec)
	f1 = max([f1_score(pred_text, g) for g in gold_answers]) if gold_answers else 0.0
	total_em += em; total_f1 += f1; n+=1
	return {"exact_match": total_em/n if n else 0.0, "f1": total_f1/n if n else 0.0}


	def load_leaderboard(path: str):
	if not os.path.exists(path) or os.path.getsize(path) == 0:
	# default
	df = pd.DataFrame([{"task": "Sentiment", "model": "indobert", "f1": 0.88, "accuracy": 0.87}])
	df.to_json(path, orient='records')
	return df
	return pd.read_json(path)


	def save_leaderboard(df, path: str):
	return pd.read_json(path)