# metrics/evaluate.py
# Evaluation utilities for classification, NER, and QA. Produces standard metrics and confusion matrices.
import os
import json
from typing import Dict, Any
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

from models import sentiment as sentiment_mod
from models import ner as ner_mod
from models import qa as qa_mod


def compute_classification_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
    return {"accuracy": float(acc), "precision": float(prec), "recall": float(rec), "f1": float(f1)}


def evaluate_classification(df: pd.DataFrame, model_key: str, text_col: str = 'text', label_col: str = 'label') -> Dict[str, Any]:
    # df must have text and label columns
    texts = df[text_col].astype(str).tolist()
    y_true = df[label_col].astype(str).tolist()
    y_pred = []
    for t in texts:
        preds = sentiment_mod.predict(t, model_key)
        if preds:
            # preds is list of (label, score) tuples, pick top
            y_pred.append(preds[0][0])
        else:
            y_pred.append('__empty__')
    metrics = compute_classification_metrics(y_true, y_pred)
    labels = sorted(list(set(y_true)))
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    return {"metrics": metrics, "confusion_matrix": cm.tolist(), "labels": labels, **metrics}


def evaluate_ner(file, model_key: str) -> Dict[str, Any]:
    # Accepts a file-like object. Expect JSONL with objects: {"text": "...", "entities": [{"start":int, "end":int, "label":"..."}, ...]}
    # Or accept TSV-CoNLL format where tokens and labels per line separated by \t and sentences by blank lines.
    import io
    if hasattr(file, 'name'):
        file_obj = open(file.name, 'r', encoding='utf-8')
    else:
        file_obj = io.TextIOWrapper(file.file, encoding='utf-8')
    gold_entities = []
    texts = []
    # Try JSONL first
    file_obj.seek(0)
    first_chars = file_obj.read(2)
    file_obj.seek(0)
    if first_chars.strip().startswith('{'):
        # JSONL
        for line in file_obj:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            texts.append(obj.get('text',''))
            gold_entities.append(obj.get('entities', []))
    else:
        # CoNLL-like TSV: token\tlabel per line, sentences separated by blank lines. We'll reconstruct text naive way.
        tokens = []
        labels = []
        sentences = []
        for line in file_obj:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append((tokens, labels))
                    tokens=[]; labels=[]
                continue
            parts = line.split()  # token label
            if len(parts) >= 2:
                tokens.append(parts[0]); labels.append(parts[-1])
        # convert to text and entities (simple)
        for toks, labs in sentences:
            text = ' '.join(toks)
            texts.append(text)
            ents = []
            char_idx = 0
            for tok, lab in zip(toks, labs):
                start = text.find(tok, char_idx)
                if start==-1:
                    start = char_idx
                end = start + len(tok)
                char_idx = end
                if lab!='O':
                    ents.append({'start': start, 'end': end, 'label': lab})
            gold_entities.append(ents)
    # Now run model predictions and compute simple metrics: token-level span overlap precision/recall/F1
    pred_entities = []
    for text in texts:
        preds = ner_mod.predict(text, model_key)
        # normalize preds to spans
        spans = []
        for p in preds:
            if 'start' in p and 'end' in p:
                spans.append({'start': p['start'], 'end': p['end'], 'label': p.get('entity_group') or p.get('entity')})
            elif 'word' in p:
                # find in text (first occ)
                w = p['word']
                i = text.find(w)
                if i>=0:
                    spans.append({'start': i, 'end': i+len(w), 'label': p.get('entity_group') or p.get('entity')})
        pred_entities.append(spans)
    # simple span-level evaluation (exact match)
    tp=0; fp=0; fn=0
    for gold, pred in zip(gold_entities, pred_entities):
        gold_set = set([(g['start'], g['end'], g['label']) for g in gold])
        pred_set = set([(p['start'], p['end'], p['label']) for p in pred])
        tp += len(gold_set & pred_set)
        fp += len(pred_set - gold_set)
        fn += len(gold_set - pred_set)
    prec = tp/(tp+fp) if tp+fp>0 else 0.0
    rec = tp/(tp+fn) if tp+fn>0 else 0.0
    f1 = 2*prec*rec/(prec+rec) if prec+rec>0 else 0.0
    return {"precision": prec, "recall": rec, "f1": f1, "tp": tp, "fp": fp, "fn": fn, "confusion_matrix": [[tp,fp],[fn,0]]}


def evaluate_qa(file, model_key: str) -> Dict[str, Any]:
    # Accept JSONL of {"context":..., "question":..., "answers": [{"text":..., "answer_start":...}, ...]}
    import io
    import json
    if hasattr(file, 'name'):
        f = open(file.name, 'r', encoding='utf-8')
    else:
        f = io.TextIOWrapper(file.file, encoding='utf-8')
    examples = []
    for line in f:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        examples.append(obj)
    # run model
    total_em = 0; total_f1 = 0; n=0
    for ex in examples:
        context = ex.get('context','')
        question = ex.get('question','')
        gold_answers = [a['text'] for a in ex.get('answers', []) if a.get('text')]
        pred = qa_mod.predict(context, question, model_key)
        pred_text = pred.get('answer','')
        # compute EM and simple F1 between token sets
        em = 1 if pred_text.strip() in gold_answers else 0
        # F1 token overlap: micro
        def f1_score(a,b):
            at = a.split(); bt = b.split()
            common = sum(1 for t in at if t in bt)
            if common==0:
                return 0.0
            prec = common/len(at); rec = common/len(bt)
            return 2*prec*rec/(prec+rec)
        f1 = max([f1_score(pred_text, g) for g in gold_answers]) if gold_answers else 0.0
        total_em += em; total_f1 += f1; n+=1
    return {"exact_match": total_em/n if n else 0.0, "f1": total_f1/n if n else 0.0}


def load_leaderboard(path: str):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        # default
        df = pd.DataFrame([{"task": "Sentiment", "model": "indobert", "f1": 0.88, "accuracy": 0.87}])
        df.to_json(path, orient='records')
        return df
    return pd.read_json(path)


def save_leaderboard(df, path: str):
    return pd.read_json(path)