esg-fine-tuning / train.py
darisdzakwanhoesien2
Adding more features
9bbcfb0
# train.py
import os
import time
import torch
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# ----------------------------------------------------------
# UTILITIES
# ----------------------------------------------------------
def clean_text(series):
return (
series.astype(str)
.fillna("")
.replace("nan", "")
.replace("None", "")
.apply(lambda x: x if isinstance(x, str) else str(x))
)
# ----------------------------------------------------------
# PREPROCESSING
# ----------------------------------------------------------
def preprocess(df_train, df_val, model_name, max_length=256):
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Convert to simple dataset with only the needed columns
ds_train = Dataset.from_pandas(df_train[["sentence", "sentiment"]])
ds_val = Dataset.from_pandas(df_val[["sentence", "sentiment"]])
# Tokenization
def tokenize(batch):
return tokenizer(
[str(x) for x in batch["sentence"]],
padding="max_length",
truncation=True,
max_length=max_length,
)
ds_train = ds_train.map(tokenize, batched=True)
ds_val = ds_val.map(tokenize, batched=True)
# Rename sentiment to labels
ds_train = ds_train.rename_column("sentiment", "labels")
ds_val = ds_val.rename_column("sentiment", "labels")
# Encode labels
ds_train = ds_train.class_encode_column("labels")
ds_val = ds_val.class_encode_column("labels")
# KEEP ONLY model-expected fields
keep_cols = {"input_ids", "attention_mask", "labels"}
remove_cols = set(ds_train.column_names) - keep_cols
ds_train = ds_train.remove_columns(list(remove_cols))
ds_val = ds_val.remove_columns(list(remove_cols))
label_list = ds_train.features["labels"].names
return tokenizer, ds_train, ds_val, label_list
# ----------------------------------------------------------
# FALLBACK PYTORCH TRAINING
# ----------------------------------------------------------
def train_fallback(model, ds_train, device):
from torch.optim import AdamW
from torch.utils.data import DataLoader
# Fallback hyperparameters
batch_size = 8
lr = 2e-5
epochs = 3
def clean_record(r):
return {
"input_ids": r["input_ids"],
"attention_mask": r["attention_mask"],
"labels": r["labels"],
}
train_data = [clean_record(ds_train[i]) for i in range(len(ds_train))]
def collate(batch):
return {
key: torch.tensor([b[key] for b in batch], dtype=torch.long)
for key in batch[0]
}
loader = DataLoader(
train_data,
batch_size=batch_size,
shuffle=True,
collate_fn=collate
)
optimizer = AdamW(model.parameters(), lr=lr)
model.to(device)
model.train()
for epoch in range(1, epochs + 1):
print(f"Epoch {epoch}/{epochs}")
loop = tqdm(loader)
for batch in loop:
batch = {k: v.to(device) for k, v in batch.items()}
loss = model(**batch).loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
loop.set_postfix(loss=float(loss))
return model
# ----------------------------------------------------------
# EVALUATION
# ----------------------------------------------------------
def evaluate(model, tokenizer, df_val, device, label_list):
preds, golds = [], []
label2id = {label: i for i, label in enumerate(label_list)}
model.eval()
model.to(device)
for _, row in df_val.iterrows():
inputs = tokenizer(
row["sentence"],
return_tensors="pt",
padding=True,
truncation=True,
max_length=256
).to(device)
with torch.no_grad():
logits = model(**inputs).logits
pred = int(logits.argmax(dim=-1))
gold = int(label2id[row["sentiment"]])
preds.append(pred)
golds.append(gold)
acc = accuracy_score(golds, preds)
prec, rec, f1, _ = precision_recall_fscore_support(
golds, preds, average="weighted", zero_division=0
)
return {
"accuracy": float(acc),
"precision": float(prec),
"recall": float(rec),
"f1": float(f1),
"total_eval_rows": len(df_val),
}
# ----------------------------------------------------------
# MAIN TRAINING PIPELINE
# ----------------------------------------------------------
def run_training(df, max_rows, models_to_test):
logs = []
start = time.time()
# Safety checks
UNSUPPORTED = ["llama", "qwen", "phi", "gpt", "mistral", "mixtral"]
TOO_LARGE = ["large", "xl", "xxl", "deberta-v3-large"]
for m in models_to_test:
if any(x in m.lower() for x in UNSUPPORTED):
return {"logs": f"❌ Unsupported model: {m}", "results": {}}
if any(x in m.lower() for x in TOO_LARGE):
return {"logs": f"❌ Too large for CPU: {m}", "results": {}}
# Limit dataset
df = df.head(max_rows)
df["sentence"] = clean_text(df["sentence"])
df["sentiment"] = clean_text(df["sentiment"])
df_train, df_val = train_test_split(
df, test_size=0.2, stratify=df["sentiment"]
)
results = {}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for model_name in models_to_test:
logs.append(f"\n🔵 Training {model_name}")
tokenizer, ds_train, ds_val, labels = preprocess(
df_train, df_val, model_name
)
# Load model safely
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=len(labels),
ignore_mismatched_sizes=True # Prevent weight shape crash
)
# Try Trainer first
try:
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
output_dir=f"out/{model_name}",
num_train_epochs=3,
per_device_train_batch_size=8,
learning_rate=2e-5,
save_strategy="no",
logging_steps=50,
disable_tqdm=True
)
trainer = Trainer(
model=model,
args=args,
train_dataset=ds_train
)
trainer.train()
except Exception as e:
logs.append(f"Trainer failed → fallback: {e}")
model = train_fallback(model, ds_train, device)
metrics = evaluate(model, tokenizer, df_val, device, labels)
results[model_name] = metrics
# Save fine-tuned model
save_dir = f"saved_models/{model_name.replace('/', '_')}"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
logs.append(f"💾 Saved model to {save_dir}")
total = (time.time() - start) / 60
logs.append(f"\n⏱ Total Time: {total:.2f} min")
return {"logs": "\n".join(logs), "results": results}