# train.py import os import time import torch import pandas as pd from tqdm import tqdm from datasets import Dataset from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_recall_fscore_support from transformers import AutoTokenizer, AutoModelForSequenceClassification # ---------------------------------------------------------- # UTILITIES # ---------------------------------------------------------- def clean_text(series): return ( series.astype(str) .fillna("") .replace("nan", "") .replace("None", "") .apply(lambda x: x if isinstance(x, str) else str(x)) ) # ---------------------------------------------------------- # PREPROCESSING # ---------------------------------------------------------- def preprocess(df_train, df_val, model_name, max_length=256): tokenizer = AutoTokenizer.from_pretrained(model_name) # Convert to simple dataset with only the needed columns ds_train = Dataset.from_pandas(df_train[["sentence", "sentiment"]]) ds_val = Dataset.from_pandas(df_val[["sentence", "sentiment"]]) # Tokenization def tokenize(batch): return tokenizer( [str(x) for x in batch["sentence"]], padding="max_length", truncation=True, max_length=max_length, ) ds_train = ds_train.map(tokenize, batched=True) ds_val = ds_val.map(tokenize, batched=True) # Rename sentiment to labels ds_train = ds_train.rename_column("sentiment", "labels") ds_val = ds_val.rename_column("sentiment", "labels") # Encode labels ds_train = ds_train.class_encode_column("labels") ds_val = ds_val.class_encode_column("labels") # KEEP ONLY model-expected fields keep_cols = {"input_ids", "attention_mask", "labels"} remove_cols = set(ds_train.column_names) - keep_cols ds_train = ds_train.remove_columns(list(remove_cols)) ds_val = ds_val.remove_columns(list(remove_cols)) label_list = ds_train.features["labels"].names return tokenizer, ds_train, ds_val, label_list # ---------------------------------------------------------- # FALLBACK PYTORCH TRAINING # ---------------------------------------------------------- def train_fallback(model, ds_train, device): from torch.optim import AdamW from torch.utils.data import DataLoader # Fallback hyperparameters batch_size = 8 lr = 2e-5 epochs = 3 def clean_record(r): return { "input_ids": r["input_ids"], "attention_mask": r["attention_mask"], "labels": r["labels"], } train_data = [clean_record(ds_train[i]) for i in range(len(ds_train))] def collate(batch): return { key: torch.tensor([b[key] for b in batch], dtype=torch.long) for key in batch[0] } loader = DataLoader( train_data, batch_size=batch_size, shuffle=True, collate_fn=collate ) optimizer = AdamW(model.parameters(), lr=lr) model.to(device) model.train() for epoch in range(1, epochs + 1): print(f"Epoch {epoch}/{epochs}") loop = tqdm(loader) for batch in loop: batch = {k: v.to(device) for k, v in batch.items()} loss = model(**batch).loss optimizer.zero_grad() loss.backward() optimizer.step() loop.set_postfix(loss=float(loss)) return model # ---------------------------------------------------------- # EVALUATION # ---------------------------------------------------------- def evaluate(model, tokenizer, df_val, device, label_list): preds, golds = [], [] label2id = {label: i for i, label in enumerate(label_list)} model.eval() model.to(device) for _, row in df_val.iterrows(): inputs = tokenizer( row["sentence"], return_tensors="pt", padding=True, truncation=True, max_length=256 ).to(device) with torch.no_grad(): logits = model(**inputs).logits pred = int(logits.argmax(dim=-1)) gold = int(label2id[row["sentiment"]]) preds.append(pred) golds.append(gold) acc = accuracy_score(golds, preds) prec, rec, f1, _ = precision_recall_fscore_support( golds, preds, average="weighted", zero_division=0 ) return { "accuracy": float(acc), "precision": float(prec), "recall": float(rec), "f1": float(f1), "total_eval_rows": len(df_val), } # ---------------------------------------------------------- # MAIN TRAINING PIPELINE # ---------------------------------------------------------- def run_training(df, max_rows, models_to_test): logs = [] start = time.time() # Safety checks UNSUPPORTED = ["llama", "qwen", "phi", "gpt", "mistral", "mixtral"] TOO_LARGE = ["large", "xl", "xxl", "deberta-v3-large"] for m in models_to_test: if any(x in m.lower() for x in UNSUPPORTED): return {"logs": f"āŒ Unsupported model: {m}", "results": {}} if any(x in m.lower() for x in TOO_LARGE): return {"logs": f"āŒ Too large for CPU: {m}", "results": {}} # Limit dataset df = df.head(max_rows) df["sentence"] = clean_text(df["sentence"]) df["sentiment"] = clean_text(df["sentiment"]) df_train, df_val = train_test_split( df, test_size=0.2, stratify=df["sentiment"] ) results = {} device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for model_name in models_to_test: logs.append(f"\nšŸ”µ Training {model_name}") tokenizer, ds_train, ds_val, labels = preprocess( df_train, df_val, model_name ) # Load model safely model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=len(labels), ignore_mismatched_sizes=True # Prevent weight shape crash ) # Try Trainer first try: from transformers import Trainer, TrainingArguments args = TrainingArguments( output_dir=f"out/{model_name}", num_train_epochs=3, per_device_train_batch_size=8, learning_rate=2e-5, save_strategy="no", logging_steps=50, disable_tqdm=True ) trainer = Trainer( model=model, args=args, train_dataset=ds_train ) trainer.train() except Exception as e: logs.append(f"Trainer failed → fallback: {e}") model = train_fallback(model, ds_train, device) metrics = evaluate(model, tokenizer, df_val, device, labels) results[model_name] = metrics # Save fine-tuned model save_dir = f"saved_models/{model_name.replace('/', '_')}" os.makedirs(save_dir, exist_ok=True) model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) logs.append(f"šŸ’¾ Saved model to {save_dir}") total = (time.time() - start) / 60 logs.append(f"\nā± Total Time: {total:.2f} min") return {"logs": "\n".join(logs), "results": results}