Spaces:
Sleeping
Sleeping
| # train.py | |
| import os | |
| import time | |
| import torch | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from datasets import Dataset | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| # ---------------------------------------------------------- | |
| # UTILITIES | |
| # ---------------------------------------------------------- | |
| def clean_text(series): | |
| return ( | |
| series.astype(str) | |
| .fillna("") | |
| .replace("nan", "") | |
| .replace("None", "") | |
| .apply(lambda x: x if isinstance(x, str) else str(x)) | |
| ) | |
| # ---------------------------------------------------------- | |
| # PREPROCESSING | |
| # ---------------------------------------------------------- | |
| def preprocess(df_train, df_val, model_name, max_length=256): | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Convert to simple dataset with only the needed columns | |
| ds_train = Dataset.from_pandas(df_train[["sentence", "sentiment"]]) | |
| ds_val = Dataset.from_pandas(df_val[["sentence", "sentiment"]]) | |
| # Tokenization | |
| def tokenize(batch): | |
| return tokenizer( | |
| [str(x) for x in batch["sentence"]], | |
| padding="max_length", | |
| truncation=True, | |
| max_length=max_length, | |
| ) | |
| ds_train = ds_train.map(tokenize, batched=True) | |
| ds_val = ds_val.map(tokenize, batched=True) | |
| # Rename sentiment to labels | |
| ds_train = ds_train.rename_column("sentiment", "labels") | |
| ds_val = ds_val.rename_column("sentiment", "labels") | |
| # Encode labels | |
| ds_train = ds_train.class_encode_column("labels") | |
| ds_val = ds_val.class_encode_column("labels") | |
| # KEEP ONLY model-expected fields | |
| keep_cols = {"input_ids", "attention_mask", "labels"} | |
| remove_cols = set(ds_train.column_names) - keep_cols | |
| ds_train = ds_train.remove_columns(list(remove_cols)) | |
| ds_val = ds_val.remove_columns(list(remove_cols)) | |
| label_list = ds_train.features["labels"].names | |
| return tokenizer, ds_train, ds_val, label_list | |
| # ---------------------------------------------------------- | |
| # FALLBACK PYTORCH TRAINING | |
| # ---------------------------------------------------------- | |
| def train_fallback(model, ds_train, device): | |
| from torch.optim import AdamW | |
| from torch.utils.data import DataLoader | |
| # Fallback hyperparameters | |
| batch_size = 8 | |
| lr = 2e-5 | |
| epochs = 3 | |
| def clean_record(r): | |
| return { | |
| "input_ids": r["input_ids"], | |
| "attention_mask": r["attention_mask"], | |
| "labels": r["labels"], | |
| } | |
| train_data = [clean_record(ds_train[i]) for i in range(len(ds_train))] | |
| def collate(batch): | |
| return { | |
| key: torch.tensor([b[key] for b in batch], dtype=torch.long) | |
| for key in batch[0] | |
| } | |
| loader = DataLoader( | |
| train_data, | |
| batch_size=batch_size, | |
| shuffle=True, | |
| collate_fn=collate | |
| ) | |
| optimizer = AdamW(model.parameters(), lr=lr) | |
| model.to(device) | |
| model.train() | |
| for epoch in range(1, epochs + 1): | |
| print(f"Epoch {epoch}/{epochs}") | |
| loop = tqdm(loader) | |
| for batch in loop: | |
| batch = {k: v.to(device) for k, v in batch.items()} | |
| loss = model(**batch).loss | |
| optimizer.zero_grad() | |
| loss.backward() | |
| optimizer.step() | |
| loop.set_postfix(loss=float(loss)) | |
| return model | |
| # ---------------------------------------------------------- | |
| # EVALUATION | |
| # ---------------------------------------------------------- | |
| def evaluate(model, tokenizer, df_val, device, label_list): | |
| preds, golds = [], [] | |
| label2id = {label: i for i, label in enumerate(label_list)} | |
| model.eval() | |
| model.to(device) | |
| for _, row in df_val.iterrows(): | |
| inputs = tokenizer( | |
| row["sentence"], | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=256 | |
| ).to(device) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| pred = int(logits.argmax(dim=-1)) | |
| gold = int(label2id[row["sentiment"]]) | |
| preds.append(pred) | |
| golds.append(gold) | |
| acc = accuracy_score(golds, preds) | |
| prec, rec, f1, _ = precision_recall_fscore_support( | |
| golds, preds, average="weighted", zero_division=0 | |
| ) | |
| return { | |
| "accuracy": float(acc), | |
| "precision": float(prec), | |
| "recall": float(rec), | |
| "f1": float(f1), | |
| "total_eval_rows": len(df_val), | |
| } | |
| # ---------------------------------------------------------- | |
| # MAIN TRAINING PIPELINE | |
| # ---------------------------------------------------------- | |
| def run_training(df, max_rows, models_to_test): | |
| logs = [] | |
| start = time.time() | |
| # Safety checks | |
| UNSUPPORTED = ["llama", "qwen", "phi", "gpt", "mistral", "mixtral"] | |
| TOO_LARGE = ["large", "xl", "xxl", "deberta-v3-large"] | |
| for m in models_to_test: | |
| if any(x in m.lower() for x in UNSUPPORTED): | |
| return {"logs": f"❌ Unsupported model: {m}", "results": {}} | |
| if any(x in m.lower() for x in TOO_LARGE): | |
| return {"logs": f"❌ Too large for CPU: {m}", "results": {}} | |
| # Limit dataset | |
| df = df.head(max_rows) | |
| df["sentence"] = clean_text(df["sentence"]) | |
| df["sentiment"] = clean_text(df["sentiment"]) | |
| df_train, df_val = train_test_split( | |
| df, test_size=0.2, stratify=df["sentiment"] | |
| ) | |
| results = {} | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| for model_name in models_to_test: | |
| logs.append(f"\n🔵 Training {model_name}") | |
| tokenizer, ds_train, ds_val, labels = preprocess( | |
| df_train, df_val, model_name | |
| ) | |
| # Load model safely | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_name, | |
| num_labels=len(labels), | |
| ignore_mismatched_sizes=True # Prevent weight shape crash | |
| ) | |
| # Try Trainer first | |
| try: | |
| from transformers import Trainer, TrainingArguments | |
| args = TrainingArguments( | |
| output_dir=f"out/{model_name}", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=8, | |
| learning_rate=2e-5, | |
| save_strategy="no", | |
| logging_steps=50, | |
| disable_tqdm=True | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=args, | |
| train_dataset=ds_train | |
| ) | |
| trainer.train() | |
| except Exception as e: | |
| logs.append(f"Trainer failed → fallback: {e}") | |
| model = train_fallback(model, ds_train, device) | |
| metrics = evaluate(model, tokenizer, df_val, device, labels) | |
| results[model_name] = metrics | |
| # Save fine-tuned model | |
| save_dir = f"saved_models/{model_name.replace('/', '_')}" | |
| os.makedirs(save_dir, exist_ok=True) | |
| model.save_pretrained(save_dir) | |
| tokenizer.save_pretrained(save_dir) | |
| logs.append(f"💾 Saved model to {save_dir}") | |
| total = (time.time() - start) / 60 | |
| logs.append(f"\n⏱ Total Time: {total:.2f} min") | |
| return {"logs": "\n".join(logs), "results": results} | |