File size: 7,355 Bytes
d1a3593
4d7964e
8ffdc64
dbbf030
 
 
2c0d9a7
dbbf030
2c0d9a7
 
90c9844
dbbf030
 
d1a3593
 
 
 
dbbf030
71dd464
d1a3593
 
 
 
71dd464
 
8ffdc64
d1a3593
 
 
 
dbbf030
 
9bbcfb0
 
 
 
 
dbbf030
 
d1a3593
dbbf030
2c0d9a7
dbbf030
 
 
9bbcfb0
 
8ffdc64
9bbcfb0
dbbf030
d1a3593
8ffdc64
9bbcfb0
dbbf030
d1a3593
812fe05
9bbcfb0
 
 
 
 
 
 
2c0d9a7
 
dbbf030
 
d1a3593
9bbcfb0
d1a3593
853de3d
dbbf030
6957074
 
9bbcfb0
 
 
 
 
dbbf030
 
 
 
 
 
 
d1a3593
8ffdc64
 
d1a3593
 
 
 
8ffdc64
9bbcfb0
 
 
 
 
 
 
dbbf030
2c0d9a7
6957074
 
 
 
d1a3593
 
6957074
 
d1a3593
dbbf030
6957074
 
 
 
90c9844
6957074
 
 
8ffdc64
d1a3593
 
 
 
dbbf030
d1a3593
 
dbbf030
2c0d9a7
dbbf030
90c9844
dbbf030
 
 
 
d1a3593
dbbf030
 
 
 
 
2c0d9a7
d1a3593
90c9844
dbbf030
 
2c0d9a7
dbbf030
d1a3593
dbbf030
d1a3593
 
dbbf030
 
90c9844
 
 
 
 
dbbf030
 
 
d1a3593
90c9844
d1a3593
 
4d7964e
d1a3593
71dd464
9bbcfb0
d1a3593
 
6957074
d1a3593
 
 
 
 
f3ab285
9bbcfb0
dbbf030
d1a3593
 
71dd464
9bbcfb0
 
 
4d7964e
dbbf030
d1a3593
f3ab285
dbbf030
d1a3593
8ffdc64
9bbcfb0
 
 
8ffdc64
9bbcfb0
dbbf030
9bbcfb0
 
 
dbbf030
8ffdc64
d1a3593
dbbf030
d1a3593
dbbf030
 
d1a3593
dbbf030
d1a3593
 
90c9844
d1a3593
 
dbbf030
 
9bbcfb0
 
 
 
 
 
dbbf030
f3ab285
dbbf030
d1a3593
 
2c0d9a7
d1a3593
dbbf030
8ffdc64
d1a3593
 
 
 
 
8ffdc64
d1a3593
812fe05
d1a3593
 
812fe05
9bbcfb0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# train.py
import os
import time
import torch
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import AutoTokenizer, AutoModelForSequenceClassification


# ----------------------------------------------------------
# UTILITIES
# ----------------------------------------------------------
def clean_text(series):
    return (
        series.astype(str)
        .fillna("")
        .replace("nan", "")
        .replace("None", "")
        .apply(lambda x: x if isinstance(x, str) else str(x))
    )


# ----------------------------------------------------------
# PREPROCESSING
# ----------------------------------------------------------
def preprocess(df_train, df_val, model_name, max_length=256):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Convert to simple dataset with only the needed columns
    ds_train = Dataset.from_pandas(df_train[["sentence", "sentiment"]])
    ds_val = Dataset.from_pandas(df_val[["sentence", "sentiment"]])

    # Tokenization
    def tokenize(batch):
        return tokenizer(
            [str(x) for x in batch["sentence"]],
            padding="max_length",
            truncation=True,
            max_length=max_length,
        )

    ds_train = ds_train.map(tokenize, batched=True)
    ds_val = ds_val.map(tokenize, batched=True)

    # Rename sentiment to labels
    ds_train = ds_train.rename_column("sentiment", "labels")
    ds_val = ds_val.rename_column("sentiment", "labels")

    # Encode labels
    ds_train = ds_train.class_encode_column("labels")
    ds_val = ds_val.class_encode_column("labels")

    # KEEP ONLY model-expected fields
    keep_cols = {"input_ids", "attention_mask", "labels"}
    remove_cols = set(ds_train.column_names) - keep_cols

    ds_train = ds_train.remove_columns(list(remove_cols))
    ds_val = ds_val.remove_columns(list(remove_cols))

    label_list = ds_train.features["labels"].names
    return tokenizer, ds_train, ds_val, label_list


# ----------------------------------------------------------
# FALLBACK PYTORCH TRAINING
# ----------------------------------------------------------
def train_fallback(model, ds_train, device):
    from torch.optim import AdamW
    from torch.utils.data import DataLoader

    # Fallback hyperparameters
    batch_size = 8
    lr = 2e-5
    epochs = 3

    def clean_record(r):
        return {
            "input_ids": r["input_ids"],
            "attention_mask": r["attention_mask"],
            "labels": r["labels"],
        }

    train_data = [clean_record(ds_train[i]) for i in range(len(ds_train))]

    def collate(batch):
        return {
            key: torch.tensor([b[key] for b in batch], dtype=torch.long)
            for key in batch[0]
        }

    loader = DataLoader(
        train_data,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate
    )

    optimizer = AdamW(model.parameters(), lr=lr)

    model.to(device)
    model.train()

    for epoch in range(1, epochs + 1):
        print(f"Epoch {epoch}/{epochs}")
        loop = tqdm(loader)
        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(**batch).loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loop.set_postfix(loss=float(loss))

    return model


# ----------------------------------------------------------
# EVALUATION
# ----------------------------------------------------------
def evaluate(model, tokenizer, df_val, device, label_list):
    preds, golds = [], []
    label2id = {label: i for i, label in enumerate(label_list)}

    model.eval()
    model.to(device)

    for _, row in df_val.iterrows():
        inputs = tokenizer(
            row["sentence"],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256
        ).to(device)

        with torch.no_grad():
            logits = model(**inputs).logits

        pred = int(logits.argmax(dim=-1))
        gold = int(label2id[row["sentiment"]])

        preds.append(pred)
        golds.append(gold)

    acc = accuracy_score(golds, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        golds, preds, average="weighted", zero_division=0
    )

    return {
        "accuracy": float(acc),
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1),
        "total_eval_rows": len(df_val),
    }


# ----------------------------------------------------------
# MAIN TRAINING PIPELINE
# ----------------------------------------------------------
def run_training(df, max_rows, models_to_test):
    logs = []
    start = time.time()

    # Safety checks
    UNSUPPORTED = ["llama", "qwen", "phi", "gpt", "mistral", "mixtral"]
    TOO_LARGE = ["large", "xl", "xxl", "deberta-v3-large"]

    for m in models_to_test:
        if any(x in m.lower() for x in UNSUPPORTED):
            return {"logs": f"❌ Unsupported model: {m}", "results": {}}
        if any(x in m.lower() for x in TOO_LARGE):
            return {"logs": f"❌ Too large for CPU: {m}", "results": {}}

    # Limit dataset
    df = df.head(max_rows)
    df["sentence"] = clean_text(df["sentence"])
    df["sentiment"] = clean_text(df["sentiment"])

    df_train, df_val = train_test_split(
        df, test_size=0.2, stratify=df["sentiment"]
    )

    results = {}
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for model_name in models_to_test:
        logs.append(f"\n🔵 Training {model_name}")

        tokenizer, ds_train, ds_val, labels = preprocess(
            df_train, df_val, model_name
        )

        # Load model safely
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(labels),
            ignore_mismatched_sizes=True   # Prevent weight shape crash
        )

        # Try Trainer first
        try:
            from transformers import Trainer, TrainingArguments

            args = TrainingArguments(
                output_dir=f"out/{model_name}",
                num_train_epochs=3,
                per_device_train_batch_size=8,
                learning_rate=2e-5,
                save_strategy="no",
                logging_steps=50,
                disable_tqdm=True
            )

            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=ds_train
            )

            trainer.train()

        except Exception as e:
            logs.append(f"Trainer failed → fallback: {e}")
            model = train_fallback(model, ds_train, device)

        metrics = evaluate(model, tokenizer, df_val, device, labels)
        results[model_name] = metrics

        # Save fine-tuned model
        save_dir = f"saved_models/{model_name.replace('/', '_')}"
        os.makedirs(save_dir, exist_ok=True)
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)

        logs.append(f"💾 Saved model to {save_dir}")

    total = (time.time() - start) / 60
    logs.append(f"\n⏱ Total Time: {total:.2f} min")

    return {"logs": "\n".join(logs), "results": results}