aliya-ra commited on
Commit
d5861d2
·
verified ·
1 Parent(s): 32e910f

Model save

Browse files
Files changed (3) hide show
  1. all_results.json +9 -0
  2. train_results.json +9 -0
  3. trainer_state.json +749 -0
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.7769582932142868,
5
+ "train_runtime": 3235.4669,
6
+ "train_samples": 422,
7
+ "train_samples_per_second": 0.12,
8
+ "train_steps_per_second": 0.03
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.7769582932142868,
5
+ "train_runtime": 3235.4669,
6
+ "train_samples": 422,
7
+ "train_samples_per_second": 0.12,
8
+ "train_steps_per_second": 0.03
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,749 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 25,
7
+ "global_step": 97,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0103359173126615,
14
+ "grad_norm": 3.4776461124420166,
15
+ "learning_rate": 0.0001,
16
+ "loss": 0.2022,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.020671834625323,
21
+ "grad_norm": 18.074466705322266,
22
+ "learning_rate": 9.896907216494846e-05,
23
+ "loss": 2.2463,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.031007751937984496,
28
+ "grad_norm": 25.333541870117188,
29
+ "learning_rate": 9.793814432989691e-05,
30
+ "loss": 3.0288,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.041343669250646,
35
+ "grad_norm": 5.342225551605225,
36
+ "learning_rate": 9.690721649484537e-05,
37
+ "loss": 0.2791,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.05167958656330749,
42
+ "grad_norm": 16.2089786529541,
43
+ "learning_rate": 9.587628865979382e-05,
44
+ "loss": 2.3307,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.06201550387596899,
49
+ "grad_norm": 7.590736389160156,
50
+ "learning_rate": 9.484536082474227e-05,
51
+ "loss": 0.5745,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.07235142118863049,
56
+ "grad_norm": 12.122525215148926,
57
+ "learning_rate": 9.381443298969073e-05,
58
+ "loss": 1.5927,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.082687338501292,
63
+ "grad_norm": 8.720907211303711,
64
+ "learning_rate": 9.278350515463918e-05,
65
+ "loss": 0.6878,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.09302325581395349,
70
+ "grad_norm": 16.62272834777832,
71
+ "learning_rate": 9.175257731958763e-05,
72
+ "loss": 0.6334,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 0.10335917312661498,
77
+ "grad_norm": 30.24419403076172,
78
+ "learning_rate": 9.072164948453609e-05,
79
+ "loss": 3.2154,
80
+ "step": 10
81
+ },
82
+ {
83
+ "epoch": 0.11369509043927649,
84
+ "grad_norm": 6.775628089904785,
85
+ "learning_rate": 8.969072164948454e-05,
86
+ "loss": 0.4518,
87
+ "step": 11
88
+ },
89
+ {
90
+ "epoch": 0.12403100775193798,
91
+ "grad_norm": 4.872361183166504,
92
+ "learning_rate": 8.865979381443299e-05,
93
+ "loss": 0.392,
94
+ "step": 12
95
+ },
96
+ {
97
+ "epoch": 0.1343669250645995,
98
+ "grad_norm": 10.680950164794922,
99
+ "learning_rate": 8.762886597938145e-05,
100
+ "loss": 1.3972,
101
+ "step": 13
102
+ },
103
+ {
104
+ "epoch": 0.14470284237726097,
105
+ "grad_norm": 2.141592264175415,
106
+ "learning_rate": 8.65979381443299e-05,
107
+ "loss": 0.1657,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.15503875968992248,
112
+ "grad_norm": 10.623418807983398,
113
+ "learning_rate": 8.556701030927835e-05,
114
+ "loss": 1.2684,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.165374677002584,
119
+ "grad_norm": 8.769332885742188,
120
+ "learning_rate": 8.453608247422681e-05,
121
+ "loss": 0.9647,
122
+ "step": 16
123
+ },
124
+ {
125
+ "epoch": 0.17571059431524547,
126
+ "grad_norm": 9.13052749633789,
127
+ "learning_rate": 8.350515463917527e-05,
128
+ "loss": 0.96,
129
+ "step": 17
130
+ },
131
+ {
132
+ "epoch": 0.18604651162790697,
133
+ "grad_norm": 2.1440060138702393,
134
+ "learning_rate": 8.247422680412371e-05,
135
+ "loss": 0.2028,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.19638242894056848,
140
+ "grad_norm": 6.703273773193359,
141
+ "learning_rate": 8.144329896907217e-05,
142
+ "loss": 0.3948,
143
+ "step": 19
144
+ },
145
+ {
146
+ "epoch": 0.20671834625322996,
147
+ "grad_norm": 4.315890312194824,
148
+ "learning_rate": 8.041237113402063e-05,
149
+ "loss": 0.3812,
150
+ "step": 20
151
+ },
152
+ {
153
+ "epoch": 0.21705426356589147,
154
+ "grad_norm": 3.0346481800079346,
155
+ "learning_rate": 7.938144329896907e-05,
156
+ "loss": 0.3261,
157
+ "step": 21
158
+ },
159
+ {
160
+ "epoch": 0.22739018087855298,
161
+ "grad_norm": 9.196224212646484,
162
+ "learning_rate": 7.835051546391753e-05,
163
+ "loss": 1.0096,
164
+ "step": 22
165
+ },
166
+ {
167
+ "epoch": 0.23772609819121446,
168
+ "grad_norm": 6.244184494018555,
169
+ "learning_rate": 7.731958762886599e-05,
170
+ "loss": 0.6852,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.24806201550387597,
175
+ "grad_norm": 9.198563575744629,
176
+ "learning_rate": 7.628865979381443e-05,
177
+ "loss": 1.3265,
178
+ "step": 24
179
+ },
180
+ {
181
+ "epoch": 0.25839793281653745,
182
+ "grad_norm": 8.720876693725586,
183
+ "learning_rate": 7.525773195876289e-05,
184
+ "loss": 0.9248,
185
+ "step": 25
186
+ },
187
+ {
188
+ "epoch": 0.25839793281653745,
189
+ "eval_accuracy": 0.43902439024390244,
190
+ "eval_loss": 0.9799439311027527,
191
+ "eval_runtime": 96.9318,
192
+ "eval_samples_per_second": 0.423,
193
+ "eval_steps_per_second": 0.217,
194
+ "step": 25
195
+ },
196
+ {
197
+ "epoch": 0.268733850129199,
198
+ "grad_norm": 5.846583843231201,
199
+ "learning_rate": 7.422680412371135e-05,
200
+ "loss": 0.4893,
201
+ "step": 26
202
+ },
203
+ {
204
+ "epoch": 0.27906976744186046,
205
+ "grad_norm": 6.042626857757568,
206
+ "learning_rate": 7.319587628865979e-05,
207
+ "loss": 0.6472,
208
+ "step": 27
209
+ },
210
+ {
211
+ "epoch": 0.28940568475452194,
212
+ "grad_norm": 5.381496429443359,
213
+ "learning_rate": 7.216494845360825e-05,
214
+ "loss": 0.581,
215
+ "step": 28
216
+ },
217
+ {
218
+ "epoch": 0.2997416020671835,
219
+ "grad_norm": 3.3443357944488525,
220
+ "learning_rate": 7.113402061855671e-05,
221
+ "loss": 0.464,
222
+ "step": 29
223
+ },
224
+ {
225
+ "epoch": 0.31007751937984496,
226
+ "grad_norm": 3.9191789627075195,
227
+ "learning_rate": 7.010309278350515e-05,
228
+ "loss": 0.3809,
229
+ "step": 30
230
+ },
231
+ {
232
+ "epoch": 0.32041343669250644,
233
+ "grad_norm": 4.756009578704834,
234
+ "learning_rate": 6.907216494845361e-05,
235
+ "loss": 0.5774,
236
+ "step": 31
237
+ },
238
+ {
239
+ "epoch": 0.330749354005168,
240
+ "grad_norm": 3.4911065101623535,
241
+ "learning_rate": 6.804123711340207e-05,
242
+ "loss": 0.2184,
243
+ "step": 32
244
+ },
245
+ {
246
+ "epoch": 0.34108527131782945,
247
+ "grad_norm": 8.012396812438965,
248
+ "learning_rate": 6.701030927835051e-05,
249
+ "loss": 0.7935,
250
+ "step": 33
251
+ },
252
+ {
253
+ "epoch": 0.35142118863049093,
254
+ "grad_norm": 11.829768180847168,
255
+ "learning_rate": 6.597938144329897e-05,
256
+ "loss": 1.2495,
257
+ "step": 34
258
+ },
259
+ {
260
+ "epoch": 0.36175710594315247,
261
+ "grad_norm": 5.451401710510254,
262
+ "learning_rate": 6.494845360824743e-05,
263
+ "loss": 0.6983,
264
+ "step": 35
265
+ },
266
+ {
267
+ "epoch": 0.37209302325581395,
268
+ "grad_norm": 5.636115550994873,
269
+ "learning_rate": 6.391752577319587e-05,
270
+ "loss": 0.7448,
271
+ "step": 36
272
+ },
273
+ {
274
+ "epoch": 0.38242894056847543,
275
+ "grad_norm": 3.40533185005188,
276
+ "learning_rate": 6.288659793814433e-05,
277
+ "loss": 0.3883,
278
+ "step": 37
279
+ },
280
+ {
281
+ "epoch": 0.39276485788113696,
282
+ "grad_norm": 4.675537586212158,
283
+ "learning_rate": 6.185567010309279e-05,
284
+ "loss": 0.3897,
285
+ "step": 38
286
+ },
287
+ {
288
+ "epoch": 0.40310077519379844,
289
+ "grad_norm": 6.9506731033325195,
290
+ "learning_rate": 6.0824742268041234e-05,
291
+ "loss": 1.0316,
292
+ "step": 39
293
+ },
294
+ {
295
+ "epoch": 0.4134366925064599,
296
+ "grad_norm": 6.975220680236816,
297
+ "learning_rate": 5.979381443298969e-05,
298
+ "loss": 0.9857,
299
+ "step": 40
300
+ },
301
+ {
302
+ "epoch": 0.42377260981912146,
303
+ "grad_norm": 5.598294734954834,
304
+ "learning_rate": 5.876288659793815e-05,
305
+ "loss": 0.7466,
306
+ "step": 41
307
+ },
308
+ {
309
+ "epoch": 0.43410852713178294,
310
+ "grad_norm": 6.038373947143555,
311
+ "learning_rate": 5.7731958762886594e-05,
312
+ "loss": 0.7051,
313
+ "step": 42
314
+ },
315
+ {
316
+ "epoch": 0.4444444444444444,
317
+ "grad_norm": 12.064945220947266,
318
+ "learning_rate": 5.670103092783505e-05,
319
+ "loss": 1.6664,
320
+ "step": 43
321
+ },
322
+ {
323
+ "epoch": 0.45478036175710596,
324
+ "grad_norm": 10.026833534240723,
325
+ "learning_rate": 5.567010309278351e-05,
326
+ "loss": 1.1244,
327
+ "step": 44
328
+ },
329
+ {
330
+ "epoch": 0.46511627906976744,
331
+ "grad_norm": 6.381928443908691,
332
+ "learning_rate": 5.4639175257731955e-05,
333
+ "loss": 0.796,
334
+ "step": 45
335
+ },
336
+ {
337
+ "epoch": 0.4754521963824289,
338
+ "grad_norm": 5.155933380126953,
339
+ "learning_rate": 5.360824742268041e-05,
340
+ "loss": 0.5749,
341
+ "step": 46
342
+ },
343
+ {
344
+ "epoch": 0.48578811369509045,
345
+ "grad_norm": 9.592456817626953,
346
+ "learning_rate": 5.257731958762887e-05,
347
+ "loss": 1.2015,
348
+ "step": 47
349
+ },
350
+ {
351
+ "epoch": 0.49612403100775193,
352
+ "grad_norm": 4.840389728546143,
353
+ "learning_rate": 5.1546391752577315e-05,
354
+ "loss": 0.3744,
355
+ "step": 48
356
+ },
357
+ {
358
+ "epoch": 0.5064599483204134,
359
+ "grad_norm": 5.06065034866333,
360
+ "learning_rate": 5.051546391752577e-05,
361
+ "loss": 0.753,
362
+ "step": 49
363
+ },
364
+ {
365
+ "epoch": 0.5167958656330749,
366
+ "grad_norm": 3.183397054672241,
367
+ "learning_rate": 4.948453608247423e-05,
368
+ "loss": 0.2727,
369
+ "step": 50
370
+ },
371
+ {
372
+ "epoch": 0.5167958656330749,
373
+ "eval_accuracy": 0.43902439024390244,
374
+ "eval_loss": 0.8627240061759949,
375
+ "eval_runtime": 114.166,
376
+ "eval_samples_per_second": 0.359,
377
+ "eval_steps_per_second": 0.184,
378
+ "step": 50
379
+ },
380
+ {
381
+ "epoch": 0.5271317829457365,
382
+ "grad_norm": 5.550570011138916,
383
+ "learning_rate": 4.845360824742268e-05,
384
+ "loss": 0.6691,
385
+ "step": 51
386
+ },
387
+ {
388
+ "epoch": 0.537467700258398,
389
+ "grad_norm": 3.9809603691101074,
390
+ "learning_rate": 4.7422680412371134e-05,
391
+ "loss": 0.3063,
392
+ "step": 52
393
+ },
394
+ {
395
+ "epoch": 0.5478036175710594,
396
+ "grad_norm": 3.5835907459259033,
397
+ "learning_rate": 4.639175257731959e-05,
398
+ "loss": 0.4591,
399
+ "step": 53
400
+ },
401
+ {
402
+ "epoch": 0.5581395348837209,
403
+ "grad_norm": 6.936642169952393,
404
+ "learning_rate": 4.536082474226804e-05,
405
+ "loss": 0.6994,
406
+ "step": 54
407
+ },
408
+ {
409
+ "epoch": 0.5684754521963824,
410
+ "grad_norm": 5.786340713500977,
411
+ "learning_rate": 4.4329896907216494e-05,
412
+ "loss": 0.8737,
413
+ "step": 55
414
+ },
415
+ {
416
+ "epoch": 0.5788113695090439,
417
+ "grad_norm": 4.627039909362793,
418
+ "learning_rate": 4.329896907216495e-05,
419
+ "loss": 0.7124,
420
+ "step": 56
421
+ },
422
+ {
423
+ "epoch": 0.5891472868217055,
424
+ "grad_norm": 2.9738147258758545,
425
+ "learning_rate": 4.2268041237113404e-05,
426
+ "loss": 0.2404,
427
+ "step": 57
428
+ },
429
+ {
430
+ "epoch": 0.599483204134367,
431
+ "grad_norm": 4.692768096923828,
432
+ "learning_rate": 4.1237113402061855e-05,
433
+ "loss": 0.5189,
434
+ "step": 58
435
+ },
436
+ {
437
+ "epoch": 0.6098191214470284,
438
+ "grad_norm": 4.4202561378479,
439
+ "learning_rate": 4.020618556701031e-05,
440
+ "loss": 0.5649,
441
+ "step": 59
442
+ },
443
+ {
444
+ "epoch": 0.6201550387596899,
445
+ "grad_norm": 10.227799415588379,
446
+ "learning_rate": 3.9175257731958764e-05,
447
+ "loss": 1.4142,
448
+ "step": 60
449
+ },
450
+ {
451
+ "epoch": 0.6304909560723514,
452
+ "grad_norm": 3.4684948921203613,
453
+ "learning_rate": 3.8144329896907216e-05,
454
+ "loss": 0.4295,
455
+ "step": 61
456
+ },
457
+ {
458
+ "epoch": 0.6408268733850129,
459
+ "grad_norm": 3.624891519546509,
460
+ "learning_rate": 3.7113402061855674e-05,
461
+ "loss": 0.5815,
462
+ "step": 62
463
+ },
464
+ {
465
+ "epoch": 0.6511627906976745,
466
+ "grad_norm": 4.949793815612793,
467
+ "learning_rate": 3.6082474226804125e-05,
468
+ "loss": 0.7469,
469
+ "step": 63
470
+ },
471
+ {
472
+ "epoch": 0.661498708010336,
473
+ "grad_norm": 5.192891597747803,
474
+ "learning_rate": 3.5051546391752576e-05,
475
+ "loss": 0.6927,
476
+ "step": 64
477
+ },
478
+ {
479
+ "epoch": 0.6718346253229974,
480
+ "grad_norm": 5.358509063720703,
481
+ "learning_rate": 3.4020618556701034e-05,
482
+ "loss": 0.6752,
483
+ "step": 65
484
+ },
485
+ {
486
+ "epoch": 0.6821705426356589,
487
+ "grad_norm": 4.354433536529541,
488
+ "learning_rate": 3.2989690721649485e-05,
489
+ "loss": 0.5655,
490
+ "step": 66
491
+ },
492
+ {
493
+ "epoch": 0.6925064599483204,
494
+ "grad_norm": 5.125238418579102,
495
+ "learning_rate": 3.1958762886597937e-05,
496
+ "loss": 0.4504,
497
+ "step": 67
498
+ },
499
+ {
500
+ "epoch": 0.7028423772609819,
501
+ "grad_norm": 2.409564256668091,
502
+ "learning_rate": 3.0927835051546395e-05,
503
+ "loss": 0.1813,
504
+ "step": 68
505
+ },
506
+ {
507
+ "epoch": 0.7131782945736435,
508
+ "grad_norm": 5.781549453735352,
509
+ "learning_rate": 2.9896907216494846e-05,
510
+ "loss": 0.9296,
511
+ "step": 69
512
+ },
513
+ {
514
+ "epoch": 0.7235142118863049,
515
+ "grad_norm": 4.349827289581299,
516
+ "learning_rate": 2.8865979381443297e-05,
517
+ "loss": 0.7134,
518
+ "step": 70
519
+ },
520
+ {
521
+ "epoch": 0.7338501291989664,
522
+ "grad_norm": 2.254243850708008,
523
+ "learning_rate": 2.7835051546391755e-05,
524
+ "loss": 0.2136,
525
+ "step": 71
526
+ },
527
+ {
528
+ "epoch": 0.7441860465116279,
529
+ "grad_norm": 4.898633003234863,
530
+ "learning_rate": 2.6804123711340206e-05,
531
+ "loss": 0.44,
532
+ "step": 72
533
+ },
534
+ {
535
+ "epoch": 0.7545219638242894,
536
+ "grad_norm": 2.420814275741577,
537
+ "learning_rate": 2.5773195876288658e-05,
538
+ "loss": 0.1553,
539
+ "step": 73
540
+ },
541
+ {
542
+ "epoch": 0.7648578811369509,
543
+ "grad_norm": 11.088196754455566,
544
+ "learning_rate": 2.4742268041237116e-05,
545
+ "loss": 1.8921,
546
+ "step": 74
547
+ },
548
+ {
549
+ "epoch": 0.7751937984496124,
550
+ "grad_norm": 7.634116172790527,
551
+ "learning_rate": 2.3711340206185567e-05,
552
+ "loss": 1.2523,
553
+ "step": 75
554
+ },
555
+ {
556
+ "epoch": 0.7751937984496124,
557
+ "eval_accuracy": 0.4634146341463415,
558
+ "eval_loss": 0.8188944458961487,
559
+ "eval_runtime": 116.0417,
560
+ "eval_samples_per_second": 0.353,
561
+ "eval_steps_per_second": 0.181,
562
+ "step": 75
563
+ },
564
+ {
565
+ "epoch": 0.7855297157622739,
566
+ "grad_norm": 2.6209282875061035,
567
+ "learning_rate": 2.268041237113402e-05,
568
+ "loss": 0.2504,
569
+ "step": 76
570
+ },
571
+ {
572
+ "epoch": 0.7958656330749354,
573
+ "grad_norm": 6.270251750946045,
574
+ "learning_rate": 2.1649484536082476e-05,
575
+ "loss": 1.0045,
576
+ "step": 77
577
+ },
578
+ {
579
+ "epoch": 0.8062015503875969,
580
+ "grad_norm": 9.073151588439941,
581
+ "learning_rate": 2.0618556701030927e-05,
582
+ "loss": 1.2293,
583
+ "step": 78
584
+ },
585
+ {
586
+ "epoch": 0.8165374677002584,
587
+ "grad_norm": 7.216558933258057,
588
+ "learning_rate": 1.9587628865979382e-05,
589
+ "loss": 1.1463,
590
+ "step": 79
591
+ },
592
+ {
593
+ "epoch": 0.8268733850129198,
594
+ "grad_norm": 5.713375568389893,
595
+ "learning_rate": 1.8556701030927837e-05,
596
+ "loss": 0.6553,
597
+ "step": 80
598
+ },
599
+ {
600
+ "epoch": 0.8372093023255814,
601
+ "grad_norm": 6.03870964050293,
602
+ "learning_rate": 1.7525773195876288e-05,
603
+ "loss": 0.8425,
604
+ "step": 81
605
+ },
606
+ {
607
+ "epoch": 0.8475452196382429,
608
+ "grad_norm": 4.563985347747803,
609
+ "learning_rate": 1.6494845360824743e-05,
610
+ "loss": 0.5306,
611
+ "step": 82
612
+ },
613
+ {
614
+ "epoch": 0.8578811369509044,
615
+ "grad_norm": 4.291099548339844,
616
+ "learning_rate": 1.5463917525773197e-05,
617
+ "loss": 0.6263,
618
+ "step": 83
619
+ },
620
+ {
621
+ "epoch": 0.8682170542635659,
622
+ "grad_norm": 3.6169426441192627,
623
+ "learning_rate": 1.4432989690721649e-05,
624
+ "loss": 0.4065,
625
+ "step": 84
626
+ },
627
+ {
628
+ "epoch": 0.8785529715762274,
629
+ "grad_norm": 3.3586318492889404,
630
+ "learning_rate": 1.3402061855670103e-05,
631
+ "loss": 0.5428,
632
+ "step": 85
633
+ },
634
+ {
635
+ "epoch": 0.8888888888888888,
636
+ "grad_norm": 4.2463226318359375,
637
+ "learning_rate": 1.2371134020618558e-05,
638
+ "loss": 0.7002,
639
+ "step": 86
640
+ },
641
+ {
642
+ "epoch": 0.8992248062015504,
643
+ "grad_norm": 4.7999958992004395,
644
+ "learning_rate": 1.134020618556701e-05,
645
+ "loss": 0.7588,
646
+ "step": 87
647
+ },
648
+ {
649
+ "epoch": 0.9095607235142119,
650
+ "grad_norm": 4.986218452453613,
651
+ "learning_rate": 1.0309278350515464e-05,
652
+ "loss": 0.5885,
653
+ "step": 88
654
+ },
655
+ {
656
+ "epoch": 0.9198966408268734,
657
+ "grad_norm": 4.087247848510742,
658
+ "learning_rate": 9.278350515463918e-06,
659
+ "loss": 0.4967,
660
+ "step": 89
661
+ },
662
+ {
663
+ "epoch": 0.9302325581395349,
664
+ "grad_norm": 10.784661293029785,
665
+ "learning_rate": 8.247422680412371e-06,
666
+ "loss": 1.0732,
667
+ "step": 90
668
+ },
669
+ {
670
+ "epoch": 0.9405684754521964,
671
+ "grad_norm": 4.600091934204102,
672
+ "learning_rate": 7.216494845360824e-06,
673
+ "loss": 0.4195,
674
+ "step": 91
675
+ },
676
+ {
677
+ "epoch": 0.9509043927648578,
678
+ "grad_norm": 5.874855041503906,
679
+ "learning_rate": 6.185567010309279e-06,
680
+ "loss": 0.7885,
681
+ "step": 92
682
+ },
683
+ {
684
+ "epoch": 0.9612403100775194,
685
+ "grad_norm": 4.497281074523926,
686
+ "learning_rate": 5.154639175257732e-06,
687
+ "loss": 0.8218,
688
+ "step": 93
689
+ },
690
+ {
691
+ "epoch": 0.9715762273901809,
692
+ "grad_norm": 5.87256383895874,
693
+ "learning_rate": 4.123711340206186e-06,
694
+ "loss": 0.6213,
695
+ "step": 94
696
+ },
697
+ {
698
+ "epoch": 0.9819121447028424,
699
+ "grad_norm": 5.737720966339111,
700
+ "learning_rate": 3.0927835051546395e-06,
701
+ "loss": 0.876,
702
+ "step": 95
703
+ },
704
+ {
705
+ "epoch": 0.9922480620155039,
706
+ "grad_norm": 4.816653728485107,
707
+ "learning_rate": 2.061855670103093e-06,
708
+ "loss": 0.5846,
709
+ "step": 96
710
+ },
711
+ {
712
+ "epoch": 1.0,
713
+ "grad_norm": 5.251804351806641,
714
+ "learning_rate": 1.0309278350515464e-06,
715
+ "loss": 0.5277,
716
+ "step": 97
717
+ },
718
+ {
719
+ "epoch": 1.0,
720
+ "step": 97,
721
+ "total_flos": 0.0,
722
+ "train_loss": 0.7769582932142868,
723
+ "train_runtime": 3235.4669,
724
+ "train_samples_per_second": 0.12,
725
+ "train_steps_per_second": 0.03
726
+ }
727
+ ],
728
+ "logging_steps": 1,
729
+ "max_steps": 97,
730
+ "num_input_tokens_seen": 0,
731
+ "num_train_epochs": 1,
732
+ "save_steps": 100,
733
+ "stateful_callbacks": {
734
+ "TrainerControl": {
735
+ "args": {
736
+ "should_epoch_stop": false,
737
+ "should_evaluate": false,
738
+ "should_log": false,
739
+ "should_save": true,
740
+ "should_training_stop": true
741
+ },
742
+ "attributes": {}
743
+ }
744
+ },
745
+ "total_flos": 0.0,
746
+ "train_batch_size": 1,
747
+ "trial_name": null,
748
+ "trial_params": null
749
+ }