| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 470, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0425531914893617, | |
| "grad_norm": 0.4270687699317932, | |
| "learning_rate": 0.00019999602855426865, | |
| "loss": 1.013, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0851063829787234, | |
| "grad_norm": 0.4152718782424927, | |
| "learning_rate": 0.00019998411453252217, | |
| "loss": 0.8289, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1276595744680851, | |
| "grad_norm": 0.7277560234069824, | |
| "learning_rate": 0.0001999642588810784, | |
| "loss": 0.5959, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1702127659574468, | |
| "grad_norm": 0.5505673885345459, | |
| "learning_rate": 0.00019993646317705016, | |
| "loss": 0.459, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2127659574468085, | |
| "grad_norm": 0.528052031993866, | |
| "learning_rate": 0.00019990072962822007, | |
| "loss": 0.3775, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2553191489361702, | |
| "grad_norm": 0.6307681202888489, | |
| "learning_rate": 0.00019985706107286514, | |
| "loss": 0.3285, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2978723404255319, | |
| "grad_norm": 0.6954013109207153, | |
| "learning_rate": 0.00019980546097953132, | |
| "loss": 0.2855, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3404255319148936, | |
| "grad_norm": 0.6790465116500854, | |
| "learning_rate": 0.000199745933446758, | |
| "loss": 0.2782, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3829787234042553, | |
| "grad_norm": 1.324937105178833, | |
| "learning_rate": 0.0001996784832027525, | |
| "loss": 0.2635, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.425531914893617, | |
| "grad_norm": 0.8779314756393433, | |
| "learning_rate": 0.00019960311560501454, | |
| "loss": 0.1861, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.46808510638297873, | |
| "grad_norm": 0.693745493888855, | |
| "learning_rate": 0.00019951983663991056, | |
| "loss": 0.2001, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5106382978723404, | |
| "grad_norm": 1.0649502277374268, | |
| "learning_rate": 0.00019942865292219838, | |
| "loss": 0.1378, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5531914893617021, | |
| "grad_norm": 0.6962260007858276, | |
| "learning_rate": 0.0001993295716945017, | |
| "loss": 0.1579, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5957446808510638, | |
| "grad_norm": 0.7934479713439941, | |
| "learning_rate": 0.00019922260082673497, | |
| "loss": 0.092, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6382978723404256, | |
| "grad_norm": 1.1331907510757446, | |
| "learning_rate": 0.000199107748815478, | |
| "loss": 0.1208, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6808510638297872, | |
| "grad_norm": 1.3689247369766235, | |
| "learning_rate": 0.00019898502478330152, | |
| "loss": 0.0874, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.723404255319149, | |
| "grad_norm": 0.5304535031318665, | |
| "learning_rate": 0.00019885443847804211, | |
| "loss": 0.0881, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7659574468085106, | |
| "grad_norm": 0.6805845499038696, | |
| "learning_rate": 0.0001987160002720283, | |
| "loss": 0.0584, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8085106382978723, | |
| "grad_norm": 0.2527499198913574, | |
| "learning_rate": 0.00019856972116125653, | |
| "loss": 0.08, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.851063829787234, | |
| "grad_norm": 0.799462616443634, | |
| "learning_rate": 0.0001984156127645178, | |
| "loss": 0.0556, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8936170212765957, | |
| "grad_norm": 0.936975359916687, | |
| "learning_rate": 0.0001982536873224748, | |
| "loss": 0.0945, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9361702127659575, | |
| "grad_norm": 0.8067993521690369, | |
| "learning_rate": 0.00019808395769668963, | |
| "loss": 0.0495, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9787234042553191, | |
| "grad_norm": 0.45767834782600403, | |
| "learning_rate": 0.00019790643736860227, | |
| "loss": 0.0617, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0212765957446808, | |
| "grad_norm": 0.9198794364929199, | |
| "learning_rate": 0.00019772114043845965, | |
| "loss": 0.0467, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0638297872340425, | |
| "grad_norm": 0.7327796816825867, | |
| "learning_rate": 0.0001975280816241959, | |
| "loss": 0.0391, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1063829787234043, | |
| "grad_norm": 0.8003076910972595, | |
| "learning_rate": 0.00019732727626026305, | |
| "loss": 0.0428, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.148936170212766, | |
| "grad_norm": 0.10251367837190628, | |
| "learning_rate": 0.0001971187402964132, | |
| "loss": 0.032, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1914893617021276, | |
| "grad_norm": 0.45093855261802673, | |
| "learning_rate": 0.00019690249029643162, | |
| "loss": 0.0673, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2340425531914894, | |
| "grad_norm": 0.4845767915248871, | |
| "learning_rate": 0.0001966785434368211, | |
| "loss": 0.033, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2765957446808511, | |
| "grad_norm": 0.31195056438446045, | |
| "learning_rate": 0.00019644691750543767, | |
| "loss": 0.0261, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3191489361702127, | |
| "grad_norm": 0.14839951694011688, | |
| "learning_rate": 0.00019620763090007762, | |
| "loss": 0.0298, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3617021276595744, | |
| "grad_norm": 0.20573872327804565, | |
| "learning_rate": 0.00019596070262701626, | |
| "loss": 0.0155, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4042553191489362, | |
| "grad_norm": 0.47702595591545105, | |
| "learning_rate": 0.00019570615229949842, | |
| "loss": 0.0369, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4468085106382977, | |
| "grad_norm": 0.7073186039924622, | |
| "learning_rate": 0.00019544400013618023, | |
| "loss": 0.0302, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.4893617021276595, | |
| "grad_norm": 0.1539478451013565, | |
| "learning_rate": 0.00019517426695952358, | |
| "loss": 0.0223, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5319148936170213, | |
| "grad_norm": 0.5202814340591431, | |
| "learning_rate": 0.00019489697419414182, | |
| "loss": 0.0263, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.574468085106383, | |
| "grad_norm": 0.968192458152771, | |
| "learning_rate": 0.00019461214386509842, | |
| "loss": 0.044, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6170212765957448, | |
| "grad_norm": 0.5662522912025452, | |
| "learning_rate": 0.00019431979859615726, | |
| "loss": 0.0421, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.6595744680851063, | |
| "grad_norm": 0.42925137281417847, | |
| "learning_rate": 0.00019401996160798573, | |
| "loss": 0.0606, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.702127659574468, | |
| "grad_norm": 0.5803830027580261, | |
| "learning_rate": 0.00019371265671631037, | |
| "loss": 0.0392, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7446808510638299, | |
| "grad_norm": 0.4235450327396393, | |
| "learning_rate": 0.00019339790833002515, | |
| "loss": 0.0286, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.7872340425531914, | |
| "grad_norm": 0.519207775592804, | |
| "learning_rate": 0.00019307574144925287, | |
| "loss": 0.0522, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8297872340425532, | |
| "grad_norm": 0.2344844490289688, | |
| "learning_rate": 0.00019274618166335912, | |
| "loss": 0.0281, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.872340425531915, | |
| "grad_norm": 0.1990007758140564, | |
| "learning_rate": 0.00019240925514892, | |
| "loss": 0.0229, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9148936170212765, | |
| "grad_norm": 0.10929415374994278, | |
| "learning_rate": 0.00019206498866764288, | |
| "loss": 0.0258, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9574468085106385, | |
| "grad_norm": 0.4308103024959564, | |
| "learning_rate": 0.00019171340956424074, | |
| "loss": 0.0167, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.46525439620018005, | |
| "learning_rate": 0.0001913545457642601, | |
| "loss": 0.0283, | |
| "step": 470 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3525, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.889350100631552e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |