diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17080 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 30, + "global_step": 2346, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00128, + "grad_norm": 0.02388751693069935, + "learning_rate": 0.0, + "loss": 0.0005, + "step": 1 + }, + { + "epoch": 0.00256, + "grad_norm": 0.02199248969554901, + "learning_rate": 5.633802816901409e-07, + "loss": 0.0004, + "step": 2 + }, + { + "epoch": 0.00384, + "grad_norm": 0.019646070897579193, + "learning_rate": 1.1267605633802817e-06, + "loss": 0.0004, + "step": 3 + }, + { + "epoch": 0.00512, + "grad_norm": 0.020258139818906784, + "learning_rate": 1.6901408450704227e-06, + "loss": 0.0004, + "step": 4 + }, + { + "epoch": 0.0064, + "grad_norm": 0.01908482424914837, + "learning_rate": 2.2535211267605635e-06, + "loss": 0.0004, + "step": 5 + }, + { + "epoch": 0.00768, + "grad_norm": 0.015439272858202457, + "learning_rate": 2.8169014084507046e-06, + "loss": 0.0004, + "step": 6 + }, + { + "epoch": 0.00896, + "grad_norm": 0.013040938414633274, + "learning_rate": 3.3802816901408454e-06, + "loss": 0.0004, + "step": 7 + }, + { + "epoch": 0.01024, + "grad_norm": 0.01280244067311287, + "learning_rate": 3.943661971830986e-06, + "loss": 0.0004, + "step": 8 + }, + { + "epoch": 0.01152, + "grad_norm": 0.010930157266557217, + "learning_rate": 4.507042253521127e-06, + "loss": 0.0004, + "step": 9 + }, + { + "epoch": 0.0128, + "grad_norm": 0.011020738631486893, + "learning_rate": 5.070422535211268e-06, + "loss": 0.0004, + "step": 10 + }, + { + "epoch": 0.01408, + "grad_norm": 0.011919221840798855, + "learning_rate": 5.633802816901409e-06, + "loss": 0.0004, + "step": 11 + }, + { + "epoch": 0.01536, + "grad_norm": 0.016522666439414024, + "learning_rate": 6.197183098591549e-06, + "loss": 0.0004, + "step": 12 + }, + { + "epoch": 0.01664, + "grad_norm": 0.016699526458978653, + "learning_rate": 6.760563380281691e-06, + "loss": 0.0004, + "step": 13 + }, + { + "epoch": 0.01792, + "grad_norm": 0.01769128441810608, + "learning_rate": 7.3239436619718316e-06, + "loss": 0.0004, + "step": 14 + }, + { + "epoch": 0.0192, + "grad_norm": 0.018868286162614822, + "learning_rate": 7.887323943661972e-06, + "loss": 0.0004, + "step": 15 + }, + { + "epoch": 0.02048, + "grad_norm": 0.017053455114364624, + "learning_rate": 8.450704225352114e-06, + "loss": 0.0004, + "step": 16 + }, + { + "epoch": 0.02176, + "grad_norm": 0.01509174332022667, + "learning_rate": 9.014084507042254e-06, + "loss": 0.0004, + "step": 17 + }, + { + "epoch": 0.02304, + "grad_norm": 0.011613738723099232, + "learning_rate": 9.577464788732394e-06, + "loss": 0.0003, + "step": 18 + }, + { + "epoch": 0.02432, + "grad_norm": 0.006196283735334873, + "learning_rate": 1.0140845070422535e-05, + "loss": 0.0004, + "step": 19 + }, + { + "epoch": 0.0256, + "grad_norm": 0.0016393736004829407, + "learning_rate": 1.0704225352112675e-05, + "loss": 0.0003, + "step": 20 + }, + { + "epoch": 0.02688, + "grad_norm": 0.004450325388461351, + "learning_rate": 1.1267605633802819e-05, + "loss": 0.0003, + "step": 21 + }, + { + "epoch": 0.02816, + "grad_norm": 0.010002792812883854, + "learning_rate": 1.1830985915492958e-05, + "loss": 0.0003, + "step": 22 + }, + { + "epoch": 0.02944, + "grad_norm": 0.011458332650363445, + "learning_rate": 1.2394366197183098e-05, + "loss": 0.0003, + "step": 23 + }, + { + "epoch": 0.03072, + "grad_norm": 0.011857348494231701, + "learning_rate": 1.2957746478873242e-05, + "loss": 0.0003, + "step": 24 + }, + { + "epoch": 0.032, + "grad_norm": 0.012468946166336536, + "learning_rate": 1.3521126760563382e-05, + "loss": 0.0003, + "step": 25 + }, + { + "epoch": 0.03328, + "grad_norm": 0.010764599777758121, + "learning_rate": 1.4084507042253522e-05, + "loss": 0.0003, + "step": 26 + }, + { + "epoch": 0.03456, + "grad_norm": 0.007388254161924124, + "learning_rate": 1.4647887323943663e-05, + "loss": 0.0003, + "step": 27 + }, + { + "epoch": 0.03584, + "grad_norm": 0.002933664945885539, + "learning_rate": 1.5211267605633803e-05, + "loss": 0.0003, + "step": 28 + }, + { + "epoch": 0.03712, + "grad_norm": 0.001106169424019754, + "learning_rate": 1.5774647887323945e-05, + "loss": 0.0003, + "step": 29 + }, + { + "epoch": 0.0384, + "grad_norm": 0.004174572881311178, + "learning_rate": 1.6338028169014086e-05, + "loss": 0.0003, + "step": 30 + }, + { + "epoch": 0.0384, + "eval_loss": 1.3955299854278564, + "eval_runtime": 45.1592, + "eval_samples_per_second": 11.116, + "eval_steps_per_second": 1.395, + "step": 30 + }, + { + "epoch": 0.03968, + "grad_norm": 0.0063258083537220955, + "learning_rate": 1.6901408450704228e-05, + "loss": 0.0003, + "step": 31 + }, + { + "epoch": 0.04096, + "grad_norm": 0.007857787422835827, + "learning_rate": 1.7464788732394366e-05, + "loss": 0.0003, + "step": 32 + }, + { + "epoch": 0.04224, + "grad_norm": 0.00830362644046545, + "learning_rate": 1.8028169014084508e-05, + "loss": 0.0003, + "step": 33 + }, + { + "epoch": 0.04352, + "grad_norm": 0.007050361018627882, + "learning_rate": 1.859154929577465e-05, + "loss": 0.0003, + "step": 34 + }, + { + "epoch": 0.0448, + "grad_norm": 0.004895492922514677, + "learning_rate": 1.9154929577464788e-05, + "loss": 0.0003, + "step": 35 + }, + { + "epoch": 0.04608, + "grad_norm": 0.0021485399920493364, + "learning_rate": 1.9718309859154933e-05, + "loss": 0.0003, + "step": 36 + }, + { + "epoch": 0.04736, + "grad_norm": 0.0009553782292641699, + "learning_rate": 2.028169014084507e-05, + "loss": 0.0003, + "step": 37 + }, + { + "epoch": 0.04864, + "grad_norm": 0.0037018507719039917, + "learning_rate": 2.0845070422535212e-05, + "loss": 0.0003, + "step": 38 + }, + { + "epoch": 0.04992, + "grad_norm": 0.006912225391715765, + "learning_rate": 2.140845070422535e-05, + "loss": 0.0003, + "step": 39 + }, + { + "epoch": 0.0512, + "grad_norm": 0.004844357259571552, + "learning_rate": 2.1971830985915496e-05, + "loss": 0.0002, + "step": 40 + }, + { + "epoch": 0.05248, + "grad_norm": 0.005295700393617153, + "learning_rate": 2.2535211267605637e-05, + "loss": 0.0003, + "step": 41 + }, + { + "epoch": 0.05376, + "grad_norm": 0.003574399510398507, + "learning_rate": 2.3098591549295775e-05, + "loss": 0.0002, + "step": 42 + }, + { + "epoch": 0.05504, + "grad_norm": 0.001562813762575388, + "learning_rate": 2.3661971830985917e-05, + "loss": 0.0003, + "step": 43 + }, + { + "epoch": 0.05632, + "grad_norm": 0.0006343662971630692, + "learning_rate": 2.422535211267606e-05, + "loss": 0.0003, + "step": 44 + }, + { + "epoch": 0.0576, + "grad_norm": 0.00290254526771605, + "learning_rate": 2.4788732394366197e-05, + "loss": 0.0003, + "step": 45 + }, + { + "epoch": 0.05888, + "grad_norm": 0.0033715504687279463, + "learning_rate": 2.5352112676056342e-05, + "loss": 0.0003, + "step": 46 + }, + { + "epoch": 0.06016, + "grad_norm": 0.0034714580979198217, + "learning_rate": 2.5915492957746483e-05, + "loss": 0.0002, + "step": 47 + }, + { + "epoch": 0.06144, + "grad_norm": 0.003244715742766857, + "learning_rate": 2.647887323943662e-05, + "loss": 0.0003, + "step": 48 + }, + { + "epoch": 0.06272, + "grad_norm": 0.001325926510617137, + "learning_rate": 2.7042253521126763e-05, + "loss": 0.0002, + "step": 49 + }, + { + "epoch": 0.064, + "grad_norm": 0.00045873725321143866, + "learning_rate": 2.76056338028169e-05, + "loss": 0.0003, + "step": 50 + }, + { + "epoch": 0.06528, + "grad_norm": 0.0007484716479666531, + "learning_rate": 2.8169014084507043e-05, + "loss": 0.0003, + "step": 51 + }, + { + "epoch": 0.06656, + "grad_norm": 0.0027240754570811987, + "learning_rate": 2.8732394366197188e-05, + "loss": 0.0003, + "step": 52 + }, + { + "epoch": 0.06784, + "grad_norm": 0.002920588944107294, + "learning_rate": 2.9295774647887326e-05, + "loss": 0.0003, + "step": 53 + }, + { + "epoch": 0.06912, + "grad_norm": 0.0020773897413164377, + "learning_rate": 2.9859154929577468e-05, + "loss": 0.0003, + "step": 54 + }, + { + "epoch": 0.0704, + "grad_norm": 0.0013842222979292274, + "learning_rate": 3.0422535211267606e-05, + "loss": 0.0003, + "step": 55 + }, + { + "epoch": 0.07168, + "grad_norm": 0.0004421753983478993, + "learning_rate": 3.0985915492957744e-05, + "loss": 0.0003, + "step": 56 + }, + { + "epoch": 0.07296, + "grad_norm": 0.0009825655724853277, + "learning_rate": 3.154929577464789e-05, + "loss": 0.0002, + "step": 57 + }, + { + "epoch": 0.07424, + "grad_norm": 0.0016538031632080674, + "learning_rate": 3.2112676056338034e-05, + "loss": 0.0003, + "step": 58 + }, + { + "epoch": 0.07552, + "grad_norm": 0.0019267118768766522, + "learning_rate": 3.267605633802817e-05, + "loss": 0.0003, + "step": 59 + }, + { + "epoch": 0.0768, + "grad_norm": 0.001848129671998322, + "learning_rate": 3.323943661971831e-05, + "loss": 0.0002, + "step": 60 + }, + { + "epoch": 0.0768, + "eval_loss": 1.2682050466537476, + "eval_runtime": 40.917, + "eval_samples_per_second": 12.269, + "eval_steps_per_second": 1.54, + "step": 60 + }, + { + "epoch": 0.07808, + "grad_norm": 0.0008087851456366479, + "learning_rate": 3.3802816901408456e-05, + "loss": 0.0002, + "step": 61 + }, + { + "epoch": 0.07936, + "grad_norm": 0.00034242391120642424, + "learning_rate": 3.4366197183098594e-05, + "loss": 0.0003, + "step": 62 + }, + { + "epoch": 0.08064, + "grad_norm": 0.0011758297914639115, + "learning_rate": 3.492957746478873e-05, + "loss": 0.0002, + "step": 63 + }, + { + "epoch": 0.08192, + "grad_norm": 0.0014905749121680856, + "learning_rate": 3.549295774647888e-05, + "loss": 0.0002, + "step": 64 + }, + { + "epoch": 0.0832, + "grad_norm": 0.0013011845294386148, + "learning_rate": 3.6056338028169015e-05, + "loss": 0.0002, + "step": 65 + }, + { + "epoch": 0.08448, + "grad_norm": 0.0011716658482328057, + "learning_rate": 3.661971830985916e-05, + "loss": 0.0003, + "step": 66 + }, + { + "epoch": 0.08576, + "grad_norm": 0.00024143150949385017, + "learning_rate": 3.71830985915493e-05, + "loss": 0.0002, + "step": 67 + }, + { + "epoch": 0.08704, + "grad_norm": 0.0006570953410118818, + "learning_rate": 3.774647887323944e-05, + "loss": 0.0002, + "step": 68 + }, + { + "epoch": 0.08832, + "grad_norm": 0.0012625143863260746, + "learning_rate": 3.8309859154929575e-05, + "loss": 0.0003, + "step": 69 + }, + { + "epoch": 0.0896, + "grad_norm": 0.0007922302465885878, + "learning_rate": 3.887323943661972e-05, + "loss": 0.0003, + "step": 70 + }, + { + "epoch": 0.09088, + "grad_norm": 0.0008460975368507206, + "learning_rate": 3.9436619718309865e-05, + "loss": 0.0002, + "step": 71 + }, + { + "epoch": 0.09216, + "grad_norm": 0.00019190074817743152, + "learning_rate": 4e-05, + "loss": 0.0003, + "step": 72 + }, + { + "epoch": 0.09344, + "grad_norm": 0.00025078540784306824, + "learning_rate": 3.99999828375461e-05, + "loss": 0.0002, + "step": 73 + }, + { + "epoch": 0.09472, + "grad_norm": 0.0007434043800458312, + "learning_rate": 3.999993135021711e-05, + "loss": 0.0003, + "step": 74 + }, + { + "epoch": 0.096, + "grad_norm": 0.0010244562290608883, + "learning_rate": 3.999984553811122e-05, + "loss": 0.0003, + "step": 75 + }, + { + "epoch": 0.09728, + "grad_norm": 0.000786862859968096, + "learning_rate": 3.999972540139207e-05, + "loss": 0.0003, + "step": 76 + }, + { + "epoch": 0.09856, + "grad_norm": 0.0001873165019787848, + "learning_rate": 3.9999570940288754e-05, + "loss": 0.0002, + "step": 77 + }, + { + "epoch": 0.09984, + "grad_norm": 0.000646409927867353, + "learning_rate": 3.999938215509581e-05, + "loss": 0.0003, + "step": 78 + }, + { + "epoch": 0.10112, + "grad_norm": 0.0005671542603522539, + "learning_rate": 3.999915904617326e-05, + "loss": 0.0003, + "step": 79 + }, + { + "epoch": 0.1024, + "grad_norm": 0.00029301070026122034, + "learning_rate": 3.999890161394654e-05, + "loss": 0.0002, + "step": 80 + }, + { + "epoch": 0.10368, + "grad_norm": 0.0006586903473362327, + "learning_rate": 3.9998609858906565e-05, + "loss": 0.0002, + "step": 81 + }, + { + "epoch": 0.10496, + "grad_norm": 0.00024197845777962357, + "learning_rate": 3.99982837816097e-05, + "loss": 0.0003, + "step": 82 + }, + { + "epoch": 0.10624, + "grad_norm": 0.0005213619442656636, + "learning_rate": 3.999792338267774e-05, + "loss": 0.0002, + "step": 83 + }, + { + "epoch": 0.10752, + "grad_norm": 0.0006407280452549458, + "learning_rate": 3.9997528662797956e-05, + "loss": 0.0002, + "step": 84 + }, + { + "epoch": 0.1088, + "grad_norm": 0.00033941137371584773, + "learning_rate": 3.999709962272305e-05, + "loss": 0.0003, + "step": 85 + }, + { + "epoch": 0.11008, + "grad_norm": 0.00016794790280982852, + "learning_rate": 3.9996636263271184e-05, + "loss": 0.0002, + "step": 86 + }, + { + "epoch": 0.11136, + "grad_norm": 0.0004396963631734252, + "learning_rate": 3.9996138585325935e-05, + "loss": 0.0002, + "step": 87 + }, + { + "epoch": 0.11264, + "grad_norm": 0.00031045766081660986, + "learning_rate": 3.999560658983637e-05, + "loss": 0.0002, + "step": 88 + }, + { + "epoch": 0.11392, + "grad_norm": 0.00012366034206934273, + "learning_rate": 3.9995040277816956e-05, + "loss": 0.0002, + "step": 89 + }, + { + "epoch": 0.1152, + "grad_norm": 0.00011263033957220614, + "learning_rate": 3.999443965034762e-05, + "loss": 0.0002, + "step": 90 + }, + { + "epoch": 0.1152, + "eval_loss": 1.2403218746185303, + "eval_runtime": 40.6319, + "eval_samples_per_second": 12.355, + "eval_steps_per_second": 1.551, + "step": 90 + }, + { + "epoch": 0.11648, + "grad_norm": 0.0001996486244024709, + "learning_rate": 3.9993804708573724e-05, + "loss": 0.0003, + "step": 91 + }, + { + "epoch": 0.11776, + "grad_norm": 0.0002045943110715598, + "learning_rate": 3.999313545370607e-05, + "loss": 0.0002, + "step": 92 + }, + { + "epoch": 0.11904, + "grad_norm": 0.00012093196710338816, + "learning_rate": 3.999243188702086e-05, + "loss": 0.0002, + "step": 93 + }, + { + "epoch": 0.12032, + "grad_norm": 0.00030544595210812986, + "learning_rate": 3.9991694009859784e-05, + "loss": 0.0002, + "step": 94 + }, + { + "epoch": 0.1216, + "grad_norm": 0.0002122317673638463, + "learning_rate": 3.9990921823629916e-05, + "loss": 0.0002, + "step": 95 + }, + { + "epoch": 0.12288, + "grad_norm": 0.0001355641143163666, + "learning_rate": 3.999011532980377e-05, + "loss": 0.0002, + "step": 96 + }, + { + "epoch": 0.12416, + "grad_norm": 0.0002007813163800165, + "learning_rate": 3.998927452991927e-05, + "loss": 0.0003, + "step": 97 + }, + { + "epoch": 0.12544, + "grad_norm": 0.0002493359206710011, + "learning_rate": 3.998839942557979e-05, + "loss": 0.0002, + "step": 98 + }, + { + "epoch": 0.12672, + "grad_norm": 0.0001009925763355568, + "learning_rate": 3.9987490018454085e-05, + "loss": 0.0002, + "step": 99 + }, + { + "epoch": 0.128, + "grad_norm": 0.00012175164738437161, + "learning_rate": 3.998654631027635e-05, + "loss": 0.0002, + "step": 100 + }, + { + "epoch": 0.12928, + "grad_norm": 0.0003556807350832969, + "learning_rate": 3.9985568302846166e-05, + "loss": 0.0002, + "step": 101 + }, + { + "epoch": 0.13056, + "grad_norm": 0.0003202245570719242, + "learning_rate": 3.9984555998028544e-05, + "loss": 0.0003, + "step": 102 + }, + { + "epoch": 0.13184, + "grad_norm": 0.0003585947270039469, + "learning_rate": 3.998350939775389e-05, + "loss": 0.0002, + "step": 103 + }, + { + "epoch": 0.13312, + "grad_norm": 0.00013465459051076323, + "learning_rate": 3.9982428504018006e-05, + "loss": 0.0003, + "step": 104 + }, + { + "epoch": 0.1344, + "grad_norm": 0.00012200856144772843, + "learning_rate": 3.998131331888208e-05, + "loss": 0.0002, + "step": 105 + }, + { + "epoch": 0.13568, + "grad_norm": 0.000345123145962134, + "learning_rate": 3.998016384447271e-05, + "loss": 0.0002, + "step": 106 + }, + { + "epoch": 0.13696, + "grad_norm": 0.00037582783261314034, + "learning_rate": 3.997898008298188e-05, + "loss": 0.0003, + "step": 107 + }, + { + "epoch": 0.13824, + "grad_norm": 0.00011709384125424549, + "learning_rate": 3.997776203666694e-05, + "loss": 0.0002, + "step": 108 + }, + { + "epoch": 0.13952, + "grad_norm": 0.0005938383401371539, + "learning_rate": 3.9976509707850645e-05, + "loss": 0.0002, + "step": 109 + }, + { + "epoch": 0.1408, + "grad_norm": 0.0001890748826554045, + "learning_rate": 3.9975223098921094e-05, + "loss": 0.0002, + "step": 110 + }, + { + "epoch": 0.14208, + "grad_norm": 0.00018463779997546226, + "learning_rate": 3.997390221233178e-05, + "loss": 0.0002, + "step": 111 + }, + { + "epoch": 0.14336, + "grad_norm": 0.0003491342649795115, + "learning_rate": 3.997254705060155e-05, + "loss": 0.0002, + "step": 112 + }, + { + "epoch": 0.14464, + "grad_norm": 0.00032335062860511243, + "learning_rate": 3.997115761631462e-05, + "loss": 0.0002, + "step": 113 + }, + { + "epoch": 0.14592, + "grad_norm": 0.00013327212946023792, + "learning_rate": 3.996973391212056e-05, + "loss": 0.0002, + "step": 114 + }, + { + "epoch": 0.1472, + "grad_norm": 0.0003823594015557319, + "learning_rate": 3.996827594073428e-05, + "loss": 0.0002, + "step": 115 + }, + { + "epoch": 0.14848, + "grad_norm": 0.00010233603097731248, + "learning_rate": 3.996678370493604e-05, + "loss": 0.0002, + "step": 116 + }, + { + "epoch": 0.14976, + "grad_norm": 0.00015697085473220795, + "learning_rate": 3.996525720757145e-05, + "loss": 0.0002, + "step": 117 + }, + { + "epoch": 0.15104, + "grad_norm": 9.921671880874783e-05, + "learning_rate": 3.996369645155145e-05, + "loss": 0.0002, + "step": 118 + }, + { + "epoch": 0.15232, + "grad_norm": 0.00010817526344908401, + "learning_rate": 3.99621014398523e-05, + "loss": 0.0002, + "step": 119 + }, + { + "epoch": 0.1536, + "grad_norm": 0.0003328252932988107, + "learning_rate": 3.9960472175515596e-05, + "loss": 0.0003, + "step": 120 + }, + { + "epoch": 0.1536, + "eval_loss": 1.2306902408599854, + "eval_runtime": 41.0777, + "eval_samples_per_second": 12.221, + "eval_steps_per_second": 1.534, + "step": 120 + }, + { + "epoch": 0.15488, + "grad_norm": 0.0001208317480632104, + "learning_rate": 3.995880866164824e-05, + "loss": 0.0002, + "step": 121 + }, + { + "epoch": 0.15616, + "grad_norm": 0.00010686482710298151, + "learning_rate": 3.995711090142246e-05, + "loss": 0.0002, + "step": 122 + }, + { + "epoch": 0.15744, + "grad_norm": 0.000191230108612217, + "learning_rate": 3.9955378898075776e-05, + "loss": 0.0002, + "step": 123 + }, + { + "epoch": 0.15872, + "grad_norm": 0.00043578416807577014, + "learning_rate": 3.995361265491102e-05, + "loss": 0.0002, + "step": 124 + }, + { + "epoch": 0.16, + "grad_norm": 0.00012811734632123262, + "learning_rate": 3.99518121752963e-05, + "loss": 0.0002, + "step": 125 + }, + { + "epoch": 0.16128, + "grad_norm": 0.00011695229477481917, + "learning_rate": 3.994997746266502e-05, + "loss": 0.0002, + "step": 126 + }, + { + "epoch": 0.16256, + "grad_norm": 0.00026602434809319675, + "learning_rate": 3.994810852051589e-05, + "loss": 0.0002, + "step": 127 + }, + { + "epoch": 0.16384, + "grad_norm": 0.00020878778013866395, + "learning_rate": 3.9946205352412836e-05, + "loss": 0.0002, + "step": 128 + }, + { + "epoch": 0.16512, + "grad_norm": 0.00013080754433758557, + "learning_rate": 3.994426796198511e-05, + "loss": 0.0002, + "step": 129 + }, + { + "epoch": 0.1664, + "grad_norm": 0.00017335913435090333, + "learning_rate": 3.994229635292718e-05, + "loss": 0.0002, + "step": 130 + }, + { + "epoch": 0.16768, + "grad_norm": 0.0001642439019633457, + "learning_rate": 3.99402905289988e-05, + "loss": 0.0002, + "step": 131 + }, + { + "epoch": 0.16896, + "grad_norm": 0.00011353264562785625, + "learning_rate": 3.993825049402494e-05, + "loss": 0.0002, + "step": 132 + }, + { + "epoch": 0.17024, + "grad_norm": 0.0003552866692189127, + "learning_rate": 3.993617625189584e-05, + "loss": 0.0002, + "step": 133 + }, + { + "epoch": 0.17152, + "grad_norm": 0.0002389036671957001, + "learning_rate": 3.993406780656694e-05, + "loss": 0.0002, + "step": 134 + }, + { + "epoch": 0.1728, + "grad_norm": 0.00018962110334541649, + "learning_rate": 3.993192516205892e-05, + "loss": 0.0002, + "step": 135 + }, + { + "epoch": 0.17408, + "grad_norm": 0.000491205370053649, + "learning_rate": 3.9929748322457674e-05, + "loss": 0.0002, + "step": 136 + }, + { + "epoch": 0.17536, + "grad_norm": 0.00025262351846322417, + "learning_rate": 3.992753729191431e-05, + "loss": 0.0002, + "step": 137 + }, + { + "epoch": 0.17664, + "grad_norm": 0.0002617812715470791, + "learning_rate": 3.992529207464512e-05, + "loss": 0.0003, + "step": 138 + }, + { + "epoch": 0.17792, + "grad_norm": 0.0002854976919479668, + "learning_rate": 3.992301267493159e-05, + "loss": 0.0002, + "step": 139 + }, + { + "epoch": 0.1792, + "grad_norm": 0.0001424322254024446, + "learning_rate": 3.9920699097120414e-05, + "loss": 0.0002, + "step": 140 + }, + { + "epoch": 0.18048, + "grad_norm": 0.00023581937421113253, + "learning_rate": 3.991835134562344e-05, + "loss": 0.0002, + "step": 141 + }, + { + "epoch": 0.18176, + "grad_norm": 0.0003542240883689374, + "learning_rate": 3.991596942491768e-05, + "loss": 0.0002, + "step": 142 + }, + { + "epoch": 0.18304, + "grad_norm": 0.0003476674319244921, + "learning_rate": 3.9913553339545315e-05, + "loss": 0.0002, + "step": 143 + }, + { + "epoch": 0.18432, + "grad_norm": 0.00037666381103917956, + "learning_rate": 3.991110309411368e-05, + "loss": 0.0003, + "step": 144 + }, + { + "epoch": 0.1856, + "grad_norm": 0.00011861531675094739, + "learning_rate": 3.990861869329524e-05, + "loss": 0.0002, + "step": 145 + }, + { + "epoch": 0.18688, + "grad_norm": 0.0004715830145869404, + "learning_rate": 3.99061001418276e-05, + "loss": 0.0002, + "step": 146 + }, + { + "epoch": 0.18816, + "grad_norm": 0.00014487896987702698, + "learning_rate": 3.990354744451347e-05, + "loss": 0.0002, + "step": 147 + }, + { + "epoch": 0.18944, + "grad_norm": 0.00035398933687247336, + "learning_rate": 3.990096060622071e-05, + "loss": 0.0002, + "step": 148 + }, + { + "epoch": 0.19072, + "grad_norm": 0.0005021999822929502, + "learning_rate": 3.989833963188225e-05, + "loss": 0.0002, + "step": 149 + }, + { + "epoch": 0.192, + "grad_norm": 0.00015732903557363898, + "learning_rate": 3.989568452649613e-05, + "loss": 0.0002, + "step": 150 + }, + { + "epoch": 0.192, + "eval_loss": 1.2276506423950195, + "eval_runtime": 41.6549, + "eval_samples_per_second": 12.051, + "eval_steps_per_second": 1.512, + "step": 150 + }, + { + "epoch": 0.19328, + "grad_norm": 0.0005945006851106882, + "learning_rate": 3.989299529512548e-05, + "loss": 0.0002, + "step": 151 + }, + { + "epoch": 0.19456, + "grad_norm": 0.0004206049197819084, + "learning_rate": 3.98902719428985e-05, + "loss": 0.0002, + "step": 152 + }, + { + "epoch": 0.19584, + "grad_norm": 0.0005308086983859539, + "learning_rate": 3.988751447500845e-05, + "loss": 0.0002, + "step": 153 + }, + { + "epoch": 0.19712, + "grad_norm": 0.0003311938198748976, + "learning_rate": 3.988472289671367e-05, + "loss": 0.0002, + "step": 154 + }, + { + "epoch": 0.1984, + "grad_norm": 0.00020781578496098518, + "learning_rate": 3.988189721333751e-05, + "loss": 0.0003, + "step": 155 + }, + { + "epoch": 0.19968, + "grad_norm": 0.0002495672379154712, + "learning_rate": 3.9879037430268394e-05, + "loss": 0.0002, + "step": 156 + }, + { + "epoch": 0.20096, + "grad_norm": 0.0003674014296848327, + "learning_rate": 3.987614355295975e-05, + "loss": 0.0002, + "step": 157 + }, + { + "epoch": 0.20224, + "grad_norm": 0.00011461845133453608, + "learning_rate": 3.987321558693002e-05, + "loss": 0.0002, + "step": 158 + }, + { + "epoch": 0.20352, + "grad_norm": 0.00014985799498390406, + "learning_rate": 3.9870253537762666e-05, + "loss": 0.0002, + "step": 159 + }, + { + "epoch": 0.2048, + "grad_norm": 0.00023186200996860862, + "learning_rate": 3.9867257411106126e-05, + "loss": 0.0002, + "step": 160 + }, + { + "epoch": 0.20608, + "grad_norm": 0.00011972602806054056, + "learning_rate": 3.986422721267384e-05, + "loss": 0.0002, + "step": 161 + }, + { + "epoch": 0.20736, + "grad_norm": 0.0001768232323229313, + "learning_rate": 3.986116294824421e-05, + "loss": 0.0002, + "step": 162 + }, + { + "epoch": 0.20864, + "grad_norm": 0.00023504432465415448, + "learning_rate": 3.985806462366061e-05, + "loss": 0.0002, + "step": 163 + }, + { + "epoch": 0.20992, + "grad_norm": 0.00016327628691215068, + "learning_rate": 3.9854932244831334e-05, + "loss": 0.0002, + "step": 164 + }, + { + "epoch": 0.2112, + "grad_norm": 0.00029715322307311, + "learning_rate": 3.985176581772967e-05, + "loss": 0.0002, + "step": 165 + }, + { + "epoch": 0.21248, + "grad_norm": 0.00016738087288103998, + "learning_rate": 3.984856534839378e-05, + "loss": 0.0002, + "step": 166 + }, + { + "epoch": 0.21376, + "grad_norm": 0.0007520260405726731, + "learning_rate": 3.984533084292677e-05, + "loss": 0.0002, + "step": 167 + }, + { + "epoch": 0.21504, + "grad_norm": 0.0003500195743981749, + "learning_rate": 3.984206230749664e-05, + "loss": 0.0002, + "step": 168 + }, + { + "epoch": 0.21632, + "grad_norm": 0.0006380021804943681, + "learning_rate": 3.9838759748336306e-05, + "loss": 0.0002, + "step": 169 + }, + { + "epoch": 0.2176, + "grad_norm": 0.0008250039536505938, + "learning_rate": 3.983542317174354e-05, + "loss": 0.0003, + "step": 170 + }, + { + "epoch": 0.21888, + "grad_norm": 0.00023163011064752936, + "learning_rate": 3.983205258408097e-05, + "loss": 0.0002, + "step": 171 + }, + { + "epoch": 0.22016, + "grad_norm": 0.0009266913402825594, + "learning_rate": 3.982864799177614e-05, + "loss": 0.0002, + "step": 172 + }, + { + "epoch": 0.22144, + "grad_norm": 0.0007819707971066236, + "learning_rate": 3.9825209401321375e-05, + "loss": 0.0002, + "step": 173 + }, + { + "epoch": 0.22272, + "grad_norm": 0.0009147466043941677, + "learning_rate": 3.9821736819273865e-05, + "loss": 0.0002, + "step": 174 + }, + { + "epoch": 0.224, + "grad_norm": 0.0006302861729636788, + "learning_rate": 3.9818230252255614e-05, + "loss": 0.0002, + "step": 175 + }, + { + "epoch": 0.22528, + "grad_norm": 0.0007533888565376401, + "learning_rate": 3.981468970695343e-05, + "loss": 0.0002, + "step": 176 + }, + { + "epoch": 0.22656, + "grad_norm": 0.0003364177537150681, + "learning_rate": 3.981111519011892e-05, + "loss": 0.0002, + "step": 177 + }, + { + "epoch": 0.22784, + "grad_norm": 0.0015544546768069267, + "learning_rate": 3.9807506708568476e-05, + "loss": 0.0002, + "step": 178 + }, + { + "epoch": 0.22912, + "grad_norm": 0.00048260428593494, + "learning_rate": 3.9803864269183244e-05, + "loss": 0.0002, + "step": 179 + }, + { + "epoch": 0.2304, + "grad_norm": 0.0009548643138259649, + "learning_rate": 3.980018787890914e-05, + "loss": 0.0002, + "step": 180 + }, + { + "epoch": 0.2304, + "eval_loss": 1.2241315841674805, + "eval_runtime": 63.3318, + "eval_samples_per_second": 7.927, + "eval_steps_per_second": 0.995, + "step": 180 + }, + { + "epoch": 0.23168, + "grad_norm": 0.0013649332104250789, + "learning_rate": 3.9796477544756814e-05, + "loss": 0.0002, + "step": 181 + }, + { + "epoch": 0.23296, + "grad_norm": 0.000332722527673468, + "learning_rate": 3.979273327380165e-05, + "loss": 0.0002, + "step": 182 + }, + { + "epoch": 0.23424, + "grad_norm": 0.0010766334598883986, + "learning_rate": 3.9788955073183747e-05, + "loss": 0.0002, + "step": 183 + }, + { + "epoch": 0.23552, + "grad_norm": 0.0004133693582843989, + "learning_rate": 3.97851429501079e-05, + "loss": 0.0002, + "step": 184 + }, + { + "epoch": 0.2368, + "grad_norm": 0.00047085783444345, + "learning_rate": 3.9781296911843606e-05, + "loss": 0.0002, + "step": 185 + }, + { + "epoch": 0.23808, + "grad_norm": 0.000769197242334485, + "learning_rate": 3.977741696572501e-05, + "loss": 0.0002, + "step": 186 + }, + { + "epoch": 0.23936, + "grad_norm": 0.0002509950427338481, + "learning_rate": 3.9773503119150955e-05, + "loss": 0.0002, + "step": 187 + }, + { + "epoch": 0.24064, + "grad_norm": 0.0010357450228184462, + "learning_rate": 3.976955537958489e-05, + "loss": 0.0002, + "step": 188 + }, + { + "epoch": 0.24192, + "grad_norm": 0.0006930662202648818, + "learning_rate": 3.9765573754554924e-05, + "loss": 0.0002, + "step": 189 + }, + { + "epoch": 0.2432, + "grad_norm": 0.0009113008854910731, + "learning_rate": 3.9761558251653774e-05, + "loss": 0.0002, + "step": 190 + }, + { + "epoch": 0.24448, + "grad_norm": 0.0005375745822675526, + "learning_rate": 3.975750887853876e-05, + "loss": 0.0002, + "step": 191 + }, + { + "epoch": 0.24576, + "grad_norm": 0.0002424489357508719, + "learning_rate": 3.9753425642931786e-05, + "loss": 0.0002, + "step": 192 + }, + { + "epoch": 0.24704, + "grad_norm": 0.000845865230076015, + "learning_rate": 3.974930855261935e-05, + "loss": 0.0002, + "step": 193 + }, + { + "epoch": 0.24832, + "grad_norm": 0.0004812710976693779, + "learning_rate": 3.974515761545247e-05, + "loss": 0.0002, + "step": 194 + }, + { + "epoch": 0.2496, + "grad_norm": 0.00042488076724112034, + "learning_rate": 3.974097283934675e-05, + "loss": 0.0002, + "step": 195 + }, + { + "epoch": 0.25088, + "grad_norm": 0.00011899595847353339, + "learning_rate": 3.9736754232282295e-05, + "loss": 0.0002, + "step": 196 + }, + { + "epoch": 0.25216, + "grad_norm": 0.000173148131580092, + "learning_rate": 3.9732501802303735e-05, + "loss": 0.0002, + "step": 197 + }, + { + "epoch": 0.25344, + "grad_norm": 0.0010396325960755348, + "learning_rate": 3.9728215557520194e-05, + "loss": 0.0002, + "step": 198 + }, + { + "epoch": 0.25472, + "grad_norm": 0.0009005111642181873, + "learning_rate": 3.972389550610528e-05, + "loss": 0.0002, + "step": 199 + }, + { + "epoch": 0.256, + "grad_norm": 0.000827074924018234, + "learning_rate": 3.9719541656297076e-05, + "loss": 0.0002, + "step": 200 + }, + { + "epoch": 0.25728, + "grad_norm": 0.00014234821719583124, + "learning_rate": 3.9715154016398114e-05, + "loss": 0.0002, + "step": 201 + }, + { + "epoch": 0.25856, + "grad_norm": 0.0020800912752747536, + "learning_rate": 3.9710732594775334e-05, + "loss": 0.0002, + "step": 202 + }, + { + "epoch": 0.25984, + "grad_norm": 0.0010654254583641887, + "learning_rate": 3.970627739986014e-05, + "loss": 0.0002, + "step": 203 + }, + { + "epoch": 0.26112, + "grad_norm": 0.001251160865649581, + "learning_rate": 3.970178844014831e-05, + "loss": 0.0002, + "step": 204 + }, + { + "epoch": 0.2624, + "grad_norm": 0.0004991651512682438, + "learning_rate": 3.969726572420002e-05, + "loss": 0.0002, + "step": 205 + }, + { + "epoch": 0.26368, + "grad_norm": 0.0012509931111708283, + "learning_rate": 3.9692709260639815e-05, + "loss": 0.0002, + "step": 206 + }, + { + "epoch": 0.26496, + "grad_norm": 0.00045681357732973993, + "learning_rate": 3.968811905815659e-05, + "loss": 0.0003, + "step": 207 + }, + { + "epoch": 0.26624, + "grad_norm": 0.00115011737216264, + "learning_rate": 3.968349512550359e-05, + "loss": 0.0002, + "step": 208 + }, + { + "epoch": 0.26752, + "grad_norm": 0.00021435614326037467, + "learning_rate": 3.967883747149837e-05, + "loss": 0.0002, + "step": 209 + }, + { + "epoch": 0.2688, + "grad_norm": 0.0015478826826438308, + "learning_rate": 3.9674146105022795e-05, + "loss": 0.0002, + "step": 210 + }, + { + "epoch": 0.2688, + "eval_loss": 1.2069844007492065, + "eval_runtime": 42.1506, + "eval_samples_per_second": 11.91, + "eval_steps_per_second": 1.495, + "step": 210 + }, + { + "epoch": 0.27008, + "grad_norm": 0.00039821109385229647, + "learning_rate": 3.966942103502301e-05, + "loss": 0.0002, + "step": 211 + }, + { + "epoch": 0.27136, + "grad_norm": 0.0013712712097913027, + "learning_rate": 3.966466227050945e-05, + "loss": 0.0002, + "step": 212 + }, + { + "epoch": 0.27264, + "grad_norm": 0.0006687208078801632, + "learning_rate": 3.965986982055677e-05, + "loss": 0.0002, + "step": 213 + }, + { + "epoch": 0.27392, + "grad_norm": 0.0006046419148333371, + "learning_rate": 3.96550436943039e-05, + "loss": 0.0002, + "step": 214 + }, + { + "epoch": 0.2752, + "grad_norm": 0.0005272756097838283, + "learning_rate": 3.965018390095396e-05, + "loss": 0.0002, + "step": 215 + }, + { + "epoch": 0.27648, + "grad_norm": 0.00027221845812164247, + "learning_rate": 3.964529044977429e-05, + "loss": 0.0002, + "step": 216 + }, + { + "epoch": 0.27776, + "grad_norm": 0.00022938432812225074, + "learning_rate": 3.9640363350096396e-05, + "loss": 0.0002, + "step": 217 + }, + { + "epoch": 0.27904, + "grad_norm": 0.0005330294952727854, + "learning_rate": 3.963540261131596e-05, + "loss": 0.0002, + "step": 218 + }, + { + "epoch": 0.28032, + "grad_norm": 0.0008340986678376794, + "learning_rate": 3.963040824289281e-05, + "loss": 0.0002, + "step": 219 + }, + { + "epoch": 0.2816, + "grad_norm": 0.00021592361736111343, + "learning_rate": 3.96253802543509e-05, + "loss": 0.0002, + "step": 220 + }, + { + "epoch": 0.28288, + "grad_norm": 0.0010405798675492406, + "learning_rate": 3.9620318655278315e-05, + "loss": 0.0002, + "step": 221 + }, + { + "epoch": 0.28416, + "grad_norm": 0.00037682682159356773, + "learning_rate": 3.96152234553272e-05, + "loss": 0.0002, + "step": 222 + }, + { + "epoch": 0.28544, + "grad_norm": 0.0005944096483290195, + "learning_rate": 3.961009466421379e-05, + "loss": 0.0002, + "step": 223 + }, + { + "epoch": 0.28672, + "grad_norm": 0.0008876877254806459, + "learning_rate": 3.9604932291718396e-05, + "loss": 0.0002, + "step": 224 + }, + { + "epoch": 0.288, + "grad_norm": 0.0004229796468280256, + "learning_rate": 3.959973634768534e-05, + "loss": 0.0002, + "step": 225 + }, + { + "epoch": 0.28928, + "grad_norm": 0.0004196965601295233, + "learning_rate": 3.959450684202296e-05, + "loss": 0.0002, + "step": 226 + }, + { + "epoch": 0.29056, + "grad_norm": 0.0007056231843307614, + "learning_rate": 3.958924378470363e-05, + "loss": 0.0002, + "step": 227 + }, + { + "epoch": 0.29184, + "grad_norm": 0.0003414407547097653, + "learning_rate": 3.958394718576366e-05, + "loss": 0.0002, + "step": 228 + }, + { + "epoch": 0.29312, + "grad_norm": 0.00045374143519438803, + "learning_rate": 3.957861705530335e-05, + "loss": 0.0002, + "step": 229 + }, + { + "epoch": 0.2944, + "grad_norm": 0.00032401952194049954, + "learning_rate": 3.9573253403486944e-05, + "loss": 0.0002, + "step": 230 + }, + { + "epoch": 0.29568, + "grad_norm": 0.00038617721293121576, + "learning_rate": 3.956785624054259e-05, + "loss": 0.0002, + "step": 231 + }, + { + "epoch": 0.29696, + "grad_norm": 0.0002595918485894799, + "learning_rate": 3.956242557676235e-05, + "loss": 0.0002, + "step": 232 + }, + { + "epoch": 0.29824, + "grad_norm": 0.0007822422194294631, + "learning_rate": 3.9556961422502176e-05, + "loss": 0.0002, + "step": 233 + }, + { + "epoch": 0.29952, + "grad_norm": 0.0009492220124229789, + "learning_rate": 3.955146378818188e-05, + "loss": 0.0002, + "step": 234 + }, + { + "epoch": 0.3008, + "grad_norm": 0.00015348431770689785, + "learning_rate": 3.9545932684285105e-05, + "loss": 0.0002, + "step": 235 + }, + { + "epoch": 0.30208, + "grad_norm": 0.0009721404640004039, + "learning_rate": 3.954036812135934e-05, + "loss": 0.0002, + "step": 236 + }, + { + "epoch": 0.30336, + "grad_norm": 0.0006310763419605792, + "learning_rate": 3.9534770110015876e-05, + "loss": 0.0002, + "step": 237 + }, + { + "epoch": 0.30464, + "grad_norm": 0.00045944383600726724, + "learning_rate": 3.9529138660929764e-05, + "loss": 0.0002, + "step": 238 + }, + { + "epoch": 0.30592, + "grad_norm": 0.0009684551623649895, + "learning_rate": 3.9523473784839846e-05, + "loss": 0.0002, + "step": 239 + }, + { + "epoch": 0.3072, + "grad_norm": 0.0011421306990087032, + "learning_rate": 3.95177754925487e-05, + "loss": 0.0002, + "step": 240 + }, + { + "epoch": 0.3072, + "eval_loss": 1.1997703313827515, + "eval_runtime": 42.6282, + "eval_samples_per_second": 11.776, + "eval_steps_per_second": 1.478, + "step": 240 + }, + { + "epoch": 0.30848, + "grad_norm": 0.0007120371446944773, + "learning_rate": 3.9512043794922614e-05, + "loss": 0.0002, + "step": 241 + }, + { + "epoch": 0.30976, + "grad_norm": 0.00023797668109182268, + "learning_rate": 3.9506278702891594e-05, + "loss": 0.0002, + "step": 242 + }, + { + "epoch": 0.31104, + "grad_norm": 0.0002528477052692324, + "learning_rate": 3.9500480227449316e-05, + "loss": 0.0002, + "step": 243 + }, + { + "epoch": 0.31232, + "grad_norm": 0.0001783139305189252, + "learning_rate": 3.949464837965313e-05, + "loss": 0.0002, + "step": 244 + }, + { + "epoch": 0.3136, + "grad_norm": 0.0013822782784700394, + "learning_rate": 3.9488783170624e-05, + "loss": 0.0002, + "step": 245 + }, + { + "epoch": 0.31488, + "grad_norm": 0.0027772309258580208, + "learning_rate": 3.948288461154654e-05, + "loss": 0.0002, + "step": 246 + }, + { + "epoch": 0.31616, + "grad_norm": 0.0026890826411545277, + "learning_rate": 3.947695271366894e-05, + "loss": 0.0002, + "step": 247 + }, + { + "epoch": 0.31744, + "grad_norm": 0.0009026412153616548, + "learning_rate": 3.947098748830296e-05, + "loss": 0.0002, + "step": 248 + }, + { + "epoch": 0.31872, + "grad_norm": 0.0014957863604649901, + "learning_rate": 3.946498894682394e-05, + "loss": 0.0002, + "step": 249 + }, + { + "epoch": 0.32, + "grad_norm": 0.0031202055979520082, + "learning_rate": 3.945895710067072e-05, + "loss": 0.0002, + "step": 250 + }, + { + "epoch": 0.32128, + "grad_norm": 0.0037764040753245354, + "learning_rate": 3.9452891961345666e-05, + "loss": 0.0002, + "step": 251 + }, + { + "epoch": 0.32256, + "grad_norm": 0.003359750611707568, + "learning_rate": 3.944679354041464e-05, + "loss": 0.0002, + "step": 252 + }, + { + "epoch": 0.32384, + "grad_norm": 0.002477930160239339, + "learning_rate": 3.944066184950695e-05, + "loss": 0.0002, + "step": 253 + }, + { + "epoch": 0.32512, + "grad_norm": 0.002216948429122567, + "learning_rate": 3.9434496900315364e-05, + "loss": 0.0002, + "step": 254 + }, + { + "epoch": 0.3264, + "grad_norm": 0.0025624523404985666, + "learning_rate": 3.942829870459606e-05, + "loss": 0.0002, + "step": 255 + }, + { + "epoch": 0.32768, + "grad_norm": 0.002335009863600135, + "learning_rate": 3.9422067274168634e-05, + "loss": 0.0002, + "step": 256 + }, + { + "epoch": 0.32896, + "grad_norm": 0.0010357327992096543, + "learning_rate": 3.941580262091603e-05, + "loss": 0.0002, + "step": 257 + }, + { + "epoch": 0.33024, + "grad_norm": 0.0002824495895765722, + "learning_rate": 3.940950475678459e-05, + "loss": 0.0002, + "step": 258 + }, + { + "epoch": 0.33152, + "grad_norm": 0.000942192564252764, + "learning_rate": 3.940317369378391e-05, + "loss": 0.0002, + "step": 259 + }, + { + "epoch": 0.3328, + "grad_norm": 0.0009988127276301384, + "learning_rate": 3.939680944398699e-05, + "loss": 0.0002, + "step": 260 + }, + { + "epoch": 0.33408, + "grad_norm": 0.0008213394903577864, + "learning_rate": 3.9390412019530046e-05, + "loss": 0.0002, + "step": 261 + }, + { + "epoch": 0.33536, + "grad_norm": 0.00039377162465825677, + "learning_rate": 3.938398143261258e-05, + "loss": 0.0002, + "step": 262 + }, + { + "epoch": 0.33664, + "grad_norm": 0.0008121069986373186, + "learning_rate": 3.937751769549733e-05, + "loss": 0.0002, + "step": 263 + }, + { + "epoch": 0.33792, + "grad_norm": 0.002108998131006956, + "learning_rate": 3.937102082051026e-05, + "loss": 0.0002, + "step": 264 + }, + { + "epoch": 0.3392, + "grad_norm": 0.003128256183117628, + "learning_rate": 3.9364490820040506e-05, + "loss": 0.0002, + "step": 265 + }, + { + "epoch": 0.34048, + "grad_norm": 0.003833349095657468, + "learning_rate": 3.935792770654039e-05, + "loss": 0.0002, + "step": 266 + }, + { + "epoch": 0.34176, + "grad_norm": 0.005173802375793457, + "learning_rate": 3.935133149252536e-05, + "loss": 0.0002, + "step": 267 + }, + { + "epoch": 0.34304, + "grad_norm": 0.008912354707717896, + "learning_rate": 3.934470219057401e-05, + "loss": 0.0002, + "step": 268 + }, + { + "epoch": 0.34432, + "grad_norm": 0.011784413829445839, + "learning_rate": 3.933803981332801e-05, + "loss": 0.0002, + "step": 269 + }, + { + "epoch": 0.3456, + "grad_norm": 0.0093743447214365, + "learning_rate": 3.933134437349211e-05, + "loss": 0.0002, + "step": 270 + }, + { + "epoch": 0.3456, + "eval_loss": 1.220482349395752, + "eval_runtime": 43.5388, + "eval_samples_per_second": 11.53, + "eval_steps_per_second": 1.447, + "step": 270 + }, + { + "epoch": 0.34688, + "grad_norm": 0.003992204554378986, + "learning_rate": 3.9324615883834103e-05, + "loss": 0.0002, + "step": 271 + }, + { + "epoch": 0.34816, + "grad_norm": 0.002077931072562933, + "learning_rate": 3.9317854357184815e-05, + "loss": 0.0002, + "step": 272 + }, + { + "epoch": 0.34944, + "grad_norm": 0.006616830825805664, + "learning_rate": 3.9311059806438065e-05, + "loss": 0.0002, + "step": 273 + }, + { + "epoch": 0.35072, + "grad_norm": 0.0070667387917637825, + "learning_rate": 3.930423224455065e-05, + "loss": 0.0002, + "step": 274 + }, + { + "epoch": 0.352, + "grad_norm": 0.0025246667210012674, + "learning_rate": 3.929737168454232e-05, + "loss": 0.0002, + "step": 275 + }, + { + "epoch": 0.35328, + "grad_norm": 0.003516591852530837, + "learning_rate": 3.929047813949575e-05, + "loss": 0.0002, + "step": 276 + }, + { + "epoch": 0.35456, + "grad_norm": 0.005735184997320175, + "learning_rate": 3.928355162255649e-05, + "loss": 0.0002, + "step": 277 + }, + { + "epoch": 0.35584, + "grad_norm": 0.003947160672396421, + "learning_rate": 3.9276592146933004e-05, + "loss": 0.0002, + "step": 278 + }, + { + "epoch": 0.35712, + "grad_norm": 0.0008040797547437251, + "learning_rate": 3.92695997258966e-05, + "loss": 0.0002, + "step": 279 + }, + { + "epoch": 0.3584, + "grad_norm": 0.0020477804355323315, + "learning_rate": 3.9262574372781383e-05, + "loss": 0.0002, + "step": 280 + }, + { + "epoch": 0.35968, + "grad_norm": 0.003859796328470111, + "learning_rate": 3.9255516100984285e-05, + "loss": 0.0002, + "step": 281 + }, + { + "epoch": 0.36096, + "grad_norm": 0.003972633741796017, + "learning_rate": 3.924842492396499e-05, + "loss": 0.0002, + "step": 282 + }, + { + "epoch": 0.36224, + "grad_norm": 0.0018808410968631506, + "learning_rate": 3.924130085524596e-05, + "loss": 0.0002, + "step": 283 + }, + { + "epoch": 0.36352, + "grad_norm": 0.0007452194113284349, + "learning_rate": 3.923414390841235e-05, + "loss": 0.0002, + "step": 284 + }, + { + "epoch": 0.3648, + "grad_norm": 0.0026158560067415237, + "learning_rate": 3.922695409711202e-05, + "loss": 0.0002, + "step": 285 + }, + { + "epoch": 0.36608, + "grad_norm": 0.003892722772434354, + "learning_rate": 3.921973143505552e-05, + "loss": 0.0002, + "step": 286 + }, + { + "epoch": 0.36736, + "grad_norm": 0.004123721271753311, + "learning_rate": 3.921247593601601e-05, + "loss": 0.0002, + "step": 287 + }, + { + "epoch": 0.36864, + "grad_norm": 0.0037682715337723494, + "learning_rate": 3.92051876138293e-05, + "loss": 0.0002, + "step": 288 + }, + { + "epoch": 0.36992, + "grad_norm": 0.002960368525236845, + "learning_rate": 3.919786648239377e-05, + "loss": 0.0002, + "step": 289 + }, + { + "epoch": 0.3712, + "grad_norm": 0.0015276469057425857, + "learning_rate": 3.9190512555670374e-05, + "loss": 0.0002, + "step": 290 + }, + { + "epoch": 0.37248, + "grad_norm": 0.00017276291328016669, + "learning_rate": 3.918312584768261e-05, + "loss": 0.0002, + "step": 291 + }, + { + "epoch": 0.37376, + "grad_norm": 0.0009839094709604979, + "learning_rate": 3.917570637251647e-05, + "loss": 0.0002, + "step": 292 + }, + { + "epoch": 0.37504, + "grad_norm": 0.001523509039543569, + "learning_rate": 3.916825414432046e-05, + "loss": 0.0002, + "step": 293 + }, + { + "epoch": 0.37632, + "grad_norm": 0.0014921202091500163, + "learning_rate": 3.916076917730552e-05, + "loss": 0.0002, + "step": 294 + }, + { + "epoch": 0.3776, + "grad_norm": 0.001001936150714755, + "learning_rate": 3.915325148574502e-05, + "loss": 0.0002, + "step": 295 + }, + { + "epoch": 0.37888, + "grad_norm": 0.0005724078509956598, + "learning_rate": 3.914570108397474e-05, + "loss": 0.0002, + "step": 296 + }, + { + "epoch": 0.38016, + "grad_norm": 0.001378412009216845, + "learning_rate": 3.9138117986392856e-05, + "loss": 0.0002, + "step": 297 + }, + { + "epoch": 0.38144, + "grad_norm": 0.0024734761100262403, + "learning_rate": 3.913050220745986e-05, + "loss": 0.0002, + "step": 298 + }, + { + "epoch": 0.38272, + "grad_norm": 0.0031675021164119244, + "learning_rate": 3.9122853761698595e-05, + "loss": 0.0002, + "step": 299 + }, + { + "epoch": 0.384, + "grad_norm": 0.0029399548657238483, + "learning_rate": 3.911517266369416e-05, + "loss": 0.0002, + "step": 300 + }, + { + "epoch": 0.384, + "eval_loss": 1.1859415769577026, + "eval_runtime": 42.3844, + "eval_samples_per_second": 11.844, + "eval_steps_per_second": 1.486, + "step": 300 + }, + { + "epoch": 0.38528, + "grad_norm": 0.0010896556777879596, + "learning_rate": 3.910745892809396e-05, + "loss": 0.0002, + "step": 301 + }, + { + "epoch": 0.38656, + "grad_norm": 0.0017165422905236483, + "learning_rate": 3.9099712569607614e-05, + "loss": 0.0002, + "step": 302 + }, + { + "epoch": 0.38784, + "grad_norm": 0.004119697492569685, + "learning_rate": 3.9091933603006964e-05, + "loss": 0.0002, + "step": 303 + }, + { + "epoch": 0.38912, + "grad_norm": 0.006195970810949802, + "learning_rate": 3.9084122043126026e-05, + "loss": 0.0002, + "step": 304 + }, + { + "epoch": 0.3904, + "grad_norm": 0.007094519678503275, + "learning_rate": 3.9076277904860965e-05, + "loss": 0.0002, + "step": 305 + }, + { + "epoch": 0.39168, + "grad_norm": 0.005756770726293325, + "learning_rate": 3.906840120317008e-05, + "loss": 0.0002, + "step": 306 + }, + { + "epoch": 0.39296, + "grad_norm": 0.0035659433342516422, + "learning_rate": 3.906049195307376e-05, + "loss": 0.0002, + "step": 307 + }, + { + "epoch": 0.39424, + "grad_norm": 0.0014575119130313396, + "learning_rate": 3.905255016965447e-05, + "loss": 0.0002, + "step": 308 + }, + { + "epoch": 0.39552, + "grad_norm": 0.0005094299558550119, + "learning_rate": 3.9044575868056705e-05, + "loss": 0.0002, + "step": 309 + }, + { + "epoch": 0.3968, + "grad_norm": 0.002270213095471263, + "learning_rate": 3.903656906348698e-05, + "loss": 0.0002, + "step": 310 + }, + { + "epoch": 0.39808, + "grad_norm": 0.003336275927722454, + "learning_rate": 3.902852977121378e-05, + "loss": 0.0002, + "step": 311 + }, + { + "epoch": 0.39936, + "grad_norm": 0.002990037202835083, + "learning_rate": 3.902045800656755e-05, + "loss": 0.0002, + "step": 312 + }, + { + "epoch": 0.40064, + "grad_norm": 0.002952362410724163, + "learning_rate": 3.901235378494065e-05, + "loss": 0.0002, + "step": 313 + }, + { + "epoch": 0.40192, + "grad_norm": 0.0025355329271405935, + "learning_rate": 3.9004217121787354e-05, + "loss": 0.0002, + "step": 314 + }, + { + "epoch": 0.4032, + "grad_norm": 0.002028323709964752, + "learning_rate": 3.899604803262377e-05, + "loss": 0.0002, + "step": 315 + }, + { + "epoch": 0.40448, + "grad_norm": 0.001996732084080577, + "learning_rate": 3.898784653302786e-05, + "loss": 0.0002, + "step": 316 + }, + { + "epoch": 0.40576, + "grad_norm": 0.0018237540498375893, + "learning_rate": 3.897961263863938e-05, + "loss": 0.0002, + "step": 317 + }, + { + "epoch": 0.40704, + "grad_norm": 0.002562148030847311, + "learning_rate": 3.897134636515987e-05, + "loss": 0.0002, + "step": 318 + }, + { + "epoch": 0.40832, + "grad_norm": 0.00395054230466485, + "learning_rate": 3.896304772835263e-05, + "loss": 0.0002, + "step": 319 + }, + { + "epoch": 0.4096, + "grad_norm": 0.004901579115539789, + "learning_rate": 3.895471674404264e-05, + "loss": 0.0002, + "step": 320 + }, + { + "epoch": 0.41088, + "grad_norm": 0.0055976551957428455, + "learning_rate": 3.894635342811657e-05, + "loss": 0.0002, + "step": 321 + }, + { + "epoch": 0.41216, + "grad_norm": 0.006354920100420713, + "learning_rate": 3.893795779652278e-05, + "loss": 0.0002, + "step": 322 + }, + { + "epoch": 0.41344, + "grad_norm": 0.008057119324803352, + "learning_rate": 3.892952986527122e-05, + "loss": 0.0002, + "step": 323 + }, + { + "epoch": 0.41472, + "grad_norm": 0.00950048491358757, + "learning_rate": 3.892106965043344e-05, + "loss": 0.0002, + "step": 324 + }, + { + "epoch": 0.416, + "grad_norm": 0.009841698221862316, + "learning_rate": 3.891257716814256e-05, + "loss": 0.0002, + "step": 325 + }, + { + "epoch": 0.41728, + "grad_norm": 0.008819937705993652, + "learning_rate": 3.890405243459322e-05, + "loss": 0.0002, + "step": 326 + }, + { + "epoch": 0.41856, + "grad_norm": 0.004502292722463608, + "learning_rate": 3.889549546604159e-05, + "loss": 0.0002, + "step": 327 + }, + { + "epoch": 0.41984, + "grad_norm": 0.001475517638027668, + "learning_rate": 3.888690627880527e-05, + "loss": 0.0002, + "step": 328 + }, + { + "epoch": 0.42112, + "grad_norm": 0.004811195190995932, + "learning_rate": 3.8878284889263326e-05, + "loss": 0.0002, + "step": 329 + }, + { + "epoch": 0.4224, + "grad_norm": 0.003931612242013216, + "learning_rate": 3.886963131385623e-05, + "loss": 0.0002, + "step": 330 + }, + { + "epoch": 0.4224, + "eval_loss": 1.1920558214187622, + "eval_runtime": 43.3287, + "eval_samples_per_second": 11.586, + "eval_steps_per_second": 1.454, + "step": 330 + }, + { + "epoch": 0.42368, + "grad_norm": 0.00033131142845377326, + "learning_rate": 3.886094556908581e-05, + "loss": 0.0002, + "step": 331 + }, + { + "epoch": 0.42496, + "grad_norm": 0.003252562368288636, + "learning_rate": 3.8852227671515274e-05, + "loss": 0.0002, + "step": 332 + }, + { + "epoch": 0.42624, + "grad_norm": 0.005851993802934885, + "learning_rate": 3.8843477637769106e-05, + "loss": 0.0002, + "step": 333 + }, + { + "epoch": 0.42752, + "grad_norm": 0.00618679728358984, + "learning_rate": 3.8834695484533104e-05, + "loss": 0.0002, + "step": 334 + }, + { + "epoch": 0.4288, + "grad_norm": 0.0034405761398375034, + "learning_rate": 3.8825881228554295e-05, + "loss": 0.0002, + "step": 335 + }, + { + "epoch": 0.43008, + "grad_norm": 0.0008351071155630052, + "learning_rate": 3.8817034886640924e-05, + "loss": 0.0002, + "step": 336 + }, + { + "epoch": 0.43136, + "grad_norm": 0.0040811290964484215, + "learning_rate": 3.880815647566244e-05, + "loss": 0.0002, + "step": 337 + }, + { + "epoch": 0.43264, + "grad_norm": 0.0059448895044624805, + "learning_rate": 3.879924601254943e-05, + "loss": 0.0002, + "step": 338 + }, + { + "epoch": 0.43392, + "grad_norm": 0.006282978691160679, + "learning_rate": 3.87903035142936e-05, + "loss": 0.0002, + "step": 339 + }, + { + "epoch": 0.4352, + "grad_norm": 0.00391614343971014, + "learning_rate": 3.878132899794776e-05, + "loss": 0.0002, + "step": 340 + }, + { + "epoch": 0.43648, + "grad_norm": 0.0001543579564895481, + "learning_rate": 3.877232248062576e-05, + "loss": 0.0002, + "step": 341 + }, + { + "epoch": 0.43776, + "grad_norm": 0.0030169414822012186, + "learning_rate": 3.876328397950249e-05, + "loss": 0.0002, + "step": 342 + }, + { + "epoch": 0.43904, + "grad_norm": 0.004486779682338238, + "learning_rate": 3.8754213511813815e-05, + "loss": 0.0002, + "step": 343 + }, + { + "epoch": 0.44032, + "grad_norm": 0.004293914418667555, + "learning_rate": 3.874511109485658e-05, + "loss": 0.0002, + "step": 344 + }, + { + "epoch": 0.4416, + "grad_norm": 0.0029261987656354904, + "learning_rate": 3.873597674598853e-05, + "loss": 0.0002, + "step": 345 + }, + { + "epoch": 0.44288, + "grad_norm": 0.0002709278487600386, + "learning_rate": 3.872681048262832e-05, + "loss": 0.0002, + "step": 346 + }, + { + "epoch": 0.44416, + "grad_norm": 0.0035741126630455256, + "learning_rate": 3.871761232225544e-05, + "loss": 0.0002, + "step": 347 + }, + { + "epoch": 0.44544, + "grad_norm": 0.005375510081648827, + "learning_rate": 3.870838228241025e-05, + "loss": 0.0002, + "step": 348 + }, + { + "epoch": 0.44672, + "grad_norm": 0.004629251081496477, + "learning_rate": 3.869912038069387e-05, + "loss": 0.0002, + "step": 349 + }, + { + "epoch": 0.448, + "grad_norm": 0.0019839161541312933, + "learning_rate": 3.868982663476817e-05, + "loss": 0.0002, + "step": 350 + }, + { + "epoch": 0.44928, + "grad_norm": 0.0009309860761277378, + "learning_rate": 3.868050106235578e-05, + "loss": 0.0002, + "step": 351 + }, + { + "epoch": 0.45056, + "grad_norm": 0.00313609279692173, + "learning_rate": 3.867114368123998e-05, + "loss": 0.0002, + "step": 352 + }, + { + "epoch": 0.45184, + "grad_norm": 0.0037451910320669413, + "learning_rate": 3.866175450926474e-05, + "loss": 0.0002, + "step": 353 + }, + { + "epoch": 0.45312, + "grad_norm": 0.002794757019728422, + "learning_rate": 3.865233356433464e-05, + "loss": 0.0002, + "step": 354 + }, + { + "epoch": 0.4544, + "grad_norm": 0.0012516654096543789, + "learning_rate": 3.864288086441485e-05, + "loss": 0.0002, + "step": 355 + }, + { + "epoch": 0.45568, + "grad_norm": 0.00018904400349128991, + "learning_rate": 3.8633396427531096e-05, + "loss": 0.0002, + "step": 356 + }, + { + "epoch": 0.45696, + "grad_norm": 0.0005176289123483002, + "learning_rate": 3.862388027176962e-05, + "loss": 0.0002, + "step": 357 + }, + { + "epoch": 0.45824, + "grad_norm": 0.00018633867148309946, + "learning_rate": 3.8614332415277155e-05, + "loss": 0.0002, + "step": 358 + }, + { + "epoch": 0.45952, + "grad_norm": 0.0004992458852939308, + "learning_rate": 3.8604752876260885e-05, + "loss": 0.0002, + "step": 359 + }, + { + "epoch": 0.4608, + "grad_norm": 0.0002147257764590904, + "learning_rate": 3.859514167298841e-05, + "loss": 0.0002, + "step": 360 + }, + { + "epoch": 0.4608, + "eval_loss": 1.1968976259231567, + "eval_runtime": 43.4271, + "eval_samples_per_second": 11.56, + "eval_steps_per_second": 1.451, + "step": 360 + }, + { + "epoch": 0.46208, + "grad_norm": 0.0015842154389247298, + "learning_rate": 3.8585498823787707e-05, + "loss": 0.0002, + "step": 361 + }, + { + "epoch": 0.46336, + "grad_norm": 0.003454265184700489, + "learning_rate": 3.8575824347047115e-05, + "loss": 0.0002, + "step": 362 + }, + { + "epoch": 0.46464, + "grad_norm": 0.004604188725352287, + "learning_rate": 3.856611826121526e-05, + "loss": 0.0002, + "step": 363 + }, + { + "epoch": 0.46592, + "grad_norm": 0.0045526353642344475, + "learning_rate": 3.8556380584801075e-05, + "loss": 0.0002, + "step": 364 + }, + { + "epoch": 0.4672, + "grad_norm": 0.0031676136422902346, + "learning_rate": 3.85466113363737e-05, + "loss": 0.0002, + "step": 365 + }, + { + "epoch": 0.46848, + "grad_norm": 0.0005085321608930826, + "learning_rate": 3.853681053456252e-05, + "loss": 0.0002, + "step": 366 + }, + { + "epoch": 0.46976, + "grad_norm": 0.0029830520506948233, + "learning_rate": 3.852697819805705e-05, + "loss": 0.0002, + "step": 367 + }, + { + "epoch": 0.47104, + "grad_norm": 0.0048976195976138115, + "learning_rate": 3.851711434560697e-05, + "loss": 0.0002, + "step": 368 + }, + { + "epoch": 0.47232, + "grad_norm": 0.005438893102109432, + "learning_rate": 3.850721899602204e-05, + "loss": 0.0002, + "step": 369 + }, + { + "epoch": 0.4736, + "grad_norm": 0.0041366140358150005, + "learning_rate": 3.84972921681721e-05, + "loss": 0.0002, + "step": 370 + }, + { + "epoch": 0.47488, + "grad_norm": 0.0002860951062757522, + "learning_rate": 3.8487333880987e-05, + "loss": 0.0002, + "step": 371 + }, + { + "epoch": 0.47616, + "grad_norm": 0.004208084661513567, + "learning_rate": 3.84773441534566e-05, + "loss": 0.0002, + "step": 372 + }, + { + "epoch": 0.47744, + "grad_norm": 0.006803308613598347, + "learning_rate": 3.8467323004630695e-05, + "loss": 0.0002, + "step": 373 + }, + { + "epoch": 0.47872, + "grad_norm": 0.006709063891321421, + "learning_rate": 3.8457270453619e-05, + "loss": 0.0002, + "step": 374 + }, + { + "epoch": 0.48, + "grad_norm": 0.005010607186704874, + "learning_rate": 3.844718651959115e-05, + "loss": 0.0002, + "step": 375 + }, + { + "epoch": 0.48128, + "grad_norm": 0.002244663191959262, + "learning_rate": 3.843707122177656e-05, + "loss": 0.0002, + "step": 376 + }, + { + "epoch": 0.48256, + "grad_norm": 0.001649985439144075, + "learning_rate": 3.842692457946452e-05, + "loss": 0.0002, + "step": 377 + }, + { + "epoch": 0.48384, + "grad_norm": 0.004389249719679356, + "learning_rate": 3.841674661200403e-05, + "loss": 0.0002, + "step": 378 + }, + { + "epoch": 0.48512, + "grad_norm": 0.004841000307351351, + "learning_rate": 3.840653733880388e-05, + "loss": 0.0002, + "step": 379 + }, + { + "epoch": 0.4864, + "grad_norm": 0.003227783599868417, + "learning_rate": 3.839629677933253e-05, + "loss": 0.0002, + "step": 380 + }, + { + "epoch": 0.48768, + "grad_norm": 0.00019390507077332586, + "learning_rate": 3.8386024953118105e-05, + "loss": 0.0002, + "step": 381 + }, + { + "epoch": 0.48896, + "grad_norm": 0.002688514068722725, + "learning_rate": 3.837572187974835e-05, + "loss": 0.0002, + "step": 382 + }, + { + "epoch": 0.49024, + "grad_norm": 0.003430665237829089, + "learning_rate": 3.83653875788706e-05, + "loss": 0.0002, + "step": 383 + }, + { + "epoch": 0.49152, + "grad_norm": 0.0023852745071053505, + "learning_rate": 3.8355022070191745e-05, + "loss": 0.0002, + "step": 384 + }, + { + "epoch": 0.4928, + "grad_norm": 0.00032828684197738767, + "learning_rate": 3.834462537347817e-05, + "loss": 0.0002, + "step": 385 + }, + { + "epoch": 0.49408, + "grad_norm": 0.001501849154010415, + "learning_rate": 3.833419750855577e-05, + "loss": 0.0002, + "step": 386 + }, + { + "epoch": 0.49536, + "grad_norm": 0.002512734616175294, + "learning_rate": 3.832373849530982e-05, + "loss": 0.0002, + "step": 387 + }, + { + "epoch": 0.49664, + "grad_norm": 0.0017667177598923445, + "learning_rate": 3.831324835368505e-05, + "loss": 0.0002, + "step": 388 + }, + { + "epoch": 0.49792, + "grad_norm": 0.00017293778364546597, + "learning_rate": 3.83027271036855e-05, + "loss": 0.0002, + "step": 389 + }, + { + "epoch": 0.4992, + "grad_norm": 0.0009446240146644413, + "learning_rate": 3.829217476537457e-05, + "loss": 0.0002, + "step": 390 + }, + { + "epoch": 0.4992, + "eval_loss": 1.193045735359192, + "eval_runtime": 43.3135, + "eval_samples_per_second": 11.59, + "eval_steps_per_second": 1.455, + "step": 390 + }, + { + "epoch": 0.50048, + "grad_norm": 0.0007563307299278677, + "learning_rate": 3.828159135887493e-05, + "loss": 0.0002, + "step": 391 + }, + { + "epoch": 0.50176, + "grad_norm": 0.000485950440634042, + "learning_rate": 3.827097690436848e-05, + "loss": 0.0002, + "step": 392 + }, + { + "epoch": 0.50304, + "grad_norm": 0.0014980818377807736, + "learning_rate": 3.826033142209636e-05, + "loss": 0.0002, + "step": 393 + }, + { + "epoch": 0.50432, + "grad_norm": 0.002262465190142393, + "learning_rate": 3.824965493235885e-05, + "loss": 0.0002, + "step": 394 + }, + { + "epoch": 0.5056, + "grad_norm": 0.0031683961860835552, + "learning_rate": 3.823894745551536e-05, + "loss": 0.0002, + "step": 395 + }, + { + "epoch": 0.50688, + "grad_norm": 0.0036339478101581335, + "learning_rate": 3.8228209011984404e-05, + "loss": 0.0002, + "step": 396 + }, + { + "epoch": 0.50816, + "grad_norm": 0.003546744817867875, + "learning_rate": 3.821743962224355e-05, + "loss": 0.0002, + "step": 397 + }, + { + "epoch": 0.50944, + "grad_norm": 0.002480078022927046, + "learning_rate": 3.820663930682936e-05, + "loss": 0.0002, + "step": 398 + }, + { + "epoch": 0.51072, + "grad_norm": 0.0004905156674794853, + "learning_rate": 3.819580808633739e-05, + "loss": 0.0002, + "step": 399 + }, + { + "epoch": 0.512, + "grad_norm": 0.0015791412442922592, + "learning_rate": 3.818494598142211e-05, + "loss": 0.0002, + "step": 400 + }, + { + "epoch": 0.51328, + "grad_norm": 0.0024596957955509424, + "learning_rate": 3.81740530127969e-05, + "loss": 0.0002, + "step": 401 + }, + { + "epoch": 0.51456, + "grad_norm": 0.002662543673068285, + "learning_rate": 3.8163129201234e-05, + "loss": 0.0002, + "step": 402 + }, + { + "epoch": 0.51584, + "grad_norm": 0.002260034205392003, + "learning_rate": 3.815217456756444e-05, + "loss": 0.0002, + "step": 403 + }, + { + "epoch": 0.51712, + "grad_norm": 0.0009084303746931255, + "learning_rate": 3.8141189132678045e-05, + "loss": 0.0002, + "step": 404 + }, + { + "epoch": 0.5184, + "grad_norm": 0.0014930436154827476, + "learning_rate": 3.813017291752338e-05, + "loss": 0.0002, + "step": 405 + }, + { + "epoch": 0.51968, + "grad_norm": 0.004289396107196808, + "learning_rate": 3.8119125943107694e-05, + "loss": 0.0002, + "step": 406 + }, + { + "epoch": 0.52096, + "grad_norm": 0.006647216156125069, + "learning_rate": 3.8108048230496896e-05, + "loss": 0.0002, + "step": 407 + }, + { + "epoch": 0.52224, + "grad_norm": 0.008235771209001541, + "learning_rate": 3.8096939800815514e-05, + "loss": 0.0002, + "step": 408 + }, + { + "epoch": 0.52352, + "grad_norm": 0.007829473353922367, + "learning_rate": 3.808580067524665e-05, + "loss": 0.0002, + "step": 409 + }, + { + "epoch": 0.5248, + "grad_norm": 0.005282750353217125, + "learning_rate": 3.807463087503194e-05, + "loss": 0.0002, + "step": 410 + }, + { + "epoch": 0.52608, + "grad_norm": 0.0014312955318018794, + "learning_rate": 3.806343042147151e-05, + "loss": 0.0002, + "step": 411 + }, + { + "epoch": 0.52736, + "grad_norm": 0.0028434335254132748, + "learning_rate": 3.805219933592396e-05, + "loss": 0.0002, + "step": 412 + }, + { + "epoch": 0.52864, + "grad_norm": 0.005742455366998911, + "learning_rate": 3.804093763980627e-05, + "loss": 0.0002, + "step": 413 + }, + { + "epoch": 0.52992, + "grad_norm": 0.006840975023806095, + "learning_rate": 3.8029645354593825e-05, + "loss": 0.0002, + "step": 414 + }, + { + "epoch": 0.5312, + "grad_norm": 0.004969259258359671, + "learning_rate": 3.8018322501820314e-05, + "loss": 0.0002, + "step": 415 + }, + { + "epoch": 0.53248, + "grad_norm": 0.0005858050426468253, + "learning_rate": 3.8006969103077736e-05, + "loss": 0.0002, + "step": 416 + }, + { + "epoch": 0.53376, + "grad_norm": 0.0043167779222130775, + "learning_rate": 3.799558518001635e-05, + "loss": 0.0002, + "step": 417 + }, + { + "epoch": 0.53504, + "grad_norm": 0.006647319067269564, + "learning_rate": 3.7984170754344584e-05, + "loss": 0.0002, + "step": 418 + }, + { + "epoch": 0.53632, + "grad_norm": 0.004266531206667423, + "learning_rate": 3.797272584782906e-05, + "loss": 0.0002, + "step": 419 + }, + { + "epoch": 0.5376, + "grad_norm": 0.00030707393307238817, + "learning_rate": 3.796125048229453e-05, + "loss": 0.0002, + "step": 420 + }, + { + "epoch": 0.5376, + "eval_loss": 1.2100694179534912, + "eval_runtime": 44.2412, + "eval_samples_per_second": 11.347, + "eval_steps_per_second": 1.424, + "step": 420 + }, + { + "epoch": 0.53888, + "grad_norm": 0.004020696971565485, + "learning_rate": 3.794974467962382e-05, + "loss": 0.0002, + "step": 421 + }, + { + "epoch": 0.54016, + "grad_norm": 0.005686239339411259, + "learning_rate": 3.793820846175778e-05, + "loss": 0.0002, + "step": 422 + }, + { + "epoch": 0.54144, + "grad_norm": 0.004134351387619972, + "learning_rate": 3.79266418506953e-05, + "loss": 0.0002, + "step": 423 + }, + { + "epoch": 0.54272, + "grad_norm": 0.0006439160206355155, + "learning_rate": 3.791504486849318e-05, + "loss": 0.0002, + "step": 424 + }, + { + "epoch": 0.544, + "grad_norm": 0.0027023518923670053, + "learning_rate": 3.790341753726618e-05, + "loss": 0.0002, + "step": 425 + }, + { + "epoch": 0.54528, + "grad_norm": 0.0032724770717322826, + "learning_rate": 3.7891759879186924e-05, + "loss": 0.0002, + "step": 426 + }, + { + "epoch": 0.54656, + "grad_norm": 0.0012540655443444848, + "learning_rate": 3.788007191648583e-05, + "loss": 0.0002, + "step": 427 + }, + { + "epoch": 0.54784, + "grad_norm": 0.0018252013251185417, + "learning_rate": 3.786835367145116e-05, + "loss": 0.0002, + "step": 428 + }, + { + "epoch": 0.54912, + "grad_norm": 0.004245402291417122, + "learning_rate": 3.785660516642888e-05, + "loss": 0.0002, + "step": 429 + }, + { + "epoch": 0.5504, + "grad_norm": 0.004345291759818792, + "learning_rate": 3.7844826423822685e-05, + "loss": 0.0002, + "step": 430 + }, + { + "epoch": 0.55168, + "grad_norm": 0.0018071354134008288, + "learning_rate": 3.783301746609391e-05, + "loss": 0.0002, + "step": 431 + }, + { + "epoch": 0.55296, + "grad_norm": 0.00209009344689548, + "learning_rate": 3.782117831576153e-05, + "loss": 0.0002, + "step": 432 + }, + { + "epoch": 0.55424, + "grad_norm": 0.004698239266872406, + "learning_rate": 3.7809308995402104e-05, + "loss": 0.0002, + "step": 433 + }, + { + "epoch": 0.55552, + "grad_norm": 0.004285912495106459, + "learning_rate": 3.779740952764968e-05, + "loss": 0.0002, + "step": 434 + }, + { + "epoch": 0.5568, + "grad_norm": 0.001513406983576715, + "learning_rate": 3.778547993519582e-05, + "loss": 0.0002, + "step": 435 + }, + { + "epoch": 0.55808, + "grad_norm": 0.0010023791110143065, + "learning_rate": 3.777352024078955e-05, + "loss": 0.0002, + "step": 436 + }, + { + "epoch": 0.55936, + "grad_norm": 0.001528212451376021, + "learning_rate": 3.776153046723727e-05, + "loss": 0.0002, + "step": 437 + }, + { + "epoch": 0.56064, + "grad_norm": 0.0007561501697637141, + "learning_rate": 3.774951063740275e-05, + "loss": 0.0002, + "step": 438 + }, + { + "epoch": 0.56192, + "grad_norm": 0.0004925573011860251, + "learning_rate": 3.7737460774207094e-05, + "loss": 0.0002, + "step": 439 + }, + { + "epoch": 0.5632, + "grad_norm": 0.0008101520943455398, + "learning_rate": 3.772538090062863e-05, + "loss": 0.0002, + "step": 440 + }, + { + "epoch": 0.56448, + "grad_norm": 0.0004090173461008817, + "learning_rate": 3.771327103970297e-05, + "loss": 0.0002, + "step": 441 + }, + { + "epoch": 0.56576, + "grad_norm": 0.0022296542301774025, + "learning_rate": 3.770113121452287e-05, + "loss": 0.0002, + "step": 442 + }, + { + "epoch": 0.56704, + "grad_norm": 0.003427501767873764, + "learning_rate": 3.768896144823825e-05, + "loss": 0.0002, + "step": 443 + }, + { + "epoch": 0.56832, + "grad_norm": 0.003229739610105753, + "learning_rate": 3.76767617640561e-05, + "loss": 0.0002, + "step": 444 + }, + { + "epoch": 0.5696, + "grad_norm": 0.0014694056008011103, + "learning_rate": 3.76645321852405e-05, + "loss": 0.0002, + "step": 445 + }, + { + "epoch": 0.57088, + "grad_norm": 0.0008542861323803663, + "learning_rate": 3.7652272735112494e-05, + "loss": 0.0002, + "step": 446 + }, + { + "epoch": 0.57216, + "grad_norm": 0.0027981793973594904, + "learning_rate": 3.763998343705012e-05, + "loss": 0.0002, + "step": 447 + }, + { + "epoch": 0.57344, + "grad_norm": 0.003584227291867137, + "learning_rate": 3.7627664314488324e-05, + "loss": 0.0002, + "step": 448 + }, + { + "epoch": 0.57472, + "grad_norm": 0.002387993037700653, + "learning_rate": 3.761531539091892e-05, + "loss": 0.0002, + "step": 449 + }, + { + "epoch": 0.576, + "grad_norm": 0.00032775092404335737, + "learning_rate": 3.760293668989055e-05, + "loss": 0.0002, + "step": 450 + }, + { + "epoch": 0.576, + "eval_loss": 1.2023624181747437, + "eval_runtime": 43.8775, + "eval_samples_per_second": 11.441, + "eval_steps_per_second": 1.436, + "step": 450 + }, + { + "epoch": 0.57728, + "grad_norm": 0.0023032776080071926, + "learning_rate": 3.7590528235008677e-05, + "loss": 0.0002, + "step": 451 + }, + { + "epoch": 0.57856, + "grad_norm": 0.0027386387810111046, + "learning_rate": 3.757809004993543e-05, + "loss": 0.0002, + "step": 452 + }, + { + "epoch": 0.57984, + "grad_norm": 0.001904963981360197, + "learning_rate": 3.75656221583897e-05, + "loss": 0.0002, + "step": 453 + }, + { + "epoch": 0.58112, + "grad_norm": 0.0004317044804338366, + "learning_rate": 3.7553124584146986e-05, + "loss": 0.0002, + "step": 454 + }, + { + "epoch": 0.5824, + "grad_norm": 0.0009737997897900641, + "learning_rate": 3.754059735103941e-05, + "loss": 0.0002, + "step": 455 + }, + { + "epoch": 0.58368, + "grad_norm": 0.0012084428453817964, + "learning_rate": 3.7528040482955655e-05, + "loss": 0.0002, + "step": 456 + }, + { + "epoch": 0.58496, + "grad_norm": 0.0005159890279173851, + "learning_rate": 3.751545400384089e-05, + "loss": 0.0002, + "step": 457 + }, + { + "epoch": 0.58624, + "grad_norm": 0.0005341696669347584, + "learning_rate": 3.7502837937696776e-05, + "loss": 0.0002, + "step": 458 + }, + { + "epoch": 0.58752, + "grad_norm": 0.0014383515808731318, + "learning_rate": 3.749019230858138e-05, + "loss": 0.0002, + "step": 459 + }, + { + "epoch": 0.5888, + "grad_norm": 0.002283843234181404, + "learning_rate": 3.747751714060915e-05, + "loss": 0.0002, + "step": 460 + }, + { + "epoch": 0.59008, + "grad_norm": 0.0026318333111703396, + "learning_rate": 3.746481245795087e-05, + "loss": 0.0002, + "step": 461 + }, + { + "epoch": 0.59136, + "grad_norm": 0.0022059311158955097, + "learning_rate": 3.745207828483359e-05, + "loss": 0.0002, + "step": 462 + }, + { + "epoch": 0.59264, + "grad_norm": 0.0014173684176057577, + "learning_rate": 3.743931464554061e-05, + "loss": 0.0002, + "step": 463 + }, + { + "epoch": 0.59392, + "grad_norm": 0.00021343116532079875, + "learning_rate": 3.742652156441142e-05, + "loss": 0.0002, + "step": 464 + }, + { + "epoch": 0.5952, + "grad_norm": 0.0014344080118462443, + "learning_rate": 3.741369906584164e-05, + "loss": 0.0002, + "step": 465 + }, + { + "epoch": 0.59648, + "grad_norm": 0.0022634316701442003, + "learning_rate": 3.7400847174283004e-05, + "loss": 0.0002, + "step": 466 + }, + { + "epoch": 0.59776, + "grad_norm": 0.0016528741689398885, + "learning_rate": 3.738796591424328e-05, + "loss": 0.0002, + "step": 467 + }, + { + "epoch": 0.59904, + "grad_norm": 0.00032261331216432154, + "learning_rate": 3.7375055310286266e-05, + "loss": 0.0002, + "step": 468 + }, + { + "epoch": 0.60032, + "grad_norm": 0.001875245594419539, + "learning_rate": 3.736211538703169e-05, + "loss": 0.0002, + "step": 469 + }, + { + "epoch": 0.6016, + "grad_norm": 0.0023898689541965723, + "learning_rate": 3.7349146169155194e-05, + "loss": 0.0002, + "step": 470 + }, + { + "epoch": 0.60288, + "grad_norm": 0.0013310150243341923, + "learning_rate": 3.7336147681388294e-05, + "loss": 0.0002, + "step": 471 + }, + { + "epoch": 0.60416, + "grad_norm": 0.00098335649818182, + "learning_rate": 3.732311994851832e-05, + "loss": 0.0002, + "step": 472 + }, + { + "epoch": 0.60544, + "grad_norm": 0.0034482080955058336, + "learning_rate": 3.731006299538837e-05, + "loss": 0.0002, + "step": 473 + }, + { + "epoch": 0.60672, + "grad_norm": 0.004604153335094452, + "learning_rate": 3.7296976846897266e-05, + "loss": 0.0002, + "step": 474 + }, + { + "epoch": 0.608, + "grad_norm": 0.002799773123115301, + "learning_rate": 3.728386152799949e-05, + "loss": 0.0002, + "step": 475 + }, + { + "epoch": 0.60928, + "grad_norm": 0.0007603497942909598, + "learning_rate": 3.727071706370516e-05, + "loss": 0.0002, + "step": 476 + }, + { + "epoch": 0.61056, + "grad_norm": 0.0037390943616628647, + "learning_rate": 3.725754347907997e-05, + "loss": 0.0002, + "step": 477 + }, + { + "epoch": 0.61184, + "grad_norm": 0.004827678669244051, + "learning_rate": 3.724434079924515e-05, + "loss": 0.0002, + "step": 478 + }, + { + "epoch": 0.61312, + "grad_norm": 0.002908482449129224, + "learning_rate": 3.7231109049377414e-05, + "loss": 0.0002, + "step": 479 + }, + { + "epoch": 0.6144, + "grad_norm": 0.0007244603475555778, + "learning_rate": 3.721784825470889e-05, + "loss": 0.0002, + "step": 480 + }, + { + "epoch": 0.6144, + "eval_loss": 1.1689307689666748, + "eval_runtime": 42.2181, + "eval_samples_per_second": 11.891, + "eval_steps_per_second": 1.492, + "step": 480 + }, + { + "epoch": 0.61568, + "grad_norm": 0.004279144573956728, + "learning_rate": 3.7204558440527136e-05, + "loss": 0.0002, + "step": 481 + }, + { + "epoch": 0.61696, + "grad_norm": 0.0062600066885352135, + "learning_rate": 3.7191239632175e-05, + "loss": 0.0002, + "step": 482 + }, + { + "epoch": 0.61824, + "grad_norm": 0.005994696635752916, + "learning_rate": 3.717789185505064e-05, + "loss": 0.0002, + "step": 483 + }, + { + "epoch": 0.61952, + "grad_norm": 0.0035205462481826544, + "learning_rate": 3.7164515134607475e-05, + "loss": 0.0002, + "step": 484 + }, + { + "epoch": 0.6208, + "grad_norm": 0.00023122667334973812, + "learning_rate": 3.7151109496354086e-05, + "loss": 0.0002, + "step": 485 + }, + { + "epoch": 0.62208, + "grad_norm": 0.0031270990148186684, + "learning_rate": 3.713767496585421e-05, + "loss": 0.0002, + "step": 486 + }, + { + "epoch": 0.62336, + "grad_norm": 0.00399185623973608, + "learning_rate": 3.712421156872668e-05, + "loss": 0.0002, + "step": 487 + }, + { + "epoch": 0.62464, + "grad_norm": 0.0028176584746688604, + "learning_rate": 3.711071933064539e-05, + "loss": 0.0002, + "step": 488 + }, + { + "epoch": 0.62592, + "grad_norm": 0.00030795088969171047, + "learning_rate": 3.7097198277339216e-05, + "loss": 0.0002, + "step": 489 + }, + { + "epoch": 0.6272, + "grad_norm": 0.0023664638865739107, + "learning_rate": 3.708364843459198e-05, + "loss": 0.0002, + "step": 490 + }, + { + "epoch": 0.62848, + "grad_norm": 0.004323100205510855, + "learning_rate": 3.70700698282424e-05, + "loss": 0.0002, + "step": 491 + }, + { + "epoch": 0.62976, + "grad_norm": 0.0050489455461502075, + "learning_rate": 3.705646248418408e-05, + "loss": 0.0002, + "step": 492 + }, + { + "epoch": 0.63104, + "grad_norm": 0.0033712410368025303, + "learning_rate": 3.704282642836539e-05, + "loss": 0.0002, + "step": 493 + }, + { + "epoch": 0.63232, + "grad_norm": 0.0005509121110662818, + "learning_rate": 3.702916168678945e-05, + "loss": 0.0002, + "step": 494 + }, + { + "epoch": 0.6336, + "grad_norm": 0.0014277957379817963, + "learning_rate": 3.70154682855141e-05, + "loss": 0.0002, + "step": 495 + }, + { + "epoch": 0.63488, + "grad_norm": 0.001731383497826755, + "learning_rate": 3.700174625065183e-05, + "loss": 0.0002, + "step": 496 + }, + { + "epoch": 0.63616, + "grad_norm": 0.0005305620143190026, + "learning_rate": 3.698799560836972e-05, + "loss": 0.0002, + "step": 497 + }, + { + "epoch": 0.63744, + "grad_norm": 0.0011219007428735495, + "learning_rate": 3.697421638488941e-05, + "loss": 0.0002, + "step": 498 + }, + { + "epoch": 0.63872, + "grad_norm": 0.002674842020496726, + "learning_rate": 3.696040860648706e-05, + "loss": 0.0002, + "step": 499 + }, + { + "epoch": 0.64, + "grad_norm": 0.003906667232513428, + "learning_rate": 3.694657229949324e-05, + "loss": 0.0002, + "step": 500 + }, + { + "epoch": 0.64128, + "grad_norm": 0.0040111360140144825, + "learning_rate": 3.6932707490292965e-05, + "loss": 0.0002, + "step": 501 + }, + { + "epoch": 0.64256, + "grad_norm": 0.002312349621206522, + "learning_rate": 3.691881420532558e-05, + "loss": 0.0002, + "step": 502 + }, + { + "epoch": 0.64384, + "grad_norm": 0.0004572660254780203, + "learning_rate": 3.690489247108474e-05, + "loss": 0.0002, + "step": 503 + }, + { + "epoch": 0.64512, + "grad_norm": 0.0024236240424215794, + "learning_rate": 3.689094231411834e-05, + "loss": 0.0002, + "step": 504 + }, + { + "epoch": 0.6464, + "grad_norm": 0.002606478286907077, + "learning_rate": 3.6876963761028486e-05, + "loss": 0.0002, + "step": 505 + }, + { + "epoch": 0.64768, + "grad_norm": 0.001922594616189599, + "learning_rate": 3.686295683847144e-05, + "loss": 0.0002, + "step": 506 + }, + { + "epoch": 0.64896, + "grad_norm": 0.0006554339197464287, + "learning_rate": 3.684892157315754e-05, + "loss": 0.0002, + "step": 507 + }, + { + "epoch": 0.65024, + "grad_norm": 0.0007631480693817139, + "learning_rate": 3.683485799185119e-05, + "loss": 0.0002, + "step": 508 + }, + { + "epoch": 0.65152, + "grad_norm": 0.0010840974282473326, + "learning_rate": 3.682076612137079e-05, + "loss": 0.0002, + "step": 509 + }, + { + "epoch": 0.6528, + "grad_norm": 0.000218061642954126, + "learning_rate": 3.680664598858868e-05, + "loss": 0.0002, + "step": 510 + }, + { + "epoch": 0.6528, + "eval_loss": 1.1986668109893799, + "eval_runtime": 43.8576, + "eval_samples_per_second": 11.446, + "eval_steps_per_second": 1.436, + "step": 510 + }, + { + "epoch": 0.65408, + "grad_norm": 0.00179583253338933, + "learning_rate": 3.679249762043109e-05, + "loss": 0.0002, + "step": 511 + }, + { + "epoch": 0.65536, + "grad_norm": 0.0026319276075810194, + "learning_rate": 3.67783210438781e-05, + "loss": 0.0002, + "step": 512 + }, + { + "epoch": 0.65664, + "grad_norm": 0.0023609804920852184, + "learning_rate": 3.6764116285963595e-05, + "loss": 0.0002, + "step": 513 + }, + { + "epoch": 0.65792, + "grad_norm": 0.0012038566637784243, + "learning_rate": 3.6749883373775174e-05, + "loss": 0.0002, + "step": 514 + }, + { + "epoch": 0.6592, + "grad_norm": 0.00014525132428389043, + "learning_rate": 3.6735622334454145e-05, + "loss": 0.0002, + "step": 515 + }, + { + "epoch": 0.66048, + "grad_norm": 0.0007447360549122095, + "learning_rate": 3.6721333195195436e-05, + "loss": 0.0002, + "step": 516 + }, + { + "epoch": 0.66176, + "grad_norm": 0.0010004477808251977, + "learning_rate": 3.6707015983247575e-05, + "loss": 0.0002, + "step": 517 + }, + { + "epoch": 0.66304, + "grad_norm": 0.0005548037588596344, + "learning_rate": 3.6692670725912616e-05, + "loss": 0.0002, + "step": 518 + }, + { + "epoch": 0.66432, + "grad_norm": 0.0002495903172530234, + "learning_rate": 3.667829745054609e-05, + "loss": 0.0002, + "step": 519 + }, + { + "epoch": 0.6656, + "grad_norm": 0.0008480948745273054, + "learning_rate": 3.666389618455696e-05, + "loss": 0.0002, + "step": 520 + }, + { + "epoch": 0.66688, + "grad_norm": 0.00144975446164608, + "learning_rate": 3.6649466955407576e-05, + "loss": 0.0002, + "step": 521 + }, + { + "epoch": 0.66816, + "grad_norm": 0.0022027993109077215, + "learning_rate": 3.663500979061359e-05, + "loss": 0.0002, + "step": 522 + }, + { + "epoch": 0.66944, + "grad_norm": 0.002505333162844181, + "learning_rate": 3.6620524717743944e-05, + "loss": 0.0002, + "step": 523 + }, + { + "epoch": 0.67072, + "grad_norm": 0.0019165552221238613, + "learning_rate": 3.66060117644208e-05, + "loss": 0.0002, + "step": 524 + }, + { + "epoch": 0.672, + "grad_norm": 0.00015419794362969697, + "learning_rate": 3.6591470958319465e-05, + "loss": 0.0002, + "step": 525 + }, + { + "epoch": 0.67328, + "grad_norm": 0.0018403524300083518, + "learning_rate": 3.657690232716839e-05, + "loss": 0.0002, + "step": 526 + }, + { + "epoch": 0.67456, + "grad_norm": 0.0028898706659674644, + "learning_rate": 3.656230589874905e-05, + "loss": 0.0002, + "step": 527 + }, + { + "epoch": 0.67584, + "grad_norm": 0.0030745165422558784, + "learning_rate": 3.6547681700895976e-05, + "loss": 0.0002, + "step": 528 + }, + { + "epoch": 0.67712, + "grad_norm": 0.002408272586762905, + "learning_rate": 3.65330297614966e-05, + "loss": 0.0002, + "step": 529 + }, + { + "epoch": 0.6784, + "grad_norm": 0.0008627455099485815, + "learning_rate": 3.65183501084913e-05, + "loss": 0.0002, + "step": 530 + }, + { + "epoch": 0.67968, + "grad_norm": 0.0015142913907766342, + "learning_rate": 3.650364276987327e-05, + "loss": 0.0002, + "step": 531 + }, + { + "epoch": 0.68096, + "grad_norm": 0.00383846671320498, + "learning_rate": 3.648890777368852e-05, + "loss": 0.0002, + "step": 532 + }, + { + "epoch": 0.68224, + "grad_norm": 0.005133279133588076, + "learning_rate": 3.6474145148035786e-05, + "loss": 0.0002, + "step": 533 + }, + { + "epoch": 0.68352, + "grad_norm": 0.0045053777284920216, + "learning_rate": 3.645935492106651e-05, + "loss": 0.0002, + "step": 534 + }, + { + "epoch": 0.6848, + "grad_norm": 0.0019303944427520037, + "learning_rate": 3.644453712098475e-05, + "loss": 0.0002, + "step": 535 + }, + { + "epoch": 0.68608, + "grad_norm": 0.0006616184837184846, + "learning_rate": 3.6429691776047165e-05, + "loss": 0.0002, + "step": 536 + }, + { + "epoch": 0.68736, + "grad_norm": 0.0017754195723682642, + "learning_rate": 3.6414818914562905e-05, + "loss": 0.0002, + "step": 537 + }, + { + "epoch": 0.68864, + "grad_norm": 0.0016088245902210474, + "learning_rate": 3.639991856489363e-05, + "loss": 0.0002, + "step": 538 + }, + { + "epoch": 0.68992, + "grad_norm": 0.0012247121194377542, + "learning_rate": 3.6384990755453396e-05, + "loss": 0.0002, + "step": 539 + }, + { + "epoch": 0.6912, + "grad_norm": 0.0010904190130531788, + "learning_rate": 3.637003551470863e-05, + "loss": 0.0002, + "step": 540 + }, + { + "epoch": 0.6912, + "eval_loss": 1.180873990058899, + "eval_runtime": 42.9212, + "eval_samples_per_second": 11.696, + "eval_steps_per_second": 1.468, + "step": 540 + }, + { + "epoch": 0.69248, + "grad_norm": 0.001041168114170432, + "learning_rate": 3.635505287117807e-05, + "loss": 0.0002, + "step": 541 + }, + { + "epoch": 0.69376, + "grad_norm": 0.00024401405244134367, + "learning_rate": 3.634004285343271e-05, + "loss": 0.0002, + "step": 542 + }, + { + "epoch": 0.69504, + "grad_norm": 0.0008669750532135367, + "learning_rate": 3.6325005490095746e-05, + "loss": 0.0002, + "step": 543 + }, + { + "epoch": 0.69632, + "grad_norm": 0.0014739528996869922, + "learning_rate": 3.630994080984251e-05, + "loss": 0.0002, + "step": 544 + }, + { + "epoch": 0.6976, + "grad_norm": 0.0015093573601916432, + "learning_rate": 3.629484884140044e-05, + "loss": 0.0002, + "step": 545 + }, + { + "epoch": 0.69888, + "grad_norm": 0.0003642103401944041, + "learning_rate": 3.627972961354899e-05, + "loss": 0.0002, + "step": 546 + }, + { + "epoch": 0.70016, + "grad_norm": 0.001804412342607975, + "learning_rate": 3.626458315511963e-05, + "loss": 0.0002, + "step": 547 + }, + { + "epoch": 0.70144, + "grad_norm": 0.0034682515542954206, + "learning_rate": 3.6249409494995724e-05, + "loss": 0.0002, + "step": 548 + }, + { + "epoch": 0.70272, + "grad_norm": 0.004247341770678759, + "learning_rate": 3.6234208662112524e-05, + "loss": 0.0002, + "step": 549 + }, + { + "epoch": 0.704, + "grad_norm": 0.004289446864277124, + "learning_rate": 3.621898068545711e-05, + "loss": 0.0002, + "step": 550 + }, + { + "epoch": 0.70528, + "grad_norm": 0.0029438326600939035, + "learning_rate": 3.620372559406828e-05, + "loss": 0.0002, + "step": 551 + }, + { + "epoch": 0.70656, + "grad_norm": 0.0008173590758815408, + "learning_rate": 3.618844341703659e-05, + "loss": 0.0002, + "step": 552 + }, + { + "epoch": 0.70784, + "grad_norm": 0.00509173097088933, + "learning_rate": 3.617313418350422e-05, + "loss": 0.0002, + "step": 553 + }, + { + "epoch": 0.70912, + "grad_norm": 0.0070475186221301556, + "learning_rate": 3.6157797922664946e-05, + "loss": 0.0002, + "step": 554 + }, + { + "epoch": 0.7104, + "grad_norm": 0.0052854400128126144, + "learning_rate": 3.614243466376409e-05, + "loss": 0.0002, + "step": 555 + }, + { + "epoch": 0.71168, + "grad_norm": 0.00032702955650165677, + "learning_rate": 3.6127044436098445e-05, + "loss": 0.0002, + "step": 556 + }, + { + "epoch": 0.71296, + "grad_norm": 0.006080043967813253, + "learning_rate": 3.611162726901626e-05, + "loss": 0.0002, + "step": 557 + }, + { + "epoch": 0.71424, + "grad_norm": 0.007932756096124649, + "learning_rate": 3.609618319191712e-05, + "loss": 0.0002, + "step": 558 + }, + { + "epoch": 0.71552, + "grad_norm": 0.006158908829092979, + "learning_rate": 3.608071223425195e-05, + "loss": 0.0002, + "step": 559 + }, + { + "epoch": 0.7168, + "grad_norm": 0.002953893970698118, + "learning_rate": 3.606521442552293e-05, + "loss": 0.0002, + "step": 560 + }, + { + "epoch": 0.71808, + "grad_norm": 0.0007969742291606963, + "learning_rate": 3.604968979528345e-05, + "loss": 0.0002, + "step": 561 + }, + { + "epoch": 0.71936, + "grad_norm": 0.0039662644267082214, + "learning_rate": 3.603413837313801e-05, + "loss": 0.0002, + "step": 562 + }, + { + "epoch": 0.72064, + "grad_norm": 0.005224650260061026, + "learning_rate": 3.601856018874224e-05, + "loss": 0.0002, + "step": 563 + }, + { + "epoch": 0.72192, + "grad_norm": 0.004226877354085445, + "learning_rate": 3.600295527180281e-05, + "loss": 0.0002, + "step": 564 + }, + { + "epoch": 0.7232, + "grad_norm": 0.0011959822149947286, + "learning_rate": 3.5987323652077326e-05, + "loss": 0.0002, + "step": 565 + }, + { + "epoch": 0.72448, + "grad_norm": 0.0025563903618603945, + "learning_rate": 3.597166535937436e-05, + "loss": 0.0002, + "step": 566 + }, + { + "epoch": 0.72576, + "grad_norm": 0.005399333778768778, + "learning_rate": 3.595598042355331e-05, + "loss": 0.0002, + "step": 567 + }, + { + "epoch": 0.72704, + "grad_norm": 0.005194221623241901, + "learning_rate": 3.5940268874524396e-05, + "loss": 0.0002, + "step": 568 + }, + { + "epoch": 0.72832, + "grad_norm": 0.0018591524567455053, + "learning_rate": 3.5924530742248595e-05, + "loss": 0.0002, + "step": 569 + }, + { + "epoch": 0.7296, + "grad_norm": 0.003616099478676915, + "learning_rate": 3.590876605673758e-05, + "loss": 0.0002, + "step": 570 + }, + { + "epoch": 0.7296, + "eval_loss": 1.2382491827011108, + "eval_runtime": 45.2439, + "eval_samples_per_second": 11.095, + "eval_steps_per_second": 1.392, + "step": 570 + }, + { + "epoch": 0.73088, + "grad_norm": 0.007196330465376377, + "learning_rate": 3.589297484805363e-05, + "loss": 0.0002, + "step": 571 + }, + { + "epoch": 0.73216, + "grad_norm": 0.005559357814490795, + "learning_rate": 3.587715714630963e-05, + "loss": 0.0002, + "step": 572 + }, + { + "epoch": 0.73344, + "grad_norm": 0.0007927911938168108, + "learning_rate": 3.5861312981668984e-05, + "loss": 0.0002, + "step": 573 + }, + { + "epoch": 0.73472, + "grad_norm": 0.0043013193644583225, + "learning_rate": 3.5845442384345546e-05, + "loss": 0.0002, + "step": 574 + }, + { + "epoch": 0.736, + "grad_norm": 0.0068841902539134026, + "learning_rate": 3.582954538460359e-05, + "loss": 0.0002, + "step": 575 + }, + { + "epoch": 0.73728, + "grad_norm": 0.005568365100771189, + "learning_rate": 3.581362201275771e-05, + "loss": 0.0002, + "step": 576 + }, + { + "epoch": 0.73856, + "grad_norm": 0.0008979937992990017, + "learning_rate": 3.579767229917284e-05, + "loss": 0.0002, + "step": 577 + }, + { + "epoch": 0.73984, + "grad_norm": 0.005714059807360172, + "learning_rate": 3.5781696274264083e-05, + "loss": 0.0002, + "step": 578 + }, + { + "epoch": 0.74112, + "grad_norm": 0.00888694729655981, + "learning_rate": 3.576569396849678e-05, + "loss": 0.0002, + "step": 579 + }, + { + "epoch": 0.7424, + "grad_norm": 0.005648651625961065, + "learning_rate": 3.574966541238633e-05, + "loss": 0.0002, + "step": 580 + }, + { + "epoch": 0.74368, + "grad_norm": 0.0008465887513011694, + "learning_rate": 3.573361063649823e-05, + "loss": 0.0002, + "step": 581 + }, + { + "epoch": 0.74496, + "grad_norm": 0.00557989114895463, + "learning_rate": 3.571752967144797e-05, + "loss": 0.0002, + "step": 582 + }, + { + "epoch": 0.74624, + "grad_norm": 0.00599457835778594, + "learning_rate": 3.570142254790097e-05, + "loss": 0.0002, + "step": 583 + }, + { + "epoch": 0.74752, + "grad_norm": 0.002908288734033704, + "learning_rate": 3.568528929657252e-05, + "loss": 0.0002, + "step": 584 + }, + { + "epoch": 0.7488, + "grad_norm": 0.002017195103690028, + "learning_rate": 3.566912994822778e-05, + "loss": 0.0002, + "step": 585 + }, + { + "epoch": 0.75008, + "grad_norm": 0.00458879116922617, + "learning_rate": 3.565294453368162e-05, + "loss": 0.0002, + "step": 586 + }, + { + "epoch": 0.75136, + "grad_norm": 0.0023251990787684917, + "learning_rate": 3.563673308379867e-05, + "loss": 0.0002, + "step": 587 + }, + { + "epoch": 0.75264, + "grad_norm": 0.0018551320536062121, + "learning_rate": 3.562049562949316e-05, + "loss": 0.0002, + "step": 588 + }, + { + "epoch": 0.75392, + "grad_norm": 0.004012602381408215, + "learning_rate": 3.560423220172894e-05, + "loss": 0.0002, + "step": 589 + }, + { + "epoch": 0.7552, + "grad_norm": 0.0016248335596174002, + "learning_rate": 3.558794283151938e-05, + "loss": 0.0002, + "step": 590 + }, + { + "epoch": 0.75648, + "grad_norm": 0.0024472202640026808, + "learning_rate": 3.5571627549927316e-05, + "loss": 0.0002, + "step": 591 + }, + { + "epoch": 0.75776, + "grad_norm": 0.003877311712130904, + "learning_rate": 3.5555286388065004e-05, + "loss": 0.0002, + "step": 592 + }, + { + "epoch": 0.75904, + "grad_norm": 0.002112805377691984, + "learning_rate": 3.553891937709404e-05, + "loss": 0.0002, + "step": 593 + }, + { + "epoch": 0.76032, + "grad_norm": 0.0009606420644558966, + "learning_rate": 3.5522526548225326e-05, + "loss": 0.0002, + "step": 594 + }, + { + "epoch": 0.7616, + "grad_norm": 0.00314708286896348, + "learning_rate": 3.5506107932719005e-05, + "loss": 0.0002, + "step": 595 + }, + { + "epoch": 0.76288, + "grad_norm": 0.003082283539697528, + "learning_rate": 3.5489663561884366e-05, + "loss": 0.0002, + "step": 596 + }, + { + "epoch": 0.76416, + "grad_norm": 0.0001470320567023009, + "learning_rate": 3.5473193467079824e-05, + "loss": 0.0002, + "step": 597 + }, + { + "epoch": 0.76544, + "grad_norm": 0.004448519553989172, + "learning_rate": 3.545669767971286e-05, + "loss": 0.0002, + "step": 598 + }, + { + "epoch": 0.76672, + "grad_norm": 0.005195180885493755, + "learning_rate": 3.544017623123993e-05, + "loss": 0.0002, + "step": 599 + }, + { + "epoch": 0.768, + "grad_norm": 0.0006394287920556962, + "learning_rate": 3.542362915316644e-05, + "loss": 0.0002, + "step": 600 + }, + { + "epoch": 0.768, + "eval_loss": 1.164539098739624, + "eval_runtime": 41.5569, + "eval_samples_per_second": 12.08, + "eval_steps_per_second": 1.516, + "step": 600 + }, + { + "epoch": 0.76928, + "grad_norm": 0.004153765272349119, + "learning_rate": 3.540705647704667e-05, + "loss": 0.0002, + "step": 601 + }, + { + "epoch": 0.77056, + "grad_norm": 0.00440931273624301, + "learning_rate": 3.539045823448369e-05, + "loss": 0.0002, + "step": 602 + }, + { + "epoch": 0.77184, + "grad_norm": 0.00035680842120200396, + "learning_rate": 3.537383445712936e-05, + "loss": 0.0002, + "step": 603 + }, + { + "epoch": 0.77312, + "grad_norm": 0.004429604858160019, + "learning_rate": 3.535718517668421e-05, + "loss": 0.0002, + "step": 604 + }, + { + "epoch": 0.7744, + "grad_norm": 0.006617020815610886, + "learning_rate": 3.534051042489739e-05, + "loss": 0.0002, + "step": 605 + }, + { + "epoch": 0.77568, + "grad_norm": 0.005061544477939606, + "learning_rate": 3.532381023356666e-05, + "loss": 0.0002, + "step": 606 + }, + { + "epoch": 0.77696, + "grad_norm": 0.0011133073130622506, + "learning_rate": 3.530708463453827e-05, + "loss": 0.0002, + "step": 607 + }, + { + "epoch": 0.77824, + "grad_norm": 0.002393580274656415, + "learning_rate": 3.5290333659706915e-05, + "loss": 0.0002, + "step": 608 + }, + { + "epoch": 0.77952, + "grad_norm": 0.0032681473530828953, + "learning_rate": 3.527355734101569e-05, + "loss": 0.0002, + "step": 609 + }, + { + "epoch": 0.7808, + "grad_norm": 0.0013635074719786644, + "learning_rate": 3.525675571045602e-05, + "loss": 0.0002, + "step": 610 + }, + { + "epoch": 0.78208, + "grad_norm": 0.0019797049462795258, + "learning_rate": 3.52399288000676e-05, + "loss": 0.0002, + "step": 611 + }, + { + "epoch": 0.78336, + "grad_norm": 0.0038020156789571047, + "learning_rate": 3.522307664193831e-05, + "loss": 0.0002, + "step": 612 + }, + { + "epoch": 0.78464, + "grad_norm": 0.0025022916961461306, + "learning_rate": 3.5206199268204207e-05, + "loss": 0.0002, + "step": 613 + }, + { + "epoch": 0.78592, + "grad_norm": 0.0010878605535253882, + "learning_rate": 3.5189296711049405e-05, + "loss": 0.0002, + "step": 614 + }, + { + "epoch": 0.7872, + "grad_norm": 0.003955469932407141, + "learning_rate": 3.517236900270608e-05, + "loss": 0.0002, + "step": 615 + }, + { + "epoch": 0.78848, + "grad_norm": 0.0026833557058125734, + "learning_rate": 3.515541617545432e-05, + "loss": 0.0002, + "step": 616 + }, + { + "epoch": 0.78976, + "grad_norm": 0.0021759020164608955, + "learning_rate": 3.513843826162214e-05, + "loss": 0.0002, + "step": 617 + }, + { + "epoch": 0.79104, + "grad_norm": 0.005862012039870024, + "learning_rate": 3.512143529358541e-05, + "loss": 0.0002, + "step": 618 + }, + { + "epoch": 0.79232, + "grad_norm": 0.004435022361576557, + "learning_rate": 3.510440730376775e-05, + "loss": 0.0002, + "step": 619 + }, + { + "epoch": 0.7936, + "grad_norm": 0.0007107479032129049, + "learning_rate": 3.508735432464049e-05, + "loss": 0.0002, + "step": 620 + }, + { + "epoch": 0.79488, + "grad_norm": 0.005531948991119862, + "learning_rate": 3.507027638872264e-05, + "loss": 0.0002, + "step": 621 + }, + { + "epoch": 0.79616, + "grad_norm": 0.006097315810620785, + "learning_rate": 3.505317352858078e-05, + "loss": 0.0002, + "step": 622 + }, + { + "epoch": 0.79744, + "grad_norm": 0.0015222890069708228, + "learning_rate": 3.503604577682903e-05, + "loss": 0.0002, + "step": 623 + }, + { + "epoch": 0.79872, + "grad_norm": 0.004580902401357889, + "learning_rate": 3.5018893166128984e-05, + "loss": 0.0002, + "step": 624 + }, + { + "epoch": 0.8, + "grad_norm": 0.006305031478404999, + "learning_rate": 3.500171572918961e-05, + "loss": 0.0002, + "step": 625 + }, + { + "epoch": 0.80128, + "grad_norm": 0.002156647155061364, + "learning_rate": 3.498451349876725e-05, + "loss": 0.0002, + "step": 626 + }, + { + "epoch": 0.80256, + "grad_norm": 0.003837764263153076, + "learning_rate": 3.496728650766552e-05, + "loss": 0.0002, + "step": 627 + }, + { + "epoch": 0.80384, + "grad_norm": 0.005848568864166737, + "learning_rate": 3.495003478873523e-05, + "loss": 0.0002, + "step": 628 + }, + { + "epoch": 0.80512, + "grad_norm": 0.0019309833878651261, + "learning_rate": 3.493275837487437e-05, + "loss": 0.0002, + "step": 629 + }, + { + "epoch": 0.8064, + "grad_norm": 0.0037213312461972237, + "learning_rate": 3.491545729902801e-05, + "loss": 0.0002, + "step": 630 + }, + { + "epoch": 0.8064, + "eval_loss": 1.227569580078125, + "eval_runtime": 45.2126, + "eval_samples_per_second": 11.103, + "eval_steps_per_second": 1.393, + "step": 630 + }, + { + "epoch": 0.80768, + "grad_norm": 0.00538895046338439, + "learning_rate": 3.489813159418826e-05, + "loss": 0.0002, + "step": 631 + }, + { + "epoch": 0.80896, + "grad_norm": 0.0008948746835812926, + "learning_rate": 3.488078129339418e-05, + "loss": 0.0002, + "step": 632 + }, + { + "epoch": 0.81024, + "grad_norm": 0.004895337857306004, + "learning_rate": 3.486340642973174e-05, + "loss": 0.0002, + "step": 633 + }, + { + "epoch": 0.81152, + "grad_norm": 0.004855775274336338, + "learning_rate": 3.484600703633376e-05, + "loss": 0.0002, + "step": 634 + }, + { + "epoch": 0.8128, + "grad_norm": 0.0011731626000255346, + "learning_rate": 3.4828583146379817e-05, + "loss": 0.0002, + "step": 635 + }, + { + "epoch": 0.81408, + "grad_norm": 0.006135343573987484, + "learning_rate": 3.4811134793096216e-05, + "loss": 0.0002, + "step": 636 + }, + { + "epoch": 0.81536, + "grad_norm": 0.003618490183725953, + "learning_rate": 3.479366200975591e-05, + "loss": 0.0002, + "step": 637 + }, + { + "epoch": 0.81664, + "grad_norm": 0.0027814011555165052, + "learning_rate": 3.477616482967844e-05, + "loss": 0.0002, + "step": 638 + }, + { + "epoch": 0.81792, + "grad_norm": 0.006283624563366175, + "learning_rate": 3.475864328622985e-05, + "loss": 0.0002, + "step": 639 + }, + { + "epoch": 0.8192, + "grad_norm": 0.004275933373719454, + "learning_rate": 3.474109741282267e-05, + "loss": 0.0002, + "step": 640 + }, + { + "epoch": 0.82048, + "grad_norm": 0.0015624063089489937, + "learning_rate": 3.472352724291583e-05, + "loss": 0.0002, + "step": 641 + }, + { + "epoch": 0.82176, + "grad_norm": 0.005364539101719856, + "learning_rate": 3.4705932810014555e-05, + "loss": 0.0002, + "step": 642 + }, + { + "epoch": 0.82304, + "grad_norm": 0.00375459436327219, + "learning_rate": 3.468831414767038e-05, + "loss": 0.0002, + "step": 643 + }, + { + "epoch": 0.82432, + "grad_norm": 0.001632419298402965, + "learning_rate": 3.467067128948101e-05, + "loss": 0.0002, + "step": 644 + }, + { + "epoch": 0.8256, + "grad_norm": 0.005220402032136917, + "learning_rate": 3.465300426909032e-05, + "loss": 0.0002, + "step": 645 + }, + { + "epoch": 0.82688, + "grad_norm": 0.0028709019534289837, + "learning_rate": 3.463531312018823e-05, + "loss": 0.0002, + "step": 646 + }, + { + "epoch": 0.82816, + "grad_norm": 0.0024156903382390738, + "learning_rate": 3.4617597876510694e-05, + "loss": 0.0002, + "step": 647 + }, + { + "epoch": 0.82944, + "grad_norm": 0.004406792111694813, + "learning_rate": 3.4599858571839606e-05, + "loss": 0.0002, + "step": 648 + }, + { + "epoch": 0.83072, + "grad_norm": 0.0010315208928659558, + "learning_rate": 3.458209524000275e-05, + "loss": 0.0002, + "step": 649 + }, + { + "epoch": 0.832, + "grad_norm": 0.003976741805672646, + "learning_rate": 3.4564307914873716e-05, + "loss": 0.0002, + "step": 650 + }, + { + "epoch": 0.83328, + "grad_norm": 0.005697767250239849, + "learning_rate": 3.4546496630371854e-05, + "loss": 0.0002, + "step": 651 + }, + { + "epoch": 0.83456, + "grad_norm": 0.0018598579335957766, + "learning_rate": 3.452866142046221e-05, + "loss": 0.0002, + "step": 652 + }, + { + "epoch": 0.83584, + "grad_norm": 0.00341482344083488, + "learning_rate": 3.4510802319155435e-05, + "loss": 0.0002, + "step": 653 + }, + { + "epoch": 0.83712, + "grad_norm": 0.004788935650140047, + "learning_rate": 3.4492919360507754e-05, + "loss": 0.0002, + "step": 654 + }, + { + "epoch": 0.8384, + "grad_norm": 0.0021135262213647366, + "learning_rate": 3.4475012578620895e-05, + "loss": 0.0002, + "step": 655 + }, + { + "epoch": 0.83968, + "grad_norm": 0.0020841804798692465, + "learning_rate": 3.4457082007641996e-05, + "loss": 0.0002, + "step": 656 + }, + { + "epoch": 0.84096, + "grad_norm": 0.004247150383889675, + "learning_rate": 3.4439127681763566e-05, + "loss": 0.0002, + "step": 657 + }, + { + "epoch": 0.84224, + "grad_norm": 0.002661739941686392, + "learning_rate": 3.4421149635223416e-05, + "loss": 0.0002, + "step": 658 + }, + { + "epoch": 0.84352, + "grad_norm": 0.0008635800913907588, + "learning_rate": 3.4403147902304596e-05, + "loss": 0.0002, + "step": 659 + }, + { + "epoch": 0.8448, + "grad_norm": 0.0035655642859637737, + "learning_rate": 3.438512251733532e-05, + "loss": 0.0002, + "step": 660 + }, + { + "epoch": 0.8448, + "eval_loss": 1.2087175846099854, + "eval_runtime": 44.3231, + "eval_samples_per_second": 11.326, + "eval_steps_per_second": 1.421, + "step": 660 + }, + { + "epoch": 0.84608, + "grad_norm": 0.0034810348879545927, + "learning_rate": 3.436707351468889e-05, + "loss": 0.0002, + "step": 661 + }, + { + "epoch": 0.84736, + "grad_norm": 0.0005423646070994437, + "learning_rate": 3.434900092878367e-05, + "loss": 0.0002, + "step": 662 + }, + { + "epoch": 0.84864, + "grad_norm": 0.0021196764428168535, + "learning_rate": 3.433090479408298e-05, + "loss": 0.0002, + "step": 663 + }, + { + "epoch": 0.84992, + "grad_norm": 0.0016890447586774826, + "learning_rate": 3.431278514509505e-05, + "loss": 0.0002, + "step": 664 + }, + { + "epoch": 0.8512, + "grad_norm": 0.0009622541256248951, + "learning_rate": 3.4294642016372954e-05, + "loss": 0.0002, + "step": 665 + }, + { + "epoch": 0.85248, + "grad_norm": 0.003417698899284005, + "learning_rate": 3.4276475442514536e-05, + "loss": 0.0002, + "step": 666 + }, + { + "epoch": 0.85376, + "grad_norm": 0.003611312946304679, + "learning_rate": 3.425828545816235e-05, + "loss": 0.0002, + "step": 667 + }, + { + "epoch": 0.85504, + "grad_norm": 0.0006507485522888601, + "learning_rate": 3.42400720980036e-05, + "loss": 0.0002, + "step": 668 + }, + { + "epoch": 0.85632, + "grad_norm": 0.003617242444306612, + "learning_rate": 3.422183539677005e-05, + "loss": 0.0002, + "step": 669 + }, + { + "epoch": 0.8576, + "grad_norm": 0.004010677337646484, + "learning_rate": 3.420357538923798e-05, + "loss": 0.0002, + "step": 670 + }, + { + "epoch": 0.85888, + "grad_norm": 0.0006595096201635897, + "learning_rate": 3.418529211022812e-05, + "loss": 0.0002, + "step": 671 + }, + { + "epoch": 0.86016, + "grad_norm": 0.004336192272603512, + "learning_rate": 3.416698559460558e-05, + "loss": 0.0002, + "step": 672 + }, + { + "epoch": 0.86144, + "grad_norm": 0.003125009825453162, + "learning_rate": 3.414865587727977e-05, + "loss": 0.0002, + "step": 673 + }, + { + "epoch": 0.86272, + "grad_norm": 0.0010173583868891, + "learning_rate": 3.4130302993204345e-05, + "loss": 0.0002, + "step": 674 + }, + { + "epoch": 0.864, + "grad_norm": 0.003576030256226659, + "learning_rate": 3.411192697737715e-05, + "loss": 0.0002, + "step": 675 + }, + { + "epoch": 0.86528, + "grad_norm": 0.0021791106555610895, + "learning_rate": 3.409352786484012e-05, + "loss": 0.0002, + "step": 676 + }, + { + "epoch": 0.86656, + "grad_norm": 0.001492811250500381, + "learning_rate": 3.407510569067926e-05, + "loss": 0.0002, + "step": 677 + }, + { + "epoch": 0.86784, + "grad_norm": 0.0030872321221977472, + "learning_rate": 3.4056660490024524e-05, + "loss": 0.0002, + "step": 678 + }, + { + "epoch": 0.86912, + "grad_norm": 0.0008694932330399752, + "learning_rate": 3.40381922980498e-05, + "loss": 0.0002, + "step": 679 + }, + { + "epoch": 0.8704, + "grad_norm": 0.0015984508208930492, + "learning_rate": 3.4019701149972806e-05, + "loss": 0.0002, + "step": 680 + }, + { + "epoch": 0.87168, + "grad_norm": 0.002277463674545288, + "learning_rate": 3.400118708105505e-05, + "loss": 0.0002, + "step": 681 + }, + { + "epoch": 0.87296, + "grad_norm": 0.001001443131826818, + "learning_rate": 3.398265012660172e-05, + "loss": 0.0002, + "step": 682 + }, + { + "epoch": 0.87424, + "grad_norm": 0.0014973531942814589, + "learning_rate": 3.396409032196167e-05, + "loss": 0.0002, + "step": 683 + }, + { + "epoch": 0.87552, + "grad_norm": 0.0019171397434547544, + "learning_rate": 3.3945507702527325e-05, + "loss": 0.0002, + "step": 684 + }, + { + "epoch": 0.8768, + "grad_norm": 0.00016306435281876475, + "learning_rate": 3.3926902303734605e-05, + "loss": 0.0002, + "step": 685 + }, + { + "epoch": 0.87808, + "grad_norm": 0.002407992724329233, + "learning_rate": 3.390827416106289e-05, + "loss": 0.0002, + "step": 686 + }, + { + "epoch": 0.87936, + "grad_norm": 0.002446410246193409, + "learning_rate": 3.388962331003491e-05, + "loss": 0.0002, + "step": 687 + }, + { + "epoch": 0.88064, + "grad_norm": 0.000253974343650043, + "learning_rate": 3.38709497862167e-05, + "loss": 0.0002, + "step": 688 + }, + { + "epoch": 0.88192, + "grad_norm": 0.0016051908023655415, + "learning_rate": 3.3852253625217555e-05, + "loss": 0.0002, + "step": 689 + }, + { + "epoch": 0.8832, + "grad_norm": 0.0011209291405975819, + "learning_rate": 3.38335348626899e-05, + "loss": 0.0002, + "step": 690 + }, + { + "epoch": 0.8832, + "eval_loss": 1.1801164150238037, + "eval_runtime": 42.922, + "eval_samples_per_second": 11.696, + "eval_steps_per_second": 1.468, + "step": 690 + }, + { + "epoch": 0.88448, + "grad_norm": 0.0005705386283807456, + "learning_rate": 3.381479353432929e-05, + "loss": 0.0002, + "step": 691 + }, + { + "epoch": 0.88576, + "grad_norm": 0.0013472288846969604, + "learning_rate": 3.37960296758743e-05, + "loss": 0.0002, + "step": 692 + }, + { + "epoch": 0.88704, + "grad_norm": 0.0012993423733860254, + "learning_rate": 3.377724332310647e-05, + "loss": 0.0002, + "step": 693 + }, + { + "epoch": 0.88832, + "grad_norm": 0.00032160914270207286, + "learning_rate": 3.3758434511850224e-05, + "loss": 0.0002, + "step": 694 + }, + { + "epoch": 0.8896, + "grad_norm": 0.0009844450978562236, + "learning_rate": 3.3739603277972835e-05, + "loss": 0.0002, + "step": 695 + }, + { + "epoch": 0.89088, + "grad_norm": 0.0020079065579921007, + "learning_rate": 3.372074965738433e-05, + "loss": 0.0002, + "step": 696 + }, + { + "epoch": 0.89216, + "grad_norm": 0.0014246099162846804, + "learning_rate": 3.37018736860374e-05, + "loss": 0.0002, + "step": 697 + }, + { + "epoch": 0.89344, + "grad_norm": 0.00041524734115228057, + "learning_rate": 3.3682975399927386e-05, + "loss": 0.0002, + "step": 698 + }, + { + "epoch": 0.89472, + "grad_norm": 0.0022344680037349463, + "learning_rate": 3.3664054835092184e-05, + "loss": 0.0002, + "step": 699 + }, + { + "epoch": 0.896, + "grad_norm": 0.0023142280988395214, + "learning_rate": 3.364511202761214e-05, + "loss": 0.0002, + "step": 700 + }, + { + "epoch": 0.89728, + "grad_norm": 0.0005925571895204484, + "learning_rate": 3.3626147013610066e-05, + "loss": 0.0002, + "step": 701 + }, + { + "epoch": 0.89856, + "grad_norm": 0.003085650270804763, + "learning_rate": 3.360715982925107e-05, + "loss": 0.0002, + "step": 702 + }, + { + "epoch": 0.89984, + "grad_norm": 0.002100037643685937, + "learning_rate": 3.358815051074257e-05, + "loss": 0.0002, + "step": 703 + }, + { + "epoch": 0.90112, + "grad_norm": 0.0018144150963053107, + "learning_rate": 3.356911909433419e-05, + "loss": 0.0002, + "step": 704 + }, + { + "epoch": 0.9024, + "grad_norm": 0.004643341060727835, + "learning_rate": 3.355006561631767e-05, + "loss": 0.0002, + "step": 705 + }, + { + "epoch": 0.90368, + "grad_norm": 0.003487049136310816, + "learning_rate": 3.353099011302685e-05, + "loss": 0.0002, + "step": 706 + }, + { + "epoch": 0.90496, + "grad_norm": 0.0005208138027228415, + "learning_rate": 3.3511892620837554e-05, + "loss": 0.0002, + "step": 707 + }, + { + "epoch": 0.90624, + "grad_norm": 0.003496536985039711, + "learning_rate": 3.349277317616754e-05, + "loss": 0.0002, + "step": 708 + }, + { + "epoch": 0.90752, + "grad_norm": 0.002766245976090431, + "learning_rate": 3.347363181547642e-05, + "loss": 0.0002, + "step": 709 + }, + { + "epoch": 0.9088, + "grad_norm": 0.0004007960669696331, + "learning_rate": 3.345446857526561e-05, + "loss": 0.0002, + "step": 710 + }, + { + "epoch": 0.91008, + "grad_norm": 0.0024029084015637636, + "learning_rate": 3.343528349207827e-05, + "loss": 0.0002, + "step": 711 + }, + { + "epoch": 0.91136, + "grad_norm": 0.001649223268032074, + "learning_rate": 3.341607660249916e-05, + "loss": 0.0002, + "step": 712 + }, + { + "epoch": 0.91264, + "grad_norm": 0.0008164893370121717, + "learning_rate": 3.339684794315467e-05, + "loss": 0.0002, + "step": 713 + }, + { + "epoch": 0.91392, + "grad_norm": 0.0026240425650030375, + "learning_rate": 3.3377597550712676e-05, + "loss": 0.0002, + "step": 714 + }, + { + "epoch": 0.9152, + "grad_norm": 0.00138380890712142, + "learning_rate": 3.3358325461882516e-05, + "loss": 0.0002, + "step": 715 + }, + { + "epoch": 0.91648, + "grad_norm": 0.0012182299979031086, + "learning_rate": 3.333903171341489e-05, + "loss": 0.0002, + "step": 716 + }, + { + "epoch": 0.91776, + "grad_norm": 0.0022654475178569555, + "learning_rate": 3.3319716342101804e-05, + "loss": 0.0002, + "step": 717 + }, + { + "epoch": 0.91904, + "grad_norm": 0.0006078749429434538, + "learning_rate": 3.330037938477651e-05, + "loss": 0.0002, + "step": 718 + }, + { + "epoch": 0.92032, + "grad_norm": 0.00262492336332798, + "learning_rate": 3.3281020878313396e-05, + "loss": 0.0002, + "step": 719 + }, + { + "epoch": 0.9216, + "grad_norm": 0.0032508145086467266, + "learning_rate": 3.3261640859627964e-05, + "loss": 0.0002, + "step": 720 + }, + { + "epoch": 0.9216, + "eval_loss": 1.1878145933151245, + "eval_runtime": 43.4753, + "eval_samples_per_second": 11.547, + "eval_steps_per_second": 1.449, + "step": 720 + }, + { + "epoch": 0.92288, + "grad_norm": 0.00020908880105707794, + "learning_rate": 3.324223936567675e-05, + "loss": 0.0002, + "step": 721 + }, + { + "epoch": 0.92416, + "grad_norm": 0.0035989386960864067, + "learning_rate": 3.3222816433457205e-05, + "loss": 0.0002, + "step": 722 + }, + { + "epoch": 0.92544, + "grad_norm": 0.004371747840195894, + "learning_rate": 3.3203372100007694e-05, + "loss": 0.0002, + "step": 723 + }, + { + "epoch": 0.92672, + "grad_norm": 0.0012274390319362283, + "learning_rate": 3.318390640240738e-05, + "loss": 0.0002, + "step": 724 + }, + { + "epoch": 0.928, + "grad_norm": 0.002447142032906413, + "learning_rate": 3.316441937777615e-05, + "loss": 0.0002, + "step": 725 + }, + { + "epoch": 0.92928, + "grad_norm": 0.0030363500118255615, + "learning_rate": 3.314491106327461e-05, + "loss": 0.0002, + "step": 726 + }, + { + "epoch": 0.93056, + "grad_norm": 0.00019563132082112134, + "learning_rate": 3.3125381496103905e-05, + "loss": 0.0002, + "step": 727 + }, + { + "epoch": 0.93184, + "grad_norm": 0.003880007890984416, + "learning_rate": 3.3105830713505745e-05, + "loss": 0.0002, + "step": 728 + }, + { + "epoch": 0.93312, + "grad_norm": 0.0046269530430436134, + "learning_rate": 3.308625875276228e-05, + "loss": 0.0002, + "step": 729 + }, + { + "epoch": 0.9344, + "grad_norm": 0.0020352972205728292, + "learning_rate": 3.306666565119606e-05, + "loss": 0.0002, + "step": 730 + }, + { + "epoch": 0.93568, + "grad_norm": 0.0020175424870103598, + "learning_rate": 3.304705144616994e-05, + "loss": 0.0002, + "step": 731 + }, + { + "epoch": 0.93696, + "grad_norm": 0.0037658843211829662, + "learning_rate": 3.3027416175087015e-05, + "loss": 0.0002, + "step": 732 + }, + { + "epoch": 0.93824, + "grad_norm": 0.0012228352716192603, + "learning_rate": 3.3007759875390554e-05, + "loss": 0.0002, + "step": 733 + }, + { + "epoch": 0.93952, + "grad_norm": 0.0029969573952257633, + "learning_rate": 3.298808258456393e-05, + "loss": 0.0002, + "step": 734 + }, + { + "epoch": 0.9408, + "grad_norm": 0.004848573822528124, + "learning_rate": 3.296838434013055e-05, + "loss": 0.0002, + "step": 735 + }, + { + "epoch": 0.94208, + "grad_norm": 0.0023214390967041254, + "learning_rate": 3.294866517965375e-05, + "loss": 0.0002, + "step": 736 + }, + { + "epoch": 0.94336, + "grad_norm": 0.00205830205231905, + "learning_rate": 3.2928925140736805e-05, + "loss": 0.0002, + "step": 737 + }, + { + "epoch": 0.94464, + "grad_norm": 0.0036876010708510876, + "learning_rate": 3.2909164261022745e-05, + "loss": 0.0002, + "step": 738 + }, + { + "epoch": 0.94592, + "grad_norm": 0.0014606869081035256, + "learning_rate": 3.2889382578194374e-05, + "loss": 0.0002, + "step": 739 + }, + { + "epoch": 0.9472, + "grad_norm": 0.0022556378971785307, + "learning_rate": 3.286958012997416e-05, + "loss": 0.0002, + "step": 740 + }, + { + "epoch": 0.94848, + "grad_norm": 0.00396134564653039, + "learning_rate": 3.284975695412418e-05, + "loss": 0.0002, + "step": 741 + }, + { + "epoch": 0.94976, + "grad_norm": 0.0020983486901968718, + "learning_rate": 3.282991308844602e-05, + "loss": 0.0002, + "step": 742 + }, + { + "epoch": 0.95104, + "grad_norm": 0.0017063944833353162, + "learning_rate": 3.2810048570780725e-05, + "loss": 0.0002, + "step": 743 + }, + { + "epoch": 0.95232, + "grad_norm": 0.0034495610743761063, + "learning_rate": 3.2790163439008735e-05, + "loss": 0.0002, + "step": 744 + }, + { + "epoch": 0.9536, + "grad_norm": 0.00145576277282089, + "learning_rate": 3.277025773104978e-05, + "loss": 0.0002, + "step": 745 + }, + { + "epoch": 0.95488, + "grad_norm": 0.0014225257327780128, + "learning_rate": 3.2750331484862844e-05, + "loss": 0.0002, + "step": 746 + }, + { + "epoch": 0.95616, + "grad_norm": 0.002464696764945984, + "learning_rate": 3.273038473844606e-05, + "loss": 0.0002, + "step": 747 + }, + { + "epoch": 0.95744, + "grad_norm": 0.0012042096350342035, + "learning_rate": 3.271041752983667e-05, + "loss": 0.0002, + "step": 748 + }, + { + "epoch": 0.95872, + "grad_norm": 0.0007343910983763635, + "learning_rate": 3.2690429897110925e-05, + "loss": 0.0002, + "step": 749 + }, + { + "epoch": 0.96, + "grad_norm": 0.0017276835860684514, + "learning_rate": 3.2670421878384044e-05, + "loss": 0.0002, + "step": 750 + }, + { + "epoch": 0.96, + "eval_loss": 1.192034363746643, + "eval_runtime": 43.5448, + "eval_samples_per_second": 11.528, + "eval_steps_per_second": 1.447, + "step": 750 + }, + { + "epoch": 0.96128, + "grad_norm": 0.001222791033796966, + "learning_rate": 3.2650393511810086e-05, + "loss": 0.0002, + "step": 751 + }, + { + "epoch": 0.96256, + "grad_norm": 0.0004405670042615384, + "learning_rate": 3.2630344835581935e-05, + "loss": 0.0002, + "step": 752 + }, + { + "epoch": 0.96384, + "grad_norm": 0.0020185222383588552, + "learning_rate": 3.26102758879312e-05, + "loss": 0.0002, + "step": 753 + }, + { + "epoch": 0.96512, + "grad_norm": 0.0022137605119496584, + "learning_rate": 3.2590186707128156e-05, + "loss": 0.0002, + "step": 754 + }, + { + "epoch": 0.9664, + "grad_norm": 0.0005233174888417125, + "learning_rate": 3.257007733148163e-05, + "loss": 0.0002, + "step": 755 + }, + { + "epoch": 0.96768, + "grad_norm": 0.0011725371005013585, + "learning_rate": 3.254994779933901e-05, + "loss": 0.0002, + "step": 756 + }, + { + "epoch": 0.96896, + "grad_norm": 0.0009187847608700395, + "learning_rate": 3.2529798149086075e-05, + "loss": 0.0002, + "step": 757 + }, + { + "epoch": 0.97024, + "grad_norm": 0.0007130370941013098, + "learning_rate": 3.2509628419146984e-05, + "loss": 0.0002, + "step": 758 + }, + { + "epoch": 0.97152, + "grad_norm": 0.0014700796455144882, + "learning_rate": 3.248943864798419e-05, + "loss": 0.0002, + "step": 759 + }, + { + "epoch": 0.9728, + "grad_norm": 0.0002132820663973689, + "learning_rate": 3.246922887409837e-05, + "loss": 0.0002, + "step": 760 + }, + { + "epoch": 0.97408, + "grad_norm": 0.0012054058024659753, + "learning_rate": 3.2448999136028326e-05, + "loss": 0.0002, + "step": 761 + }, + { + "epoch": 0.97536, + "grad_norm": 0.00087908492423594, + "learning_rate": 3.242874947235095e-05, + "loss": 0.0002, + "step": 762 + }, + { + "epoch": 0.97664, + "grad_norm": 0.00034825917100533843, + "learning_rate": 3.240847992168111e-05, + "loss": 0.0002, + "step": 763 + }, + { + "epoch": 0.97792, + "grad_norm": 0.0009228674462065101, + "learning_rate": 3.238819052267162e-05, + "loss": 0.0002, + "step": 764 + }, + { + "epoch": 0.9792, + "grad_norm": 0.00041521593811921775, + "learning_rate": 3.236788131401313e-05, + "loss": 0.0002, + "step": 765 + }, + { + "epoch": 0.98048, + "grad_norm": 0.0007673377986066043, + "learning_rate": 3.234755233443406e-05, + "loss": 0.0002, + "step": 766 + }, + { + "epoch": 0.98176, + "grad_norm": 0.0011025749845430255, + "learning_rate": 3.232720362270057e-05, + "loss": 0.0002, + "step": 767 + }, + { + "epoch": 0.98304, + "grad_norm": 0.0006265212432481349, + "learning_rate": 3.2306835217616374e-05, + "loss": 0.0002, + "step": 768 + }, + { + "epoch": 0.98432, + "grad_norm": 0.0001603246491868049, + "learning_rate": 3.228644715802282e-05, + "loss": 0.0002, + "step": 769 + }, + { + "epoch": 0.9856, + "grad_norm": 0.0005604405887424946, + "learning_rate": 3.226603948279868e-05, + "loss": 0.0002, + "step": 770 + }, + { + "epoch": 0.98688, + "grad_norm": 0.000603645050432533, + "learning_rate": 3.224561223086017e-05, + "loss": 0.0002, + "step": 771 + }, + { + "epoch": 0.98816, + "grad_norm": 0.0004497454210650176, + "learning_rate": 3.222516544116081e-05, + "loss": 0.0002, + "step": 772 + }, + { + "epoch": 0.98944, + "grad_norm": 0.0003486175846774131, + "learning_rate": 3.220469915269138e-05, + "loss": 0.0002, + "step": 773 + }, + { + "epoch": 0.99072, + "grad_norm": 0.0009738111984916031, + "learning_rate": 3.218421340447986e-05, + "loss": 0.0002, + "step": 774 + }, + { + "epoch": 0.992, + "grad_norm": 0.001073844963684678, + "learning_rate": 3.216370823559133e-05, + "loss": 0.0002, + "step": 775 + }, + { + "epoch": 0.99328, + "grad_norm": 0.0004396456351969391, + "learning_rate": 3.21431836851279e-05, + "loss": 0.0002, + "step": 776 + }, + { + "epoch": 0.99456, + "grad_norm": 0.00045633409172296524, + "learning_rate": 3.2122639792228645e-05, + "loss": 0.0002, + "step": 777 + }, + { + "epoch": 0.99584, + "grad_norm": 0.0005275820149108768, + "learning_rate": 3.210207659606951e-05, + "loss": 0.0002, + "step": 778 + }, + { + "epoch": 0.99712, + "grad_norm": 0.00014961262058932334, + "learning_rate": 3.208149413586329e-05, + "loss": 0.0002, + "step": 779 + }, + { + "epoch": 0.9984, + "grad_norm": 0.0009144467767328024, + "learning_rate": 3.206089245085945e-05, + "loss": 0.0002, + "step": 780 + }, + { + "epoch": 0.9984, + "eval_loss": 1.194998025894165, + "eval_runtime": 43.8434, + "eval_samples_per_second": 11.45, + "eval_steps_per_second": 1.437, + "step": 780 + }, + { + "epoch": 0.99968, + "grad_norm": 0.0014543908182531595, + "learning_rate": 3.2040271580344176e-05, + "loss": 0.0002, + "step": 781 + }, + { + "epoch": 1.0, + "grad_norm": 0.0022573259193450212, + "learning_rate": 3.201963156364022e-05, + "loss": 0.0002, + "step": 782 + }, + { + "epoch": 1.00128, + "grad_norm": 0.0029144599102437496, + "learning_rate": 3.1998972440106833e-05, + "loss": 0.0002, + "step": 783 + }, + { + "epoch": 1.00256, + "grad_norm": 0.0017878921935334802, + "learning_rate": 3.197829424913971e-05, + "loss": 0.0002, + "step": 784 + }, + { + "epoch": 1.00384, + "grad_norm": 0.0008101496496237814, + "learning_rate": 3.195759703017091e-05, + "loss": 0.0002, + "step": 785 + }, + { + "epoch": 1.00512, + "grad_norm": 0.003011012217029929, + "learning_rate": 3.193688082266878e-05, + "loss": 0.0002, + "step": 786 + }, + { + "epoch": 1.0064, + "grad_norm": 0.002561653731390834, + "learning_rate": 3.191614566613785e-05, + "loss": 0.0002, + "step": 787 + }, + { + "epoch": 1.00768, + "grad_norm": 0.0004529696889221668, + "learning_rate": 3.189539160011882e-05, + "loss": 0.0002, + "step": 788 + }, + { + "epoch": 1.00896, + "grad_norm": 0.003300096606835723, + "learning_rate": 3.1874618664188435e-05, + "loss": 0.0002, + "step": 789 + }, + { + "epoch": 1.01024, + "grad_norm": 0.0036370474845170975, + "learning_rate": 3.185382689795943e-05, + "loss": 0.0002, + "step": 790 + }, + { + "epoch": 1.01152, + "grad_norm": 0.0012020476860925555, + "learning_rate": 3.1833016341080424e-05, + "loss": 0.0002, + "step": 791 + }, + { + "epoch": 1.0128, + "grad_norm": 0.001177908037789166, + "learning_rate": 3.18121870332359e-05, + "loss": 0.0002, + "step": 792 + }, + { + "epoch": 1.01408, + "grad_norm": 0.0015154675347730517, + "learning_rate": 3.1791339014146075e-05, + "loss": 0.0002, + "step": 793 + }, + { + "epoch": 1.01536, + "grad_norm": 0.0005656080902554095, + "learning_rate": 3.1770472323566864e-05, + "loss": 0.0002, + "step": 794 + }, + { + "epoch": 1.01664, + "grad_norm": 0.0005200884770601988, + "learning_rate": 3.174958700128977e-05, + "loss": 0.0002, + "step": 795 + }, + { + "epoch": 1.01792, + "grad_norm": 0.0005907294107601047, + "learning_rate": 3.172868308714185e-05, + "loss": 0.0002, + "step": 796 + }, + { + "epoch": 1.0192, + "grad_norm": 0.0002785873075481504, + "learning_rate": 3.170776062098559e-05, + "loss": 0.0002, + "step": 797 + }, + { + "epoch": 1.02048, + "grad_norm": 0.00015250472642946988, + "learning_rate": 3.168681964271886e-05, + "loss": 0.0002, + "step": 798 + }, + { + "epoch": 1.02176, + "grad_norm": 0.00017732252308633178, + "learning_rate": 3.166586019227485e-05, + "loss": 0.0002, + "step": 799 + }, + { + "epoch": 1.02304, + "grad_norm": 0.00015901085862424225, + "learning_rate": 3.164488230962194e-05, + "loss": 0.0002, + "step": 800 + }, + { + "epoch": 1.02432, + "grad_norm": 0.0001894240704132244, + "learning_rate": 3.16238860347637e-05, + "loss": 0.0002, + "step": 801 + }, + { + "epoch": 1.0256, + "grad_norm": 0.00015613554569426924, + "learning_rate": 3.160287140773873e-05, + "loss": 0.0002, + "step": 802 + }, + { + "epoch": 1.02688, + "grad_norm": 0.00070232676807791, + "learning_rate": 3.1581838468620674e-05, + "loss": 0.0002, + "step": 803 + }, + { + "epoch": 1.02816, + "grad_norm": 0.0006290404126048088, + "learning_rate": 3.156078725751805e-05, + "loss": 0.0002, + "step": 804 + }, + { + "epoch": 1.02944, + "grad_norm": 0.00029896237538196146, + "learning_rate": 3.1539717814574244e-05, + "loss": 0.0002, + "step": 805 + }, + { + "epoch": 1.03072, + "grad_norm": 0.0009222381049767137, + "learning_rate": 3.151863017996741e-05, + "loss": 0.0002, + "step": 806 + }, + { + "epoch": 1.032, + "grad_norm": 0.001593570108525455, + "learning_rate": 3.1497524393910396e-05, + "loss": 0.0002, + "step": 807 + }, + { + "epoch": 1.03328, + "grad_norm": 0.000536854553502053, + "learning_rate": 3.147640049665063e-05, + "loss": 0.0002, + "step": 808 + }, + { + "epoch": 1.03456, + "grad_norm": 0.0011051252949982882, + "learning_rate": 3.145525852847013e-05, + "loss": 0.0002, + "step": 809 + }, + { + "epoch": 1.03584, + "grad_norm": 0.0017763072391971946, + "learning_rate": 3.143409852968534e-05, + "loss": 0.0002, + "step": 810 + }, + { + "epoch": 1.03584, + "eval_loss": 1.1909286975860596, + "eval_runtime": 43.7864, + "eval_samples_per_second": 11.465, + "eval_steps_per_second": 1.439, + "step": 810 + }, + { + "epoch": 1.03712, + "grad_norm": 0.0015747741563245654, + "learning_rate": 3.141292054064707e-05, + "loss": 0.0002, + "step": 811 + }, + { + "epoch": 1.0384, + "grad_norm": 0.0007148962467908859, + "learning_rate": 3.139172460174049e-05, + "loss": 0.0002, + "step": 812 + }, + { + "epoch": 1.03968, + "grad_norm": 0.001513791736215353, + "learning_rate": 3.137051075338496e-05, + "loss": 0.0002, + "step": 813 + }, + { + "epoch": 1.04096, + "grad_norm": 0.0031881192699074745, + "learning_rate": 3.134927903603399e-05, + "loss": 0.0002, + "step": 814 + }, + { + "epoch": 1.04224, + "grad_norm": 0.002331627532839775, + "learning_rate": 3.132802949017519e-05, + "loss": 0.0002, + "step": 815 + }, + { + "epoch": 1.04352, + "grad_norm": 0.00039511098293587565, + "learning_rate": 3.130676215633016e-05, + "loss": 0.0002, + "step": 816 + }, + { + "epoch": 1.0448, + "grad_norm": 0.0027486959006637335, + "learning_rate": 3.128547707505439e-05, + "loss": 0.0002, + "step": 817 + }, + { + "epoch": 1.04608, + "grad_norm": 0.002707613864913583, + "learning_rate": 3.126417428693726e-05, + "loss": 0.0002, + "step": 818 + }, + { + "epoch": 1.04736, + "grad_norm": 0.00034304489963687956, + "learning_rate": 3.12428538326019e-05, + "loss": 0.0002, + "step": 819 + }, + { + "epoch": 1.04864, + "grad_norm": 0.0018327307188883424, + "learning_rate": 3.12215157527051e-05, + "loss": 0.0002, + "step": 820 + }, + { + "epoch": 1.04992, + "grad_norm": 0.0018494493560865521, + "learning_rate": 3.1200160087937304e-05, + "loss": 0.0002, + "step": 821 + }, + { + "epoch": 1.0512, + "grad_norm": 0.0004779093724209815, + "learning_rate": 3.117878687902245e-05, + "loss": 0.0002, + "step": 822 + }, + { + "epoch": 1.05248, + "grad_norm": 0.0009879459394142032, + "learning_rate": 3.115739616671796e-05, + "loss": 0.0002, + "step": 823 + }, + { + "epoch": 1.05376, + "grad_norm": 0.0014417411293834448, + "learning_rate": 3.1135987991814635e-05, + "loss": 0.0002, + "step": 824 + }, + { + "epoch": 1.05504, + "grad_norm": 0.00022263139544520527, + "learning_rate": 3.1114562395136545e-05, + "loss": 0.0002, + "step": 825 + }, + { + "epoch": 1.05632, + "grad_norm": 0.002068109344691038, + "learning_rate": 3.109311941754102e-05, + "loss": 0.0002, + "step": 826 + }, + { + "epoch": 1.0576, + "grad_norm": 0.0023081593681126833, + "learning_rate": 3.107165909991851e-05, + "loss": 0.0002, + "step": 827 + }, + { + "epoch": 1.05888, + "grad_norm": 0.00045680778566747904, + "learning_rate": 3.105018148319254e-05, + "loss": 0.0002, + "step": 828 + }, + { + "epoch": 1.06016, + "grad_norm": 0.0015283108223229647, + "learning_rate": 3.102868660831962e-05, + "loss": 0.0002, + "step": 829 + }, + { + "epoch": 1.06144, + "grad_norm": 0.001918208785355091, + "learning_rate": 3.100717451628917e-05, + "loss": 0.0002, + "step": 830 + }, + { + "epoch": 1.06272, + "grad_norm": 0.0006047256174497306, + "learning_rate": 3.098564524812345e-05, + "loss": 0.0002, + "step": 831 + }, + { + "epoch": 1.064, + "grad_norm": 0.0013389823725447059, + "learning_rate": 3.0964098844877464e-05, + "loss": 0.0002, + "step": 832 + }, + { + "epoch": 1.06528, + "grad_norm": 0.002758131129667163, + "learning_rate": 3.0942535347638896e-05, + "loss": 0.0002, + "step": 833 + }, + { + "epoch": 1.06656, + "grad_norm": 0.00223194039426744, + "learning_rate": 3.092095479752803e-05, + "loss": 0.0002, + "step": 834 + }, + { + "epoch": 1.06784, + "grad_norm": 0.0002103773003909737, + "learning_rate": 3.0899357235697665e-05, + "loss": 0.0002, + "step": 835 + }, + { + "epoch": 1.06912, + "grad_norm": 0.0022379762958735228, + "learning_rate": 3.0877742703333034e-05, + "loss": 0.0002, + "step": 836 + }, + { + "epoch": 1.0704, + "grad_norm": 0.0028583749663084745, + "learning_rate": 3.085611124165175e-05, + "loss": 0.0002, + "step": 837 + }, + { + "epoch": 1.07168, + "grad_norm": 0.000792765524238348, + "learning_rate": 3.083446289190369e-05, + "loss": 0.0002, + "step": 838 + }, + { + "epoch": 1.07296, + "grad_norm": 0.001578961149789393, + "learning_rate": 3.081279769537095e-05, + "loss": 0.0002, + "step": 839 + }, + { + "epoch": 1.07424, + "grad_norm": 0.0018366475123912096, + "learning_rate": 3.0791115693367725e-05, + "loss": 0.0002, + "step": 840 + }, + { + "epoch": 1.07424, + "eval_loss": 1.183817982673645, + "eval_runtime": 43.479, + "eval_samples_per_second": 11.546, + "eval_steps_per_second": 1.449, + "step": 840 + }, + { + "epoch": 1.07552, + "grad_norm": 0.00039021187694743276, + "learning_rate": 3.076941692724031e-05, + "loss": 0.0002, + "step": 841 + }, + { + "epoch": 1.0768, + "grad_norm": 0.0007836410659365356, + "learning_rate": 3.0747701438366906e-05, + "loss": 0.0002, + "step": 842 + }, + { + "epoch": 1.07808, + "grad_norm": 0.00045217288425192237, + "learning_rate": 3.072596926815766e-05, + "loss": 0.0002, + "step": 843 + }, + { + "epoch": 1.07936, + "grad_norm": 0.0005364661919884384, + "learning_rate": 3.070422045805448e-05, + "loss": 0.0002, + "step": 844 + }, + { + "epoch": 1.08064, + "grad_norm": 0.0008205175399780273, + "learning_rate": 3.068245504953103e-05, + "loss": 0.0002, + "step": 845 + }, + { + "epoch": 1.08192, + "grad_norm": 0.0001271996443392709, + "learning_rate": 3.066067308409262e-05, + "loss": 0.0002, + "step": 846 + }, + { + "epoch": 1.0832, + "grad_norm": 0.0013580002123489976, + "learning_rate": 3.063887460327616e-05, + "loss": 0.0002, + "step": 847 + }, + { + "epoch": 1.08448, + "grad_norm": 0.002191178034991026, + "learning_rate": 3.061705964865e-05, + "loss": 0.0002, + "step": 848 + }, + { + "epoch": 1.08576, + "grad_norm": 0.0014392256271094084, + "learning_rate": 3.059522826181396e-05, + "loss": 0.0002, + "step": 849 + }, + { + "epoch": 1.08704, + "grad_norm": 0.000954207091126591, + "learning_rate": 3.0573380484399155e-05, + "loss": 0.0002, + "step": 850 + }, + { + "epoch": 1.08832, + "grad_norm": 0.0029570641927421093, + "learning_rate": 3.055151635806797e-05, + "loss": 0.0002, + "step": 851 + }, + { + "epoch": 1.0896, + "grad_norm": 0.002237113891169429, + "learning_rate": 3.0529635924513974e-05, + "loss": 0.0002, + "step": 852 + }, + { + "epoch": 1.09088, + "grad_norm": 0.0007386120851151645, + "learning_rate": 3.0507739225461827e-05, + "loss": 0.0002, + "step": 853 + }, + { + "epoch": 1.09216, + "grad_norm": 0.0026111765764653683, + "learning_rate": 3.0485826302667206e-05, + "loss": 0.0002, + "step": 854 + }, + { + "epoch": 1.09344, + "grad_norm": 0.0016683172434568405, + "learning_rate": 3.046389719791672e-05, + "loss": 0.0002, + "step": 855 + }, + { + "epoch": 1.09472, + "grad_norm": 0.0005721906200051308, + "learning_rate": 3.044195195302784e-05, + "loss": 0.0002, + "step": 856 + }, + { + "epoch": 1.096, + "grad_norm": 0.0019186767749488354, + "learning_rate": 3.041999060984882e-05, + "loss": 0.0002, + "step": 857 + }, + { + "epoch": 1.09728, + "grad_norm": 0.0013161582173779607, + "learning_rate": 3.0398013210258607e-05, + "loss": 0.0002, + "step": 858 + }, + { + "epoch": 1.09856, + "grad_norm": 0.0007004044600762427, + "learning_rate": 3.037601979616677e-05, + "loss": 0.0002, + "step": 859 + }, + { + "epoch": 1.09984, + "grad_norm": 0.0017194673418998718, + "learning_rate": 3.0354010409513418e-05, + "loss": 0.0002, + "step": 860 + }, + { + "epoch": 1.10112, + "grad_norm": 0.0009747726726345718, + "learning_rate": 3.0331985092269107e-05, + "loss": 0.0002, + "step": 861 + }, + { + "epoch": 1.1024, + "grad_norm": 0.0005672164843417704, + "learning_rate": 3.0309943886434796e-05, + "loss": 0.0002, + "step": 862 + }, + { + "epoch": 1.10368, + "grad_norm": 0.0017328979447484016, + "learning_rate": 3.028788683404171e-05, + "loss": 0.0002, + "step": 863 + }, + { + "epoch": 1.10496, + "grad_norm": 0.00160455540753901, + "learning_rate": 3.026581397715132e-05, + "loss": 0.0002, + "step": 864 + }, + { + "epoch": 1.1062400000000001, + "grad_norm": 0.00018998782616108656, + "learning_rate": 3.024372535785522e-05, + "loss": 0.0002, + "step": 865 + }, + { + "epoch": 1.10752, + "grad_norm": 0.0013784171314910054, + "learning_rate": 3.0221621018275067e-05, + "loss": 0.0002, + "step": 866 + }, + { + "epoch": 1.1088, + "grad_norm": 0.0012219077907502651, + "learning_rate": 3.0199501000562498e-05, + "loss": 0.0002, + "step": 867 + }, + { + "epoch": 1.11008, + "grad_norm": 0.00020059949019923806, + "learning_rate": 3.0177365346899045e-05, + "loss": 0.0002, + "step": 868 + }, + { + "epoch": 1.11136, + "grad_norm": 0.001082933391444385, + "learning_rate": 3.015521409949605e-05, + "loss": 0.0002, + "step": 869 + }, + { + "epoch": 1.11264, + "grad_norm": 0.0016314086969941854, + "learning_rate": 3.0133047300594615e-05, + "loss": 0.0002, + "step": 870 + }, + { + "epoch": 1.11264, + "eval_loss": 1.1762357950210571, + "eval_runtime": 42.7757, + "eval_samples_per_second": 11.736, + "eval_steps_per_second": 1.473, + "step": 870 + }, + { + "epoch": 1.11392, + "grad_norm": 0.0017652047099545598, + "learning_rate": 3.011086499246546e-05, + "loss": 0.0002, + "step": 871 + }, + { + "epoch": 1.1152, + "grad_norm": 0.0012991164112463593, + "learning_rate": 3.0088667217408907e-05, + "loss": 0.0002, + "step": 872 + }, + { + "epoch": 1.11648, + "grad_norm": 0.0004405010840855539, + "learning_rate": 3.0066454017754768e-05, + "loss": 0.0002, + "step": 873 + }, + { + "epoch": 1.11776, + "grad_norm": 0.000430526357376948, + "learning_rate": 3.0044225435862268e-05, + "loss": 0.0002, + "step": 874 + }, + { + "epoch": 1.11904, + "grad_norm": 0.00026241704472340643, + "learning_rate": 3.0021981514119966e-05, + "loss": 0.0002, + "step": 875 + }, + { + "epoch": 1.12032, + "grad_norm": 0.0005373109597712755, + "learning_rate": 2.9999722294945665e-05, + "loss": 0.0002, + "step": 876 + }, + { + "epoch": 1.1216, + "grad_norm": 0.0006974454736337066, + "learning_rate": 2.9977447820786348e-05, + "loss": 0.0002, + "step": 877 + }, + { + "epoch": 1.12288, + "grad_norm": 0.0003311955078970641, + "learning_rate": 2.9955158134118085e-05, + "loss": 0.0002, + "step": 878 + }, + { + "epoch": 1.12416, + "grad_norm": 0.0018197010504081845, + "learning_rate": 2.9932853277445958e-05, + "loss": 0.0002, + "step": 879 + }, + { + "epoch": 1.12544, + "grad_norm": 0.001579714473336935, + "learning_rate": 2.9910533293303974e-05, + "loss": 0.0002, + "step": 880 + }, + { + "epoch": 1.12672, + "grad_norm": 0.0002464255376253277, + "learning_rate": 2.9888198224254986e-05, + "loss": 0.0002, + "step": 881 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.001337314024567604, + "learning_rate": 2.986584811289062e-05, + "loss": 0.0002, + "step": 882 + }, + { + "epoch": 1.12928, + "grad_norm": 0.0007345476187765598, + "learning_rate": 2.984348300183118e-05, + "loss": 0.0002, + "step": 883 + }, + { + "epoch": 1.13056, + "grad_norm": 0.0006227174308151007, + "learning_rate": 2.982110293372558e-05, + "loss": 0.0002, + "step": 884 + }, + { + "epoch": 1.13184, + "grad_norm": 0.0008132438524626195, + "learning_rate": 2.9798707951251243e-05, + "loss": 0.0002, + "step": 885 + }, + { + "epoch": 1.13312, + "grad_norm": 0.0004234490043018013, + "learning_rate": 2.9776298097114048e-05, + "loss": 0.0002, + "step": 886 + }, + { + "epoch": 1.1344, + "grad_norm": 0.0015641709323972464, + "learning_rate": 2.9753873414048223e-05, + "loss": 0.0002, + "step": 887 + }, + { + "epoch": 1.13568, + "grad_norm": 0.00038816570304334164, + "learning_rate": 2.9731433944816282e-05, + "loss": 0.0002, + "step": 888 + }, + { + "epoch": 1.13696, + "grad_norm": 0.0018072250531986356, + "learning_rate": 2.9708979732208933e-05, + "loss": 0.0002, + "step": 889 + }, + { + "epoch": 1.13824, + "grad_norm": 0.0019957402255386114, + "learning_rate": 2.9686510819044988e-05, + "loss": 0.0002, + "step": 890 + }, + { + "epoch": 1.13952, + "grad_norm": 0.0002762428193818778, + "learning_rate": 2.966402724817131e-05, + "loss": 0.0002, + "step": 891 + }, + { + "epoch": 1.1408, + "grad_norm": 0.0021149199455976486, + "learning_rate": 2.9641529062462703e-05, + "loss": 0.0002, + "step": 892 + }, + { + "epoch": 1.14208, + "grad_norm": 0.002743937075138092, + "learning_rate": 2.9619016304821837e-05, + "loss": 0.0002, + "step": 893 + }, + { + "epoch": 1.14336, + "grad_norm": 0.00023781094932928681, + "learning_rate": 2.959648901817918e-05, + "loss": 0.0002, + "step": 894 + }, + { + "epoch": 1.1446399999999999, + "grad_norm": 0.002317876322194934, + "learning_rate": 2.95739472454929e-05, + "loss": 0.0002, + "step": 895 + }, + { + "epoch": 1.14592, + "grad_norm": 0.0012434066738933325, + "learning_rate": 2.9551391029748785e-05, + "loss": 0.0002, + "step": 896 + }, + { + "epoch": 1.1472, + "grad_norm": 0.002554555656388402, + "learning_rate": 2.9528820413960172e-05, + "loss": 0.0002, + "step": 897 + }, + { + "epoch": 1.14848, + "grad_norm": 0.004035282880067825, + "learning_rate": 2.9506235441167862e-05, + "loss": 0.0002, + "step": 898 + }, + { + "epoch": 1.1497600000000001, + "grad_norm": 0.0009114958229474723, + "learning_rate": 2.9483636154440024e-05, + "loss": 0.0002, + "step": 899 + }, + { + "epoch": 1.15104, + "grad_norm": 0.003329591127112508, + "learning_rate": 2.9461022596872125e-05, + "loss": 0.0002, + "step": 900 + }, + { + "epoch": 1.15104, + "eval_loss": 1.214892864227295, + "eval_runtime": 44.5411, + "eval_samples_per_second": 11.27, + "eval_steps_per_second": 1.414, + "step": 900 + }, + { + "epoch": 1.15232, + "grad_norm": 0.0036533991806209087, + "learning_rate": 2.943839481158684e-05, + "loss": 0.0002, + "step": 901 + }, + { + "epoch": 1.1536, + "grad_norm": 0.00112155731767416, + "learning_rate": 2.9415752841734003e-05, + "loss": 0.0002, + "step": 902 + }, + { + "epoch": 1.15488, + "grad_norm": 0.005300252232700586, + "learning_rate": 2.9393096730490454e-05, + "loss": 0.0002, + "step": 903 + }, + { + "epoch": 1.15616, + "grad_norm": 0.0034816202241927385, + "learning_rate": 2.9370426521060036e-05, + "loss": 0.0002, + "step": 904 + }, + { + "epoch": 1.15744, + "grad_norm": 0.002600653562694788, + "learning_rate": 2.9347742256673455e-05, + "loss": 0.0002, + "step": 905 + }, + { + "epoch": 1.15872, + "grad_norm": 0.007056380622088909, + "learning_rate": 2.9325043980588233e-05, + "loss": 0.0002, + "step": 906 + }, + { + "epoch": 1.16, + "grad_norm": 0.004707319661974907, + "learning_rate": 2.9302331736088603e-05, + "loss": 0.0002, + "step": 907 + }, + { + "epoch": 1.16128, + "grad_norm": 0.002767306286841631, + "learning_rate": 2.9279605566485437e-05, + "loss": 0.0002, + "step": 908 + }, + { + "epoch": 1.16256, + "grad_norm": 0.008173026144504547, + "learning_rate": 2.925686551511616e-05, + "loss": 0.0002, + "step": 909 + }, + { + "epoch": 1.16384, + "grad_norm": 0.006080700550228357, + "learning_rate": 2.923411162534467e-05, + "loss": 0.0002, + "step": 910 + }, + { + "epoch": 1.16512, + "grad_norm": 0.0016829116502776742, + "learning_rate": 2.9211343940561265e-05, + "loss": 0.0002, + "step": 911 + }, + { + "epoch": 1.1663999999999999, + "grad_norm": 0.006520835217088461, + "learning_rate": 2.918856250418252e-05, + "loss": 0.0002, + "step": 912 + }, + { + "epoch": 1.16768, + "grad_norm": 0.004198993556201458, + "learning_rate": 2.9165767359651275e-05, + "loss": 0.0002, + "step": 913 + }, + { + "epoch": 1.16896, + "grad_norm": 0.0021160051692277193, + "learning_rate": 2.9142958550436457e-05, + "loss": 0.0002, + "step": 914 + }, + { + "epoch": 1.17024, + "grad_norm": 0.005272620357573032, + "learning_rate": 2.9120136120033104e-05, + "loss": 0.0002, + "step": 915 + }, + { + "epoch": 1.1715200000000001, + "grad_norm": 0.0017011539312079549, + "learning_rate": 2.9097300111962197e-05, + "loss": 0.0002, + "step": 916 + }, + { + "epoch": 1.1728, + "grad_norm": 0.0037756350357085466, + "learning_rate": 2.9074450569770628e-05, + "loss": 0.0002, + "step": 917 + }, + { + "epoch": 1.17408, + "grad_norm": 0.004187179729342461, + "learning_rate": 2.9051587537031075e-05, + "loss": 0.0002, + "step": 918 + }, + { + "epoch": 1.17536, + "grad_norm": 0.0008325865492224693, + "learning_rate": 2.9028711057341964e-05, + "loss": 0.0002, + "step": 919 + }, + { + "epoch": 1.17664, + "grad_norm": 0.0047426228411495686, + "learning_rate": 2.9005821174327343e-05, + "loss": 0.0002, + "step": 920 + }, + { + "epoch": 1.17792, + "grad_norm": 0.0034046040382236242, + "learning_rate": 2.8982917931636844e-05, + "loss": 0.0002, + "step": 921 + }, + { + "epoch": 1.1792, + "grad_norm": 0.0014802594669163227, + "learning_rate": 2.896000137294554e-05, + "loss": 0.0002, + "step": 922 + }, + { + "epoch": 1.18048, + "grad_norm": 0.003786710323765874, + "learning_rate": 2.893707154195394e-05, + "loss": 0.0002, + "step": 923 + }, + { + "epoch": 1.18176, + "grad_norm": 0.00018919807916972786, + "learning_rate": 2.891412848238783e-05, + "loss": 0.0002, + "step": 924 + }, + { + "epoch": 1.18304, + "grad_norm": 0.004215307533740997, + "learning_rate": 2.8891172237998236e-05, + "loss": 0.0002, + "step": 925 + }, + { + "epoch": 1.18432, + "grad_norm": 0.002527839969843626, + "learning_rate": 2.8868202852561316e-05, + "loss": 0.0002, + "step": 926 + }, + { + "epoch": 1.1856, + "grad_norm": 0.002203546231612563, + "learning_rate": 2.884522036987831e-05, + "loss": 0.0002, + "step": 927 + }, + { + "epoch": 1.18688, + "grad_norm": 0.003203033236786723, + "learning_rate": 2.8822224833775393e-05, + "loss": 0.0002, + "step": 928 + }, + { + "epoch": 1.1881599999999999, + "grad_norm": 0.0009034183458425105, + "learning_rate": 2.879921628810368e-05, + "loss": 0.0002, + "step": 929 + }, + { + "epoch": 1.18944, + "grad_norm": 0.004178234841674566, + "learning_rate": 2.8776194776739056e-05, + "loss": 0.0002, + "step": 930 + }, + { + "epoch": 1.18944, + "eval_loss": 1.175118088722229, + "eval_runtime": 43.0279, + "eval_samples_per_second": 11.667, + "eval_steps_per_second": 1.464, + "step": 930 + }, + { + "epoch": 1.19072, + "grad_norm": 0.0013845686335116625, + "learning_rate": 2.8753160343582162e-05, + "loss": 0.0002, + "step": 931 + }, + { + "epoch": 1.192, + "grad_norm": 0.004039150197058916, + "learning_rate": 2.8730113032558253e-05, + "loss": 0.0002, + "step": 932 + }, + { + "epoch": 1.1932800000000001, + "grad_norm": 0.005750681739300489, + "learning_rate": 2.870705288761715e-05, + "loss": 0.0002, + "step": 933 + }, + { + "epoch": 1.19456, + "grad_norm": 0.0012514875270426273, + "learning_rate": 2.868397995273316e-05, + "loss": 0.0002, + "step": 934 + }, + { + "epoch": 1.19584, + "grad_norm": 0.005147598218172789, + "learning_rate": 2.8660894271904958e-05, + "loss": 0.0002, + "step": 935 + }, + { + "epoch": 1.19712, + "grad_norm": 0.004961889237165451, + "learning_rate": 2.8637795889155552e-05, + "loss": 0.0002, + "step": 936 + }, + { + "epoch": 1.1984, + "grad_norm": 0.002034895122051239, + "learning_rate": 2.8614684848532147e-05, + "loss": 0.0002, + "step": 937 + }, + { + "epoch": 1.19968, + "grad_norm": 0.007024592719972134, + "learning_rate": 2.8591561194106096e-05, + "loss": 0.0002, + "step": 938 + }, + { + "epoch": 1.20096, + "grad_norm": 0.004905078560113907, + "learning_rate": 2.8568424969972805e-05, + "loss": 0.0002, + "step": 939 + }, + { + "epoch": 1.20224, + "grad_norm": 0.0008516943780705333, + "learning_rate": 2.854527622025165e-05, + "loss": 0.0002, + "step": 940 + }, + { + "epoch": 1.20352, + "grad_norm": 0.0038896326441317797, + "learning_rate": 2.852211498908589e-05, + "loss": 0.0002, + "step": 941 + }, + { + "epoch": 1.2048, + "grad_norm": 0.0018528762739151716, + "learning_rate": 2.8498941320642592e-05, + "loss": 0.0002, + "step": 942 + }, + { + "epoch": 1.20608, + "grad_norm": 0.002043849555775523, + "learning_rate": 2.8475755259112528e-05, + "loss": 0.0002, + "step": 943 + }, + { + "epoch": 1.20736, + "grad_norm": 0.0025435269344598055, + "learning_rate": 2.845255684871012e-05, + "loss": 0.0002, + "step": 944 + }, + { + "epoch": 1.20864, + "grad_norm": 0.0008807358099147677, + "learning_rate": 2.842934613367332e-05, + "loss": 0.0002, + "step": 945 + }, + { + "epoch": 1.2099199999999999, + "grad_norm": 0.003347573336213827, + "learning_rate": 2.840612315826356e-05, + "loss": 0.0002, + "step": 946 + }, + { + "epoch": 1.2112, + "grad_norm": 0.0014326430391520262, + "learning_rate": 2.8382887966765645e-05, + "loss": 0.0002, + "step": 947 + }, + { + "epoch": 1.21248, + "grad_norm": 0.0026219149585813284, + "learning_rate": 2.8359640603487673e-05, + "loss": 0.0002, + "step": 948 + }, + { + "epoch": 1.21376, + "grad_norm": 0.003363543888553977, + "learning_rate": 2.833638111276095e-05, + "loss": 0.0002, + "step": 949 + }, + { + "epoch": 1.2150400000000001, + "grad_norm": 0.00021797107183374465, + "learning_rate": 2.8313109538939918e-05, + "loss": 0.0002, + "step": 950 + }, + { + "epoch": 1.21632, + "grad_norm": 0.0037933571729809046, + "learning_rate": 2.828982592640207e-05, + "loss": 0.0002, + "step": 951 + }, + { + "epoch": 1.2176, + "grad_norm": 0.0028030104003846645, + "learning_rate": 2.826653031954781e-05, + "loss": 0.0002, + "step": 952 + }, + { + "epoch": 1.21888, + "grad_norm": 0.0015061424346640706, + "learning_rate": 2.824322276280048e-05, + "loss": 0.0002, + "step": 953 + }, + { + "epoch": 1.22016, + "grad_norm": 0.002614412922412157, + "learning_rate": 2.821990330060616e-05, + "loss": 0.0002, + "step": 954 + }, + { + "epoch": 1.22144, + "grad_norm": 0.00022668491874355823, + "learning_rate": 2.819657197743365e-05, + "loss": 0.0002, + "step": 955 + }, + { + "epoch": 1.22272, + "grad_norm": 0.0018148926319554448, + "learning_rate": 2.8173228837774365e-05, + "loss": 0.0002, + "step": 956 + }, + { + "epoch": 1.224, + "grad_norm": 0.0004227437893860042, + "learning_rate": 2.814987392614227e-05, + "loss": 0.0002, + "step": 957 + }, + { + "epoch": 1.22528, + "grad_norm": 0.0014729988761246204, + "learning_rate": 2.812650728707375e-05, + "loss": 0.0002, + "step": 958 + }, + { + "epoch": 1.22656, + "grad_norm": 0.0012458571000024676, + "learning_rate": 2.8103128965127574e-05, + "loss": 0.0002, + "step": 959 + }, + { + "epoch": 1.22784, + "grad_norm": 0.00014713928976561874, + "learning_rate": 2.8079739004884783e-05, + "loss": 0.0002, + "step": 960 + }, + { + "epoch": 1.22784, + "eval_loss": 1.1813075542449951, + "eval_runtime": 43.1318, + "eval_samples_per_second": 11.639, + "eval_steps_per_second": 1.461, + "step": 960 + }, + { + "epoch": 1.22912, + "grad_norm": 0.0003031356609426439, + "learning_rate": 2.805633745094861e-05, + "loss": 0.0002, + "step": 961 + }, + { + "epoch": 1.2304, + "grad_norm": 0.00014785327948629856, + "learning_rate": 2.8032924347944394e-05, + "loss": 0.0002, + "step": 962 + }, + { + "epoch": 1.2316799999999999, + "grad_norm": 0.00023444350517820567, + "learning_rate": 2.8009499740519514e-05, + "loss": 0.0002, + "step": 963 + }, + { + "epoch": 1.23296, + "grad_norm": 0.00013383221812546253, + "learning_rate": 2.7986063673343263e-05, + "loss": 0.0002, + "step": 964 + }, + { + "epoch": 1.23424, + "grad_norm": 0.0003675081243272871, + "learning_rate": 2.796261619110681e-05, + "loss": 0.0002, + "step": 965 + }, + { + "epoch": 1.23552, + "grad_norm": 0.00046441424638032913, + "learning_rate": 2.7939157338523074e-05, + "loss": 0.0002, + "step": 966 + }, + { + "epoch": 1.2368000000000001, + "grad_norm": 0.00019773181702475995, + "learning_rate": 2.791568716032666e-05, + "loss": 0.0002, + "step": 967 + }, + { + "epoch": 1.23808, + "grad_norm": 0.00011621037992881611, + "learning_rate": 2.7892205701273784e-05, + "loss": 0.0002, + "step": 968 + }, + { + "epoch": 1.23936, + "grad_norm": 0.001013370230793953, + "learning_rate": 2.7868713006142156e-05, + "loss": 0.0002, + "step": 969 + }, + { + "epoch": 1.24064, + "grad_norm": 0.0011750737903639674, + "learning_rate": 2.784520911973092e-05, + "loss": 0.0002, + "step": 970 + }, + { + "epoch": 1.24192, + "grad_norm": 0.0007235942757688463, + "learning_rate": 2.7821694086860568e-05, + "loss": 0.0002, + "step": 971 + }, + { + "epoch": 1.2432, + "grad_norm": 0.001979984575882554, + "learning_rate": 2.779816795237283e-05, + "loss": 0.0002, + "step": 972 + }, + { + "epoch": 1.24448, + "grad_norm": 0.0006019955035299063, + "learning_rate": 2.777463076113063e-05, + "loss": 0.0002, + "step": 973 + }, + { + "epoch": 1.24576, + "grad_norm": 0.0014207311905920506, + "learning_rate": 2.7751082558017953e-05, + "loss": 0.0002, + "step": 974 + }, + { + "epoch": 1.24704, + "grad_norm": 0.0008990809437818825, + "learning_rate": 2.77275233879398e-05, + "loss": 0.0002, + "step": 975 + }, + { + "epoch": 1.24832, + "grad_norm": 0.0007700049318373203, + "learning_rate": 2.770395329582208e-05, + "loss": 0.0002, + "step": 976 + }, + { + "epoch": 1.2496, + "grad_norm": 0.0007865463849157095, + "learning_rate": 2.7680372326611517e-05, + "loss": 0.0002, + "step": 977 + }, + { + "epoch": 1.25088, + "grad_norm": 0.0001616239023860544, + "learning_rate": 2.7656780525275598e-05, + "loss": 0.0002, + "step": 978 + }, + { + "epoch": 1.25216, + "grad_norm": 0.00020688146469183266, + "learning_rate": 2.763317793680245e-05, + "loss": 0.0002, + "step": 979 + }, + { + "epoch": 1.2534399999999999, + "grad_norm": 0.0004241189162712544, + "learning_rate": 2.7609564606200788e-05, + "loss": 0.0002, + "step": 980 + }, + { + "epoch": 1.25472, + "grad_norm": 0.0008062803535722196, + "learning_rate": 2.7585940578499792e-05, + "loss": 0.0002, + "step": 981 + }, + { + "epoch": 1.256, + "grad_norm": 0.0005234317504800856, + "learning_rate": 2.7562305898749054e-05, + "loss": 0.0002, + "step": 982 + }, + { + "epoch": 1.25728, + "grad_norm": 0.00017792911967262626, + "learning_rate": 2.7538660612018477e-05, + "loss": 0.0002, + "step": 983 + }, + { + "epoch": 1.2585600000000001, + "grad_norm": 0.00015215743042062968, + "learning_rate": 2.7515004763398172e-05, + "loss": 0.0002, + "step": 984 + }, + { + "epoch": 1.25984, + "grad_norm": 0.0001460699422750622, + "learning_rate": 2.749133839799843e-05, + "loss": 0.0002, + "step": 985 + }, + { + "epoch": 1.26112, + "grad_norm": 0.00027044335729442537, + "learning_rate": 2.746766156094955e-05, + "loss": 0.0002, + "step": 986 + }, + { + "epoch": 1.2624, + "grad_norm": 0.0005267596570774913, + "learning_rate": 2.7443974297401842e-05, + "loss": 0.0002, + "step": 987 + }, + { + "epoch": 1.26368, + "grad_norm": 0.000697027484420687, + "learning_rate": 2.742027665252547e-05, + "loss": 0.0002, + "step": 988 + }, + { + "epoch": 1.2649599999999999, + "grad_norm": 0.00040344882290810347, + "learning_rate": 2.739656867151042e-05, + "loss": 0.0002, + "step": 989 + }, + { + "epoch": 1.26624, + "grad_norm": 0.0004076792683918029, + "learning_rate": 2.737285039956635e-05, + "loss": 0.0002, + "step": 990 + }, + { + "epoch": 1.26624, + "eval_loss": 1.1873022317886353, + "eval_runtime": 43.5805, + "eval_samples_per_second": 11.519, + "eval_steps_per_second": 1.446, + "step": 990 + }, + { + "epoch": 1.26752, + "grad_norm": 0.0003270470770075917, + "learning_rate": 2.734912188192258e-05, + "loss": 0.0002, + "step": 991 + }, + { + "epoch": 1.2688, + "grad_norm": 0.0003373113286215812, + "learning_rate": 2.7325383163827947e-05, + "loss": 0.0002, + "step": 992 + }, + { + "epoch": 1.27008, + "grad_norm": 0.00026391519349999726, + "learning_rate": 2.7301634290550762e-05, + "loss": 0.0002, + "step": 993 + }, + { + "epoch": 1.27136, + "grad_norm": 0.000451684114523232, + "learning_rate": 2.727787530737866e-05, + "loss": 0.0002, + "step": 994 + }, + { + "epoch": 1.27264, + "grad_norm": 0.0008735348237678409, + "learning_rate": 2.7254106259618604e-05, + "loss": 0.0002, + "step": 995 + }, + { + "epoch": 1.27392, + "grad_norm": 0.0006068554357625544, + "learning_rate": 2.723032719259671e-05, + "loss": 0.0002, + "step": 996 + }, + { + "epoch": 1.2752, + "grad_norm": 0.0001193479256471619, + "learning_rate": 2.7206538151658222e-05, + "loss": 0.0002, + "step": 997 + }, + { + "epoch": 1.27648, + "grad_norm": 0.0007612605695612729, + "learning_rate": 2.718273918216739e-05, + "loss": 0.0002, + "step": 998 + }, + { + "epoch": 1.27776, + "grad_norm": 0.000665926025249064, + "learning_rate": 2.715893032950742e-05, + "loss": 0.0002, + "step": 999 + }, + { + "epoch": 1.27904, + "grad_norm": 0.0006610708078369498, + "learning_rate": 2.7135111639080335e-05, + "loss": 0.0002, + "step": 1000 + }, + { + "epoch": 1.2803200000000001, + "grad_norm": 0.0011780158383771777, + "learning_rate": 2.7111283156306957e-05, + "loss": 0.0002, + "step": 1001 + }, + { + "epoch": 1.2816, + "grad_norm": 0.0001723014866001904, + "learning_rate": 2.7087444926626717e-05, + "loss": 0.0002, + "step": 1002 + }, + { + "epoch": 1.28288, + "grad_norm": 0.0009809026960283518, + "learning_rate": 2.70635969954977e-05, + "loss": 0.0002, + "step": 1003 + }, + { + "epoch": 1.28416, + "grad_norm": 0.000595583813264966, + "learning_rate": 2.7039739408396456e-05, + "loss": 0.0002, + "step": 1004 + }, + { + "epoch": 1.28544, + "grad_norm": 0.0006542736082337797, + "learning_rate": 2.7015872210817956e-05, + "loss": 0.0002, + "step": 1005 + }, + { + "epoch": 1.2867199999999999, + "grad_norm": 0.0009241512161679566, + "learning_rate": 2.6991995448275506e-05, + "loss": 0.0002, + "step": 1006 + }, + { + "epoch": 1.288, + "grad_norm": 0.00016221008263528347, + "learning_rate": 2.696810916630063e-05, + "loss": 0.0002, + "step": 1007 + }, + { + "epoch": 1.28928, + "grad_norm": 0.0008979769772849977, + "learning_rate": 2.6944213410443026e-05, + "loss": 0.0002, + "step": 1008 + }, + { + "epoch": 1.29056, + "grad_norm": 0.0011557216057553887, + "learning_rate": 2.6920308226270448e-05, + "loss": 0.0002, + "step": 1009 + }, + { + "epoch": 1.29184, + "grad_norm": 0.0005811048904433846, + "learning_rate": 2.6896393659368637e-05, + "loss": 0.0002, + "step": 1010 + }, + { + "epoch": 1.29312, + "grad_norm": 0.000264172675088048, + "learning_rate": 2.6872469755341213e-05, + "loss": 0.0002, + "step": 1011 + }, + { + "epoch": 1.2944, + "grad_norm": 0.00032463358365930617, + "learning_rate": 2.684853655980962e-05, + "loss": 0.0002, + "step": 1012 + }, + { + "epoch": 1.29568, + "grad_norm": 0.0004930752329528332, + "learning_rate": 2.6824594118412998e-05, + "loss": 0.0002, + "step": 1013 + }, + { + "epoch": 1.29696, + "grad_norm": 0.00100604142062366, + "learning_rate": 2.680064247680813e-05, + "loss": 0.0002, + "step": 1014 + }, + { + "epoch": 1.29824, + "grad_norm": 0.000345872831530869, + "learning_rate": 2.6776681680669353e-05, + "loss": 0.0002, + "step": 1015 + }, + { + "epoch": 1.29952, + "grad_norm": 0.0003196639590896666, + "learning_rate": 2.6752711775688452e-05, + "loss": 0.0002, + "step": 1016 + }, + { + "epoch": 1.3008, + "grad_norm": 0.00015214429004117846, + "learning_rate": 2.6728732807574566e-05, + "loss": 0.0002, + "step": 1017 + }, + { + "epoch": 1.3020800000000001, + "grad_norm": 0.0003405305033084005, + "learning_rate": 2.6704744822054155e-05, + "loss": 0.0002, + "step": 1018 + }, + { + "epoch": 1.30336, + "grad_norm": 0.0002859738888218999, + "learning_rate": 2.6680747864870823e-05, + "loss": 0.0002, + "step": 1019 + }, + { + "epoch": 1.30464, + "grad_norm": 0.0010289245983585715, + "learning_rate": 2.665674198178534e-05, + "loss": 0.0002, + "step": 1020 + }, + { + "epoch": 1.30464, + "eval_loss": 1.1872708797454834, + "eval_runtime": 43.5075, + "eval_samples_per_second": 11.538, + "eval_steps_per_second": 1.448, + "step": 1020 + }, + { + "epoch": 1.30592, + "grad_norm": 0.0006891186349093914, + "learning_rate": 2.6632727218575446e-05, + "loss": 0.0002, + "step": 1021 + }, + { + "epoch": 1.3072, + "grad_norm": 0.0004253540828358382, + "learning_rate": 2.6608703621035853e-05, + "loss": 0.0002, + "step": 1022 + }, + { + "epoch": 1.3084799999999999, + "grad_norm": 0.001381613314151764, + "learning_rate": 2.6584671234978094e-05, + "loss": 0.0002, + "step": 1023 + }, + { + "epoch": 1.30976, + "grad_norm": 0.0010794890113174915, + "learning_rate": 2.6560630106230478e-05, + "loss": 0.0002, + "step": 1024 + }, + { + "epoch": 1.31104, + "grad_norm": 0.0005325808888301253, + "learning_rate": 2.6536580280637975e-05, + "loss": 0.0002, + "step": 1025 + }, + { + "epoch": 1.31232, + "grad_norm": 0.0013000130420550704, + "learning_rate": 2.651252180406214e-05, + "loss": 0.0002, + "step": 1026 + }, + { + "epoch": 1.3136, + "grad_norm": 0.00042323453817516565, + "learning_rate": 2.6488454722381037e-05, + "loss": 0.0002, + "step": 1027 + }, + { + "epoch": 1.31488, + "grad_norm": 0.001034266664646566, + "learning_rate": 2.646437908148912e-05, + "loss": 0.0002, + "step": 1028 + }, + { + "epoch": 1.31616, + "grad_norm": 0.001303969300352037, + "learning_rate": 2.6440294927297185e-05, + "loss": 0.0002, + "step": 1029 + }, + { + "epoch": 1.31744, + "grad_norm": 0.00026662839809432626, + "learning_rate": 2.6416202305732248e-05, + "loss": 0.0002, + "step": 1030 + }, + { + "epoch": 1.31872, + "grad_norm": 0.0006337203085422516, + "learning_rate": 2.6392101262737474e-05, + "loss": 0.0002, + "step": 1031 + }, + { + "epoch": 1.32, + "grad_norm": 0.00018651703430805355, + "learning_rate": 2.6367991844272095e-05, + "loss": 0.0002, + "step": 1032 + }, + { + "epoch": 1.32128, + "grad_norm": 0.0011202350724488497, + "learning_rate": 2.6343874096311308e-05, + "loss": 0.0002, + "step": 1033 + }, + { + "epoch": 1.32256, + "grad_norm": 0.001055483939126134, + "learning_rate": 2.631974806484619e-05, + "loss": 0.0002, + "step": 1034 + }, + { + "epoch": 1.3238400000000001, + "grad_norm": 0.00020614058303181082, + "learning_rate": 2.6295613795883624e-05, + "loss": 0.0002, + "step": 1035 + }, + { + "epoch": 1.32512, + "grad_norm": 0.0007595567149110138, + "learning_rate": 2.6271471335446194e-05, + "loss": 0.0002, + "step": 1036 + }, + { + "epoch": 1.3264, + "grad_norm": 0.0004523808602243662, + "learning_rate": 2.624732072957211e-05, + "loss": 0.0002, + "step": 1037 + }, + { + "epoch": 1.32768, + "grad_norm": 0.00021755503257736564, + "learning_rate": 2.62231620243151e-05, + "loss": 0.0002, + "step": 1038 + }, + { + "epoch": 1.32896, + "grad_norm": 0.00024666677927598357, + "learning_rate": 2.6198995265744374e-05, + "loss": 0.0002, + "step": 1039 + }, + { + "epoch": 1.3302399999999999, + "grad_norm": 0.0002588001952972263, + "learning_rate": 2.6174820499944446e-05, + "loss": 0.0002, + "step": 1040 + }, + { + "epoch": 1.33152, + "grad_norm": 0.00016227785090450197, + "learning_rate": 2.615063777301515e-05, + "loss": 0.0002, + "step": 1041 + }, + { + "epoch": 1.3328, + "grad_norm": 0.0006037519196979702, + "learning_rate": 2.6126447131071467e-05, + "loss": 0.0002, + "step": 1042 + }, + { + "epoch": 1.33408, + "grad_norm": 0.0009266502456739545, + "learning_rate": 2.610224862024349e-05, + "loss": 0.0002, + "step": 1043 + }, + { + "epoch": 1.33536, + "grad_norm": 0.00027118143043480814, + "learning_rate": 2.60780422866763e-05, + "loss": 0.0002, + "step": 1044 + }, + { + "epoch": 1.33664, + "grad_norm": 0.0013858125312253833, + "learning_rate": 2.6053828176529924e-05, + "loss": 0.0002, + "step": 1045 + }, + { + "epoch": 1.33792, + "grad_norm": 0.0015585527289658785, + "learning_rate": 2.6029606335979178e-05, + "loss": 0.0002, + "step": 1046 + }, + { + "epoch": 1.3392, + "grad_norm": 0.00019692144996952266, + "learning_rate": 2.600537681121366e-05, + "loss": 0.0002, + "step": 1047 + }, + { + "epoch": 1.34048, + "grad_norm": 0.0013894090661779046, + "learning_rate": 2.59811396484376e-05, + "loss": 0.0002, + "step": 1048 + }, + { + "epoch": 1.34176, + "grad_norm": 0.00041332849650643766, + "learning_rate": 2.595689489386979e-05, + "loss": 0.0002, + "step": 1049 + }, + { + "epoch": 1.34304, + "grad_norm": 0.001263888319954276, + "learning_rate": 2.593264259374352e-05, + "loss": 0.0002, + "step": 1050 + }, + { + "epoch": 1.34304, + "eval_loss": 1.1927013397216797, + "eval_runtime": 43.9413, + "eval_samples_per_second": 11.424, + "eval_steps_per_second": 1.434, + "step": 1050 + }, + { + "epoch": 1.34432, + "grad_norm": 0.001191023038700223, + "learning_rate": 2.5908382794306435e-05, + "loss": 0.0002, + "step": 1051 + }, + { + "epoch": 1.3456000000000001, + "grad_norm": 0.0004447832179721445, + "learning_rate": 2.5884115541820514e-05, + "loss": 0.0002, + "step": 1052 + }, + { + "epoch": 1.34688, + "grad_norm": 0.0015104867052286863, + "learning_rate": 2.585984088256193e-05, + "loss": 0.0002, + "step": 1053 + }, + { + "epoch": 1.34816, + "grad_norm": 0.0002054869692074135, + "learning_rate": 2.5835558862820997e-05, + "loss": 0.0002, + "step": 1054 + }, + { + "epoch": 1.34944, + "grad_norm": 0.0016293632797896862, + "learning_rate": 2.581126952890203e-05, + "loss": 0.0002, + "step": 1055 + }, + { + "epoch": 1.35072, + "grad_norm": 0.0007887612446211278, + "learning_rate": 2.5786972927123333e-05, + "loss": 0.0002, + "step": 1056 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.0021560071036219597, + "learning_rate": 2.576266910381705e-05, + "loss": 0.0002, + "step": 1057 + }, + { + "epoch": 1.35328, + "grad_norm": 0.003101208945736289, + "learning_rate": 2.573835810532909e-05, + "loss": 0.0002, + "step": 1058 + }, + { + "epoch": 1.35456, + "grad_norm": 0.0003672559978440404, + "learning_rate": 2.5714039978019062e-05, + "loss": 0.0002, + "step": 1059 + }, + { + "epoch": 1.35584, + "grad_norm": 0.003368792589753866, + "learning_rate": 2.568971476826015e-05, + "loss": 0.0002, + "step": 1060 + }, + { + "epoch": 1.35712, + "grad_norm": 0.0014343790244311094, + "learning_rate": 2.5665382522439053e-05, + "loss": 0.0002, + "step": 1061 + }, + { + "epoch": 1.3584, + "grad_norm": 0.00228679645806551, + "learning_rate": 2.5641043286955896e-05, + "loss": 0.0002, + "step": 1062 + }, + { + "epoch": 1.35968, + "grad_norm": 0.0023334973957389593, + "learning_rate": 2.5616697108224116e-05, + "loss": 0.0002, + "step": 1063 + }, + { + "epoch": 1.36096, + "grad_norm": 0.001103361020796001, + "learning_rate": 2.5592344032670406e-05, + "loss": 0.0002, + "step": 1064 + }, + { + "epoch": 1.36224, + "grad_norm": 0.0027988275978714228, + "learning_rate": 2.5567984106734597e-05, + "loss": 0.0002, + "step": 1065 + }, + { + "epoch": 1.36352, + "grad_norm": 0.00014616409316658974, + "learning_rate": 2.5543617376869584e-05, + "loss": 0.0002, + "step": 1066 + }, + { + "epoch": 1.3648, + "grad_norm": 0.0036822843831032515, + "learning_rate": 2.551924388954126e-05, + "loss": 0.0002, + "step": 1067 + }, + { + "epoch": 1.36608, + "grad_norm": 0.0036023741122335196, + "learning_rate": 2.549486369122837e-05, + "loss": 0.0002, + "step": 1068 + }, + { + "epoch": 1.3673600000000001, + "grad_norm": 0.00010829493112396449, + "learning_rate": 2.5470476828422482e-05, + "loss": 0.0002, + "step": 1069 + }, + { + "epoch": 1.36864, + "grad_norm": 0.003002248238772154, + "learning_rate": 2.5446083347627865e-05, + "loss": 0.0002, + "step": 1070 + }, + { + "epoch": 1.36992, + "grad_norm": 0.0024013391230255365, + "learning_rate": 2.5421683295361396e-05, + "loss": 0.0002, + "step": 1071 + }, + { + "epoch": 1.3712, + "grad_norm": 0.0012497822754085064, + "learning_rate": 2.5397276718152503e-05, + "loss": 0.0002, + "step": 1072 + }, + { + "epoch": 1.37248, + "grad_norm": 0.0032194824889302254, + "learning_rate": 2.5372863662543047e-05, + "loss": 0.0002, + "step": 1073 + }, + { + "epoch": 1.3737599999999999, + "grad_norm": 0.0007761308224871755, + "learning_rate": 2.534844417508724e-05, + "loss": 0.0002, + "step": 1074 + }, + { + "epoch": 1.37504, + "grad_norm": 0.0025674847420305014, + "learning_rate": 2.5324018302351564e-05, + "loss": 0.0002, + "step": 1075 + }, + { + "epoch": 1.37632, + "grad_norm": 0.0020916294306516647, + "learning_rate": 2.529958609091468e-05, + "loss": 0.0002, + "step": 1076 + }, + { + "epoch": 1.3776, + "grad_norm": 0.0012239001225680113, + "learning_rate": 2.5275147587367337e-05, + "loss": 0.0002, + "step": 1077 + }, + { + "epoch": 1.37888, + "grad_norm": 0.0021880425047129393, + "learning_rate": 2.5250702838312263e-05, + "loss": 0.0002, + "step": 1078 + }, + { + "epoch": 1.38016, + "grad_norm": 0.00032185690361075103, + "learning_rate": 2.5226251890364122e-05, + "loss": 0.0002, + "step": 1079 + }, + { + "epoch": 1.38144, + "grad_norm": 0.0026376349851489067, + "learning_rate": 2.5201794790149383e-05, + "loss": 0.0002, + "step": 1080 + }, + { + "epoch": 1.38144, + "eval_loss": 1.1927109956741333, + "eval_runtime": 43.8681, + "eval_samples_per_second": 11.443, + "eval_steps_per_second": 1.436, + "step": 1080 + }, + { + "epoch": 1.38272, + "grad_norm": 0.0016633147606626153, + "learning_rate": 2.517733158430626e-05, + "loss": 0.0002, + "step": 1081 + }, + { + "epoch": 1.384, + "grad_norm": 0.0013083289377391338, + "learning_rate": 2.5152862319484596e-05, + "loss": 0.0002, + "step": 1082 + }, + { + "epoch": 1.38528, + "grad_norm": 0.002328772097826004, + "learning_rate": 2.5128387042345792e-05, + "loss": 0.0002, + "step": 1083 + }, + { + "epoch": 1.38656, + "grad_norm": 0.00023164039885159582, + "learning_rate": 2.5103905799562723e-05, + "loss": 0.0002, + "step": 1084 + }, + { + "epoch": 1.38784, + "grad_norm": 0.0023417614866048098, + "learning_rate": 2.507941863781963e-05, + "loss": 0.0002, + "step": 1085 + }, + { + "epoch": 1.3891200000000001, + "grad_norm": 0.001744166831485927, + "learning_rate": 2.5054925603812042e-05, + "loss": 0.0002, + "step": 1086 + }, + { + "epoch": 1.3904, + "grad_norm": 0.0015349910827353597, + "learning_rate": 2.503042674424669e-05, + "loss": 0.0002, + "step": 1087 + }, + { + "epoch": 1.39168, + "grad_norm": 0.0033147349022328854, + "learning_rate": 2.500592210584143e-05, + "loss": 0.0002, + "step": 1088 + }, + { + "epoch": 1.39296, + "grad_norm": 0.0007261090795509517, + "learning_rate": 2.4981411735325088e-05, + "loss": 0.0002, + "step": 1089 + }, + { + "epoch": 1.39424, + "grad_norm": 0.003090859390795231, + "learning_rate": 2.495689567943748e-05, + "loss": 0.0002, + "step": 1090 + }, + { + "epoch": 1.3955199999999999, + "grad_norm": 0.002288298448547721, + "learning_rate": 2.493237398492922e-05, + "loss": 0.0002, + "step": 1091 + }, + { + "epoch": 1.3968, + "grad_norm": 0.0014610658399760723, + "learning_rate": 2.4907846698561704e-05, + "loss": 0.0002, + "step": 1092 + }, + { + "epoch": 1.39808, + "grad_norm": 0.002210103441029787, + "learning_rate": 2.488331386710697e-05, + "loss": 0.0002, + "step": 1093 + }, + { + "epoch": 1.39936, + "grad_norm": 0.00047967262798920274, + "learning_rate": 2.4858775537347654e-05, + "loss": 0.0002, + "step": 1094 + }, + { + "epoch": 1.40064, + "grad_norm": 0.0018254382302984595, + "learning_rate": 2.4834231756076833e-05, + "loss": 0.0002, + "step": 1095 + }, + { + "epoch": 1.40192, + "grad_norm": 0.00032902159728109837, + "learning_rate": 2.480968257009804e-05, + "loss": 0.0002, + "step": 1096 + }, + { + "epoch": 1.4032, + "grad_norm": 0.0019898158498108387, + "learning_rate": 2.4785128026225062e-05, + "loss": 0.0002, + "step": 1097 + }, + { + "epoch": 1.40448, + "grad_norm": 0.0008954960503615439, + "learning_rate": 2.4760568171281934e-05, + "loss": 0.0002, + "step": 1098 + }, + { + "epoch": 1.40576, + "grad_norm": 0.0008229295490309596, + "learning_rate": 2.4736003052102816e-05, + "loss": 0.0002, + "step": 1099 + }, + { + "epoch": 1.40704, + "grad_norm": 0.0005774835590273142, + "learning_rate": 2.4711432715531874e-05, + "loss": 0.0002, + "step": 1100 + }, + { + "epoch": 1.40832, + "grad_norm": 0.0005937288515269756, + "learning_rate": 2.4686857208423276e-05, + "loss": 0.0002, + "step": 1101 + }, + { + "epoch": 1.4096, + "grad_norm": 0.00021285265393089503, + "learning_rate": 2.466227657764101e-05, + "loss": 0.0002, + "step": 1102 + }, + { + "epoch": 1.4108800000000001, + "grad_norm": 0.000575678946916014, + "learning_rate": 2.463769087005885e-05, + "loss": 0.0002, + "step": 1103 + }, + { + "epoch": 1.41216, + "grad_norm": 0.0004997372161597013, + "learning_rate": 2.461310013256025e-05, + "loss": 0.0002, + "step": 1104 + }, + { + "epoch": 1.41344, + "grad_norm": 0.00018097950669471174, + "learning_rate": 2.4588504412038253e-05, + "loss": 0.0002, + "step": 1105 + }, + { + "epoch": 1.41472, + "grad_norm": 0.0005268906243145466, + "learning_rate": 2.45639037553954e-05, + "loss": 0.0002, + "step": 1106 + }, + { + "epoch": 1.416, + "grad_norm": 0.00013170884631108493, + "learning_rate": 2.453929820954366e-05, + "loss": 0.0002, + "step": 1107 + }, + { + "epoch": 1.4172799999999999, + "grad_norm": 0.00040802324656397104, + "learning_rate": 2.451468782140431e-05, + "loss": 0.0002, + "step": 1108 + }, + { + "epoch": 1.41856, + "grad_norm": 0.0009418302797712386, + "learning_rate": 2.4490072637907873e-05, + "loss": 0.0002, + "step": 1109 + }, + { + "epoch": 1.41984, + "grad_norm": 0.0011874447809532285, + "learning_rate": 2.4465452705994e-05, + "loss": 0.0002, + "step": 1110 + }, + { + "epoch": 1.41984, + "eval_loss": 1.1833388805389404, + "eval_runtime": 43.5138, + "eval_samples_per_second": 11.537, + "eval_steps_per_second": 1.448, + "step": 1110 + }, + { + "epoch": 1.42112, + "grad_norm": 0.0001316447596764192, + "learning_rate": 2.444082807261142e-05, + "loss": 0.0002, + "step": 1111 + }, + { + "epoch": 1.4224, + "grad_norm": 0.0008458861848339438, + "learning_rate": 2.44161987847178e-05, + "loss": 0.0002, + "step": 1112 + }, + { + "epoch": 1.42368, + "grad_norm": 0.0002578686107881367, + "learning_rate": 2.4391564889279712e-05, + "loss": 0.0002, + "step": 1113 + }, + { + "epoch": 1.42496, + "grad_norm": 0.001641182112507522, + "learning_rate": 2.4366926433272494e-05, + "loss": 0.0002, + "step": 1114 + }, + { + "epoch": 1.42624, + "grad_norm": 0.0012906165793538094, + "learning_rate": 2.434228346368018e-05, + "loss": 0.0002, + "step": 1115 + }, + { + "epoch": 1.42752, + "grad_norm": 0.0002661029575392604, + "learning_rate": 2.4317636027495426e-05, + "loss": 0.0002, + "step": 1116 + }, + { + "epoch": 1.4288, + "grad_norm": 0.0005313612637110054, + "learning_rate": 2.42929841717194e-05, + "loss": 0.0002, + "step": 1117 + }, + { + "epoch": 1.43008, + "grad_norm": 0.0009100149036385119, + "learning_rate": 2.4268327943361688e-05, + "loss": 0.0002, + "step": 1118 + }, + { + "epoch": 1.43136, + "grad_norm": 0.0018073428655043244, + "learning_rate": 2.4243667389440222e-05, + "loss": 0.0002, + "step": 1119 + }, + { + "epoch": 1.4326400000000001, + "grad_norm": 0.0005083298892714083, + "learning_rate": 2.4219002556981184e-05, + "loss": 0.0002, + "step": 1120 + }, + { + "epoch": 1.43392, + "grad_norm": 0.0010776943527162075, + "learning_rate": 2.4194333493018912e-05, + "loss": 0.0002, + "step": 1121 + }, + { + "epoch": 1.4352, + "grad_norm": 0.00042680808110162616, + "learning_rate": 2.4169660244595818e-05, + "loss": 0.0002, + "step": 1122 + }, + { + "epoch": 1.43648, + "grad_norm": 0.0024712400045245886, + "learning_rate": 2.414498285876228e-05, + "loss": 0.0002, + "step": 1123 + }, + { + "epoch": 1.43776, + "grad_norm": 0.0015264041721820831, + "learning_rate": 2.4120301382576593e-05, + "loss": 0.0002, + "step": 1124 + }, + { + "epoch": 1.4390399999999999, + "grad_norm": 0.0009439815767109394, + "learning_rate": 2.4095615863104812e-05, + "loss": 0.0002, + "step": 1125 + }, + { + "epoch": 1.44032, + "grad_norm": 0.0013193822233006358, + "learning_rate": 2.4070926347420746e-05, + "loss": 0.0002, + "step": 1126 + }, + { + "epoch": 1.4416, + "grad_norm": 0.000317655794788152, + "learning_rate": 2.404623288260578e-05, + "loss": 0.0002, + "step": 1127 + }, + { + "epoch": 1.44288, + "grad_norm": 0.0012924104230478406, + "learning_rate": 2.402153551574888e-05, + "loss": 0.0002, + "step": 1128 + }, + { + "epoch": 1.44416, + "grad_norm": 0.0003663348325062543, + "learning_rate": 2.399683429394641e-05, + "loss": 0.0002, + "step": 1129 + }, + { + "epoch": 1.44544, + "grad_norm": 0.0015283627435564995, + "learning_rate": 2.3972129264302103e-05, + "loss": 0.0002, + "step": 1130 + }, + { + "epoch": 1.44672, + "grad_norm": 0.0014445302076637745, + "learning_rate": 2.3947420473926943e-05, + "loss": 0.0002, + "step": 1131 + }, + { + "epoch": 1.448, + "grad_norm": 0.0007018298492766917, + "learning_rate": 2.3922707969939115e-05, + "loss": 0.0002, + "step": 1132 + }, + { + "epoch": 1.44928, + "grad_norm": 0.0018520744051784277, + "learning_rate": 2.3897991799463847e-05, + "loss": 0.0002, + "step": 1133 + }, + { + "epoch": 1.45056, + "grad_norm": 0.00043054608977399766, + "learning_rate": 2.387327200963339e-05, + "loss": 0.0002, + "step": 1134 + }, + { + "epoch": 1.45184, + "grad_norm": 0.0013373054098337889, + "learning_rate": 2.384854864758686e-05, + "loss": 0.0002, + "step": 1135 + }, + { + "epoch": 1.45312, + "grad_norm": 0.00040695464122109115, + "learning_rate": 2.3823821760470237e-05, + "loss": 0.0002, + "step": 1136 + }, + { + "epoch": 1.4544000000000001, + "grad_norm": 0.0019905297085642815, + "learning_rate": 2.3799091395436172e-05, + "loss": 0.0002, + "step": 1137 + }, + { + "epoch": 1.45568, + "grad_norm": 0.00207939138635993, + "learning_rate": 2.3774357599643985e-05, + "loss": 0.0002, + "step": 1138 + }, + { + "epoch": 1.45696, + "grad_norm": 0.00039109544013626873, + "learning_rate": 2.374962042025952e-05, + "loss": 0.0002, + "step": 1139 + }, + { + "epoch": 1.45824, + "grad_norm": 0.0014972800854593515, + "learning_rate": 2.3724879904455072e-05, + "loss": 0.0002, + "step": 1140 + }, + { + "epoch": 1.45824, + "eval_loss": 1.1820496320724487, + "eval_runtime": 43.3596, + "eval_samples_per_second": 11.578, + "eval_steps_per_second": 1.453, + "step": 1140 + }, + { + "epoch": 1.45952, + "grad_norm": 0.00017591100186109543, + "learning_rate": 2.370013609940931e-05, + "loss": 0.0002, + "step": 1141 + }, + { + "epoch": 1.4607999999999999, + "grad_norm": 0.002228839322924614, + "learning_rate": 2.3675389052307165e-05, + "loss": 0.0002, + "step": 1142 + }, + { + "epoch": 1.46208, + "grad_norm": 0.0017025255365297198, + "learning_rate": 2.3650638810339755e-05, + "loss": 0.0002, + "step": 1143 + }, + { + "epoch": 1.46336, + "grad_norm": 0.0016512767178937793, + "learning_rate": 2.3625885420704283e-05, + "loss": 0.0002, + "step": 1144 + }, + { + "epoch": 1.46464, + "grad_norm": 0.0030168932862579823, + "learning_rate": 2.360112893060398e-05, + "loss": 0.0002, + "step": 1145 + }, + { + "epoch": 1.4659200000000001, + "grad_norm": 0.00015002665168140084, + "learning_rate": 2.357636938724795e-05, + "loss": 0.0002, + "step": 1146 + }, + { + "epoch": 1.4672, + "grad_norm": 0.0025957797188311815, + "learning_rate": 2.355160683785115e-05, + "loss": 0.0002, + "step": 1147 + }, + { + "epoch": 1.46848, + "grad_norm": 0.0005568786873482168, + "learning_rate": 2.3526841329634258e-05, + "loss": 0.0002, + "step": 1148 + }, + { + "epoch": 1.46976, + "grad_norm": 0.0029370211996138096, + "learning_rate": 2.3502072909823598e-05, + "loss": 0.0002, + "step": 1149 + }, + { + "epoch": 1.47104, + "grad_norm": 0.001947471289895475, + "learning_rate": 2.3477301625651033e-05, + "loss": 0.0002, + "step": 1150 + }, + { + "epoch": 1.47232, + "grad_norm": 0.0019220055546611547, + "learning_rate": 2.3452527524353913e-05, + "loss": 0.0002, + "step": 1151 + }, + { + "epoch": 1.4736, + "grad_norm": 0.0024576217401772738, + "learning_rate": 2.342775065317494e-05, + "loss": 0.0002, + "step": 1152 + }, + { + "epoch": 1.47488, + "grad_norm": 0.0007621421827934682, + "learning_rate": 2.34029710593621e-05, + "loss": 0.0002, + "step": 1153 + }, + { + "epoch": 1.4761600000000001, + "grad_norm": 0.002741249743849039, + "learning_rate": 2.3378188790168576e-05, + "loss": 0.0002, + "step": 1154 + }, + { + "epoch": 1.47744, + "grad_norm": 0.0012215838069096208, + "learning_rate": 2.335340389285266e-05, + "loss": 0.0002, + "step": 1155 + }, + { + "epoch": 1.47872, + "grad_norm": 0.0014048725133761764, + "learning_rate": 2.3328616414677633e-05, + "loss": 0.0002, + "step": 1156 + }, + { + "epoch": 1.48, + "grad_norm": 0.0012951638782396913, + "learning_rate": 2.330382640291173e-05, + "loss": 0.0002, + "step": 1157 + }, + { + "epoch": 1.48128, + "grad_norm": 0.001137186773121357, + "learning_rate": 2.3279033904827983e-05, + "loss": 0.0002, + "step": 1158 + }, + { + "epoch": 1.4825599999999999, + "grad_norm": 0.0011690363753587008, + "learning_rate": 2.3254238967704184e-05, + "loss": 0.0002, + "step": 1159 + }, + { + "epoch": 1.48384, + "grad_norm": 0.0013390486128628254, + "learning_rate": 2.3229441638822783e-05, + "loss": 0.0002, + "step": 1160 + }, + { + "epoch": 1.48512, + "grad_norm": 0.001537360018119216, + "learning_rate": 2.3204641965470773e-05, + "loss": 0.0002, + "step": 1161 + }, + { + "epoch": 1.4864, + "grad_norm": 0.000685350620187819, + "learning_rate": 2.317983999493963e-05, + "loss": 0.0002, + "step": 1162 + }, + { + "epoch": 1.4876800000000001, + "grad_norm": 0.0017830547876656055, + "learning_rate": 2.3155035774525206e-05, + "loss": 0.0002, + "step": 1163 + }, + { + "epoch": 1.48896, + "grad_norm": 0.0004691873327828944, + "learning_rate": 2.3130229351527645e-05, + "loss": 0.0002, + "step": 1164 + }, + { + "epoch": 1.49024, + "grad_norm": 0.001537265139631927, + "learning_rate": 2.310542077325129e-05, + "loss": 0.0002, + "step": 1165 + }, + { + "epoch": 1.49152, + "grad_norm": 0.0013536819024011493, + "learning_rate": 2.308061008700459e-05, + "loss": 0.0002, + "step": 1166 + }, + { + "epoch": 1.4928, + "grad_norm": 0.00046399055281654, + "learning_rate": 2.3055797340100024e-05, + "loss": 0.0002, + "step": 1167 + }, + { + "epoch": 1.49408, + "grad_norm": 0.0013303180458024144, + "learning_rate": 2.3030982579853996e-05, + "loss": 0.0002, + "step": 1168 + }, + { + "epoch": 1.49536, + "grad_norm": 0.000706115213688463, + "learning_rate": 2.3006165853586747e-05, + "loss": 0.0002, + "step": 1169 + }, + { + "epoch": 1.49664, + "grad_norm": 0.00045828090514987707, + "learning_rate": 2.298134720862227e-05, + "loss": 0.0002, + "step": 1170 + }, + { + "epoch": 1.49664, + "eval_loss": 1.1880303621292114, + "eval_runtime": 43.6913, + "eval_samples_per_second": 11.49, + "eval_steps_per_second": 1.442, + "step": 1170 + }, + { + "epoch": 1.49792, + "grad_norm": 0.0005636084824800491, + "learning_rate": 2.2956526692288197e-05, + "loss": 0.0002, + "step": 1171 + }, + { + "epoch": 1.4992, + "grad_norm": 0.0004983227117918432, + "learning_rate": 2.2931704351915776e-05, + "loss": 0.0002, + "step": 1172 + }, + { + "epoch": 1.50048, + "grad_norm": 0.0012799539836123586, + "learning_rate": 2.2906880234839676e-05, + "loss": 0.0002, + "step": 1173 + }, + { + "epoch": 1.50176, + "grad_norm": 0.0003243404207751155, + "learning_rate": 2.2882054388398006e-05, + "loss": 0.0002, + "step": 1174 + }, + { + "epoch": 1.50304, + "grad_norm": 0.0009844460291787982, + "learning_rate": 2.2857226859932134e-05, + "loss": 0.0002, + "step": 1175 + }, + { + "epoch": 1.5043199999999999, + "grad_norm": 0.00018364368588663638, + "learning_rate": 2.2832397696786653e-05, + "loss": 0.0002, + "step": 1176 + }, + { + "epoch": 1.5056, + "grad_norm": 0.001054965308867395, + "learning_rate": 2.2807566946309273e-05, + "loss": 0.0002, + "step": 1177 + }, + { + "epoch": 1.50688, + "grad_norm": 0.00043028456275351346, + "learning_rate": 2.2782734655850727e-05, + "loss": 0.0002, + "step": 1178 + }, + { + "epoch": 1.50816, + "grad_norm": 0.0005619335570372641, + "learning_rate": 2.275790087276468e-05, + "loss": 0.0002, + "step": 1179 + }, + { + "epoch": 1.5094400000000001, + "grad_norm": 0.0001896164903882891, + "learning_rate": 2.273306564440767e-05, + "loss": 0.0002, + "step": 1180 + }, + { + "epoch": 1.51072, + "grad_norm": 0.000446715101134032, + "learning_rate": 2.2708229018138946e-05, + "loss": 0.0002, + "step": 1181 + }, + { + "epoch": 1.512, + "grad_norm": 0.0004963649553246796, + "learning_rate": 2.268339104132046e-05, + "loss": 0.0002, + "step": 1182 + }, + { + "epoch": 1.51328, + "grad_norm": 0.0008193458197638392, + "learning_rate": 2.2658551761316723e-05, + "loss": 0.0002, + "step": 1183 + }, + { + "epoch": 1.51456, + "grad_norm": 0.0004890802665613592, + "learning_rate": 2.263371122549474e-05, + "loss": 0.0002, + "step": 1184 + }, + { + "epoch": 1.5158399999999999, + "grad_norm": 0.0001344592310488224, + "learning_rate": 2.2608869481223898e-05, + "loss": 0.0002, + "step": 1185 + }, + { + "epoch": 1.51712, + "grad_norm": 0.00012116249126847833, + "learning_rate": 2.2584026575875902e-05, + "loss": 0.0002, + "step": 1186 + }, + { + "epoch": 1.5184, + "grad_norm": 0.0006150749395601451, + "learning_rate": 2.2559182556824667e-05, + "loss": 0.0002, + "step": 1187 + }, + { + "epoch": 1.5196800000000001, + "grad_norm": 0.0006445617182180285, + "learning_rate": 2.2534337471446223e-05, + "loss": 0.0002, + "step": 1188 + }, + { + "epoch": 1.52096, + "grad_norm": 0.00024265323008876294, + "learning_rate": 2.2509491367118642e-05, + "loss": 0.0002, + "step": 1189 + }, + { + "epoch": 1.52224, + "grad_norm": 0.0001686473551671952, + "learning_rate": 2.2484644291221933e-05, + "loss": 0.0002, + "step": 1190 + }, + { + "epoch": 1.52352, + "grad_norm": 0.0006335959769785404, + "learning_rate": 2.245979629113797e-05, + "loss": 0.0002, + "step": 1191 + }, + { + "epoch": 1.5248, + "grad_norm": 0.0006540421163663268, + "learning_rate": 2.2434947414250387e-05, + "loss": 0.0002, + "step": 1192 + }, + { + "epoch": 1.5260799999999999, + "grad_norm": 0.00016801041783764958, + "learning_rate": 2.241009770794447e-05, + "loss": 0.0002, + "step": 1193 + }, + { + "epoch": 1.52736, + "grad_norm": 0.0006320360698737204, + "learning_rate": 2.238524721960711e-05, + "loss": 0.0002, + "step": 1194 + }, + { + "epoch": 1.52864, + "grad_norm": 0.0003093292471021414, + "learning_rate": 2.236039599662667e-05, + "loss": 0.0002, + "step": 1195 + }, + { + "epoch": 1.52992, + "grad_norm": 0.0003343690186738968, + "learning_rate": 2.233554408639294e-05, + "loss": 0.0002, + "step": 1196 + }, + { + "epoch": 1.5312000000000001, + "grad_norm": 0.00028043152997270226, + "learning_rate": 2.2310691536296995e-05, + "loss": 0.0002, + "step": 1197 + }, + { + "epoch": 1.53248, + "grad_norm": 0.000370222725905478, + "learning_rate": 2.2285838393731146e-05, + "loss": 0.0002, + "step": 1198 + }, + { + "epoch": 1.53376, + "grad_norm": 0.00022848798835184425, + "learning_rate": 2.226098470608882e-05, + "loss": 0.0002, + "step": 1199 + }, + { + "epoch": 1.53504, + "grad_norm": 0.0006417376571334898, + "learning_rate": 2.2236130520764493e-05, + "loss": 0.0002, + "step": 1200 + }, + { + "epoch": 1.53504, + "eval_loss": 1.18635094165802, + "eval_runtime": 43.537, + "eval_samples_per_second": 11.53, + "eval_steps_per_second": 1.447, + "step": 1200 + }, + { + "epoch": 1.53632, + "grad_norm": 0.0005315435701049864, + "learning_rate": 2.2211275885153594e-05, + "loss": 0.0002, + "step": 1201 + }, + { + "epoch": 1.5375999999999999, + "grad_norm": 0.0007035636226646602, + "learning_rate": 2.21864208466524e-05, + "loss": 0.0002, + "step": 1202 + }, + { + "epoch": 1.53888, + "grad_norm": 0.0015890513313934207, + "learning_rate": 2.2161565452657964e-05, + "loss": 0.0002, + "step": 1203 + }, + { + "epoch": 1.54016, + "grad_norm": 0.0005806463886983693, + "learning_rate": 2.2136709750568023e-05, + "loss": 0.0002, + "step": 1204 + }, + { + "epoch": 1.5414400000000001, + "grad_norm": 0.0012760835234075785, + "learning_rate": 2.2111853787780864e-05, + "loss": 0.0002, + "step": 1205 + }, + { + "epoch": 1.54272, + "grad_norm": 0.0006499637383967638, + "learning_rate": 2.208699761169533e-05, + "loss": 0.0002, + "step": 1206 + }, + { + "epoch": 1.544, + "grad_norm": 0.0016380108427256346, + "learning_rate": 2.2062141269710627e-05, + "loss": 0.0002, + "step": 1207 + }, + { + "epoch": 1.54528, + "grad_norm": 0.001415404723957181, + "learning_rate": 2.2037284809226295e-05, + "loss": 0.0002, + "step": 1208 + }, + { + "epoch": 1.54656, + "grad_norm": 0.0009337143274024129, + "learning_rate": 2.2012428277642085e-05, + "loss": 0.0002, + "step": 1209 + }, + { + "epoch": 1.5478399999999999, + "grad_norm": 0.0014679753221571445, + "learning_rate": 2.198757172235792e-05, + "loss": 0.0002, + "step": 1210 + }, + { + "epoch": 1.54912, + "grad_norm": 0.00028598069911822677, + "learning_rate": 2.196271519077371e-05, + "loss": 0.0002, + "step": 1211 + }, + { + "epoch": 1.5504, + "grad_norm": 0.0016987015260383487, + "learning_rate": 2.193785873028938e-05, + "loss": 0.0002, + "step": 1212 + }, + { + "epoch": 1.55168, + "grad_norm": 0.0011778499465435743, + "learning_rate": 2.191300238830468e-05, + "loss": 0.0002, + "step": 1213 + }, + { + "epoch": 1.5529600000000001, + "grad_norm": 0.0002532570797484368, + "learning_rate": 2.1888146212219142e-05, + "loss": 0.0002, + "step": 1214 + }, + { + "epoch": 1.55424, + "grad_norm": 0.0004746295453514904, + "learning_rate": 2.1863290249431993e-05, + "loss": 0.0002, + "step": 1215 + }, + { + "epoch": 1.55552, + "grad_norm": 0.00015324894047807902, + "learning_rate": 2.183843454734204e-05, + "loss": 0.0002, + "step": 1216 + }, + { + "epoch": 1.5568, + "grad_norm": 0.00023059602244757116, + "learning_rate": 2.1813579153347604e-05, + "loss": 0.0002, + "step": 1217 + }, + { + "epoch": 1.55808, + "grad_norm": 0.00017352044233120978, + "learning_rate": 2.1788724114846408e-05, + "loss": 0.0002, + "step": 1218 + }, + { + "epoch": 1.5593599999999999, + "grad_norm": 0.0008680567843839526, + "learning_rate": 2.1763869479235512e-05, + "loss": 0.0002, + "step": 1219 + }, + { + "epoch": 1.56064, + "grad_norm": 0.0011944602010771632, + "learning_rate": 2.173901529391119e-05, + "loss": 0.0002, + "step": 1220 + }, + { + "epoch": 1.56192, + "grad_norm": 0.0003440252039581537, + "learning_rate": 2.1714161606268863e-05, + "loss": 0.0002, + "step": 1221 + }, + { + "epoch": 1.5632000000000001, + "grad_norm": 0.001474754768423736, + "learning_rate": 2.168930846370301e-05, + "loss": 0.0002, + "step": 1222 + }, + { + "epoch": 1.56448, + "grad_norm": 0.0006110747344791889, + "learning_rate": 2.1664455913607063e-05, + "loss": 0.0002, + "step": 1223 + }, + { + "epoch": 1.56576, + "grad_norm": 0.00047909640124998987, + "learning_rate": 2.1639604003373332e-05, + "loss": 0.0002, + "step": 1224 + }, + { + "epoch": 1.56704, + "grad_norm": 0.0001748676149873063, + "learning_rate": 2.16147527803929e-05, + "loss": 0.0002, + "step": 1225 + }, + { + "epoch": 1.56832, + "grad_norm": 0.000838764535728842, + "learning_rate": 2.1589902292055534e-05, + "loss": 0.0002, + "step": 1226 + }, + { + "epoch": 1.5695999999999999, + "grad_norm": 0.0002627510402817279, + "learning_rate": 2.1565052585749626e-05, + "loss": 0.0002, + "step": 1227 + }, + { + "epoch": 1.57088, + "grad_norm": 0.0006864861934445798, + "learning_rate": 2.154020370886203e-05, + "loss": 0.0002, + "step": 1228 + }, + { + "epoch": 1.57216, + "grad_norm": 0.00024974255939014256, + "learning_rate": 2.1515355708778072e-05, + "loss": 0.0002, + "step": 1229 + }, + { + "epoch": 1.57344, + "grad_norm": 0.0008880963432602584, + "learning_rate": 2.149050863288137e-05, + "loss": 0.0002, + "step": 1230 + }, + { + "epoch": 1.57344, + "eval_loss": 1.184080719947815, + "eval_runtime": 43.5512, + "eval_samples_per_second": 11.527, + "eval_steps_per_second": 1.447, + "step": 1230 + }, + { + "epoch": 1.5747200000000001, + "grad_norm": 0.0002939118421636522, + "learning_rate": 2.1465662528553787e-05, + "loss": 0.0002, + "step": 1231 + }, + { + "epoch": 1.576, + "grad_norm": 0.001100562745705247, + "learning_rate": 2.1440817443175342e-05, + "loss": 0.0002, + "step": 1232 + }, + { + "epoch": 1.57728, + "grad_norm": 0.0009249002323485911, + "learning_rate": 2.14159734241241e-05, + "loss": 0.0002, + "step": 1233 + }, + { + "epoch": 1.57856, + "grad_norm": 0.0005277720047160983, + "learning_rate": 2.1391130518776104e-05, + "loss": 0.0002, + "step": 1234 + }, + { + "epoch": 1.57984, + "grad_norm": 0.0010325635084882379, + "learning_rate": 2.1366288774505263e-05, + "loss": 0.0002, + "step": 1235 + }, + { + "epoch": 1.5811199999999999, + "grad_norm": 0.00023603586305398494, + "learning_rate": 2.134144823868328e-05, + "loss": 0.0002, + "step": 1236 + }, + { + "epoch": 1.5824, + "grad_norm": 0.0016086595132946968, + "learning_rate": 2.1316608958679547e-05, + "loss": 0.0002, + "step": 1237 + }, + { + "epoch": 1.58368, + "grad_norm": 0.0009198304614983499, + "learning_rate": 2.129177098186106e-05, + "loss": 0.0002, + "step": 1238 + }, + { + "epoch": 1.5849600000000001, + "grad_norm": 0.0010304874740540981, + "learning_rate": 2.1266934355592337e-05, + "loss": 0.0002, + "step": 1239 + }, + { + "epoch": 1.58624, + "grad_norm": 0.0009441455476917326, + "learning_rate": 2.124209912723532e-05, + "loss": 0.0002, + "step": 1240 + }, + { + "epoch": 1.58752, + "grad_norm": 0.0005426053539849818, + "learning_rate": 2.121726534414928e-05, + "loss": 0.0002, + "step": 1241 + }, + { + "epoch": 1.5888, + "grad_norm": 0.00025539376656524837, + "learning_rate": 2.1192433053690743e-05, + "loss": 0.0002, + "step": 1242 + }, + { + "epoch": 1.59008, + "grad_norm": 0.0010780163574963808, + "learning_rate": 2.1167602303213352e-05, + "loss": 0.0002, + "step": 1243 + }, + { + "epoch": 1.5913599999999999, + "grad_norm": 0.00113013107329607, + "learning_rate": 2.114277314006788e-05, + "loss": 0.0002, + "step": 1244 + }, + { + "epoch": 1.5926399999999998, + "grad_norm": 0.00017947869491763413, + "learning_rate": 2.1117945611602e-05, + "loss": 0.0002, + "step": 1245 + }, + { + "epoch": 1.59392, + "grad_norm": 0.001032519736327231, + "learning_rate": 2.109311976516033e-05, + "loss": 0.0002, + "step": 1246 + }, + { + "epoch": 1.5952, + "grad_norm": 0.0004488160484470427, + "learning_rate": 2.1068295648084233e-05, + "loss": 0.0002, + "step": 1247 + }, + { + "epoch": 1.5964800000000001, + "grad_norm": 0.0001314591063419357, + "learning_rate": 2.1043473307711805e-05, + "loss": 0.0002, + "step": 1248 + }, + { + "epoch": 1.59776, + "grad_norm": 0.0003850892826449126, + "learning_rate": 2.1018652791377744e-05, + "loss": 0.0002, + "step": 1249 + }, + { + "epoch": 1.59904, + "grad_norm": 0.00025546064716763794, + "learning_rate": 2.099383414641326e-05, + "loss": 0.0002, + "step": 1250 + }, + { + "epoch": 1.60032, + "grad_norm": 0.0003797194513026625, + "learning_rate": 2.096901742014601e-05, + "loss": 0.0002, + "step": 1251 + }, + { + "epoch": 1.6016, + "grad_norm": 0.00018792178889270872, + "learning_rate": 2.0944202659899978e-05, + "loss": 0.0002, + "step": 1252 + }, + { + "epoch": 1.6028799999999999, + "grad_norm": 0.0005130458157509565, + "learning_rate": 2.0919389912995416e-05, + "loss": 0.0002, + "step": 1253 + }, + { + "epoch": 1.60416, + "grad_norm": 0.00027721276273950934, + "learning_rate": 2.0894579226748722e-05, + "loss": 0.0002, + "step": 1254 + }, + { + "epoch": 1.60544, + "grad_norm": 0.0006829426274634898, + "learning_rate": 2.0869770648472364e-05, + "loss": 0.0002, + "step": 1255 + }, + { + "epoch": 1.6067200000000001, + "grad_norm": 0.0011762769427150488, + "learning_rate": 2.08449642254748e-05, + "loss": 0.0002, + "step": 1256 + }, + { + "epoch": 1.608, + "grad_norm": 0.0001786458888091147, + "learning_rate": 2.0820160005060376e-05, + "loss": 0.0002, + "step": 1257 + }, + { + "epoch": 1.60928, + "grad_norm": 0.001003823708742857, + "learning_rate": 2.0795358034529233e-05, + "loss": 0.0002, + "step": 1258 + }, + { + "epoch": 1.61056, + "grad_norm": 0.0005043644341640174, + "learning_rate": 2.077055836117723e-05, + "loss": 0.0002, + "step": 1259 + }, + { + "epoch": 1.61184, + "grad_norm": 0.0005947434692643583, + "learning_rate": 2.074576103229582e-05, + "loss": 0.0002, + "step": 1260 + }, + { + "epoch": 1.61184, + "eval_loss": 1.1842581033706665, + "eval_runtime": 43.4749, + "eval_samples_per_second": 11.547, + "eval_steps_per_second": 1.449, + "step": 1260 + }, + { + "epoch": 1.6131199999999999, + "grad_norm": 0.0007518928032368422, + "learning_rate": 2.0720966095172033e-05, + "loss": 0.0002, + "step": 1261 + }, + { + "epoch": 1.6143999999999998, + "grad_norm": 0.0005646328208968043, + "learning_rate": 2.0696173597088283e-05, + "loss": 0.0002, + "step": 1262 + }, + { + "epoch": 1.61568, + "grad_norm": 0.0002734081353992224, + "learning_rate": 2.0671383585322372e-05, + "loss": 0.0002, + "step": 1263 + }, + { + "epoch": 1.61696, + "grad_norm": 0.0007313131354749203, + "learning_rate": 2.064659610714735e-05, + "loss": 0.0002, + "step": 1264 + }, + { + "epoch": 1.6182400000000001, + "grad_norm": 0.00136158661916852, + "learning_rate": 2.062181120983143e-05, + "loss": 0.0002, + "step": 1265 + }, + { + "epoch": 1.61952, + "grad_norm": 0.00017255271086469293, + "learning_rate": 2.0597028940637907e-05, + "loss": 0.0002, + "step": 1266 + }, + { + "epoch": 1.6208, + "grad_norm": 0.00156474020332098, + "learning_rate": 2.0572249346825067e-05, + "loss": 0.0002, + "step": 1267 + }, + { + "epoch": 1.62208, + "grad_norm": 0.0006997156888246536, + "learning_rate": 2.0547472475646093e-05, + "loss": 0.0002, + "step": 1268 + }, + { + "epoch": 1.62336, + "grad_norm": 0.0015759170055389404, + "learning_rate": 2.052269837434897e-05, + "loss": 0.0002, + "step": 1269 + }, + { + "epoch": 1.6246399999999999, + "grad_norm": 0.0018613300053402781, + "learning_rate": 2.0497927090176408e-05, + "loss": 0.0002, + "step": 1270 + }, + { + "epoch": 1.62592, + "grad_norm": 0.0004975548945367336, + "learning_rate": 2.0473158670365745e-05, + "loss": 0.0002, + "step": 1271 + }, + { + "epoch": 1.6272, + "grad_norm": 0.0012139310128986835, + "learning_rate": 2.0448393162148852e-05, + "loss": 0.0002, + "step": 1272 + }, + { + "epoch": 1.6284800000000001, + "grad_norm": 0.0007971475715748966, + "learning_rate": 2.0423630612752054e-05, + "loss": 0.0002, + "step": 1273 + }, + { + "epoch": 1.62976, + "grad_norm": 0.0012991585535928607, + "learning_rate": 2.0398871069396023e-05, + "loss": 0.0002, + "step": 1274 + }, + { + "epoch": 1.63104, + "grad_norm": 0.0008941097185015678, + "learning_rate": 2.0374114579295723e-05, + "loss": 0.0002, + "step": 1275 + }, + { + "epoch": 1.63232, + "grad_norm": 0.0021978786680847406, + "learning_rate": 2.0349361189660247e-05, + "loss": 0.0002, + "step": 1276 + }, + { + "epoch": 1.6336, + "grad_norm": 0.0004447043174877763, + "learning_rate": 2.032461094769284e-05, + "loss": 0.0002, + "step": 1277 + }, + { + "epoch": 1.6348799999999999, + "grad_norm": 0.0012047910131514072, + "learning_rate": 2.0299863900590702e-05, + "loss": 0.0002, + "step": 1278 + }, + { + "epoch": 1.6361599999999998, + "grad_norm": 0.00027334975311532617, + "learning_rate": 2.027512009554493e-05, + "loss": 0.0002, + "step": 1279 + }, + { + "epoch": 1.63744, + "grad_norm": 0.0020765531808137894, + "learning_rate": 2.025037957974049e-05, + "loss": 0.0002, + "step": 1280 + }, + { + "epoch": 1.63872, + "grad_norm": 0.0007688076002523303, + "learning_rate": 2.0225642400356018e-05, + "loss": 0.0002, + "step": 1281 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.0015095536364242435, + "learning_rate": 2.0200908604563834e-05, + "loss": 0.0002, + "step": 1282 + }, + { + "epoch": 1.64128, + "grad_norm": 0.000709025131072849, + "learning_rate": 2.0176178239529775e-05, + "loss": 0.0002, + "step": 1283 + }, + { + "epoch": 1.64256, + "grad_norm": 0.001618406968191266, + "learning_rate": 2.0151451352413144e-05, + "loss": 0.0002, + "step": 1284 + }, + { + "epoch": 1.64384, + "grad_norm": 0.0016390075907111168, + "learning_rate": 2.0126727990366625e-05, + "loss": 0.0002, + "step": 1285 + }, + { + "epoch": 1.64512, + "grad_norm": 0.000761967443395406, + "learning_rate": 2.0102008200536155e-05, + "loss": 0.0002, + "step": 1286 + }, + { + "epoch": 1.6463999999999999, + "grad_norm": 0.0018765556160360575, + "learning_rate": 2.0077292030060894e-05, + "loss": 0.0002, + "step": 1287 + }, + { + "epoch": 1.64768, + "grad_norm": 0.00017738196766003966, + "learning_rate": 2.005257952607306e-05, + "loss": 0.0002, + "step": 1288 + }, + { + "epoch": 1.64896, + "grad_norm": 0.001326532568782568, + "learning_rate": 2.0027870735697906e-05, + "loss": 0.0002, + "step": 1289 + }, + { + "epoch": 1.6502400000000002, + "grad_norm": 0.0003637004701886326, + "learning_rate": 2.0003165706053603e-05, + "loss": 0.0002, + "step": 1290 + }, + { + "epoch": 1.6502400000000002, + "eval_loss": 1.1752021312713623, + "eval_runtime": 43.1151, + "eval_samples_per_second": 11.643, + "eval_steps_per_second": 1.461, + "step": 1290 + }, + { + "epoch": 1.65152, + "grad_norm": 0.0008304680814035237, + "learning_rate": 1.9978464484251125e-05, + "loss": 0.0002, + "step": 1291 + }, + { + "epoch": 1.6528, + "grad_norm": 0.00021488082711584866, + "learning_rate": 1.9953767117394225e-05, + "loss": 0.0002, + "step": 1292 + }, + { + "epoch": 1.65408, + "grad_norm": 0.0006303602713160217, + "learning_rate": 1.992907365257926e-05, + "loss": 0.0002, + "step": 1293 + }, + { + "epoch": 1.65536, + "grad_norm": 0.00033922336297109723, + "learning_rate": 1.9904384136895193e-05, + "loss": 0.0002, + "step": 1294 + }, + { + "epoch": 1.65664, + "grad_norm": 0.00041787928785197437, + "learning_rate": 1.9879698617423416e-05, + "loss": 0.0002, + "step": 1295 + }, + { + "epoch": 1.6579199999999998, + "grad_norm": 0.0005025964928790927, + "learning_rate": 1.985501714123772e-05, + "loss": 0.0002, + "step": 1296 + }, + { + "epoch": 1.6592, + "grad_norm": 0.00022155040642246604, + "learning_rate": 1.983033975540419e-05, + "loss": 0.0002, + "step": 1297 + }, + { + "epoch": 1.66048, + "grad_norm": 0.0002778613707050681, + "learning_rate": 1.980566650698109e-05, + "loss": 0.0002, + "step": 1298 + }, + { + "epoch": 1.6617600000000001, + "grad_norm": 0.0003134211292490363, + "learning_rate": 1.9780997443018822e-05, + "loss": 0.0002, + "step": 1299 + }, + { + "epoch": 1.66304, + "grad_norm": 0.0006997901946306229, + "learning_rate": 1.9756332610559787e-05, + "loss": 0.0002, + "step": 1300 + }, + { + "epoch": 1.66432, + "grad_norm": 0.0010301823494955897, + "learning_rate": 1.973167205663832e-05, + "loss": 0.0002, + "step": 1301 + }, + { + "epoch": 1.6656, + "grad_norm": 0.00035824833321385086, + "learning_rate": 1.9707015828280607e-05, + "loss": 0.0002, + "step": 1302 + }, + { + "epoch": 1.66688, + "grad_norm": 0.0009457553969696164, + "learning_rate": 1.9682363972504577e-05, + "loss": 0.0002, + "step": 1303 + }, + { + "epoch": 1.6681599999999999, + "grad_norm": 0.0007050599670037627, + "learning_rate": 1.9657716536319826e-05, + "loss": 0.0002, + "step": 1304 + }, + { + "epoch": 1.66944, + "grad_norm": 0.00086369359632954, + "learning_rate": 1.9633073566727515e-05, + "loss": 0.0002, + "step": 1305 + }, + { + "epoch": 1.67072, + "grad_norm": 0.0013965156394988298, + "learning_rate": 1.9608435110720294e-05, + "loss": 0.0002, + "step": 1306 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.0014690575189888477, + "learning_rate": 1.958380121528221e-05, + "loss": 0.0002, + "step": 1307 + }, + { + "epoch": 1.67328, + "grad_norm": 0.0012357474770396948, + "learning_rate": 1.9559171927388586e-05, + "loss": 0.0002, + "step": 1308 + }, + { + "epoch": 1.67456, + "grad_norm": 0.0019563697278499603, + "learning_rate": 1.9534547294006006e-05, + "loss": 0.0002, + "step": 1309 + }, + { + "epoch": 1.67584, + "grad_norm": 0.00038007416878826916, + "learning_rate": 1.9509927362092133e-05, + "loss": 0.0002, + "step": 1310 + }, + { + "epoch": 1.67712, + "grad_norm": 0.0010130003793165088, + "learning_rate": 1.94853121785957e-05, + "loss": 0.0002, + "step": 1311 + }, + { + "epoch": 1.6784, + "grad_norm": 0.0007123137474991381, + "learning_rate": 1.9460701790456348e-05, + "loss": 0.0002, + "step": 1312 + }, + { + "epoch": 1.6796799999999998, + "grad_norm": 0.0009491070522926748, + "learning_rate": 1.9436096244604607e-05, + "loss": 0.0002, + "step": 1313 + }, + { + "epoch": 1.68096, + "grad_norm": 0.0008381984662264585, + "learning_rate": 1.941149558796176e-05, + "loss": 0.0002, + "step": 1314 + }, + { + "epoch": 1.68224, + "grad_norm": 0.001452930853702128, + "learning_rate": 1.9386899867439756e-05, + "loss": 0.0002, + "step": 1315 + }, + { + "epoch": 1.6835200000000001, + "grad_norm": 0.0004416204756125808, + "learning_rate": 1.9362309129941157e-05, + "loss": 0.0002, + "step": 1316 + }, + { + "epoch": 1.6848, + "grad_norm": 0.0013925581006333232, + "learning_rate": 1.9337723422358992e-05, + "loss": 0.0002, + "step": 1317 + }, + { + "epoch": 1.68608, + "grad_norm": 0.00019246626470703632, + "learning_rate": 1.9313142791576733e-05, + "loss": 0.0002, + "step": 1318 + }, + { + "epoch": 1.68736, + "grad_norm": 0.0008070362964645028, + "learning_rate": 1.9288567284468128e-05, + "loss": 0.0002, + "step": 1319 + }, + { + "epoch": 1.68864, + "grad_norm": 0.0005856645875610411, + "learning_rate": 1.9263996947897193e-05, + "loss": 0.0002, + "step": 1320 + }, + { + "epoch": 1.68864, + "eval_loss": 1.1746774911880493, + "eval_runtime": 43.1038, + "eval_samples_per_second": 11.646, + "eval_steps_per_second": 1.462, + "step": 1320 + }, + { + "epoch": 1.6899199999999999, + "grad_norm": 0.0009006186737678945, + "learning_rate": 1.9239431828718068e-05, + "loss": 0.0002, + "step": 1321 + }, + { + "epoch": 1.6912, + "grad_norm": 0.0007656152010895312, + "learning_rate": 1.9214871973774936e-05, + "loss": 0.0002, + "step": 1322 + }, + { + "epoch": 1.69248, + "grad_norm": 0.0007279204437509179, + "learning_rate": 1.9190317429901965e-05, + "loss": 0.0002, + "step": 1323 + }, + { + "epoch": 1.6937600000000002, + "grad_norm": 0.0014004282420501113, + "learning_rate": 1.9165768243923172e-05, + "loss": 0.0002, + "step": 1324 + }, + { + "epoch": 1.69504, + "grad_norm": 0.0016127376584336162, + "learning_rate": 1.9141224462652355e-05, + "loss": 0.0002, + "step": 1325 + }, + { + "epoch": 1.69632, + "grad_norm": 0.0012736310018226504, + "learning_rate": 1.9116686132893037e-05, + "loss": 0.0002, + "step": 1326 + }, + { + "epoch": 1.6976, + "grad_norm": 0.002573431469500065, + "learning_rate": 1.9092153301438295e-05, + "loss": 0.0002, + "step": 1327 + }, + { + "epoch": 1.69888, + "grad_norm": 0.000511201040353626, + "learning_rate": 1.9067626015070784e-05, + "loss": 0.0002, + "step": 1328 + }, + { + "epoch": 1.70016, + "grad_norm": 0.002701991470530629, + "learning_rate": 1.9043104320562534e-05, + "loss": 0.0002, + "step": 1329 + }, + { + "epoch": 1.7014399999999998, + "grad_norm": 0.00016028250684030354, + "learning_rate": 1.9018588264674918e-05, + "loss": 0.0002, + "step": 1330 + }, + { + "epoch": 1.70272, + "grad_norm": 0.002182245021685958, + "learning_rate": 1.8994077894158583e-05, + "loss": 0.0002, + "step": 1331 + }, + { + "epoch": 1.704, + "grad_norm": 0.0001369698002235964, + "learning_rate": 1.896957325575331e-05, + "loss": 0.0002, + "step": 1332 + }, + { + "epoch": 1.7052800000000001, + "grad_norm": 0.0019207252189517021, + "learning_rate": 1.894507439618796e-05, + "loss": 0.0002, + "step": 1333 + }, + { + "epoch": 1.70656, + "grad_norm": 0.0006946883513592184, + "learning_rate": 1.8920581362180376e-05, + "loss": 0.0002, + "step": 1334 + }, + { + "epoch": 1.70784, + "grad_norm": 0.0023960743565112352, + "learning_rate": 1.8896094200437286e-05, + "loss": 0.0002, + "step": 1335 + }, + { + "epoch": 1.70912, + "grad_norm": 0.0003100070753134787, + "learning_rate": 1.8871612957654214e-05, + "loss": 0.0002, + "step": 1336 + }, + { + "epoch": 1.7104, + "grad_norm": 0.0013245106674730778, + "learning_rate": 1.8847137680515407e-05, + "loss": 0.0002, + "step": 1337 + }, + { + "epoch": 1.7116799999999999, + "grad_norm": 0.00015704419638495892, + "learning_rate": 1.882266841569375e-05, + "loss": 0.0002, + "step": 1338 + }, + { + "epoch": 1.71296, + "grad_norm": 0.0016549899009987712, + "learning_rate": 1.879820520985062e-05, + "loss": 0.0002, + "step": 1339 + }, + { + "epoch": 1.71424, + "grad_norm": 0.000723134318832308, + "learning_rate": 1.8773748109635887e-05, + "loss": 0.0002, + "step": 1340 + }, + { + "epoch": 1.7155200000000002, + "grad_norm": 0.0005959108239039779, + "learning_rate": 1.874929716168775e-05, + "loss": 0.0002, + "step": 1341 + }, + { + "epoch": 1.7168, + "grad_norm": 0.00012082476314390078, + "learning_rate": 1.872485241263267e-05, + "loss": 0.0002, + "step": 1342 + }, + { + "epoch": 1.71808, + "grad_norm": 0.0008101105922833085, + "learning_rate": 1.8700413909085324e-05, + "loss": 0.0002, + "step": 1343 + }, + { + "epoch": 1.71936, + "grad_norm": 0.0002989626955240965, + "learning_rate": 1.8675981697648435e-05, + "loss": 0.0002, + "step": 1344 + }, + { + "epoch": 1.72064, + "grad_norm": 0.00021426856983453035, + "learning_rate": 1.865155582491277e-05, + "loss": 0.0002, + "step": 1345 + }, + { + "epoch": 1.72192, + "grad_norm": 0.00018660198838915676, + "learning_rate": 1.8627136337456956e-05, + "loss": 0.0002, + "step": 1346 + }, + { + "epoch": 1.7231999999999998, + "grad_norm": 0.0006224081735126674, + "learning_rate": 1.8602723281847503e-05, + "loss": 0.0002, + "step": 1347 + }, + { + "epoch": 1.72448, + "grad_norm": 0.0008301949710585177, + "learning_rate": 1.857831670463861e-05, + "loss": 0.0002, + "step": 1348 + }, + { + "epoch": 1.72576, + "grad_norm": 0.00014336922322399914, + "learning_rate": 1.8553916652372144e-05, + "loss": 0.0002, + "step": 1349 + }, + { + "epoch": 1.7270400000000001, + "grad_norm": 0.0011552822543308139, + "learning_rate": 1.8529523171577524e-05, + "loss": 0.0002, + "step": 1350 + }, + { + "epoch": 1.7270400000000001, + "eval_loss": 1.1862709522247314, + "eval_runtime": 43.6506, + "eval_samples_per_second": 11.5, + "eval_steps_per_second": 1.443, + "step": 1350 + }, + { + "epoch": 1.72832, + "grad_norm": 0.0010331370867788792, + "learning_rate": 1.8505136308771633e-05, + "loss": 0.0002, + "step": 1351 + }, + { + "epoch": 1.7296, + "grad_norm": 0.00012964263441972435, + "learning_rate": 1.8480756110458745e-05, + "loss": 0.0002, + "step": 1352 + }, + { + "epoch": 1.73088, + "grad_norm": 0.0005008718580938876, + "learning_rate": 1.8456382623130422e-05, + "loss": 0.0002, + "step": 1353 + }, + { + "epoch": 1.73216, + "grad_norm": 0.0001546658604638651, + "learning_rate": 1.843201589326541e-05, + "loss": 0.0002, + "step": 1354 + }, + { + "epoch": 1.7334399999999999, + "grad_norm": 0.0003648756246548146, + "learning_rate": 1.8407655967329603e-05, + "loss": 0.0002, + "step": 1355 + }, + { + "epoch": 1.73472, + "grad_norm": 0.0002198543370468542, + "learning_rate": 1.8383302891775887e-05, + "loss": 0.0002, + "step": 1356 + }, + { + "epoch": 1.736, + "grad_norm": 0.00030617695301771164, + "learning_rate": 1.835895671304411e-05, + "loss": 0.0002, + "step": 1357 + }, + { + "epoch": 1.7372800000000002, + "grad_norm": 0.0005935181980021298, + "learning_rate": 1.8334617477560956e-05, + "loss": 0.0002, + "step": 1358 + }, + { + "epoch": 1.73856, + "grad_norm": 0.00011263234773650765, + "learning_rate": 1.8310285231739855e-05, + "loss": 0.0002, + "step": 1359 + }, + { + "epoch": 1.73984, + "grad_norm": 0.0007660306873731315, + "learning_rate": 1.828596002198095e-05, + "loss": 0.0002, + "step": 1360 + }, + { + "epoch": 1.74112, + "grad_norm": 0.0001412270503351465, + "learning_rate": 1.826164189467091e-05, + "loss": 0.0002, + "step": 1361 + }, + { + "epoch": 1.7424, + "grad_norm": 0.0006607779650948942, + "learning_rate": 1.823733089618296e-05, + "loss": 0.0002, + "step": 1362 + }, + { + "epoch": 1.74368, + "grad_norm": 0.0003936325665563345, + "learning_rate": 1.8213027072876665e-05, + "loss": 0.0002, + "step": 1363 + }, + { + "epoch": 1.7449599999999998, + "grad_norm": 0.0006705633131787181, + "learning_rate": 1.8188730471097976e-05, + "loss": 0.0002, + "step": 1364 + }, + { + "epoch": 1.74624, + "grad_norm": 0.0003361855342518538, + "learning_rate": 1.8164441137179016e-05, + "loss": 0.0002, + "step": 1365 + }, + { + "epoch": 1.74752, + "grad_norm": 0.0005639787414111197, + "learning_rate": 1.8140159117438073e-05, + "loss": 0.0002, + "step": 1366 + }, + { + "epoch": 1.7488000000000001, + "grad_norm": 0.00037033509579487145, + "learning_rate": 1.8115884458179495e-05, + "loss": 0.0002, + "step": 1367 + }, + { + "epoch": 1.75008, + "grad_norm": 0.0008438365184701979, + "learning_rate": 1.8091617205693574e-05, + "loss": 0.0002, + "step": 1368 + }, + { + "epoch": 1.75136, + "grad_norm": 0.000185017182957381, + "learning_rate": 1.806735740625649e-05, + "loss": 0.0002, + "step": 1369 + }, + { + "epoch": 1.75264, + "grad_norm": 0.0009372641798108816, + "learning_rate": 1.8043105106130213e-05, + "loss": 0.0002, + "step": 1370 + }, + { + "epoch": 1.75392, + "grad_norm": 0.0003428539785090834, + "learning_rate": 1.8018860351562406e-05, + "loss": 0.0002, + "step": 1371 + }, + { + "epoch": 1.7551999999999999, + "grad_norm": 0.001404768554493785, + "learning_rate": 1.7994623188786346e-05, + "loss": 0.0002, + "step": 1372 + }, + { + "epoch": 1.75648, + "grad_norm": 0.0002842540852725506, + "learning_rate": 1.7970393664020824e-05, + "loss": 0.0002, + "step": 1373 + }, + { + "epoch": 1.75776, + "grad_norm": 0.0010203004349023104, + "learning_rate": 1.7946171823470085e-05, + "loss": 0.0002, + "step": 1374 + }, + { + "epoch": 1.7590400000000002, + "grad_norm": 0.0005073043284937739, + "learning_rate": 1.7921957713323695e-05, + "loss": 0.0002, + "step": 1375 + }, + { + "epoch": 1.76032, + "grad_norm": 0.00044870551209896803, + "learning_rate": 1.7897751379756515e-05, + "loss": 0.0002, + "step": 1376 + }, + { + "epoch": 1.7616, + "grad_norm": 0.0001787729124771431, + "learning_rate": 1.787355286892854e-05, + "loss": 0.0002, + "step": 1377 + }, + { + "epoch": 1.76288, + "grad_norm": 0.0007749973447062075, + "learning_rate": 1.7849362226984852e-05, + "loss": 0.0002, + "step": 1378 + }, + { + "epoch": 1.76416, + "grad_norm": 0.0006996009033173323, + "learning_rate": 1.7825179500055556e-05, + "loss": 0.0002, + "step": 1379 + }, + { + "epoch": 1.76544, + "grad_norm": 0.0005011953180655837, + "learning_rate": 1.7801004734255632e-05, + "loss": 0.0002, + "step": 1380 + }, + { + "epoch": 1.76544, + "eval_loss": 1.1896673440933228, + "eval_runtime": 43.8766, + "eval_samples_per_second": 11.441, + "eval_steps_per_second": 1.436, + "step": 1380 + }, + { + "epoch": 1.7667199999999998, + "grad_norm": 0.0009371109772473574, + "learning_rate": 1.7776837975684902e-05, + "loss": 0.0002, + "step": 1381 + }, + { + "epoch": 1.768, + "grad_norm": 0.000150485968333669, + "learning_rate": 1.7752679270427902e-05, + "loss": 0.0002, + "step": 1382 + }, + { + "epoch": 1.76928, + "grad_norm": 0.0002970367786474526, + "learning_rate": 1.7728528664553812e-05, + "loss": 0.0002, + "step": 1383 + }, + { + "epoch": 1.7705600000000001, + "grad_norm": 0.000249633303610608, + "learning_rate": 1.7704386204116385e-05, + "loss": 0.0002, + "step": 1384 + }, + { + "epoch": 1.77184, + "grad_norm": 0.000201058792299591, + "learning_rate": 1.7680251935153813e-05, + "loss": 0.0002, + "step": 1385 + }, + { + "epoch": 1.77312, + "grad_norm": 0.00021982115868013352, + "learning_rate": 1.7656125903688698e-05, + "loss": 0.0002, + "step": 1386 + }, + { + "epoch": 1.7744, + "grad_norm": 0.00044147908920422196, + "learning_rate": 1.763200815572791e-05, + "loss": 0.0002, + "step": 1387 + }, + { + "epoch": 1.77568, + "grad_norm": 0.00031368451891466975, + "learning_rate": 1.7607898737262528e-05, + "loss": 0.0002, + "step": 1388 + }, + { + "epoch": 1.7769599999999999, + "grad_norm": 0.00021405424922704697, + "learning_rate": 1.7583797694267754e-05, + "loss": 0.0002, + "step": 1389 + }, + { + "epoch": 1.77824, + "grad_norm": 0.00014162302250042558, + "learning_rate": 1.7559705072702814e-05, + "loss": 0.0002, + "step": 1390 + }, + { + "epoch": 1.77952, + "grad_norm": 0.000270137214101851, + "learning_rate": 1.7535620918510883e-05, + "loss": 0.0002, + "step": 1391 + }, + { + "epoch": 1.7808000000000002, + "grad_norm": 0.0003177259932272136, + "learning_rate": 1.7511545277618965e-05, + "loss": 0.0002, + "step": 1392 + }, + { + "epoch": 1.78208, + "grad_norm": 0.0005201466847211123, + "learning_rate": 1.7487478195937865e-05, + "loss": 0.0002, + "step": 1393 + }, + { + "epoch": 1.78336, + "grad_norm": 0.0006870387587696314, + "learning_rate": 1.7463419719362034e-05, + "loss": 0.0002, + "step": 1394 + }, + { + "epoch": 1.78464, + "grad_norm": 0.00019761078874580562, + "learning_rate": 1.7439369893769528e-05, + "loss": 0.0002, + "step": 1395 + }, + { + "epoch": 1.78592, + "grad_norm": 0.0012992613483220339, + "learning_rate": 1.7415328765021912e-05, + "loss": 0.0002, + "step": 1396 + }, + { + "epoch": 1.7872, + "grad_norm": 0.0006985355284996331, + "learning_rate": 1.7391296378964156e-05, + "loss": 0.0002, + "step": 1397 + }, + { + "epoch": 1.7884799999999998, + "grad_norm": 0.0004908322589471936, + "learning_rate": 1.7367272781424566e-05, + "loss": 0.0002, + "step": 1398 + }, + { + "epoch": 1.78976, + "grad_norm": 0.00016040352056734264, + "learning_rate": 1.7343258018214673e-05, + "loss": 0.0002, + "step": 1399 + }, + { + "epoch": 1.79104, + "grad_norm": 0.00038101038080640137, + "learning_rate": 1.731925213512918e-05, + "loss": 0.0002, + "step": 1400 + }, + { + "epoch": 1.7923200000000001, + "grad_norm": 0.00041198916733264923, + "learning_rate": 1.7295255177945858e-05, + "loss": 0.0002, + "step": 1401 + }, + { + "epoch": 1.7936, + "grad_norm": 0.0008410747977904975, + "learning_rate": 1.7271267192425436e-05, + "loss": 0.0002, + "step": 1402 + }, + { + "epoch": 1.79488, + "grad_norm": 0.00016792569658719003, + "learning_rate": 1.7247288224311557e-05, + "loss": 0.0002, + "step": 1403 + }, + { + "epoch": 1.79616, + "grad_norm": 0.00025911914417520165, + "learning_rate": 1.722331831933065e-05, + "loss": 0.0002, + "step": 1404 + }, + { + "epoch": 1.79744, + "grad_norm": 0.0005258122109808028, + "learning_rate": 1.719935752319187e-05, + "loss": 0.0002, + "step": 1405 + }, + { + "epoch": 1.7987199999999999, + "grad_norm": 0.000921487167943269, + "learning_rate": 1.7175405881587015e-05, + "loss": 0.0002, + "step": 1406 + }, + { + "epoch": 1.8, + "grad_norm": 0.00017789006233215332, + "learning_rate": 1.7151463440190387e-05, + "loss": 0.0002, + "step": 1407 + }, + { + "epoch": 1.80128, + "grad_norm": 0.0008785719983279705, + "learning_rate": 1.7127530244658796e-05, + "loss": 0.0002, + "step": 1408 + }, + { + "epoch": 1.8025600000000002, + "grad_norm": 0.0005776337930001318, + "learning_rate": 1.7103606340631365e-05, + "loss": 0.0002, + "step": 1409 + }, + { + "epoch": 1.80384, + "grad_norm": 0.00021976965945214033, + "learning_rate": 1.7079691773729558e-05, + "loss": 0.0002, + "step": 1410 + }, + { + "epoch": 1.80384, + "eval_loss": 1.1821080446243286, + "eval_runtime": 43.496, + "eval_samples_per_second": 11.541, + "eval_steps_per_second": 1.448, + "step": 1410 + }, + { + "epoch": 1.80512, + "grad_norm": 0.00015776725194882601, + "learning_rate": 1.7055786589556983e-05, + "loss": 0.0002, + "step": 1411 + }, + { + "epoch": 1.8064, + "grad_norm": 0.0003731239412445575, + "learning_rate": 1.703189083369938e-05, + "loss": 0.0002, + "step": 1412 + }, + { + "epoch": 1.80768, + "grad_norm": 0.0003114558639936149, + "learning_rate": 1.7008004551724503e-05, + "loss": 0.0002, + "step": 1413 + }, + { + "epoch": 1.80896, + "grad_norm": 0.0005815111217088997, + "learning_rate": 1.6984127789182046e-05, + "loss": 0.0002, + "step": 1414 + }, + { + "epoch": 1.8102399999999998, + "grad_norm": 0.00024505917099304497, + "learning_rate": 1.6960260591603553e-05, + "loss": 0.0002, + "step": 1415 + }, + { + "epoch": 1.81152, + "grad_norm": 0.0004912762087769806, + "learning_rate": 1.6936403004502303e-05, + "loss": 0.0002, + "step": 1416 + }, + { + "epoch": 1.8128, + "grad_norm": 0.0006611712742596865, + "learning_rate": 1.6912555073373292e-05, + "loss": 0.0002, + "step": 1417 + }, + { + "epoch": 1.8140800000000001, + "grad_norm": 0.0011671832762658596, + "learning_rate": 1.688871684369306e-05, + "loss": 0.0002, + "step": 1418 + }, + { + "epoch": 1.81536, + "grad_norm": 0.0006492765387520194, + "learning_rate": 1.6864888360919664e-05, + "loss": 0.0002, + "step": 1419 + }, + { + "epoch": 1.81664, + "grad_norm": 0.0014940252294763923, + "learning_rate": 1.6841069670492584e-05, + "loss": 0.0002, + "step": 1420 + }, + { + "epoch": 1.81792, + "grad_norm": 0.00017851243319455534, + "learning_rate": 1.681726081783261e-05, + "loss": 0.0002, + "step": 1421 + }, + { + "epoch": 1.8192, + "grad_norm": 0.0008618085994385183, + "learning_rate": 1.6793461848341787e-05, + "loss": 0.0002, + "step": 1422 + }, + { + "epoch": 1.8204799999999999, + "grad_norm": 0.0009502249886281788, + "learning_rate": 1.67696728074033e-05, + "loss": 0.0002, + "step": 1423 + }, + { + "epoch": 1.82176, + "grad_norm": 0.0014715096913278103, + "learning_rate": 1.6745893740381402e-05, + "loss": 0.0002, + "step": 1424 + }, + { + "epoch": 1.82304, + "grad_norm": 0.0006264635012485087, + "learning_rate": 1.6722124692621348e-05, + "loss": 0.0002, + "step": 1425 + }, + { + "epoch": 1.8243200000000002, + "grad_norm": 0.0013680956326425076, + "learning_rate": 1.6698365709449243e-05, + "loss": 0.0002, + "step": 1426 + }, + { + "epoch": 1.8256000000000001, + "grad_norm": 0.0005238053272478282, + "learning_rate": 1.6674616836172055e-05, + "loss": 0.0002, + "step": 1427 + }, + { + "epoch": 1.82688, + "grad_norm": 0.001148419687524438, + "learning_rate": 1.6650878118077426e-05, + "loss": 0.0002, + "step": 1428 + }, + { + "epoch": 1.82816, + "grad_norm": 0.0004914168966934085, + "learning_rate": 1.6627149600433656e-05, + "loss": 0.0002, + "step": 1429 + }, + { + "epoch": 1.82944, + "grad_norm": 0.0009969781385734677, + "learning_rate": 1.660343132848959e-05, + "loss": 0.0002, + "step": 1430 + }, + { + "epoch": 1.83072, + "grad_norm": 0.0003050584637094289, + "learning_rate": 1.657972334747453e-05, + "loss": 0.0002, + "step": 1431 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.0008236157591454685, + "learning_rate": 1.6556025702598163e-05, + "loss": 0.0002, + "step": 1432 + }, + { + "epoch": 1.83328, + "grad_norm": 0.00024275497708003968, + "learning_rate": 1.6532338439050453e-05, + "loss": 0.0002, + "step": 1433 + }, + { + "epoch": 1.83456, + "grad_norm": 0.0009991588303819299, + "learning_rate": 1.650866160200158e-05, + "loss": 0.0002, + "step": 1434 + }, + { + "epoch": 1.8358400000000001, + "grad_norm": 0.0002861864340957254, + "learning_rate": 1.648499523660183e-05, + "loss": 0.0002, + "step": 1435 + }, + { + "epoch": 1.83712, + "grad_norm": 0.0007088854908943176, + "learning_rate": 1.6461339387981535e-05, + "loss": 0.0002, + "step": 1436 + }, + { + "epoch": 1.8384, + "grad_norm": 0.0005638033035211265, + "learning_rate": 1.643769410125095e-05, + "loss": 0.0002, + "step": 1437 + }, + { + "epoch": 1.83968, + "grad_norm": 0.00013802683679386973, + "learning_rate": 1.6414059421500207e-05, + "loss": 0.0002, + "step": 1438 + }, + { + "epoch": 1.84096, + "grad_norm": 0.00027752117603085935, + "learning_rate": 1.6390435393799214e-05, + "loss": 0.0002, + "step": 1439 + }, + { + "epoch": 1.8422399999999999, + "grad_norm": 0.0008538371766917408, + "learning_rate": 1.6366822063197556e-05, + "loss": 0.0002, + "step": 1440 + }, + { + "epoch": 1.8422399999999999, + "eval_loss": 1.1808280944824219, + "eval_runtime": 43.3456, + "eval_samples_per_second": 11.581, + "eval_steps_per_second": 1.453, + "step": 1440 + }, + { + "epoch": 1.84352, + "grad_norm": 0.00022036586597096175, + "learning_rate": 1.6343219474724404e-05, + "loss": 0.0002, + "step": 1441 + }, + { + "epoch": 1.8448, + "grad_norm": 0.0010862440103664994, + "learning_rate": 1.6319627673388495e-05, + "loss": 0.0002, + "step": 1442 + }, + { + "epoch": 1.8460800000000002, + "grad_norm": 0.00029390587587840855, + "learning_rate": 1.6296046704177927e-05, + "loss": 0.0002, + "step": 1443 + }, + { + "epoch": 1.8473600000000001, + "grad_norm": 0.0015011231880635023, + "learning_rate": 1.627247661206021e-05, + "loss": 0.0002, + "step": 1444 + }, + { + "epoch": 1.84864, + "grad_norm": 0.0004352701944299042, + "learning_rate": 1.6248917441982045e-05, + "loss": 0.0002, + "step": 1445 + }, + { + "epoch": 1.84992, + "grad_norm": 0.0007848394452594221, + "learning_rate": 1.6225369238869374e-05, + "loss": 0.0002, + "step": 1446 + }, + { + "epoch": 1.8512, + "grad_norm": 0.00015477949636988342, + "learning_rate": 1.6201832047627174e-05, + "loss": 0.0002, + "step": 1447 + }, + { + "epoch": 1.85248, + "grad_norm": 0.0010525579564273357, + "learning_rate": 1.6178305913139438e-05, + "loss": 0.0002, + "step": 1448 + }, + { + "epoch": 1.8537599999999999, + "grad_norm": 0.00019572787277866155, + "learning_rate": 1.6154790880269083e-05, + "loss": 0.0002, + "step": 1449 + }, + { + "epoch": 1.85504, + "grad_norm": 0.001078969449736178, + "learning_rate": 1.613128699385785e-05, + "loss": 0.0002, + "step": 1450 + }, + { + "epoch": 1.85632, + "grad_norm": 0.0001666359748924151, + "learning_rate": 1.6107794298726222e-05, + "loss": 0.0002, + "step": 1451 + }, + { + "epoch": 1.8576000000000001, + "grad_norm": 0.0004069862188771367, + "learning_rate": 1.6084312839673346e-05, + "loss": 0.0002, + "step": 1452 + }, + { + "epoch": 1.85888, + "grad_norm": 0.0008127239998430014, + "learning_rate": 1.6060842661476935e-05, + "loss": 0.0002, + "step": 1453 + }, + { + "epoch": 1.86016, + "grad_norm": 0.0008960436680354178, + "learning_rate": 1.6037383808893195e-05, + "loss": 0.0002, + "step": 1454 + }, + { + "epoch": 1.86144, + "grad_norm": 0.00027456111274659634, + "learning_rate": 1.6013936326656736e-05, + "loss": 0.0002, + "step": 1455 + }, + { + "epoch": 1.86272, + "grad_norm": 0.0003418760607019067, + "learning_rate": 1.5990500259480492e-05, + "loss": 0.0002, + "step": 1456 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.0004984101396985352, + "learning_rate": 1.5967075652055616e-05, + "loss": 0.0002, + "step": 1457 + }, + { + "epoch": 1.86528, + "grad_norm": 0.0002873337361961603, + "learning_rate": 1.59436625490514e-05, + "loss": 0.0002, + "step": 1458 + }, + { + "epoch": 1.86656, + "grad_norm": 0.0005379713838919997, + "learning_rate": 1.592026099511523e-05, + "loss": 0.0002, + "step": 1459 + }, + { + "epoch": 1.86784, + "grad_norm": 0.0003407693584449589, + "learning_rate": 1.589687103487243e-05, + "loss": 0.0002, + "step": 1460 + }, + { + "epoch": 1.8691200000000001, + "grad_norm": 0.0007091562729328871, + "learning_rate": 1.587349271292626e-05, + "loss": 0.0002, + "step": 1461 + }, + { + "epoch": 1.8704, + "grad_norm": 0.0007454613223671913, + "learning_rate": 1.5850126073857733e-05, + "loss": 0.0002, + "step": 1462 + }, + { + "epoch": 1.87168, + "grad_norm": 0.00040651875315234065, + "learning_rate": 1.5826771162225637e-05, + "loss": 0.0002, + "step": 1463 + }, + { + "epoch": 1.87296, + "grad_norm": 0.0006714593037031591, + "learning_rate": 1.580342802256636e-05, + "loss": 0.0002, + "step": 1464 + }, + { + "epoch": 1.87424, + "grad_norm": 0.0006067923968657851, + "learning_rate": 1.5780096699393845e-05, + "loss": 0.0002, + "step": 1465 + }, + { + "epoch": 1.8755199999999999, + "grad_norm": 0.0007768379291519523, + "learning_rate": 1.5756777237199527e-05, + "loss": 0.0002, + "step": 1466 + }, + { + "epoch": 1.8768, + "grad_norm": 0.0004379746678750962, + "learning_rate": 1.573346968045219e-05, + "loss": 0.0002, + "step": 1467 + }, + { + "epoch": 1.87808, + "grad_norm": 0.00031985618988983333, + "learning_rate": 1.5710174073597944e-05, + "loss": 0.0002, + "step": 1468 + }, + { + "epoch": 1.8793600000000001, + "grad_norm": 0.0005030058673582971, + "learning_rate": 1.5686890461060084e-05, + "loss": 0.0002, + "step": 1469 + }, + { + "epoch": 1.88064, + "grad_norm": 0.0002572862431406975, + "learning_rate": 1.5663618887239053e-05, + "loss": 0.0002, + "step": 1470 + }, + { + "epoch": 1.88064, + "eval_loss": 1.1787371635437012, + "eval_runtime": 43.1586, + "eval_samples_per_second": 11.632, + "eval_steps_per_second": 1.46, + "step": 1470 + }, + { + "epoch": 1.88192, + "grad_norm": 0.0005245308275334537, + "learning_rate": 1.564035939651234e-05, + "loss": 0.0002, + "step": 1471 + }, + { + "epoch": 1.8832, + "grad_norm": 0.00030732108280062675, + "learning_rate": 1.5617112033234357e-05, + "loss": 0.0002, + "step": 1472 + }, + { + "epoch": 1.88448, + "grad_norm": 0.0007221931009553373, + "learning_rate": 1.559387684173644e-05, + "loss": 0.0002, + "step": 1473 + }, + { + "epoch": 1.8857599999999999, + "grad_norm": 0.0006306172581389546, + "learning_rate": 1.5570653866326685e-05, + "loss": 0.0002, + "step": 1474 + }, + { + "epoch": 1.88704, + "grad_norm": 0.0006269289879128337, + "learning_rate": 1.5547443151289887e-05, + "loss": 0.0002, + "step": 1475 + }, + { + "epoch": 1.88832, + "grad_norm": 0.0005559783894568682, + "learning_rate": 1.552424474088748e-05, + "loss": 0.0002, + "step": 1476 + }, + { + "epoch": 1.8896, + "grad_norm": 0.00046742561971768737, + "learning_rate": 1.5501058679357413e-05, + "loss": 0.0002, + "step": 1477 + }, + { + "epoch": 1.8908800000000001, + "grad_norm": 0.00011757310858229175, + "learning_rate": 1.5477885010914116e-05, + "loss": 0.0002, + "step": 1478 + }, + { + "epoch": 1.89216, + "grad_norm": 0.0009887436171993613, + "learning_rate": 1.5454723779748352e-05, + "loss": 0.0002, + "step": 1479 + }, + { + "epoch": 1.89344, + "grad_norm": 0.00039360520895570517, + "learning_rate": 1.5431575030027204e-05, + "loss": 0.0002, + "step": 1480 + }, + { + "epoch": 1.89472, + "grad_norm": 0.0007880927878431976, + "learning_rate": 1.540843880589391e-05, + "loss": 0.0002, + "step": 1481 + }, + { + "epoch": 1.896, + "grad_norm": 0.0004839149478357285, + "learning_rate": 1.5385315151467856e-05, + "loss": 0.0002, + "step": 1482 + }, + { + "epoch": 1.8972799999999999, + "grad_norm": 0.0007827369263395667, + "learning_rate": 1.5362204110844454e-05, + "loss": 0.0002, + "step": 1483 + }, + { + "epoch": 1.89856, + "grad_norm": 0.0004319619038142264, + "learning_rate": 1.533910572809504e-05, + "loss": 0.0002, + "step": 1484 + }, + { + "epoch": 1.89984, + "grad_norm": 0.00044077992788515985, + "learning_rate": 1.531602004726685e-05, + "loss": 0.0002, + "step": 1485 + }, + { + "epoch": 1.9011200000000001, + "grad_norm": 0.0003705875133164227, + "learning_rate": 1.529294711238286e-05, + "loss": 0.0002, + "step": 1486 + }, + { + "epoch": 1.9024, + "grad_norm": 0.0005771559081040323, + "learning_rate": 1.5269886967441756e-05, + "loss": 0.0002, + "step": 1487 + }, + { + "epoch": 1.90368, + "grad_norm": 0.00013823727203998715, + "learning_rate": 1.5246839656417845e-05, + "loss": 0.0002, + "step": 1488 + }, + { + "epoch": 1.90496, + "grad_norm": 0.0001308949722442776, + "learning_rate": 1.5223805223260946e-05, + "loss": 0.0002, + "step": 1489 + }, + { + "epoch": 1.90624, + "grad_norm": 0.00029769743559882045, + "learning_rate": 1.5200783711896327e-05, + "loss": 0.0002, + "step": 1490 + }, + { + "epoch": 1.9075199999999999, + "grad_norm": 0.0002313807635800913, + "learning_rate": 1.5177775166224606e-05, + "loss": 0.0002, + "step": 1491 + }, + { + "epoch": 1.9088, + "grad_norm": 0.00012953733676113188, + "learning_rate": 1.5154779630121703e-05, + "loss": 0.0002, + "step": 1492 + }, + { + "epoch": 1.91008, + "grad_norm": 0.0003203785454388708, + "learning_rate": 1.5131797147438692e-05, + "loss": 0.0002, + "step": 1493 + }, + { + "epoch": 1.91136, + "grad_norm": 0.00015288084978237748, + "learning_rate": 1.5108827762001772e-05, + "loss": 0.0002, + "step": 1494 + }, + { + "epoch": 1.9126400000000001, + "grad_norm": 0.00027298854547552764, + "learning_rate": 1.5085871517612178e-05, + "loss": 0.0002, + "step": 1495 + }, + { + "epoch": 1.91392, + "grad_norm": 0.0003144654037896544, + "learning_rate": 1.5062928458046066e-05, + "loss": 0.0002, + "step": 1496 + }, + { + "epoch": 1.9152, + "grad_norm": 0.0005439766100607812, + "learning_rate": 1.5039998627054462e-05, + "loss": 0.0002, + "step": 1497 + }, + { + "epoch": 1.91648, + "grad_norm": 0.00028937109163962305, + "learning_rate": 1.5017082068363168e-05, + "loss": 0.0002, + "step": 1498 + }, + { + "epoch": 1.91776, + "grad_norm": 0.00034966770908795297, + "learning_rate": 1.4994178825672663e-05, + "loss": 0.0002, + "step": 1499 + }, + { + "epoch": 1.9190399999999999, + "grad_norm": 0.000162362091941759, + "learning_rate": 1.4971288942658046e-05, + "loss": 0.0002, + "step": 1500 + }, + { + "epoch": 1.9190399999999999, + "eval_loss": 1.18230402469635, + "eval_runtime": 43.4353, + "eval_samples_per_second": 11.557, + "eval_steps_per_second": 1.45, + "step": 1500 + }, + { + "epoch": 1.92032, + "grad_norm": 0.00014286025543697178, + "learning_rate": 1.4948412462968929e-05, + "loss": 0.0002, + "step": 1501 + }, + { + "epoch": 1.9216, + "grad_norm": 0.00026032765163108706, + "learning_rate": 1.4925549430229378e-05, + "loss": 0.0002, + "step": 1502 + }, + { + "epoch": 1.9228800000000001, + "grad_norm": 0.000396853982238099, + "learning_rate": 1.49026998880378e-05, + "loss": 0.0002, + "step": 1503 + }, + { + "epoch": 1.92416, + "grad_norm": 0.00029547963640652597, + "learning_rate": 1.4879863879966903e-05, + "loss": 0.0002, + "step": 1504 + }, + { + "epoch": 1.92544, + "grad_norm": 0.0006995249423198402, + "learning_rate": 1.4857041449563552e-05, + "loss": 0.0002, + "step": 1505 + }, + { + "epoch": 1.92672, + "grad_norm": 0.00016520809731446207, + "learning_rate": 1.4834232640348733e-05, + "loss": 0.0002, + "step": 1506 + }, + { + "epoch": 1.928, + "grad_norm": 0.00048660230822861195, + "learning_rate": 1.4811437495817486e-05, + "loss": 0.0002, + "step": 1507 + }, + { + "epoch": 1.9292799999999999, + "grad_norm": 0.0006820852868258953, + "learning_rate": 1.4788656059438742e-05, + "loss": 0.0002, + "step": 1508 + }, + { + "epoch": 1.93056, + "grad_norm": 0.0003663533425424248, + "learning_rate": 1.4765888374655335e-05, + "loss": 0.0002, + "step": 1509 + }, + { + "epoch": 1.93184, + "grad_norm": 0.0011120438575744629, + "learning_rate": 1.4743134484883847e-05, + "loss": 0.0002, + "step": 1510 + }, + { + "epoch": 1.93312, + "grad_norm": 0.00039282278157770634, + "learning_rate": 1.4720394433514572e-05, + "loss": 0.0002, + "step": 1511 + }, + { + "epoch": 1.9344000000000001, + "grad_norm": 0.0013644680147990584, + "learning_rate": 1.4697668263911408e-05, + "loss": 0.0002, + "step": 1512 + }, + { + "epoch": 1.93568, + "grad_norm": 0.0001902208023238927, + "learning_rate": 1.4674956019411775e-05, + "loss": 0.0002, + "step": 1513 + }, + { + "epoch": 1.93696, + "grad_norm": 0.0017309424001723528, + "learning_rate": 1.4652257743326557e-05, + "loss": 0.0002, + "step": 1514 + }, + { + "epoch": 1.93824, + "grad_norm": 0.00018405367154628038, + "learning_rate": 1.4629573478939976e-05, + "loss": 0.0002, + "step": 1515 + }, + { + "epoch": 1.93952, + "grad_norm": 0.0017507137963548303, + "learning_rate": 1.4606903269509554e-05, + "loss": 0.0002, + "step": 1516 + }, + { + "epoch": 1.9407999999999999, + "grad_norm": 0.00046362378634512424, + "learning_rate": 1.4584247158266008e-05, + "loss": 0.0002, + "step": 1517 + }, + { + "epoch": 1.94208, + "grad_norm": 0.0012449919013306499, + "learning_rate": 1.456160518841316e-05, + "loss": 0.0002, + "step": 1518 + }, + { + "epoch": 1.94336, + "grad_norm": 0.00016421821783296764, + "learning_rate": 1.453897740312788e-05, + "loss": 0.0002, + "step": 1519 + }, + { + "epoch": 1.9446400000000001, + "grad_norm": 0.0012402897700667381, + "learning_rate": 1.451636384555998e-05, + "loss": 0.0002, + "step": 1520 + }, + { + "epoch": 1.94592, + "grad_norm": 0.00031017547007650137, + "learning_rate": 1.449376455883214e-05, + "loss": 0.0002, + "step": 1521 + }, + { + "epoch": 1.9472, + "grad_norm": 0.0014095036312937737, + "learning_rate": 1.4471179586039833e-05, + "loss": 0.0002, + "step": 1522 + }, + { + "epoch": 1.94848, + "grad_norm": 0.000539817672688514, + "learning_rate": 1.444860897025122e-05, + "loss": 0.0002, + "step": 1523 + }, + { + "epoch": 1.94976, + "grad_norm": 0.001391338068060577, + "learning_rate": 1.4426052754507108e-05, + "loss": 0.0002, + "step": 1524 + }, + { + "epoch": 1.9510399999999999, + "grad_norm": 0.0005815787590108812, + "learning_rate": 1.4403510981820823e-05, + "loss": 0.0002, + "step": 1525 + }, + { + "epoch": 1.95232, + "grad_norm": 0.000504453491885215, + "learning_rate": 1.438098369517817e-05, + "loss": 0.0002, + "step": 1526 + }, + { + "epoch": 1.9536, + "grad_norm": 0.0015071576926857233, + "learning_rate": 1.4358470937537306e-05, + "loss": 0.0002, + "step": 1527 + }, + { + "epoch": 1.95488, + "grad_norm": 0.0005614665569737554, + "learning_rate": 1.4335972751828693e-05, + "loss": 0.0002, + "step": 1528 + }, + { + "epoch": 1.9561600000000001, + "grad_norm": 0.0013540390646085143, + "learning_rate": 1.4313489180955017e-05, + "loss": 0.0002, + "step": 1529 + }, + { + "epoch": 1.95744, + "grad_norm": 0.0001240475830854848, + "learning_rate": 1.4291020267791073e-05, + "loss": 0.0002, + "step": 1530 + }, + { + "epoch": 1.95744, + "eval_loss": 1.1908667087554932, + "eval_runtime": 43.9335, + "eval_samples_per_second": 11.426, + "eval_steps_per_second": 1.434, + "step": 1530 + }, + { + "epoch": 1.95872, + "grad_norm": 0.0014987153699621558, + "learning_rate": 1.4268566055183723e-05, + "loss": 0.0002, + "step": 1531 + }, + { + "epoch": 1.96, + "grad_norm": 0.00023755997244734317, + "learning_rate": 1.4246126585951779e-05, + "loss": 0.0002, + "step": 1532 + }, + { + "epoch": 1.96128, + "grad_norm": 0.001222156686708331, + "learning_rate": 1.422370190288596e-05, + "loss": 0.0002, + "step": 1533 + }, + { + "epoch": 1.9625599999999999, + "grad_norm": 0.0004482627846300602, + "learning_rate": 1.420129204874877e-05, + "loss": 0.0002, + "step": 1534 + }, + { + "epoch": 1.96384, + "grad_norm": 0.0009276245255023241, + "learning_rate": 1.4178897066274427e-05, + "loss": 0.0002, + "step": 1535 + }, + { + "epoch": 1.96512, + "grad_norm": 0.00022700500267092139, + "learning_rate": 1.4156516998168824e-05, + "loss": 0.0002, + "step": 1536 + }, + { + "epoch": 1.9664000000000001, + "grad_norm": 0.0006968076340854168, + "learning_rate": 1.4134151887109381e-05, + "loss": 0.0002, + "step": 1537 + }, + { + "epoch": 1.96768, + "grad_norm": 0.00012518244329839945, + "learning_rate": 1.4111801775745022e-05, + "loss": 0.0002, + "step": 1538 + }, + { + "epoch": 1.96896, + "grad_norm": 0.0006591297569684684, + "learning_rate": 1.4089466706696034e-05, + "loss": 0.0002, + "step": 1539 + }, + { + "epoch": 1.97024, + "grad_norm": 0.00018697626364883035, + "learning_rate": 1.4067146722554048e-05, + "loss": 0.0002, + "step": 1540 + }, + { + "epoch": 1.97152, + "grad_norm": 0.00033598518348298967, + "learning_rate": 1.4044841865881926e-05, + "loss": 0.0002, + "step": 1541 + }, + { + "epoch": 1.9727999999999999, + "grad_norm": 0.000304957153275609, + "learning_rate": 1.4022552179213658e-05, + "loss": 0.0002, + "step": 1542 + }, + { + "epoch": 1.9740799999999998, + "grad_norm": 0.0005245900247246027, + "learning_rate": 1.400027770505434e-05, + "loss": 0.0002, + "step": 1543 + }, + { + "epoch": 1.97536, + "grad_norm": 0.00034147215774282813, + "learning_rate": 1.3978018485880037e-05, + "loss": 0.0002, + "step": 1544 + }, + { + "epoch": 1.97664, + "grad_norm": 0.0003990849363617599, + "learning_rate": 1.3955774564137736e-05, + "loss": 0.0002, + "step": 1545 + }, + { + "epoch": 1.9779200000000001, + "grad_norm": 0.0005777077749371529, + "learning_rate": 1.3933545982245236e-05, + "loss": 0.0002, + "step": 1546 + }, + { + "epoch": 1.9792, + "grad_norm": 0.0005894573987461627, + "learning_rate": 1.3911332782591093e-05, + "loss": 0.0002, + "step": 1547 + }, + { + "epoch": 1.98048, + "grad_norm": 0.0003304107813164592, + "learning_rate": 1.3889135007534549e-05, + "loss": 0.0002, + "step": 1548 + }, + { + "epoch": 1.98176, + "grad_norm": 0.0006092807743698359, + "learning_rate": 1.3866952699405394e-05, + "loss": 0.0002, + "step": 1549 + }, + { + "epoch": 1.98304, + "grad_norm": 0.0005102291470393538, + "learning_rate": 1.3844785900503946e-05, + "loss": 0.0002, + "step": 1550 + }, + { + "epoch": 1.9843199999999999, + "grad_norm": 0.0012099050218239427, + "learning_rate": 1.382263465310096e-05, + "loss": 0.0002, + "step": 1551 + }, + { + "epoch": 1.9856, + "grad_norm": 0.00048186135245487094, + "learning_rate": 1.3800498999437504e-05, + "loss": 0.0002, + "step": 1552 + }, + { + "epoch": 1.98688, + "grad_norm": 0.00017949033644981682, + "learning_rate": 1.377837898172494e-05, + "loss": 0.0002, + "step": 1553 + }, + { + "epoch": 1.9881600000000001, + "grad_norm": 0.0003641147050075233, + "learning_rate": 1.375627464214479e-05, + "loss": 0.0002, + "step": 1554 + }, + { + "epoch": 1.98944, + "grad_norm": 0.00032777784508652985, + "learning_rate": 1.3734186022848688e-05, + "loss": 0.0002, + "step": 1555 + }, + { + "epoch": 1.99072, + "grad_norm": 0.0004519484646152705, + "learning_rate": 1.3712113165958302e-05, + "loss": 0.0002, + "step": 1556 + }, + { + "epoch": 1.992, + "grad_norm": 0.0003106287040282041, + "learning_rate": 1.3690056113565215e-05, + "loss": 0.0002, + "step": 1557 + }, + { + "epoch": 1.99328, + "grad_norm": 0.0003117345040664077, + "learning_rate": 1.3668014907730895e-05, + "loss": 0.0002, + "step": 1558 + }, + { + "epoch": 1.9945599999999999, + "grad_norm": 0.00021027673210483044, + "learning_rate": 1.3645989590486586e-05, + "loss": 0.0002, + "step": 1559 + }, + { + "epoch": 1.9958399999999998, + "grad_norm": 0.0006823004223406315, + "learning_rate": 1.3623980203833233e-05, + "loss": 0.0002, + "step": 1560 + }, + { + "epoch": 1.9958399999999998, + "eval_loss": 1.1802235841751099, + "eval_runtime": 43.2595, + "eval_samples_per_second": 11.604, + "eval_steps_per_second": 1.456, + "step": 1560 + }, + { + "epoch": 1.99712, + "grad_norm": 0.00023292088008020073, + "learning_rate": 1.3601986789741397e-05, + "loss": 0.0002, + "step": 1561 + }, + { + "epoch": 1.9984, + "grad_norm": 0.0006186995888128877, + "learning_rate": 1.3580009390151185e-05, + "loss": 0.0002, + "step": 1562 + }, + { + "epoch": 1.9996800000000001, + "grad_norm": 0.0003250852460041642, + "learning_rate": 1.3558048046972169e-05, + "loss": 0.0002, + "step": 1563 + }, + { + "epoch": 2.0, + "grad_norm": 0.001266122329980135, + "learning_rate": 1.3536102802083288e-05, + "loss": 0.0002, + "step": 1564 + }, + { + "epoch": 2.00128, + "grad_norm": 0.0008162333979271352, + "learning_rate": 1.3514173697332804e-05, + "loss": 0.0002, + "step": 1565 + }, + { + "epoch": 2.00256, + "grad_norm": 0.0008856783388182521, + "learning_rate": 1.349226077453817e-05, + "loss": 0.0002, + "step": 1566 + }, + { + "epoch": 2.00384, + "grad_norm": 0.0009964342461898923, + "learning_rate": 1.3470364075486027e-05, + "loss": 0.0002, + "step": 1567 + }, + { + "epoch": 2.00512, + "grad_norm": 0.00029980260296724737, + "learning_rate": 1.3448483641932037e-05, + "loss": 0.0002, + "step": 1568 + }, + { + "epoch": 2.0064, + "grad_norm": 0.00043958862079307437, + "learning_rate": 1.3426619515600854e-05, + "loss": 0.0002, + "step": 1569 + }, + { + "epoch": 2.00768, + "grad_norm": 0.0012856496032327414, + "learning_rate": 1.3404771738186047e-05, + "loss": 0.0002, + "step": 1570 + }, + { + "epoch": 2.00896, + "grad_norm": 0.0005549627239815891, + "learning_rate": 1.3382940351350003e-05, + "loss": 0.0002, + "step": 1571 + }, + { + "epoch": 2.01024, + "grad_norm": 0.0012308419682085514, + "learning_rate": 1.336112539672385e-05, + "loss": 0.0002, + "step": 1572 + }, + { + "epoch": 2.01152, + "grad_norm": 0.0009879851713776588, + "learning_rate": 1.333932691590738e-05, + "loss": 0.0002, + "step": 1573 + }, + { + "epoch": 2.0128, + "grad_norm": 0.0006125083309598267, + "learning_rate": 1.331754495046898e-05, + "loss": 0.0002, + "step": 1574 + }, + { + "epoch": 2.01408, + "grad_norm": 0.0011476647341623902, + "learning_rate": 1.3295779541945535e-05, + "loss": 0.0002, + "step": 1575 + }, + { + "epoch": 2.01536, + "grad_norm": 0.00018577600712887943, + "learning_rate": 1.3274030731842349e-05, + "loss": 0.0002, + "step": 1576 + }, + { + "epoch": 2.01664, + "grad_norm": 0.0008417818462476134, + "learning_rate": 1.3252298561633093e-05, + "loss": 0.0002, + "step": 1577 + }, + { + "epoch": 2.01792, + "grad_norm": 0.00107707513961941, + "learning_rate": 1.3230583072759693e-05, + "loss": 0.0002, + "step": 1578 + }, + { + "epoch": 2.0192, + "grad_norm": 0.000682896061334759, + "learning_rate": 1.3208884306632274e-05, + "loss": 0.0002, + "step": 1579 + }, + { + "epoch": 2.02048, + "grad_norm": 0.0016809995286166668, + "learning_rate": 1.3187202304629066e-05, + "loss": 0.0002, + "step": 1580 + }, + { + "epoch": 2.02176, + "grad_norm": 0.0006951438263058662, + "learning_rate": 1.3165537108096311e-05, + "loss": 0.0002, + "step": 1581 + }, + { + "epoch": 2.02304, + "grad_norm": 0.001414105063304305, + "learning_rate": 1.3143888758348256e-05, + "loss": 0.0002, + "step": 1582 + }, + { + "epoch": 2.02432, + "grad_norm": 0.00011542163701960817, + "learning_rate": 1.3122257296666966e-05, + "loss": 0.0002, + "step": 1583 + }, + { + "epoch": 2.0256, + "grad_norm": 0.0013241195119917393, + "learning_rate": 1.3100642764302344e-05, + "loss": 0.0002, + "step": 1584 + }, + { + "epoch": 2.02688, + "grad_norm": 0.0009679512586444616, + "learning_rate": 1.3079045202471977e-05, + "loss": 0.0002, + "step": 1585 + }, + { + "epoch": 2.02816, + "grad_norm": 0.002106900094076991, + "learning_rate": 1.3057464652361106e-05, + "loss": 0.0002, + "step": 1586 + }, + { + "epoch": 2.02944, + "grad_norm": 0.00037472325493581593, + "learning_rate": 1.3035901155122547e-05, + "loss": 0.0002, + "step": 1587 + }, + { + "epoch": 2.03072, + "grad_norm": 0.0017979259137064219, + "learning_rate": 1.3014354751876557e-05, + "loss": 0.0002, + "step": 1588 + }, + { + "epoch": 2.032, + "grad_norm": 0.0005142168374732137, + "learning_rate": 1.2992825483710837e-05, + "loss": 0.0002, + "step": 1589 + }, + { + "epoch": 2.03328, + "grad_norm": 0.0016031867126002908, + "learning_rate": 1.2971313391680384e-05, + "loss": 0.0002, + "step": 1590 + }, + { + "epoch": 2.03328, + "eval_loss": 1.1785191297531128, + "eval_runtime": 43.1988, + "eval_samples_per_second": 11.621, + "eval_steps_per_second": 1.458, + "step": 1590 + }, + { + "epoch": 2.03456, + "grad_norm": 0.0003188488190062344, + "learning_rate": 1.2949818516807468e-05, + "loss": 0.0002, + "step": 1591 + }, + { + "epoch": 2.03584, + "grad_norm": 0.000818319502286613, + "learning_rate": 1.2928340900081495e-05, + "loss": 0.0002, + "step": 1592 + }, + { + "epoch": 2.03712, + "grad_norm": 0.0006152302958071232, + "learning_rate": 1.2906880582458983e-05, + "loss": 0.0002, + "step": 1593 + }, + { + "epoch": 2.0384, + "grad_norm": 0.0004554502957034856, + "learning_rate": 1.2885437604863458e-05, + "loss": 0.0002, + "step": 1594 + }, + { + "epoch": 2.03968, + "grad_norm": 0.0007213074713945389, + "learning_rate": 1.2864012008185371e-05, + "loss": 0.0002, + "step": 1595 + }, + { + "epoch": 2.04096, + "grad_norm": 0.0003609023697208613, + "learning_rate": 1.284260383328204e-05, + "loss": 0.0002, + "step": 1596 + }, + { + "epoch": 2.04224, + "grad_norm": 0.000855445337947458, + "learning_rate": 1.2821213120977558e-05, + "loss": 0.0002, + "step": 1597 + }, + { + "epoch": 2.04352, + "grad_norm": 0.00029147861641831696, + "learning_rate": 1.2799839912062708e-05, + "loss": 0.0002, + "step": 1598 + }, + { + "epoch": 2.0448, + "grad_norm": 0.0007133941981010139, + "learning_rate": 1.2778484247294909e-05, + "loss": 0.0002, + "step": 1599 + }, + { + "epoch": 2.04608, + "grad_norm": 0.000201601127628237, + "learning_rate": 1.2757146167398103e-05, + "loss": 0.0002, + "step": 1600 + }, + { + "epoch": 2.04736, + "grad_norm": 0.00014098279643803835, + "learning_rate": 1.2735825713062738e-05, + "loss": 0.0002, + "step": 1601 + }, + { + "epoch": 2.04864, + "grad_norm": 0.0002673087583389133, + "learning_rate": 1.271452292494561e-05, + "loss": 0.0002, + "step": 1602 + }, + { + "epoch": 2.04992, + "grad_norm": 0.00015182622883003205, + "learning_rate": 1.2693237843669852e-05, + "loss": 0.0002, + "step": 1603 + }, + { + "epoch": 2.0512, + "grad_norm": 0.00028255832148715854, + "learning_rate": 1.267197050982481e-05, + "loss": 0.0002, + "step": 1604 + }, + { + "epoch": 2.05248, + "grad_norm": 0.00029869540594518185, + "learning_rate": 1.265072096396601e-05, + "loss": 0.0002, + "step": 1605 + }, + { + "epoch": 2.05376, + "grad_norm": 0.0001590411993674934, + "learning_rate": 1.2629489246615052e-05, + "loss": 0.0002, + "step": 1606 + }, + { + "epoch": 2.05504, + "grad_norm": 0.00010362736065872014, + "learning_rate": 1.2608275398259516e-05, + "loss": 0.0002, + "step": 1607 + }, + { + "epoch": 2.05632, + "grad_norm": 0.00034905390930362046, + "learning_rate": 1.2587079459352931e-05, + "loss": 0.0002, + "step": 1608 + }, + { + "epoch": 2.0576, + "grad_norm": 0.0002614253608044237, + "learning_rate": 1.2565901470314677e-05, + "loss": 0.0002, + "step": 1609 + }, + { + "epoch": 2.05888, + "grad_norm": 0.00016580188821535558, + "learning_rate": 1.2544741471529873e-05, + "loss": 0.0002, + "step": 1610 + }, + { + "epoch": 2.06016, + "grad_norm": 0.0003083675110246986, + "learning_rate": 1.2523599503349368e-05, + "loss": 0.0002, + "step": 1611 + }, + { + "epoch": 2.06144, + "grad_norm": 0.00018635000742506236, + "learning_rate": 1.250247560608961e-05, + "loss": 0.0002, + "step": 1612 + }, + { + "epoch": 2.06272, + "grad_norm": 0.0003100994508713484, + "learning_rate": 1.2481369820032594e-05, + "loss": 0.0002, + "step": 1613 + }, + { + "epoch": 2.064, + "grad_norm": 0.00014046489377506077, + "learning_rate": 1.2460282185425767e-05, + "loss": 0.0002, + "step": 1614 + }, + { + "epoch": 2.06528, + "grad_norm": 0.00012373473145999014, + "learning_rate": 1.2439212742481957e-05, + "loss": 0.0002, + "step": 1615 + }, + { + "epoch": 2.06656, + "grad_norm": 0.0002941812272183597, + "learning_rate": 1.2418161531379336e-05, + "loss": 0.0002, + "step": 1616 + }, + { + "epoch": 2.06784, + "grad_norm": 0.0004038626793771982, + "learning_rate": 1.2397128592261271e-05, + "loss": 0.0002, + "step": 1617 + }, + { + "epoch": 2.06912, + "grad_norm": 0.00012880339636467397, + "learning_rate": 1.2376113965236311e-05, + "loss": 0.0002, + "step": 1618 + }, + { + "epoch": 2.0704, + "grad_norm": 0.00032092208857648075, + "learning_rate": 1.2355117690378057e-05, + "loss": 0.0002, + "step": 1619 + }, + { + "epoch": 2.07168, + "grad_norm": 0.00014724551874678582, + "learning_rate": 1.2334139807725158e-05, + "loss": 0.0002, + "step": 1620 + }, + { + "epoch": 2.07168, + "eval_loss": 1.181175708770752, + "eval_runtime": 43.3984, + "eval_samples_per_second": 11.567, + "eval_steps_per_second": 1.452, + "step": 1620 + }, + { + "epoch": 2.07296, + "grad_norm": 0.00035675789695233107, + "learning_rate": 1.2313180357281146e-05, + "loss": 0.0002, + "step": 1621 + }, + { + "epoch": 2.07424, + "grad_norm": 0.0004117561038583517, + "learning_rate": 1.2292239379014417e-05, + "loss": 0.0002, + "step": 1622 + }, + { + "epoch": 2.07552, + "grad_norm": 0.00019115378381684422, + "learning_rate": 1.2271316912858155e-05, + "loss": 0.0002, + "step": 1623 + }, + { + "epoch": 2.0768, + "grad_norm": 0.0001364218769595027, + "learning_rate": 1.225041299871023e-05, + "loss": 0.0002, + "step": 1624 + }, + { + "epoch": 2.07808, + "grad_norm": 0.00017668337386567146, + "learning_rate": 1.2229527676433147e-05, + "loss": 0.0002, + "step": 1625 + }, + { + "epoch": 2.07936, + "grad_norm": 0.00018027937039732933, + "learning_rate": 1.2208660985853934e-05, + "loss": 0.0002, + "step": 1626 + }, + { + "epoch": 2.08064, + "grad_norm": 0.0001851502020144835, + "learning_rate": 1.2187812966764105e-05, + "loss": 0.0002, + "step": 1627 + }, + { + "epoch": 2.08192, + "grad_norm": 0.0002945510495919734, + "learning_rate": 1.2166983658919587e-05, + "loss": 0.0002, + "step": 1628 + }, + { + "epoch": 2.0832, + "grad_norm": 0.00013975801994092762, + "learning_rate": 1.2146173102040577e-05, + "loss": 0.0002, + "step": 1629 + }, + { + "epoch": 2.08448, + "grad_norm": 0.0004168010491412133, + "learning_rate": 1.2125381335811564e-05, + "loss": 0.0002, + "step": 1630 + }, + { + "epoch": 2.08576, + "grad_norm": 0.00022464545327238739, + "learning_rate": 1.210460839988118e-05, + "loss": 0.0002, + "step": 1631 + }, + { + "epoch": 2.08704, + "grad_norm": 0.00011976376117672771, + "learning_rate": 1.2083854333862157e-05, + "loss": 0.0002, + "step": 1632 + }, + { + "epoch": 2.08832, + "grad_norm": 0.0001563438563607633, + "learning_rate": 1.2063119177331237e-05, + "loss": 0.0002, + "step": 1633 + }, + { + "epoch": 2.0896, + "grad_norm": 0.00014362685033120215, + "learning_rate": 1.204240296982909e-05, + "loss": 0.0002, + "step": 1634 + }, + { + "epoch": 2.09088, + "grad_norm": 0.0001592675835127011, + "learning_rate": 1.2021705750860295e-05, + "loss": 0.0002, + "step": 1635 + }, + { + "epoch": 2.09216, + "grad_norm": 0.00012455842806957662, + "learning_rate": 1.2001027559893172e-05, + "loss": 0.0002, + "step": 1636 + }, + { + "epoch": 2.09344, + "grad_norm": 0.00014256317808758467, + "learning_rate": 1.198036843635979e-05, + "loss": 0.0002, + "step": 1637 + }, + { + "epoch": 2.09472, + "grad_norm": 0.0006191166467033327, + "learning_rate": 1.1959728419655828e-05, + "loss": 0.0002, + "step": 1638 + }, + { + "epoch": 2.096, + "grad_norm": 0.000110753440822009, + "learning_rate": 1.1939107549140557e-05, + "loss": 0.0002, + "step": 1639 + }, + { + "epoch": 2.09728, + "grad_norm": 0.0005446019931696355, + "learning_rate": 1.1918505864136728e-05, + "loss": 0.0002, + "step": 1640 + }, + { + "epoch": 2.09856, + "grad_norm": 0.0005599053110927343, + "learning_rate": 1.1897923403930491e-05, + "loss": 0.0002, + "step": 1641 + }, + { + "epoch": 2.09984, + "grad_norm": 0.000916913675609976, + "learning_rate": 1.1877360207771363e-05, + "loss": 0.0002, + "step": 1642 + }, + { + "epoch": 2.10112, + "grad_norm": 0.0005046268925070763, + "learning_rate": 1.1856816314872108e-05, + "loss": 0.0002, + "step": 1643 + }, + { + "epoch": 2.1024, + "grad_norm": 0.0007644724100828171, + "learning_rate": 1.1836291764408672e-05, + "loss": 0.0002, + "step": 1644 + }, + { + "epoch": 2.1036799999999998, + "grad_norm": 0.0008394183241762221, + "learning_rate": 1.181578659552014e-05, + "loss": 0.0002, + "step": 1645 + }, + { + "epoch": 2.10496, + "grad_norm": 0.0007981343660503626, + "learning_rate": 1.1795300847308621e-05, + "loss": 0.0002, + "step": 1646 + }, + { + "epoch": 2.10624, + "grad_norm": 0.00044534794869832695, + "learning_rate": 1.1774834558839198e-05, + "loss": 0.0002, + "step": 1647 + }, + { + "epoch": 2.10752, + "grad_norm": 0.00014556580572389066, + "learning_rate": 1.1754387769139835e-05, + "loss": 0.0002, + "step": 1648 + }, + { + "epoch": 2.1088, + "grad_norm": 0.0006214206805452704, + "learning_rate": 1.1733960517201316e-05, + "loss": 0.0002, + "step": 1649 + }, + { + "epoch": 2.11008, + "grad_norm": 0.0003172321885358542, + "learning_rate": 1.1713552841977185e-05, + "loss": 0.0002, + "step": 1650 + }, + { + "epoch": 2.11008, + "eval_loss": 1.1766486167907715, + "eval_runtime": 43.0906, + "eval_samples_per_second": 11.65, + "eval_steps_per_second": 1.462, + "step": 1650 + }, + { + "epoch": 2.11136, + "grad_norm": 0.0007667154422961175, + "learning_rate": 1.1693164782383628e-05, + "loss": 0.0002, + "step": 1651 + }, + { + "epoch": 2.11264, + "grad_norm": 0.0004064768727403134, + "learning_rate": 1.1672796377299444e-05, + "loss": 0.0002, + "step": 1652 + }, + { + "epoch": 2.11392, + "grad_norm": 0.000624951149802655, + "learning_rate": 1.1652447665565941e-05, + "loss": 0.0002, + "step": 1653 + }, + { + "epoch": 2.1152, + "grad_norm": 0.0003828768967650831, + "learning_rate": 1.1632118685986875e-05, + "loss": 0.0002, + "step": 1654 + }, + { + "epoch": 2.11648, + "grad_norm": 0.0006233001477085054, + "learning_rate": 1.1611809477328388e-05, + "loss": 0.0002, + "step": 1655 + }, + { + "epoch": 2.11776, + "grad_norm": 0.00017231212405022234, + "learning_rate": 1.1591520078318898e-05, + "loss": 0.0002, + "step": 1656 + }, + { + "epoch": 2.11904, + "grad_norm": 0.00027037342078983784, + "learning_rate": 1.1571250527649061e-05, + "loss": 0.0002, + "step": 1657 + }, + { + "epoch": 2.12032, + "grad_norm": 0.00028422908508218825, + "learning_rate": 1.1551000863971677e-05, + "loss": 0.0002, + "step": 1658 + }, + { + "epoch": 2.1216, + "grad_norm": 0.00034367659827694297, + "learning_rate": 1.1530771125901641e-05, + "loss": 0.0002, + "step": 1659 + }, + { + "epoch": 2.12288, + "grad_norm": 0.00011658842413453385, + "learning_rate": 1.1510561352015817e-05, + "loss": 0.0002, + "step": 1660 + }, + { + "epoch": 2.12416, + "grad_norm": 0.00011873227049363777, + "learning_rate": 1.1490371580853021e-05, + "loss": 0.0002, + "step": 1661 + }, + { + "epoch": 2.12544, + "grad_norm": 0.00020507187582552433, + "learning_rate": 1.1470201850913935e-05, + "loss": 0.0002, + "step": 1662 + }, + { + "epoch": 2.12672, + "grad_norm": 0.00015456117398571223, + "learning_rate": 1.1450052200660997e-05, + "loss": 0.0002, + "step": 1663 + }, + { + "epoch": 2.128, + "grad_norm": 0.00029631159850396216, + "learning_rate": 1.1429922668518367e-05, + "loss": 0.0002, + "step": 1664 + }, + { + "epoch": 2.12928, + "grad_norm": 0.0005491878837347031, + "learning_rate": 1.1409813292871848e-05, + "loss": 0.0002, + "step": 1665 + }, + { + "epoch": 2.13056, + "grad_norm": 0.0009095377754420042, + "learning_rate": 1.1389724112068803e-05, + "loss": 0.0002, + "step": 1666 + }, + { + "epoch": 2.13184, + "grad_norm": 0.00017942706472240388, + "learning_rate": 1.1369655164418076e-05, + "loss": 0.0002, + "step": 1667 + }, + { + "epoch": 2.13312, + "grad_norm": 0.0008740307530388236, + "learning_rate": 1.1349606488189923e-05, + "loss": 0.0002, + "step": 1668 + }, + { + "epoch": 2.1344, + "grad_norm": 0.00012318171502556652, + "learning_rate": 1.1329578121615961e-05, + "loss": 0.0002, + "step": 1669 + }, + { + "epoch": 2.13568, + "grad_norm": 0.0005075764493085444, + "learning_rate": 1.130957010288907e-05, + "loss": 0.0002, + "step": 1670 + }, + { + "epoch": 2.13696, + "grad_norm": 0.0003700858214870095, + "learning_rate": 1.1289582470163336e-05, + "loss": 0.0002, + "step": 1671 + }, + { + "epoch": 2.13824, + "grad_norm": 0.0010242791613563895, + "learning_rate": 1.1269615261553946e-05, + "loss": 0.0002, + "step": 1672 + }, + { + "epoch": 2.13952, + "grad_norm": 0.00033670556149445474, + "learning_rate": 1.1249668515137162e-05, + "loss": 0.0002, + "step": 1673 + }, + { + "epoch": 2.1408, + "grad_norm": 0.0005608523497357965, + "learning_rate": 1.122974226895023e-05, + "loss": 0.0002, + "step": 1674 + }, + { + "epoch": 2.14208, + "grad_norm": 0.00029023917159065604, + "learning_rate": 1.1209836560991274e-05, + "loss": 0.0002, + "step": 1675 + }, + { + "epoch": 2.14336, + "grad_norm": 0.00019341774168424308, + "learning_rate": 1.1189951429219276e-05, + "loss": 0.0002, + "step": 1676 + }, + { + "epoch": 2.14464, + "grad_norm": 0.00017067784210667014, + "learning_rate": 1.1170086911553981e-05, + "loss": 0.0002, + "step": 1677 + }, + { + "epoch": 2.14592, + "grad_norm": 0.00016009536921046674, + "learning_rate": 1.1150243045875824e-05, + "loss": 0.0002, + "step": 1678 + }, + { + "epoch": 2.1471999999999998, + "grad_norm": 0.00041830449481494725, + "learning_rate": 1.1130419870025848e-05, + "loss": 0.0002, + "step": 1679 + }, + { + "epoch": 2.14848, + "grad_norm": 0.0001825682702474296, + "learning_rate": 1.111061742180563e-05, + "loss": 0.0002, + "step": 1680 + }, + { + "epoch": 2.14848, + "eval_loss": 1.178200125694275, + "eval_runtime": 43.3215, + "eval_samples_per_second": 11.588, + "eval_steps_per_second": 1.454, + "step": 1680 + }, + { + "epoch": 2.14976, + "grad_norm": 0.00045886740554124117, + "learning_rate": 1.1090835738977264e-05, + "loss": 0.0002, + "step": 1681 + }, + { + "epoch": 2.15104, + "grad_norm": 0.00017417485651094466, + "learning_rate": 1.10710748592632e-05, + "loss": 0.0002, + "step": 1682 + }, + { + "epoch": 2.15232, + "grad_norm": 0.0005541493883356452, + "learning_rate": 1.105133482034625e-05, + "loss": 0.0002, + "step": 1683 + }, + { + "epoch": 2.1536, + "grad_norm": 0.00013444297655951232, + "learning_rate": 1.1031615659869456e-05, + "loss": 0.0002, + "step": 1684 + }, + { + "epoch": 2.15488, + "grad_norm": 0.0001350865204585716, + "learning_rate": 1.1011917415436071e-05, + "loss": 0.0002, + "step": 1685 + }, + { + "epoch": 2.15616, + "grad_norm": 0.00012103471817681566, + "learning_rate": 1.0992240124609452e-05, + "loss": 0.0002, + "step": 1686 + }, + { + "epoch": 2.15744, + "grad_norm": 0.0001908185367938131, + "learning_rate": 1.0972583824912993e-05, + "loss": 0.0002, + "step": 1687 + }, + { + "epoch": 2.15872, + "grad_norm": 0.0002593412937130779, + "learning_rate": 1.0952948553830063e-05, + "loss": 0.0002, + "step": 1688 + }, + { + "epoch": 2.16, + "grad_norm": 0.0004132689500693232, + "learning_rate": 1.093333434880394e-05, + "loss": 0.0002, + "step": 1689 + }, + { + "epoch": 2.16128, + "grad_norm": 0.000668655673507601, + "learning_rate": 1.0913741247237724e-05, + "loss": 0.0002, + "step": 1690 + }, + { + "epoch": 2.16256, + "grad_norm": 0.0007550397422164679, + "learning_rate": 1.0894169286494263e-05, + "loss": 0.0002, + "step": 1691 + }, + { + "epoch": 2.16384, + "grad_norm": 0.0006890227668918669, + "learning_rate": 1.0874618503896099e-05, + "loss": 0.0002, + "step": 1692 + }, + { + "epoch": 2.16512, + "grad_norm": 0.0010281478753313422, + "learning_rate": 1.08550889367254e-05, + "loss": 0.0002, + "step": 1693 + }, + { + "epoch": 2.1664, + "grad_norm": 0.0003120306064374745, + "learning_rate": 1.083558062222385e-05, + "loss": 0.0002, + "step": 1694 + }, + { + "epoch": 2.16768, + "grad_norm": 0.0009151713456958532, + "learning_rate": 1.0816093597592627e-05, + "loss": 0.0002, + "step": 1695 + }, + { + "epoch": 2.16896, + "grad_norm": 0.0002048113674391061, + "learning_rate": 1.0796627899992313e-05, + "loss": 0.0002, + "step": 1696 + }, + { + "epoch": 2.17024, + "grad_norm": 0.0007615440990775824, + "learning_rate": 1.0777183566542799e-05, + "loss": 0.0002, + "step": 1697 + }, + { + "epoch": 2.17152, + "grad_norm": 0.00036416234797798097, + "learning_rate": 1.075776063432326e-05, + "loss": 0.0002, + "step": 1698 + }, + { + "epoch": 2.1728, + "grad_norm": 0.0003339081595186144, + "learning_rate": 1.0738359140372033e-05, + "loss": 0.0002, + "step": 1699 + }, + { + "epoch": 2.17408, + "grad_norm": 0.00048636324936524034, + "learning_rate": 1.0718979121686613e-05, + "loss": 0.0002, + "step": 1700 + }, + { + "epoch": 2.17536, + "grad_norm": 0.00012778060045093298, + "learning_rate": 1.0699620615223499e-05, + "loss": 0.0002, + "step": 1701 + }, + { + "epoch": 2.17664, + "grad_norm": 0.0005591086810454726, + "learning_rate": 1.0680283657898205e-05, + "loss": 0.0002, + "step": 1702 + }, + { + "epoch": 2.17792, + "grad_norm": 0.00011497444211272523, + "learning_rate": 1.066096828658512e-05, + "loss": 0.0002, + "step": 1703 + }, + { + "epoch": 2.1792, + "grad_norm": 0.00012892419181298465, + "learning_rate": 1.0641674538117491e-05, + "loss": 0.0002, + "step": 1704 + }, + { + "epoch": 2.18048, + "grad_norm": 0.0007491412106901407, + "learning_rate": 1.0622402449287335e-05, + "loss": 0.0002, + "step": 1705 + }, + { + "epoch": 2.18176, + "grad_norm": 0.0006464109173975885, + "learning_rate": 1.0603152056845339e-05, + "loss": 0.0002, + "step": 1706 + }, + { + "epoch": 2.18304, + "grad_norm": 0.00023252949176821858, + "learning_rate": 1.0583923397500841e-05, + "loss": 0.0002, + "step": 1707 + }, + { + "epoch": 2.18432, + "grad_norm": 0.0005042919074185193, + "learning_rate": 1.056471650792174e-05, + "loss": 0.0002, + "step": 1708 + }, + { + "epoch": 2.1856, + "grad_norm": 0.0007390492246486247, + "learning_rate": 1.0545531424734387e-05, + "loss": 0.0002, + "step": 1709 + }, + { + "epoch": 2.18688, + "grad_norm": 0.0006836414686404169, + "learning_rate": 1.0526368184523585e-05, + "loss": 0.0002, + "step": 1710 + }, + { + "epoch": 2.18688, + "eval_loss": 1.1836718320846558, + "eval_runtime": 43.5042, + "eval_samples_per_second": 11.539, + "eval_steps_per_second": 1.448, + "step": 1710 + }, + { + "epoch": 2.18816, + "grad_norm": 0.0005014064372517169, + "learning_rate": 1.0507226823832469e-05, + "loss": 0.0002, + "step": 1711 + }, + { + "epoch": 2.18944, + "grad_norm": 0.00038360522012226284, + "learning_rate": 1.0488107379162453e-05, + "loss": 0.0002, + "step": 1712 + }, + { + "epoch": 2.19072, + "grad_norm": 0.0002872354525607079, + "learning_rate": 1.046900988697316e-05, + "loss": 0.0002, + "step": 1713 + }, + { + "epoch": 2.192, + "grad_norm": 0.00022756788530386984, + "learning_rate": 1.0449934383682328e-05, + "loss": 0.0002, + "step": 1714 + }, + { + "epoch": 2.19328, + "grad_norm": 0.000334077951265499, + "learning_rate": 1.0430880905665814e-05, + "loss": 0.0002, + "step": 1715 + }, + { + "epoch": 2.19456, + "grad_norm": 0.00034818786662071943, + "learning_rate": 1.0411849489257428e-05, + "loss": 0.0002, + "step": 1716 + }, + { + "epoch": 2.19584, + "grad_norm": 0.000506854965351522, + "learning_rate": 1.0392840170748934e-05, + "loss": 0.0002, + "step": 1717 + }, + { + "epoch": 2.19712, + "grad_norm": 0.0004816302680410445, + "learning_rate": 1.0373852986389934e-05, + "loss": 0.0002, + "step": 1718 + }, + { + "epoch": 2.1984, + "grad_norm": 0.0006793339271098375, + "learning_rate": 1.0354887972387858e-05, + "loss": 0.0002, + "step": 1719 + }, + { + "epoch": 2.19968, + "grad_norm": 0.00035194880911149085, + "learning_rate": 1.0335945164907827e-05, + "loss": 0.0002, + "step": 1720 + }, + { + "epoch": 2.20096, + "grad_norm": 0.00023243727628141642, + "learning_rate": 1.0317024600072618e-05, + "loss": 0.0002, + "step": 1721 + }, + { + "epoch": 2.20224, + "grad_norm": 0.0008212224929593503, + "learning_rate": 1.0298126313962604e-05, + "loss": 0.0002, + "step": 1722 + }, + { + "epoch": 2.20352, + "grad_norm": 0.00029432986048050225, + "learning_rate": 1.0279250342615676e-05, + "loss": 0.0002, + "step": 1723 + }, + { + "epoch": 2.2048, + "grad_norm": 0.00025335565442219377, + "learning_rate": 1.0260396722027168e-05, + "loss": 0.0002, + "step": 1724 + }, + { + "epoch": 2.20608, + "grad_norm": 0.0008946171146817505, + "learning_rate": 1.0241565488149781e-05, + "loss": 0.0002, + "step": 1725 + }, + { + "epoch": 2.20736, + "grad_norm": 0.0006439693388529122, + "learning_rate": 1.0222756676893536e-05, + "loss": 0.0002, + "step": 1726 + }, + { + "epoch": 2.20864, + "grad_norm": 0.000815407547634095, + "learning_rate": 1.0203970324125708e-05, + "loss": 0.0002, + "step": 1727 + }, + { + "epoch": 2.20992, + "grad_norm": 0.0002080225822282955, + "learning_rate": 1.0185206465670712e-05, + "loss": 0.0002, + "step": 1728 + }, + { + "epoch": 2.2112, + "grad_norm": 0.0007100445800460875, + "learning_rate": 1.01664651373101e-05, + "loss": 0.0002, + "step": 1729 + }, + { + "epoch": 2.2124800000000002, + "grad_norm": 0.000299918872769922, + "learning_rate": 1.0147746374782445e-05, + "loss": 0.0002, + "step": 1730 + }, + { + "epoch": 2.21376, + "grad_norm": 0.0006271378370001912, + "learning_rate": 1.0129050213783299e-05, + "loss": 0.0002, + "step": 1731 + }, + { + "epoch": 2.21504, + "grad_norm": 0.0003948029479943216, + "learning_rate": 1.0110376689965099e-05, + "loss": 0.0002, + "step": 1732 + }, + { + "epoch": 2.21632, + "grad_norm": 0.00014382618246600032, + "learning_rate": 1.0091725838937112e-05, + "loss": 0.0002, + "step": 1733 + }, + { + "epoch": 2.2176, + "grad_norm": 0.0008930534822866321, + "learning_rate": 1.0073097696265398e-05, + "loss": 0.0002, + "step": 1734 + }, + { + "epoch": 2.21888, + "grad_norm": 0.0003659480717033148, + "learning_rate": 1.0054492297472684e-05, + "loss": 0.0002, + "step": 1735 + }, + { + "epoch": 2.22016, + "grad_norm": 0.00012835237430408597, + "learning_rate": 1.0035909678038344e-05, + "loss": 0.0002, + "step": 1736 + }, + { + "epoch": 2.22144, + "grad_norm": 0.0002881828404497355, + "learning_rate": 1.0017349873398295e-05, + "loss": 0.0002, + "step": 1737 + }, + { + "epoch": 2.22272, + "grad_norm": 0.00011149987403769046, + "learning_rate": 9.998812918944963e-06, + "loss": 0.0002, + "step": 1738 + }, + { + "epoch": 2.224, + "grad_norm": 0.00010650986223481596, + "learning_rate": 9.980298850027201e-06, + "loss": 0.0002, + "step": 1739 + }, + { + "epoch": 2.22528, + "grad_norm": 0.0003959020250476897, + "learning_rate": 9.961807701950207e-06, + "loss": 0.0002, + "step": 1740 + }, + { + "epoch": 2.22528, + "eval_loss": 1.1779879331588745, + "eval_runtime": 43.2666, + "eval_samples_per_second": 11.602, + "eval_steps_per_second": 1.456, + "step": 1740 + }, + { + "epoch": 2.22656, + "grad_norm": 0.0005422495305538177, + "learning_rate": 9.943339509975481e-06, + "loss": 0.0002, + "step": 1741 + }, + { + "epoch": 2.22784, + "grad_norm": 0.0004364894703030586, + "learning_rate": 9.924894309320754e-06, + "loss": 0.0002, + "step": 1742 + }, + { + "epoch": 2.22912, + "grad_norm": 0.0008182907477021217, + "learning_rate": 9.906472135159886e-06, + "loss": 0.0002, + "step": 1743 + }, + { + "epoch": 2.2304, + "grad_norm": 0.0003204997337888926, + "learning_rate": 9.888073022622858e-06, + "loss": 0.0002, + "step": 1744 + }, + { + "epoch": 2.23168, + "grad_norm": 0.0007177082588896155, + "learning_rate": 9.869697006795657e-06, + "loss": 0.0002, + "step": 1745 + }, + { + "epoch": 2.23296, + "grad_norm": 0.00014134106459096074, + "learning_rate": 9.851344122720236e-06, + "loss": 0.0002, + "step": 1746 + }, + { + "epoch": 2.23424, + "grad_norm": 0.0009650987340137362, + "learning_rate": 9.833014405394423e-06, + "loss": 0.0002, + "step": 1747 + }, + { + "epoch": 2.23552, + "grad_norm": 0.00016414733545389026, + "learning_rate": 9.81470788977188e-06, + "loss": 0.0002, + "step": 1748 + }, + { + "epoch": 2.2368, + "grad_norm": 0.0002189509687013924, + "learning_rate": 9.79642461076203e-06, + "loss": 0.0002, + "step": 1749 + }, + { + "epoch": 2.23808, + "grad_norm": 0.001173550495877862, + "learning_rate": 9.778164603229959e-06, + "loss": 0.0002, + "step": 1750 + }, + { + "epoch": 2.23936, + "grad_norm": 0.0001450089766876772, + "learning_rate": 9.75992790199641e-06, + "loss": 0.0002, + "step": 1751 + }, + { + "epoch": 2.24064, + "grad_norm": 0.001029281527735293, + "learning_rate": 9.74171454183765e-06, + "loss": 0.0002, + "step": 1752 + }, + { + "epoch": 2.24192, + "grad_norm": 0.0006216371548362076, + "learning_rate": 9.72352455748547e-06, + "loss": 0.0002, + "step": 1753 + }, + { + "epoch": 2.2432, + "grad_norm": 0.00053853151621297, + "learning_rate": 9.705357983627057e-06, + "loss": 0.0002, + "step": 1754 + }, + { + "epoch": 2.24448, + "grad_norm": 0.0008826253470033407, + "learning_rate": 9.687214854904959e-06, + "loss": 0.0002, + "step": 1755 + }, + { + "epoch": 2.24576, + "grad_norm": 0.00010876509622903541, + "learning_rate": 9.66909520591703e-06, + "loss": 0.0002, + "step": 1756 + }, + { + "epoch": 2.24704, + "grad_norm": 0.0006186162354424596, + "learning_rate": 9.650999071216338e-06, + "loss": 0.0002, + "step": 1757 + }, + { + "epoch": 2.24832, + "grad_norm": 0.00030843057902529836, + "learning_rate": 9.632926485311119e-06, + "loss": 0.0002, + "step": 1758 + }, + { + "epoch": 2.2496, + "grad_norm": 0.0003102632472291589, + "learning_rate": 9.614877482664688e-06, + "loss": 0.0002, + "step": 1759 + }, + { + "epoch": 2.25088, + "grad_norm": 0.0004681867139879614, + "learning_rate": 9.596852097695404e-06, + "loss": 0.0002, + "step": 1760 + }, + { + "epoch": 2.25216, + "grad_norm": 0.0002742313372436911, + "learning_rate": 9.578850364776588e-06, + "loss": 0.0002, + "step": 1761 + }, + { + "epoch": 2.25344, + "grad_norm": 0.0005104625597596169, + "learning_rate": 9.560872318236437e-06, + "loss": 0.0002, + "step": 1762 + }, + { + "epoch": 2.25472, + "grad_norm": 0.0002644178457558155, + "learning_rate": 9.542917992358008e-06, + "loss": 0.0002, + "step": 1763 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 0.00029693092801608145, + "learning_rate": 9.524987421379105e-06, + "loss": 0.0002, + "step": 1764 + }, + { + "epoch": 2.25728, + "grad_norm": 0.00042807578574866056, + "learning_rate": 9.507080639492248e-06, + "loss": 0.0002, + "step": 1765 + }, + { + "epoch": 2.25856, + "grad_norm": 0.0006727955769747496, + "learning_rate": 9.489197680844575e-06, + "loss": 0.0002, + "step": 1766 + }, + { + "epoch": 2.25984, + "grad_norm": 0.0006015878752805293, + "learning_rate": 9.471338579537794e-06, + "loss": 0.0002, + "step": 1767 + }, + { + "epoch": 2.26112, + "grad_norm": 0.0007681053830310702, + "learning_rate": 9.45350336962815e-06, + "loss": 0.0002, + "step": 1768 + }, + { + "epoch": 2.2624, + "grad_norm": 0.00019292229262646288, + "learning_rate": 9.435692085126286e-06, + "loss": 0.0002, + "step": 1769 + }, + { + "epoch": 2.26368, + "grad_norm": 0.00048759227502159774, + "learning_rate": 9.417904759997257e-06, + "loss": 0.0002, + "step": 1770 + }, + { + "epoch": 2.26368, + "eval_loss": 1.1767559051513672, + "eval_runtime": 43.1357, + "eval_samples_per_second": 11.638, + "eval_steps_per_second": 1.461, + "step": 1770 + }, + { + "epoch": 2.26496, + "grad_norm": 0.0008960186969488859, + "learning_rate": 9.400141428160397e-06, + "loss": 0.0002, + "step": 1771 + }, + { + "epoch": 2.26624, + "grad_norm": 0.00046001203008927405, + "learning_rate": 9.382402123489312e-06, + "loss": 0.0002, + "step": 1772 + }, + { + "epoch": 2.26752, + "grad_norm": 0.0006008041673339903, + "learning_rate": 9.364686879811781e-06, + "loss": 0.0002, + "step": 1773 + }, + { + "epoch": 2.2688, + "grad_norm": 0.0003917776921298355, + "learning_rate": 9.346995730909692e-06, + "loss": 0.0002, + "step": 1774 + }, + { + "epoch": 2.27008, + "grad_norm": 0.00011662714678095654, + "learning_rate": 9.329328710518992e-06, + "loss": 0.0002, + "step": 1775 + }, + { + "epoch": 2.27136, + "grad_norm": 0.000828468007966876, + "learning_rate": 9.311685852329622e-06, + "loss": 0.0002, + "step": 1776 + }, + { + "epoch": 2.27264, + "grad_norm": 0.0005161363515071571, + "learning_rate": 9.294067189985446e-06, + "loss": 0.0002, + "step": 1777 + }, + { + "epoch": 2.27392, + "grad_norm": 0.000594046083278954, + "learning_rate": 9.276472757084176e-06, + "loss": 0.0002, + "step": 1778 + }, + { + "epoch": 2.2752, + "grad_norm": 0.0010922584915533662, + "learning_rate": 9.258902587177328e-06, + "loss": 0.0002, + "step": 1779 + }, + { + "epoch": 2.27648, + "grad_norm": 0.0004009727854281664, + "learning_rate": 9.241356713770157e-06, + "loss": 0.0002, + "step": 1780 + }, + { + "epoch": 2.27776, + "grad_norm": 0.0010642922716215253, + "learning_rate": 9.22383517032157e-06, + "loss": 0.0002, + "step": 1781 + }, + { + "epoch": 2.27904, + "grad_norm": 0.00014497771917376667, + "learning_rate": 9.206337990244096e-06, + "loss": 0.0002, + "step": 1782 + }, + { + "epoch": 2.28032, + "grad_norm": 0.00046217776252888143, + "learning_rate": 9.188865206903788e-06, + "loss": 0.0002, + "step": 1783 + }, + { + "epoch": 2.2816, + "grad_norm": 0.0007038525654934347, + "learning_rate": 9.171416853620184e-06, + "loss": 0.0002, + "step": 1784 + }, + { + "epoch": 2.28288, + "grad_norm": 0.0004476465692277998, + "learning_rate": 9.153992963666247e-06, + "loss": 0.0002, + "step": 1785 + }, + { + "epoch": 2.28416, + "grad_norm": 0.00021213979925960302, + "learning_rate": 9.136593570268261e-06, + "loss": 0.0002, + "step": 1786 + }, + { + "epoch": 2.28544, + "grad_norm": 0.00044914140016771853, + "learning_rate": 9.119218706605823e-06, + "loss": 0.0002, + "step": 1787 + }, + { + "epoch": 2.28672, + "grad_norm": 0.0005167127237655222, + "learning_rate": 9.10186840581174e-06, + "loss": 0.0002, + "step": 1788 + }, + { + "epoch": 2.288, + "grad_norm": 0.00013319379650056362, + "learning_rate": 9.084542700971993e-06, + "loss": 0.0002, + "step": 1789 + }, + { + "epoch": 2.2892799999999998, + "grad_norm": 0.0002639666781760752, + "learning_rate": 9.067241625125635e-06, + "loss": 0.0002, + "step": 1790 + }, + { + "epoch": 2.29056, + "grad_norm": 0.0002221256581833586, + "learning_rate": 9.049965211264773e-06, + "loss": 0.0002, + "step": 1791 + }, + { + "epoch": 2.29184, + "grad_norm": 0.0003682387759909034, + "learning_rate": 9.03271349233449e-06, + "loss": 0.0002, + "step": 1792 + }, + { + "epoch": 2.29312, + "grad_norm": 0.00022319938580039889, + "learning_rate": 9.015486501232751e-06, + "loss": 0.0002, + "step": 1793 + }, + { + "epoch": 2.2944, + "grad_norm": 0.00020367535762488842, + "learning_rate": 8.998284270810392e-06, + "loss": 0.0002, + "step": 1794 + }, + { + "epoch": 2.29568, + "grad_norm": 0.00025371191441081464, + "learning_rate": 8.981106833871025e-06, + "loss": 0.0002, + "step": 1795 + }, + { + "epoch": 2.29696, + "grad_norm": 0.00015242303197737783, + "learning_rate": 8.963954223170972e-06, + "loss": 0.0002, + "step": 1796 + }, + { + "epoch": 2.29824, + "grad_norm": 0.0005362336523830891, + "learning_rate": 8.94682647141923e-06, + "loss": 0.0002, + "step": 1797 + }, + { + "epoch": 2.2995200000000002, + "grad_norm": 0.0002387440181337297, + "learning_rate": 8.929723611277366e-06, + "loss": 0.0002, + "step": 1798 + }, + { + "epoch": 2.3008, + "grad_norm": 0.0001875937741715461, + "learning_rate": 8.91264567535952e-06, + "loss": 0.0002, + "step": 1799 + }, + { + "epoch": 2.30208, + "grad_norm": 0.0002941049460787326, + "learning_rate": 8.89559269623226e-06, + "loss": 0.0002, + "step": 1800 + }, + { + "epoch": 2.30208, + "eval_loss": 1.1789199113845825, + "eval_runtime": 43.3392, + "eval_samples_per_second": 11.583, + "eval_steps_per_second": 1.454, + "step": 1800 + }, + { + "epoch": 2.30336, + "grad_norm": 0.00032305315835401416, + "learning_rate": 8.878564706414597e-06, + "loss": 0.0002, + "step": 1801 + }, + { + "epoch": 2.30464, + "grad_norm": 0.00010795753041747957, + "learning_rate": 8.861561738377861e-06, + "loss": 0.0002, + "step": 1802 + }, + { + "epoch": 2.30592, + "grad_norm": 0.00043354270746931434, + "learning_rate": 8.844583824545688e-06, + "loss": 0.0002, + "step": 1803 + }, + { + "epoch": 2.3072, + "grad_norm": 0.00028120525530539453, + "learning_rate": 8.827630997293933e-06, + "loss": 0.0002, + "step": 1804 + }, + { + "epoch": 2.30848, + "grad_norm": 0.0008327866671606898, + "learning_rate": 8.810703288950596e-06, + "loss": 0.0002, + "step": 1805 + }, + { + "epoch": 2.30976, + "grad_norm": 0.0002845035633072257, + "learning_rate": 8.7938007317958e-06, + "loss": 0.0002, + "step": 1806 + }, + { + "epoch": 2.31104, + "grad_norm": 0.000504064722917974, + "learning_rate": 8.7769233580617e-06, + "loss": 0.0002, + "step": 1807 + }, + { + "epoch": 2.31232, + "grad_norm": 0.0002677366719581187, + "learning_rate": 8.76007119993241e-06, + "loss": 0.0002, + "step": 1808 + }, + { + "epoch": 2.3136, + "grad_norm": 0.00022501955390907824, + "learning_rate": 8.74324428954398e-06, + "loss": 0.0002, + "step": 1809 + }, + { + "epoch": 2.31488, + "grad_norm": 0.0006158575415611267, + "learning_rate": 8.72644265898431e-06, + "loss": 0.0002, + "step": 1810 + }, + { + "epoch": 2.31616, + "grad_norm": 0.00011165841715410352, + "learning_rate": 8.70966634029309e-06, + "loss": 0.0002, + "step": 1811 + }, + { + "epoch": 2.31744, + "grad_norm": 0.0008404940599575639, + "learning_rate": 8.692915365461739e-06, + "loss": 0.0002, + "step": 1812 + }, + { + "epoch": 2.31872, + "grad_norm": 0.0002906533482018858, + "learning_rate": 8.67618976643334e-06, + "loss": 0.0002, + "step": 1813 + }, + { + "epoch": 2.32, + "grad_norm": 0.0006887954659759998, + "learning_rate": 8.659489575102617e-06, + "loss": 0.0002, + "step": 1814 + }, + { + "epoch": 2.32128, + "grad_norm": 0.00022589090804103762, + "learning_rate": 8.642814823315804e-06, + "loss": 0.0002, + "step": 1815 + }, + { + "epoch": 2.32256, + "grad_norm": 0.0006809495971538126, + "learning_rate": 8.626165542870651e-06, + "loss": 0.0002, + "step": 1816 + }, + { + "epoch": 2.32384, + "grad_norm": 0.0008959157858043909, + "learning_rate": 8.60954176551631e-06, + "loss": 0.0002, + "step": 1817 + }, + { + "epoch": 2.32512, + "grad_norm": 0.0008075773948803544, + "learning_rate": 8.592943522953339e-06, + "loss": 0.0002, + "step": 1818 + }, + { + "epoch": 2.3264, + "grad_norm": 0.00017589032358955592, + "learning_rate": 8.576370846833567e-06, + "loss": 0.0002, + "step": 1819 + }, + { + "epoch": 2.32768, + "grad_norm": 0.00101184262894094, + "learning_rate": 8.559823768760078e-06, + "loss": 0.0002, + "step": 1820 + }, + { + "epoch": 2.32896, + "grad_norm": 0.00037295141373761, + "learning_rate": 8.543302320287151e-06, + "loss": 0.0002, + "step": 1821 + }, + { + "epoch": 2.33024, + "grad_norm": 0.000557410647161305, + "learning_rate": 8.52680653292018e-06, + "loss": 0.0002, + "step": 1822 + }, + { + "epoch": 2.33152, + "grad_norm": 0.0007255637901835144, + "learning_rate": 8.510336438115643e-06, + "loss": 0.0002, + "step": 1823 + }, + { + "epoch": 2.3327999999999998, + "grad_norm": 0.0003043143660761416, + "learning_rate": 8.493892067281e-06, + "loss": 0.0002, + "step": 1824 + }, + { + "epoch": 2.33408, + "grad_norm": 0.0008695904980413616, + "learning_rate": 8.477473451774671e-06, + "loss": 0.0002, + "step": 1825 + }, + { + "epoch": 2.33536, + "grad_norm": 0.0004969065776094794, + "learning_rate": 8.461080622905966e-06, + "loss": 0.0002, + "step": 1826 + }, + { + "epoch": 2.33664, + "grad_norm": 0.000555870239622891, + "learning_rate": 8.444713611935005e-06, + "loss": 0.0002, + "step": 1827 + }, + { + "epoch": 2.33792, + "grad_norm": 0.0005065601435489953, + "learning_rate": 8.42837245007269e-06, + "loss": 0.0002, + "step": 1828 + }, + { + "epoch": 2.3392, + "grad_norm": 0.00013234195648692548, + "learning_rate": 8.412057168480623e-06, + "loss": 0.0002, + "step": 1829 + }, + { + "epoch": 2.34048, + "grad_norm": 0.0008337848703376949, + "learning_rate": 8.395767798271065e-06, + "loss": 0.0002, + "step": 1830 + }, + { + "epoch": 2.34048, + "eval_loss": 1.1797373294830322, + "eval_runtime": 43.3554, + "eval_samples_per_second": 11.579, + "eval_steps_per_second": 1.453, + "step": 1830 + }, + { + "epoch": 2.34176, + "grad_norm": 0.00016388027870561928, + "learning_rate": 8.379504370506847e-06, + "loss": 0.0002, + "step": 1831 + }, + { + "epoch": 2.3430400000000002, + "grad_norm": 0.0010740353027358651, + "learning_rate": 8.363266916201331e-06, + "loss": 0.0002, + "step": 1832 + }, + { + "epoch": 2.34432, + "grad_norm": 0.00013404090714175254, + "learning_rate": 8.34705546631838e-06, + "loss": 0.0002, + "step": 1833 + }, + { + "epoch": 2.3456, + "grad_norm": 0.0005119265988469124, + "learning_rate": 8.330870051772226e-06, + "loss": 0.0002, + "step": 1834 + }, + { + "epoch": 2.34688, + "grad_norm": 0.000775667664129287, + "learning_rate": 8.314710703427487e-06, + "loss": 0.0002, + "step": 1835 + }, + { + "epoch": 2.34816, + "grad_norm": 0.00010712215589592233, + "learning_rate": 8.29857745209904e-06, + "loss": 0.0002, + "step": 1836 + }, + { + "epoch": 2.34944, + "grad_norm": 0.0007510327268391848, + "learning_rate": 8.282470328552037e-06, + "loss": 0.0002, + "step": 1837 + }, + { + "epoch": 2.35072, + "grad_norm": 0.0007226445595733821, + "learning_rate": 8.266389363501775e-06, + "loss": 0.0002, + "step": 1838 + }, + { + "epoch": 2.352, + "grad_norm": 0.0006508962833322585, + "learning_rate": 8.250334587613677e-06, + "loss": 0.0002, + "step": 1839 + }, + { + "epoch": 2.35328, + "grad_norm": 0.0007670673076063395, + "learning_rate": 8.23430603150323e-06, + "loss": 0.0002, + "step": 1840 + }, + { + "epoch": 2.35456, + "grad_norm": 0.0002744818921200931, + "learning_rate": 8.21830372573592e-06, + "loss": 0.0002, + "step": 1841 + }, + { + "epoch": 2.35584, + "grad_norm": 0.0005801216466352344, + "learning_rate": 8.202327700827171e-06, + "loss": 0.0002, + "step": 1842 + }, + { + "epoch": 2.35712, + "grad_norm": 0.0003591123386286199, + "learning_rate": 8.18637798724229e-06, + "loss": 0.0002, + "step": 1843 + }, + { + "epoch": 2.3584, + "grad_norm": 0.00020390120334923267, + "learning_rate": 8.170454615396412e-06, + "loss": 0.0002, + "step": 1844 + }, + { + "epoch": 2.35968, + "grad_norm": 0.0007962146191857755, + "learning_rate": 8.154557615654455e-06, + "loss": 0.0002, + "step": 1845 + }, + { + "epoch": 2.36096, + "grad_norm": 0.0001696029503364116, + "learning_rate": 8.138687018331016e-06, + "loss": 0.0002, + "step": 1846 + }, + { + "epoch": 2.36224, + "grad_norm": 0.0004607769660651684, + "learning_rate": 8.122842853690368e-06, + "loss": 0.0002, + "step": 1847 + }, + { + "epoch": 2.36352, + "grad_norm": 0.0005677232402376831, + "learning_rate": 8.107025151946376e-06, + "loss": 0.0002, + "step": 1848 + }, + { + "epoch": 2.3648, + "grad_norm": 0.00037837939453311265, + "learning_rate": 8.091233943262426e-06, + "loss": 0.0002, + "step": 1849 + }, + { + "epoch": 2.36608, + "grad_norm": 0.0009469679207541049, + "learning_rate": 8.075469257751409e-06, + "loss": 0.0002, + "step": 1850 + }, + { + "epoch": 2.36736, + "grad_norm": 0.0005874616326764226, + "learning_rate": 8.059731125475605e-06, + "loss": 0.0002, + "step": 1851 + }, + { + "epoch": 2.36864, + "grad_norm": 0.0008352144504897296, + "learning_rate": 8.044019576446698e-06, + "loss": 0.0002, + "step": 1852 + }, + { + "epoch": 2.36992, + "grad_norm": 0.00018226521206088364, + "learning_rate": 8.02833464062565e-06, + "loss": 0.0002, + "step": 1853 + }, + { + "epoch": 2.3712, + "grad_norm": 0.00012786927982233465, + "learning_rate": 8.012676347922676e-06, + "loss": 0.0002, + "step": 1854 + }, + { + "epoch": 2.37248, + "grad_norm": 0.0005658522713929415, + "learning_rate": 7.997044728197196e-06, + "loss": 0.0002, + "step": 1855 + }, + { + "epoch": 2.37376, + "grad_norm": 0.00010807412036228925, + "learning_rate": 7.981439811257758e-06, + "loss": 0.0002, + "step": 1856 + }, + { + "epoch": 2.37504, + "grad_norm": 0.0005290827830322087, + "learning_rate": 7.965861626861999e-06, + "loss": 0.0002, + "step": 1857 + }, + { + "epoch": 2.3763199999999998, + "grad_norm": 0.00011951862688874826, + "learning_rate": 7.950310204716562e-06, + "loss": 0.0002, + "step": 1858 + }, + { + "epoch": 2.3776, + "grad_norm": 0.00024504747125320137, + "learning_rate": 7.934785574477069e-06, + "loss": 0.0002, + "step": 1859 + }, + { + "epoch": 2.37888, + "grad_norm": 0.00025921399355866015, + "learning_rate": 7.919287765748052e-06, + "loss": 0.0002, + "step": 1860 + }, + { + "epoch": 2.37888, + "eval_loss": 1.1839673519134521, + "eval_runtime": 43.5719, + "eval_samples_per_second": 11.521, + "eval_steps_per_second": 1.446, + "step": 1860 + }, + { + "epoch": 2.38016, + "grad_norm": 0.0006775157526135445, + "learning_rate": 7.903816808082883e-06, + "loss": 0.0002, + "step": 1861 + }, + { + "epoch": 2.38144, + "grad_norm": 0.0004763702745549381, + "learning_rate": 7.888372730983744e-06, + "loss": 0.0002, + "step": 1862 + }, + { + "epoch": 2.38272, + "grad_norm": 0.00022730005730409175, + "learning_rate": 7.872955563901554e-06, + "loss": 0.0002, + "step": 1863 + }, + { + "epoch": 2.384, + "grad_norm": 0.0005117187974974513, + "learning_rate": 7.85756533623592e-06, + "loss": 0.0002, + "step": 1864 + }, + { + "epoch": 2.38528, + "grad_norm": 0.00012259108189027756, + "learning_rate": 7.842202077335063e-06, + "loss": 0.0002, + "step": 1865 + }, + { + "epoch": 2.3865600000000002, + "grad_norm": 0.0010777805000543594, + "learning_rate": 7.826865816495784e-06, + "loss": 0.0002, + "step": 1866 + }, + { + "epoch": 2.38784, + "grad_norm": 0.0001915521133923903, + "learning_rate": 7.811556582963415e-06, + "loss": 0.0002, + "step": 1867 + }, + { + "epoch": 2.38912, + "grad_norm": 0.0004820209287572652, + "learning_rate": 7.796274405931725e-06, + "loss": 0.0002, + "step": 1868 + }, + { + "epoch": 2.3904, + "grad_norm": 0.0007654948276467621, + "learning_rate": 7.781019314542904e-06, + "loss": 0.0002, + "step": 1869 + }, + { + "epoch": 2.39168, + "grad_norm": 0.0003211789298802614, + "learning_rate": 7.765791337887477e-06, + "loss": 0.0002, + "step": 1870 + }, + { + "epoch": 2.39296, + "grad_norm": 0.00039891450433060527, + "learning_rate": 7.750590505004278e-06, + "loss": 0.0002, + "step": 1871 + }, + { + "epoch": 2.39424, + "grad_norm": 0.0004278056148905307, + "learning_rate": 7.735416844880377e-06, + "loss": 0.0002, + "step": 1872 + }, + { + "epoch": 2.39552, + "grad_norm": 0.00041603570571169257, + "learning_rate": 7.720270386451012e-06, + "loss": 0.0002, + "step": 1873 + }, + { + "epoch": 2.3968, + "grad_norm": 0.0001487014233134687, + "learning_rate": 7.705151158599568e-06, + "loss": 0.0002, + "step": 1874 + }, + { + "epoch": 2.39808, + "grad_norm": 0.0007464838563464582, + "learning_rate": 7.690059190157493e-06, + "loss": 0.0002, + "step": 1875 + }, + { + "epoch": 2.39936, + "grad_norm": 0.0002430266613373533, + "learning_rate": 7.67499450990426e-06, + "loss": 0.0002, + "step": 1876 + }, + { + "epoch": 2.40064, + "grad_norm": 0.00044445652747526765, + "learning_rate": 7.659957146567292e-06, + "loss": 0.0002, + "step": 1877 + }, + { + "epoch": 2.40192, + "grad_norm": 0.000578554579988122, + "learning_rate": 7.644947128821931e-06, + "loss": 0.0002, + "step": 1878 + }, + { + "epoch": 2.4032, + "grad_norm": 0.00038803924690000713, + "learning_rate": 7.629964485291377e-06, + "loss": 0.0002, + "step": 1879 + }, + { + "epoch": 2.40448, + "grad_norm": 0.0005333941080607474, + "learning_rate": 7.615009244546611e-06, + "loss": 0.0002, + "step": 1880 + }, + { + "epoch": 2.40576, + "grad_norm": 0.00010084795212605968, + "learning_rate": 7.600081435106375e-06, + "loss": 0.0002, + "step": 1881 + }, + { + "epoch": 2.40704, + "grad_norm": 0.00016836215218063444, + "learning_rate": 7.585181085437101e-06, + "loss": 0.0002, + "step": 1882 + }, + { + "epoch": 2.40832, + "grad_norm": 0.0006275451160036027, + "learning_rate": 7.570308223952843e-06, + "loss": 0.0002, + "step": 1883 + }, + { + "epoch": 2.4096, + "grad_norm": 0.000125435835798271, + "learning_rate": 7.55546287901525e-06, + "loss": 0.0002, + "step": 1884 + }, + { + "epoch": 2.41088, + "grad_norm": 0.00022631444153375924, + "learning_rate": 7.540645078933491e-06, + "loss": 0.0002, + "step": 1885 + }, + { + "epoch": 2.41216, + "grad_norm": 0.00045561956358142197, + "learning_rate": 7.525854851964214e-06, + "loss": 0.0002, + "step": 1886 + }, + { + "epoch": 2.41344, + "grad_norm": 0.00031555627356283367, + "learning_rate": 7.511092226311482e-06, + "loss": 0.0002, + "step": 1887 + }, + { + "epoch": 2.41472, + "grad_norm": 0.0005395089974626899, + "learning_rate": 7.4963572301267364e-06, + "loss": 0.0002, + "step": 1888 + }, + { + "epoch": 2.416, + "grad_norm": 0.0007261098944582045, + "learning_rate": 7.481649891508706e-06, + "loss": 0.0002, + "step": 1889 + }, + { + "epoch": 2.41728, + "grad_norm": 0.00010826917423401028, + "learning_rate": 7.466970238503399e-06, + "loss": 0.0002, + "step": 1890 + }, + { + "epoch": 2.41728, + "eval_loss": 1.1747924089431763, + "eval_runtime": 43.0535, + "eval_samples_per_second": 11.66, + "eval_steps_per_second": 1.463, + "step": 1890 + }, + { + "epoch": 2.41856, + "grad_norm": 0.0011698183370754123, + "learning_rate": 7.45231829910403e-06, + "loss": 0.0002, + "step": 1891 + }, + { + "epoch": 2.4198399999999998, + "grad_norm": 0.0003063059411942959, + "learning_rate": 7.437694101250949e-06, + "loss": 0.0002, + "step": 1892 + }, + { + "epoch": 2.42112, + "grad_norm": 0.00037533161230385303, + "learning_rate": 7.423097672831616e-06, + "loss": 0.0002, + "step": 1893 + }, + { + "epoch": 2.4224, + "grad_norm": 0.0013914547162130475, + "learning_rate": 7.4085290416805385e-06, + "loss": 0.0002, + "step": 1894 + }, + { + "epoch": 2.42368, + "grad_norm": 0.0003663707757368684, + "learning_rate": 7.393988235579208e-06, + "loss": 0.0002, + "step": 1895 + }, + { + "epoch": 2.42496, + "grad_norm": 0.0010497913463041186, + "learning_rate": 7.379475282256059e-06, + "loss": 0.0002, + "step": 1896 + }, + { + "epoch": 2.42624, + "grad_norm": 0.00026213075034320354, + "learning_rate": 7.364990209386413e-06, + "loss": 0.0002, + "step": 1897 + }, + { + "epoch": 2.42752, + "grad_norm": 0.000615556025877595, + "learning_rate": 7.350533044592434e-06, + "loss": 0.0002, + "step": 1898 + }, + { + "epoch": 2.4288, + "grad_norm": 0.0003180488711223006, + "learning_rate": 7.33610381544305e-06, + "loss": 0.0002, + "step": 1899 + }, + { + "epoch": 2.4300800000000002, + "grad_norm": 0.00014440002269111574, + "learning_rate": 7.321702549453922e-06, + "loss": 0.0002, + "step": 1900 + }, + { + "epoch": 2.43136, + "grad_norm": 0.0003721504472196102, + "learning_rate": 7.307329274087394e-06, + "loss": 0.0002, + "step": 1901 + }, + { + "epoch": 2.43264, + "grad_norm": 0.00012881176371593028, + "learning_rate": 7.2929840167524295e-06, + "loss": 0.0002, + "step": 1902 + }, + { + "epoch": 2.43392, + "grad_norm": 0.00012962603068444878, + "learning_rate": 7.278666804804571e-06, + "loss": 0.0002, + "step": 1903 + }, + { + "epoch": 2.4352, + "grad_norm": 0.0001382453046971932, + "learning_rate": 7.264377665545861e-06, + "loss": 0.0002, + "step": 1904 + }, + { + "epoch": 2.43648, + "grad_norm": 0.00035853637382388115, + "learning_rate": 7.2501166262248265e-06, + "loss": 0.0002, + "step": 1905 + }, + { + "epoch": 2.43776, + "grad_norm": 0.00015113272820599377, + "learning_rate": 7.235883714036411e-06, + "loss": 0.0002, + "step": 1906 + }, + { + "epoch": 2.43904, + "grad_norm": 0.0004522831295616925, + "learning_rate": 7.221678956121903e-06, + "loss": 0.0002, + "step": 1907 + }, + { + "epoch": 2.44032, + "grad_norm": 0.00030768956639803946, + "learning_rate": 7.207502379568916e-06, + "loss": 0.0002, + "step": 1908 + }, + { + "epoch": 2.4416, + "grad_norm": 0.0003462491149548441, + "learning_rate": 7.1933540114113266e-06, + "loss": 0.0002, + "step": 1909 + }, + { + "epoch": 2.44288, + "grad_norm": 0.0003502328472677618, + "learning_rate": 7.179233878629217e-06, + "loss": 0.0002, + "step": 1910 + }, + { + "epoch": 2.44416, + "grad_norm": 0.00020817614858970046, + "learning_rate": 7.165142008148818e-06, + "loss": 0.0002, + "step": 1911 + }, + { + "epoch": 2.44544, + "grad_norm": 0.000591093732509762, + "learning_rate": 7.1510784268424635e-06, + "loss": 0.0002, + "step": 1912 + }, + { + "epoch": 2.44672, + "grad_norm": 0.000337381410645321, + "learning_rate": 7.137043161528567e-06, + "loss": 0.0002, + "step": 1913 + }, + { + "epoch": 2.448, + "grad_norm": 0.00018345269199926406, + "learning_rate": 7.123036238971517e-06, + "loss": 0.0002, + "step": 1914 + }, + { + "epoch": 2.44928, + "grad_norm": 0.0006533708074130118, + "learning_rate": 7.109057685881668e-06, + "loss": 0.0002, + "step": 1915 + }, + { + "epoch": 2.45056, + "grad_norm": 0.00033750629518181086, + "learning_rate": 7.095107528915265e-06, + "loss": 0.0002, + "step": 1916 + }, + { + "epoch": 2.45184, + "grad_norm": 0.0001718109124340117, + "learning_rate": 7.081185794674425e-06, + "loss": 0.0002, + "step": 1917 + }, + { + "epoch": 2.45312, + "grad_norm": 0.0006910200463607907, + "learning_rate": 7.0672925097070446e-06, + "loss": 0.0002, + "step": 1918 + }, + { + "epoch": 2.4544, + "grad_norm": 0.0003057036083191633, + "learning_rate": 7.053427700506767e-06, + "loss": 0.0002, + "step": 1919 + }, + { + "epoch": 2.45568, + "grad_norm": 0.0002737011236604303, + "learning_rate": 7.03959139351295e-06, + "loss": 0.0002, + "step": 1920 + }, + { + "epoch": 2.45568, + "eval_loss": 1.1830275058746338, + "eval_runtime": 43.4832, + "eval_samples_per_second": 11.545, + "eval_steps_per_second": 1.449, + "step": 1920 + }, + { + "epoch": 2.45696, + "grad_norm": 0.0003738821833394468, + "learning_rate": 7.02578361511059e-06, + "loss": 0.0002, + "step": 1921 + }, + { + "epoch": 2.45824, + "grad_norm": 0.00014257086149882525, + "learning_rate": 7.0120043916302885e-06, + "loss": 0.0002, + "step": 1922 + }, + { + "epoch": 2.45952, + "grad_norm": 0.00015353388153016567, + "learning_rate": 6.998253749348178e-06, + "loss": 0.0002, + "step": 1923 + }, + { + "epoch": 2.4608, + "grad_norm": 0.0002355447504669428, + "learning_rate": 6.984531714485905e-06, + "loss": 0.0002, + "step": 1924 + }, + { + "epoch": 2.46208, + "grad_norm": 0.00034142841468565166, + "learning_rate": 6.970838313210557e-06, + "loss": 0.0002, + "step": 1925 + }, + { + "epoch": 2.4633599999999998, + "grad_norm": 0.0005062466952949762, + "learning_rate": 6.957173571634617e-06, + "loss": 0.0002, + "step": 1926 + }, + { + "epoch": 2.46464, + "grad_norm": 0.0003805873275268823, + "learning_rate": 6.943537515815919e-06, + "loss": 0.0002, + "step": 1927 + }, + { + "epoch": 2.46592, + "grad_norm": 0.00024458515690639615, + "learning_rate": 6.929930171757595e-06, + "loss": 0.0002, + "step": 1928 + }, + { + "epoch": 2.4672, + "grad_norm": 0.0004816846631001681, + "learning_rate": 6.916351565408028e-06, + "loss": 0.0002, + "step": 1929 + }, + { + "epoch": 2.46848, + "grad_norm": 0.00022859709861222655, + "learning_rate": 6.902801722660793e-06, + "loss": 0.0002, + "step": 1930 + }, + { + "epoch": 2.46976, + "grad_norm": 0.0002492612402420491, + "learning_rate": 6.88928066935461e-06, + "loss": 0.0002, + "step": 1931 + }, + { + "epoch": 2.47104, + "grad_norm": 0.00032544921850785613, + "learning_rate": 6.875788431273322e-06, + "loss": 0.0002, + "step": 1932 + }, + { + "epoch": 2.47232, + "grad_norm": 0.0005566601175814867, + "learning_rate": 6.862325034145796e-06, + "loss": 0.0002, + "step": 1933 + }, + { + "epoch": 2.4736000000000002, + "grad_norm": 0.00034675264032557607, + "learning_rate": 6.848890503645924e-06, + "loss": 0.0002, + "step": 1934 + }, + { + "epoch": 2.47488, + "grad_norm": 0.0004174388595856726, + "learning_rate": 6.83548486539253e-06, + "loss": 0.0002, + "step": 1935 + }, + { + "epoch": 2.47616, + "grad_norm": 0.00031951870187185705, + "learning_rate": 6.822108144949359e-06, + "loss": 0.0002, + "step": 1936 + }, + { + "epoch": 2.47744, + "grad_norm": 0.0002985461615025997, + "learning_rate": 6.808760367825009e-06, + "loss": 0.0002, + "step": 1937 + }, + { + "epoch": 2.47872, + "grad_norm": 0.00015834596706554294, + "learning_rate": 6.79544155947287e-06, + "loss": 0.0002, + "step": 1938 + }, + { + "epoch": 2.48, + "grad_norm": 0.0006239261128939688, + "learning_rate": 6.782151745291108e-06, + "loss": 0.0002, + "step": 1939 + }, + { + "epoch": 2.48128, + "grad_norm": 0.00012638726911973208, + "learning_rate": 6.768890950622596e-06, + "loss": 0.0002, + "step": 1940 + }, + { + "epoch": 2.48256, + "grad_norm": 0.0003220559738110751, + "learning_rate": 6.755659200754856e-06, + "loss": 0.0002, + "step": 1941 + }, + { + "epoch": 2.48384, + "grad_norm": 0.00022451272525358945, + "learning_rate": 6.742456520920035e-06, + "loss": 0.0002, + "step": 1942 + }, + { + "epoch": 2.48512, + "grad_norm": 0.00015170875121839345, + "learning_rate": 6.729282936294846e-06, + "loss": 0.0002, + "step": 1943 + }, + { + "epoch": 2.4864, + "grad_norm": 0.0003331229672767222, + "learning_rate": 6.716138472000517e-06, + "loss": 0.0002, + "step": 1944 + }, + { + "epoch": 2.48768, + "grad_norm": 0.0001102744645322673, + "learning_rate": 6.703023153102738e-06, + "loss": 0.0002, + "step": 1945 + }, + { + "epoch": 2.48896, + "grad_norm": 0.00012537582369986922, + "learning_rate": 6.6899370046116265e-06, + "loss": 0.0002, + "step": 1946 + }, + { + "epoch": 2.49024, + "grad_norm": 0.0002181735326303169, + "learning_rate": 6.67688005148168e-06, + "loss": 0.0002, + "step": 1947 + }, + { + "epoch": 2.49152, + "grad_norm": 0.0004947243141941726, + "learning_rate": 6.6638523186117084e-06, + "loss": 0.0002, + "step": 1948 + }, + { + "epoch": 2.4928, + "grad_norm": 0.00017499204841442406, + "learning_rate": 6.650853830844817e-06, + "loss": 0.0002, + "step": 1949 + }, + { + "epoch": 2.49408, + "grad_norm": 0.0003295360365882516, + "learning_rate": 6.637884612968315e-06, + "loss": 0.0002, + "step": 1950 + }, + { + "epoch": 2.49408, + "eval_loss": 1.1815475225448608, + "eval_runtime": 43.3933, + "eval_samples_per_second": 11.569, + "eval_steps_per_second": 1.452, + "step": 1950 + }, + { + "epoch": 2.49536, + "grad_norm": 0.0003929525555577129, + "learning_rate": 6.624944689713739e-06, + "loss": 0.0002, + "step": 1951 + }, + { + "epoch": 2.49664, + "grad_norm": 0.00013083942758385092, + "learning_rate": 6.6120340857567226e-06, + "loss": 0.0002, + "step": 1952 + }, + { + "epoch": 2.49792, + "grad_norm": 0.0006896118284203112, + "learning_rate": 6.599152825717001e-06, + "loss": 0.0002, + "step": 1953 + }, + { + "epoch": 2.4992, + "grad_norm": 0.0003455237310845405, + "learning_rate": 6.586300934158363e-06, + "loss": 0.0002, + "step": 1954 + }, + { + "epoch": 2.50048, + "grad_norm": 0.0005406014970503747, + "learning_rate": 6.573478435588581e-06, + "loss": 0.0002, + "step": 1955 + }, + { + "epoch": 2.50176, + "grad_norm": 0.00023619551211595535, + "learning_rate": 6.5606853544593914e-06, + "loss": 0.0002, + "step": 1956 + }, + { + "epoch": 2.50304, + "grad_norm": 0.0004901570500805974, + "learning_rate": 6.547921715166411e-06, + "loss": 0.0002, + "step": 1957 + }, + { + "epoch": 2.50432, + "grad_norm": 0.00059032830176875, + "learning_rate": 6.535187542049132e-06, + "loss": 0.0002, + "step": 1958 + }, + { + "epoch": 2.5056000000000003, + "grad_norm": 0.00040494927088730037, + "learning_rate": 6.522482859390853e-06, + "loss": 0.0002, + "step": 1959 + }, + { + "epoch": 2.5068799999999998, + "grad_norm": 0.00031261437106877565, + "learning_rate": 6.509807691418626e-06, + "loss": 0.0002, + "step": 1960 + }, + { + "epoch": 2.50816, + "grad_norm": 0.00013683347788173705, + "learning_rate": 6.497162062303229e-06, + "loss": 0.0002, + "step": 1961 + }, + { + "epoch": 2.50944, + "grad_norm": 0.0001638557150727138, + "learning_rate": 6.484545996159114e-06, + "loss": 0.0002, + "step": 1962 + }, + { + "epoch": 2.51072, + "grad_norm": 0.0002301942149642855, + "learning_rate": 6.47195951704435e-06, + "loss": 0.0002, + "step": 1963 + }, + { + "epoch": 2.512, + "grad_norm": 0.0001310958032263443, + "learning_rate": 6.459402648960594e-06, + "loss": 0.0002, + "step": 1964 + }, + { + "epoch": 2.51328, + "grad_norm": 0.0005266906809993088, + "learning_rate": 6.446875415853016e-06, + "loss": 0.0002, + "step": 1965 + }, + { + "epoch": 2.51456, + "grad_norm": 0.0002377618511673063, + "learning_rate": 6.434377841610306e-06, + "loss": 0.0002, + "step": 1966 + }, + { + "epoch": 2.51584, + "grad_norm": 0.0003869717475026846, + "learning_rate": 6.421909950064575e-06, + "loss": 0.0002, + "step": 1967 + }, + { + "epoch": 2.5171200000000002, + "grad_norm": 0.0007507234113290906, + "learning_rate": 6.409471764991335e-06, + "loss": 0.0002, + "step": 1968 + }, + { + "epoch": 2.5183999999999997, + "grad_norm": 0.00019987816631328315, + "learning_rate": 6.397063310109448e-06, + "loss": 0.0002, + "step": 1969 + }, + { + "epoch": 2.51968, + "grad_norm": 0.0004650322371162474, + "learning_rate": 6.384684609081084e-06, + "loss": 0.0002, + "step": 1970 + }, + { + "epoch": 2.52096, + "grad_norm": 0.0006089420639909804, + "learning_rate": 6.372335685511685e-06, + "loss": 0.0002, + "step": 1971 + }, + { + "epoch": 2.52224, + "grad_norm": 0.0001694520324235782, + "learning_rate": 6.360016562949884e-06, + "loss": 0.0002, + "step": 1972 + }, + { + "epoch": 2.52352, + "grad_norm": 0.0004142711404711008, + "learning_rate": 6.3477272648875084e-06, + "loss": 0.0002, + "step": 1973 + }, + { + "epoch": 2.5248, + "grad_norm": 0.000749512983020395, + "learning_rate": 6.335467814759505e-06, + "loss": 0.0002, + "step": 1974 + }, + { + "epoch": 2.52608, + "grad_norm": 0.0002557408297434449, + "learning_rate": 6.3232382359439e-06, + "loss": 0.0002, + "step": 1975 + }, + { + "epoch": 2.52736, + "grad_norm": 0.00038810624391771853, + "learning_rate": 6.311038551761755e-06, + "loss": 0.0002, + "step": 1976 + }, + { + "epoch": 2.52864, + "grad_norm": 0.0002354103489778936, + "learning_rate": 6.298868785477132e-06, + "loss": 0.0002, + "step": 1977 + }, + { + "epoch": 2.5299199999999997, + "grad_norm": 0.00012855166278313845, + "learning_rate": 6.286728960297036e-06, + "loss": 0.0002, + "step": 1978 + }, + { + "epoch": 2.5312, + "grad_norm": 0.00019848457304760814, + "learning_rate": 6.274619099371373e-06, + "loss": 0.0002, + "step": 1979 + }, + { + "epoch": 2.53248, + "grad_norm": 0.00019530925783328712, + "learning_rate": 6.262539225792914e-06, + "loss": 0.0002, + "step": 1980 + }, + { + "epoch": 2.53248, + "eval_loss": 1.1816483736038208, + "eval_runtime": 43.6059, + "eval_samples_per_second": 11.512, + "eval_steps_per_second": 1.445, + "step": 1980 + }, + { + "epoch": 2.53376, + "grad_norm": 0.00015873344091232866, + "learning_rate": 6.250489362597252e-06, + "loss": 0.0002, + "step": 1981 + }, + { + "epoch": 2.53504, + "grad_norm": 0.00015742145478725433, + "learning_rate": 6.238469532762737e-06, + "loss": 0.0002, + "step": 1982 + }, + { + "epoch": 2.53632, + "grad_norm": 0.00015187107783276588, + "learning_rate": 6.22647975921046e-06, + "loss": 0.0002, + "step": 1983 + }, + { + "epoch": 2.5376, + "grad_norm": 0.00018921127775684, + "learning_rate": 6.214520064804183e-06, + "loss": 0.0002, + "step": 1984 + }, + { + "epoch": 2.53888, + "grad_norm": 8.872054604580626e-05, + "learning_rate": 6.20259047235033e-06, + "loss": 0.0002, + "step": 1985 + }, + { + "epoch": 2.54016, + "grad_norm": 0.0001130819073296152, + "learning_rate": 6.190691004597901e-06, + "loss": 0.0002, + "step": 1986 + }, + { + "epoch": 2.54144, + "grad_norm": 0.0002832153986673802, + "learning_rate": 6.178821684238467e-06, + "loss": 0.0002, + "step": 1987 + }, + { + "epoch": 2.54272, + "grad_norm": 0.00026950897881761193, + "learning_rate": 6.166982533906092e-06, + "loss": 0.0002, + "step": 1988 + }, + { + "epoch": 2.544, + "grad_norm": 0.0001317868591286242, + "learning_rate": 6.155173576177321e-06, + "loss": 0.0002, + "step": 1989 + }, + { + "epoch": 2.54528, + "grad_norm": 0.000146243633935228, + "learning_rate": 6.143394833571127e-06, + "loss": 0.0002, + "step": 1990 + }, + { + "epoch": 2.54656, + "grad_norm": 0.00030357291689142585, + "learning_rate": 6.131646328548846e-06, + "loss": 0.0002, + "step": 1991 + }, + { + "epoch": 2.54784, + "grad_norm": 0.00015373178757727146, + "learning_rate": 6.11992808351417e-06, + "loss": 0.0002, + "step": 1992 + }, + { + "epoch": 2.5491200000000003, + "grad_norm": 0.0001959829533006996, + "learning_rate": 6.108240120813085e-06, + "loss": 0.0002, + "step": 1993 + }, + { + "epoch": 2.5504, + "grad_norm": 0.00013423572818282992, + "learning_rate": 6.096582462733818e-06, + "loss": 0.0002, + "step": 1994 + }, + { + "epoch": 2.55168, + "grad_norm": 0.0005714610451832414, + "learning_rate": 6.08495513150682e-06, + "loss": 0.0002, + "step": 1995 + }, + { + "epoch": 2.55296, + "grad_norm": 0.00018218479817733169, + "learning_rate": 6.073358149304708e-06, + "loss": 0.0002, + "step": 1996 + }, + { + "epoch": 2.55424, + "grad_norm": 0.0009697470813989639, + "learning_rate": 6.061791538242224e-06, + "loss": 0.0002, + "step": 1997 + }, + { + "epoch": 2.55552, + "grad_norm": 0.00021223712246865034, + "learning_rate": 6.05025532037619e-06, + "loss": 0.0002, + "step": 1998 + }, + { + "epoch": 2.5568, + "grad_norm": 0.0006042190361768007, + "learning_rate": 6.038749517705469e-06, + "loss": 0.0002, + "step": 1999 + }, + { + "epoch": 2.55808, + "grad_norm": 0.0008896634681150317, + "learning_rate": 6.027274152170941e-06, + "loss": 0.0002, + "step": 2000 + }, + { + "epoch": 2.55936, + "grad_norm": 0.00023286855139303952, + "learning_rate": 6.01582924565542e-06, + "loss": 0.0002, + "step": 2001 + }, + { + "epoch": 2.5606400000000002, + "grad_norm": 0.00044719918514601886, + "learning_rate": 6.0044148199836595e-06, + "loss": 0.0002, + "step": 2002 + }, + { + "epoch": 2.5619199999999998, + "grad_norm": 0.0006025312468409538, + "learning_rate": 5.993030896922266e-06, + "loss": 0.0002, + "step": 2003 + }, + { + "epoch": 2.5632, + "grad_norm": 0.00023673917166888714, + "learning_rate": 5.981677498179692e-06, + "loss": 0.0002, + "step": 2004 + }, + { + "epoch": 2.56448, + "grad_norm": 0.00039530108915641904, + "learning_rate": 5.970354645406189e-06, + "loss": 0.0002, + "step": 2005 + }, + { + "epoch": 2.56576, + "grad_norm": 0.0005158588173799217, + "learning_rate": 5.959062360193738e-06, + "loss": 0.0002, + "step": 2006 + }, + { + "epoch": 2.56704, + "grad_norm": 0.00020971003687009215, + "learning_rate": 5.947800664076047e-06, + "loss": 0.0002, + "step": 2007 + }, + { + "epoch": 2.56832, + "grad_norm": 0.0007004168583080173, + "learning_rate": 5.9365695785284906e-06, + "loss": 0.0002, + "step": 2008 + }, + { + "epoch": 2.5696, + "grad_norm": 0.0006223475793376565, + "learning_rate": 5.925369124968066e-06, + "loss": 0.0002, + "step": 2009 + }, + { + "epoch": 2.57088, + "grad_norm": 0.00013660028344020247, + "learning_rate": 5.9141993247533525e-06, + "loss": 0.0002, + "step": 2010 + }, + { + "epoch": 2.57088, + "eval_loss": 1.1760672330856323, + "eval_runtime": 43.1068, + "eval_samples_per_second": 11.645, + "eval_steps_per_second": 1.461, + "step": 2010 + }, + { + "epoch": 2.5721600000000002, + "grad_norm": 0.0007261909195221961, + "learning_rate": 5.903060199184487e-06, + "loss": 0.0002, + "step": 2011 + }, + { + "epoch": 2.5734399999999997, + "grad_norm": 0.000268265837803483, + "learning_rate": 5.891951769503107e-06, + "loss": 0.0002, + "step": 2012 + }, + { + "epoch": 2.57472, + "grad_norm": 0.0002705653023440391, + "learning_rate": 5.880874056892308e-06, + "loss": 0.0002, + "step": 2013 + }, + { + "epoch": 2.576, + "grad_norm": 0.00012832919310312718, + "learning_rate": 5.869827082476625e-06, + "loss": 0.0002, + "step": 2014 + }, + { + "epoch": 2.57728, + "grad_norm": 0.0006553751300089061, + "learning_rate": 5.858810867321955e-06, + "loss": 0.0002, + "step": 2015 + }, + { + "epoch": 2.57856, + "grad_norm": 0.00020814413437619805, + "learning_rate": 5.847825432435566e-06, + "loss": 0.0002, + "step": 2016 + }, + { + "epoch": 2.57984, + "grad_norm": 0.0006322242552414536, + "learning_rate": 5.836870798766008e-06, + "loss": 0.0002, + "step": 2017 + }, + { + "epoch": 2.58112, + "grad_norm": 0.0002930402697529644, + "learning_rate": 5.8259469872031006e-06, + "loss": 0.0002, + "step": 2018 + }, + { + "epoch": 2.5824, + "grad_norm": 0.0005195149569772184, + "learning_rate": 5.815054018577894e-06, + "loss": 0.0002, + "step": 2019 + }, + { + "epoch": 2.58368, + "grad_norm": 0.0004457169270608574, + "learning_rate": 5.8041919136626144e-06, + "loss": 0.0002, + "step": 2020 + }, + { + "epoch": 2.58496, + "grad_norm": 9.912128007272258e-05, + "learning_rate": 5.793360693170646e-06, + "loss": 0.0002, + "step": 2021 + }, + { + "epoch": 2.58624, + "grad_norm": 0.0005960392882116139, + "learning_rate": 5.782560377756459e-06, + "loss": 0.0002, + "step": 2022 + }, + { + "epoch": 2.58752, + "grad_norm": 0.0007441943162120879, + "learning_rate": 5.771790988015601e-06, + "loss": 0.0002, + "step": 2023 + }, + { + "epoch": 2.5888, + "grad_norm": 0.0006059888401068747, + "learning_rate": 5.761052544484652e-06, + "loss": 0.0002, + "step": 2024 + }, + { + "epoch": 2.59008, + "grad_norm": 0.000811684294603765, + "learning_rate": 5.750345067641161e-06, + "loss": 0.0002, + "step": 2025 + }, + { + "epoch": 2.59136, + "grad_norm": 0.00020101373957004398, + "learning_rate": 5.739668577903646e-06, + "loss": 0.0002, + "step": 2026 + }, + { + "epoch": 2.59264, + "grad_norm": 0.0002765175304375589, + "learning_rate": 5.729023095631524e-06, + "loss": 0.0002, + "step": 2027 + }, + { + "epoch": 2.59392, + "grad_norm": 0.0007142999093048275, + "learning_rate": 5.71840864112508e-06, + "loss": 0.0002, + "step": 2028 + }, + { + "epoch": 2.5952, + "grad_norm": 0.0005172202945686877, + "learning_rate": 5.7078252346254395e-06, + "loss": 0.0002, + "step": 2029 + }, + { + "epoch": 2.59648, + "grad_norm": 0.0005064026918262243, + "learning_rate": 5.6972728963145055e-06, + "loss": 0.0002, + "step": 2030 + }, + { + "epoch": 2.59776, + "grad_norm": 0.0007822649786248803, + "learning_rate": 5.686751646314961e-06, + "loss": 0.0002, + "step": 2031 + }, + { + "epoch": 2.59904, + "grad_norm": 0.00016675966617185622, + "learning_rate": 5.676261504690184e-06, + "loss": 0.0002, + "step": 2032 + }, + { + "epoch": 2.60032, + "grad_norm": 0.0004352393443696201, + "learning_rate": 5.665802491444239e-06, + "loss": 0.0002, + "step": 2033 + }, + { + "epoch": 2.6016, + "grad_norm": 0.00018516472482588142, + "learning_rate": 5.65537462652183e-06, + "loss": 0.0002, + "step": 2034 + }, + { + "epoch": 2.60288, + "grad_norm": 0.00020108661556150764, + "learning_rate": 5.64497792980826e-06, + "loss": 0.0002, + "step": 2035 + }, + { + "epoch": 2.6041600000000003, + "grad_norm": 0.0005044660647399724, + "learning_rate": 5.6346124211294074e-06, + "loss": 0.0002, + "step": 2036 + }, + { + "epoch": 2.6054399999999998, + "grad_norm": 0.0001886051904875785, + "learning_rate": 5.624278120251657e-06, + "loss": 0.0002, + "step": 2037 + }, + { + "epoch": 2.60672, + "grad_norm": 0.0007088842103257775, + "learning_rate": 5.613975046881899e-06, + "loss": 0.0002, + "step": 2038 + }, + { + "epoch": 2.608, + "grad_norm": 0.0003815259551629424, + "learning_rate": 5.603703220667473e-06, + "loss": 0.0002, + "step": 2039 + }, + { + "epoch": 2.60928, + "grad_norm": 0.0004345429188106209, + "learning_rate": 5.593462661196119e-06, + "loss": 0.0002, + "step": 2040 + }, + { + "epoch": 2.60928, + "eval_loss": 1.1763349771499634, + "eval_runtime": 43.2109, + "eval_samples_per_second": 11.617, + "eval_steps_per_second": 1.458, + "step": 2040 + }, + { + "epoch": 2.61056, + "grad_norm": 0.0006033392855897546, + "learning_rate": 5.583253387995969e-06, + "loss": 0.0002, + "step": 2041 + }, + { + "epoch": 2.61184, + "grad_norm": 0.00014597387053072453, + "learning_rate": 5.573075420535488e-06, + "loss": 0.0002, + "step": 2042 + }, + { + "epoch": 2.61312, + "grad_norm": 0.00015920022269710898, + "learning_rate": 5.562928778223439e-06, + "loss": 0.0002, + "step": 2043 + }, + { + "epoch": 2.6144, + "grad_norm": 0.00037969640106894076, + "learning_rate": 5.552813480408856e-06, + "loss": 0.0002, + "step": 2044 + }, + { + "epoch": 2.6156800000000002, + "grad_norm": 0.0005262352642603219, + "learning_rate": 5.542729546380997e-06, + "loss": 0.0002, + "step": 2045 + }, + { + "epoch": 2.6169599999999997, + "grad_norm": 0.0007127207354642451, + "learning_rate": 5.532676995369314e-06, + "loss": 0.0002, + "step": 2046 + }, + { + "epoch": 2.61824, + "grad_norm": 0.00018934982654172927, + "learning_rate": 5.522655846543408e-06, + "loss": 0.0002, + "step": 2047 + }, + { + "epoch": 2.61952, + "grad_norm": 0.0003394143714103848, + "learning_rate": 5.512666119013005e-06, + "loss": 0.0002, + "step": 2048 + }, + { + "epoch": 2.6208, + "grad_norm": 0.00015175512817222625, + "learning_rate": 5.5027078318279035e-06, + "loss": 0.0002, + "step": 2049 + }, + { + "epoch": 2.62208, + "grad_norm": 0.00025432585971429944, + "learning_rate": 5.492781003977965e-06, + "loss": 0.0002, + "step": 2050 + }, + { + "epoch": 2.62336, + "grad_norm": 0.0002585637557785958, + "learning_rate": 5.482885654393039e-06, + "loss": 0.0002, + "step": 2051 + }, + { + "epoch": 2.62464, + "grad_norm": 0.000490740523673594, + "learning_rate": 5.473021801942955e-06, + "loss": 0.0002, + "step": 2052 + }, + { + "epoch": 2.62592, + "grad_norm": 0.0001708458730718121, + "learning_rate": 5.463189465437484e-06, + "loss": 0.0002, + "step": 2053 + }, + { + "epoch": 2.6272, + "grad_norm": 0.0001390179677400738, + "learning_rate": 5.4533886636262955e-06, + "loss": 0.0002, + "step": 2054 + }, + { + "epoch": 2.62848, + "grad_norm": 0.0006850110366940498, + "learning_rate": 5.443619415198929e-06, + "loss": 0.0002, + "step": 2055 + }, + { + "epoch": 2.62976, + "grad_norm": 9.433117520529777e-05, + "learning_rate": 5.433881738784739e-06, + "loss": 0.0002, + "step": 2056 + }, + { + "epoch": 2.63104, + "grad_norm": 0.0007274181698448956, + "learning_rate": 5.424175652952887e-06, + "loss": 0.0002, + "step": 2057 + }, + { + "epoch": 2.63232, + "grad_norm": 0.00011057154915761203, + "learning_rate": 5.414501176212295e-06, + "loss": 0.0002, + "step": 2058 + }, + { + "epoch": 2.6336, + "grad_norm": 0.00011248882947256789, + "learning_rate": 5.4048583270115966e-06, + "loss": 0.0002, + "step": 2059 + }, + { + "epoch": 2.63488, + "grad_norm": 0.0005161191220395267, + "learning_rate": 5.395247123739119e-06, + "loss": 0.0002, + "step": 2060 + }, + { + "epoch": 2.63616, + "grad_norm": 0.0005980413407087326, + "learning_rate": 5.385667584722849e-06, + "loss": 0.0002, + "step": 2061 + }, + { + "epoch": 2.63744, + "grad_norm": 0.00018621369963511825, + "learning_rate": 5.376119728230387e-06, + "loss": 0.0002, + "step": 2062 + }, + { + "epoch": 2.63872, + "grad_norm": 0.0007834093994461, + "learning_rate": 5.3666035724689135e-06, + "loss": 0.0002, + "step": 2063 + }, + { + "epoch": 2.64, + "grad_norm": 0.0007524779066443443, + "learning_rate": 5.357119135585153e-06, + "loss": 0.0002, + "step": 2064 + }, + { + "epoch": 2.64128, + "grad_norm": 0.00033790417364798486, + "learning_rate": 5.347666435665363e-06, + "loss": 0.0002, + "step": 2065 + }, + { + "epoch": 2.64256, + "grad_norm": 0.0007513008895330131, + "learning_rate": 5.338245490735263e-06, + "loss": 0.0002, + "step": 2066 + }, + { + "epoch": 2.64384, + "grad_norm": 0.0007832477567717433, + "learning_rate": 5.328856318760026e-06, + "loss": 0.0002, + "step": 2067 + }, + { + "epoch": 2.64512, + "grad_norm": 0.00011210219963686541, + "learning_rate": 5.319498937644228e-06, + "loss": 0.0002, + "step": 2068 + }, + { + "epoch": 2.6464, + "grad_norm": 0.0006179630290716887, + "learning_rate": 5.31017336523183e-06, + "loss": 0.0002, + "step": 2069 + }, + { + "epoch": 2.6476800000000003, + "grad_norm": 0.000690981512889266, + "learning_rate": 5.300879619306135e-06, + "loss": 0.0002, + "step": 2070 + }, + { + "epoch": 2.6476800000000003, + "eval_loss": 1.1787551641464233, + "eval_runtime": 43.2975, + "eval_samples_per_second": 11.594, + "eval_steps_per_second": 1.455, + "step": 2070 + }, + { + "epoch": 2.6489599999999998, + "grad_norm": 8.601046283729374e-05, + "learning_rate": 5.29161771758975e-06, + "loss": 0.0002, + "step": 2071 + }, + { + "epoch": 2.65024, + "grad_norm": 0.0004467715334612876, + "learning_rate": 5.282387677744559e-06, + "loss": 0.0002, + "step": 2072 + }, + { + "epoch": 2.65152, + "grad_norm": 0.00033754404284991324, + "learning_rate": 5.273189517371689e-06, + "loss": 0.0002, + "step": 2073 + }, + { + "epoch": 2.6528, + "grad_norm": 9.469128417549655e-05, + "learning_rate": 5.264023254011476e-06, + "loss": 0.0002, + "step": 2074 + }, + { + "epoch": 2.65408, + "grad_norm": 0.00046040205052122474, + "learning_rate": 5.254888905143425e-06, + "loss": 0.0002, + "step": 2075 + }, + { + "epoch": 2.65536, + "grad_norm": 0.00017660969751887023, + "learning_rate": 5.245786488186183e-06, + "loss": 0.0002, + "step": 2076 + }, + { + "epoch": 2.65664, + "grad_norm": 0.0002686119987629354, + "learning_rate": 5.236716020497515e-06, + "loss": 0.0002, + "step": 2077 + }, + { + "epoch": 2.65792, + "grad_norm": 0.0005117706023156643, + "learning_rate": 5.227677519374243e-06, + "loss": 0.0002, + "step": 2078 + }, + { + "epoch": 2.6592000000000002, + "grad_norm": 0.00013892765855416656, + "learning_rate": 5.2186710020522435e-06, + "loss": 0.0002, + "step": 2079 + }, + { + "epoch": 2.6604799999999997, + "grad_norm": 0.0001570197200635448, + "learning_rate": 5.209696485706404e-06, + "loss": 0.0002, + "step": 2080 + }, + { + "epoch": 2.66176, + "grad_norm": 0.0002135578397428617, + "learning_rate": 5.200753987450575e-06, + "loss": 0.0002, + "step": 2081 + }, + { + "epoch": 2.66304, + "grad_norm": 0.0005491600022651255, + "learning_rate": 5.1918435243375646e-06, + "loss": 0.0002, + "step": 2082 + }, + { + "epoch": 2.66432, + "grad_norm": 0.00010938257037196308, + "learning_rate": 5.182965113359075e-06, + "loss": 0.0002, + "step": 2083 + }, + { + "epoch": 2.6656, + "grad_norm": 0.00019497588800732046, + "learning_rate": 5.174118771445711e-06, + "loss": 0.0002, + "step": 2084 + }, + { + "epoch": 2.66688, + "grad_norm": 0.0001784377236617729, + "learning_rate": 5.165304515466897e-06, + "loss": 0.0002, + "step": 2085 + }, + { + "epoch": 2.66816, + "grad_norm": 0.0004209493927191943, + "learning_rate": 5.156522362230895e-06, + "loss": 0.0002, + "step": 2086 + }, + { + "epoch": 2.66944, + "grad_norm": 0.00045996581320650876, + "learning_rate": 5.147772328484731e-06, + "loss": 0.0002, + "step": 2087 + }, + { + "epoch": 2.67072, + "grad_norm": 0.00036037570680491626, + "learning_rate": 5.139054430914192e-06, + "loss": 0.0002, + "step": 2088 + }, + { + "epoch": 2.672, + "grad_norm": 0.00041035652975551784, + "learning_rate": 5.13036868614378e-06, + "loss": 0.0002, + "step": 2089 + }, + { + "epoch": 2.67328, + "grad_norm": 0.00014587903569918126, + "learning_rate": 5.121715110736679e-06, + "loss": 0.0002, + "step": 2090 + }, + { + "epoch": 2.67456, + "grad_norm": 0.00012942550529260188, + "learning_rate": 5.113093721194738e-06, + "loss": 0.0002, + "step": 2091 + }, + { + "epoch": 2.67584, + "grad_norm": 0.00011897664808202535, + "learning_rate": 5.104504533958419e-06, + "loss": 0.0002, + "step": 2092 + }, + { + "epoch": 2.67712, + "grad_norm": 0.00014001535600982606, + "learning_rate": 5.095947565406784e-06, + "loss": 0.0002, + "step": 2093 + }, + { + "epoch": 2.6784, + "grad_norm": 0.0005521393613889813, + "learning_rate": 5.087422831857449e-06, + "loss": 0.0002, + "step": 2094 + }, + { + "epoch": 2.67968, + "grad_norm": 0.0001651291677262634, + "learning_rate": 5.0789303495665664e-06, + "loss": 0.0002, + "step": 2095 + }, + { + "epoch": 2.68096, + "grad_norm": 0.0004139514931011945, + "learning_rate": 5.070470134728789e-06, + "loss": 0.0002, + "step": 2096 + }, + { + "epoch": 2.68224, + "grad_norm": 0.0003664352698251605, + "learning_rate": 5.062042203477227e-06, + "loss": 0.0002, + "step": 2097 + }, + { + "epoch": 2.68352, + "grad_norm": 9.784977009985596e-05, + "learning_rate": 5.053646571883431e-06, + "loss": 0.0002, + "step": 2098 + }, + { + "epoch": 2.6848, + "grad_norm": 0.0004598863306455314, + "learning_rate": 5.045283255957371e-06, + "loss": 0.0002, + "step": 2099 + }, + { + "epoch": 2.68608, + "grad_norm": 0.00018233667651657015, + "learning_rate": 5.036952271647375e-06, + "loss": 0.0002, + "step": 2100 + }, + { + "epoch": 2.68608, + "eval_loss": 1.1784363985061646, + "eval_runtime": 43.2782, + "eval_samples_per_second": 11.599, + "eval_steps_per_second": 1.456, + "step": 2100 + }, + { + "epoch": 2.68736, + "grad_norm": 0.0004271143116056919, + "learning_rate": 5.02865363484013e-06, + "loss": 0.0002, + "step": 2101 + }, + { + "epoch": 2.68864, + "grad_norm": 0.00019682984566316009, + "learning_rate": 5.020387361360624e-06, + "loss": 0.0002, + "step": 2102 + }, + { + "epoch": 2.68992, + "grad_norm": 0.00015684337995480746, + "learning_rate": 5.01215346697215e-06, + "loss": 0.0002, + "step": 2103 + }, + { + "epoch": 2.6912000000000003, + "grad_norm": 0.00022414051636587828, + "learning_rate": 5.003951967376241e-06, + "loss": 0.0002, + "step": 2104 + }, + { + "epoch": 2.6924799999999998, + "grad_norm": 0.0005924166762270033, + "learning_rate": 4.995782878212654e-06, + "loss": 0.0002, + "step": 2105 + }, + { + "epoch": 2.69376, + "grad_norm": 0.00012100968888262287, + "learning_rate": 4.9876462150593515e-06, + "loss": 0.0002, + "step": 2106 + }, + { + "epoch": 2.69504, + "grad_norm": 0.0006111165857873857, + "learning_rate": 4.979541993432454e-06, + "loss": 0.0002, + "step": 2107 + }, + { + "epoch": 2.69632, + "grad_norm": 0.00018479368009138852, + "learning_rate": 4.971470228786226e-06, + "loss": 0.0002, + "step": 2108 + }, + { + "epoch": 2.6976, + "grad_norm": 0.00019101858197245747, + "learning_rate": 4.963430936513024e-06, + "loss": 0.0002, + "step": 2109 + }, + { + "epoch": 2.69888, + "grad_norm": 0.0001369758101645857, + "learning_rate": 4.955424131943296e-06, + "loss": 0.0002, + "step": 2110 + }, + { + "epoch": 2.70016, + "grad_norm": 0.0006212744629010558, + "learning_rate": 4.947449830345536e-06, + "loss": 0.0002, + "step": 2111 + }, + { + "epoch": 2.70144, + "grad_norm": 0.00013725864118896425, + "learning_rate": 4.939508046926245e-06, + "loss": 0.0002, + "step": 2112 + }, + { + "epoch": 2.7027200000000002, + "grad_norm": 0.0003938393492717296, + "learning_rate": 4.931598796829925e-06, + "loss": 0.0002, + "step": 2113 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 0.0003750926407519728, + "learning_rate": 4.923722095139039e-06, + "loss": 0.0002, + "step": 2114 + }, + { + "epoch": 2.70528, + "grad_norm": 0.00019795184198301286, + "learning_rate": 4.915877956873979e-06, + "loss": 0.0002, + "step": 2115 + }, + { + "epoch": 2.70656, + "grad_norm": 0.0003393131191842258, + "learning_rate": 4.9080663969930395e-06, + "loss": 0.0002, + "step": 2116 + }, + { + "epoch": 2.70784, + "grad_norm": 0.0005614679539576173, + "learning_rate": 4.900287430392386e-06, + "loss": 0.0002, + "step": 2117 + }, + { + "epoch": 2.70912, + "grad_norm": 0.0001567447034176439, + "learning_rate": 4.892541071906042e-06, + "loss": 0.0002, + "step": 2118 + }, + { + "epoch": 2.7104, + "grad_norm": 0.00024158720043487847, + "learning_rate": 4.884827336305843e-06, + "loss": 0.0002, + "step": 2119 + }, + { + "epoch": 2.71168, + "grad_norm": 0.00022995199833530933, + "learning_rate": 4.8771462383014155e-06, + "loss": 0.0002, + "step": 2120 + }, + { + "epoch": 2.71296, + "grad_norm": 0.00029130213079042733, + "learning_rate": 4.86949779254014e-06, + "loss": 0.0002, + "step": 2121 + }, + { + "epoch": 2.71424, + "grad_norm": 0.0005213206168264151, + "learning_rate": 4.861882013607146e-06, + "loss": 0.0002, + "step": 2122 + }, + { + "epoch": 2.71552, + "grad_norm": 0.0003036532725673169, + "learning_rate": 4.854298916025262e-06, + "loss": 0.0002, + "step": 2123 + }, + { + "epoch": 2.7168, + "grad_norm": 0.0004486961697693914, + "learning_rate": 4.846748514254989e-06, + "loss": 0.0002, + "step": 2124 + }, + { + "epoch": 2.71808, + "grad_norm": 0.0008269405807368457, + "learning_rate": 4.839230822694487e-06, + "loss": 0.0002, + "step": 2125 + }, + { + "epoch": 2.71936, + "grad_norm": 0.00011923944111913443, + "learning_rate": 4.831745855679545e-06, + "loss": 0.0002, + "step": 2126 + }, + { + "epoch": 2.72064, + "grad_norm": 0.0005785307730548084, + "learning_rate": 4.824293627483531e-06, + "loss": 0.0002, + "step": 2127 + }, + { + "epoch": 2.72192, + "grad_norm": 0.000523311726283282, + "learning_rate": 4.816874152317396e-06, + "loss": 0.0002, + "step": 2128 + }, + { + "epoch": 2.7232, + "grad_norm": 0.0002992335648741573, + "learning_rate": 4.809487444329629e-06, + "loss": 0.0002, + "step": 2129 + }, + { + "epoch": 2.72448, + "grad_norm": 0.00010770787775982171, + "learning_rate": 4.802133517606236e-06, + "loss": 0.0002, + "step": 2130 + }, + { + "epoch": 2.72448, + "eval_loss": 1.1832425594329834, + "eval_runtime": 43.5517, + "eval_samples_per_second": 11.527, + "eval_steps_per_second": 1.447, + "step": 2130 + }, + { + "epoch": 2.72576, + "grad_norm": 0.0007023107027634978, + "learning_rate": 4.794812386170704e-06, + "loss": 0.0002, + "step": 2131 + }, + { + "epoch": 2.72704, + "grad_norm": 0.0002617916907183826, + "learning_rate": 4.787524063983993e-06, + "loss": 0.0002, + "step": 2132 + }, + { + "epoch": 2.72832, + "grad_norm": 0.00022266076121013612, + "learning_rate": 4.780268564944484e-06, + "loss": 0.0002, + "step": 2133 + }, + { + "epoch": 2.7296, + "grad_norm": 0.00012528970546554774, + "learning_rate": 4.773045902887979e-06, + "loss": 0.0002, + "step": 2134 + }, + { + "epoch": 2.73088, + "grad_norm": 0.00026173575315624475, + "learning_rate": 4.765856091587654e-06, + "loss": 0.0002, + "step": 2135 + }, + { + "epoch": 2.73216, + "grad_norm": 0.00034735462395474315, + "learning_rate": 4.758699144754043e-06, + "loss": 0.0002, + "step": 2136 + }, + { + "epoch": 2.73344, + "grad_norm": 0.00010721830039983615, + "learning_rate": 4.751575076035009e-06, + "loss": 0.0002, + "step": 2137 + }, + { + "epoch": 2.7347200000000003, + "grad_norm": 0.0005374618922360241, + "learning_rate": 4.744483899015722e-06, + "loss": 0.0002, + "step": 2138 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 0.00016130245057865977, + "learning_rate": 4.7374256272186205e-06, + "loss": 0.0002, + "step": 2139 + }, + { + "epoch": 2.73728, + "grad_norm": 0.00034650624729692936, + "learning_rate": 4.730400274103404e-06, + "loss": 0.0002, + "step": 2140 + }, + { + "epoch": 2.73856, + "grad_norm": 0.00032838908373378217, + "learning_rate": 4.723407853066994e-06, + "loss": 0.0002, + "step": 2141 + }, + { + "epoch": 2.73984, + "grad_norm": 9.886346379062161e-05, + "learning_rate": 4.716448377443515e-06, + "loss": 0.0002, + "step": 2142 + }, + { + "epoch": 2.74112, + "grad_norm": 0.0004354401316959411, + "learning_rate": 4.709521860504261e-06, + "loss": 0.0002, + "step": 2143 + }, + { + "epoch": 2.7424, + "grad_norm": 0.00011024817649740726, + "learning_rate": 4.702628315457683e-06, + "loss": 0.0002, + "step": 2144 + }, + { + "epoch": 2.74368, + "grad_norm": 0.00012188892287667841, + "learning_rate": 4.695767755449353e-06, + "loss": 0.0002, + "step": 2145 + }, + { + "epoch": 2.74496, + "grad_norm": 0.0003627574478741735, + "learning_rate": 4.68894019356194e-06, + "loss": 0.0002, + "step": 2146 + }, + { + "epoch": 2.7462400000000002, + "grad_norm": 0.0003473378310445696, + "learning_rate": 4.682145642815193e-06, + "loss": 0.0002, + "step": 2147 + }, + { + "epoch": 2.7475199999999997, + "grad_norm": 0.00028617039788514376, + "learning_rate": 4.675384116165903e-06, + "loss": 0.0002, + "step": 2148 + }, + { + "epoch": 2.7488, + "grad_norm": 0.0002804455580189824, + "learning_rate": 4.668655626507898e-06, + "loss": 0.0002, + "step": 2149 + }, + { + "epoch": 2.75008, + "grad_norm": 0.00034440791932865977, + "learning_rate": 4.661960186671997e-06, + "loss": 0.0002, + "step": 2150 + }, + { + "epoch": 2.75136, + "grad_norm": 0.00016433031123597175, + "learning_rate": 4.655297809425995e-06, + "loss": 0.0002, + "step": 2151 + }, + { + "epoch": 2.75264, + "grad_norm": 0.00031462355400435627, + "learning_rate": 4.648668507474642e-06, + "loss": 0.0002, + "step": 2152 + }, + { + "epoch": 2.75392, + "grad_norm": 0.0002593990648165345, + "learning_rate": 4.642072293459616e-06, + "loss": 0.0002, + "step": 2153 + }, + { + "epoch": 2.7552, + "grad_norm": 0.0001353595725959167, + "learning_rate": 4.635509179959497e-06, + "loss": 0.0002, + "step": 2154 + }, + { + "epoch": 2.75648, + "grad_norm": 0.00020742540073115379, + "learning_rate": 4.628979179489745e-06, + "loss": 0.0002, + "step": 2155 + }, + { + "epoch": 2.75776, + "grad_norm": 0.00020888213475700468, + "learning_rate": 4.62248230450267e-06, + "loss": 0.0002, + "step": 2156 + }, + { + "epoch": 2.75904, + "grad_norm": 0.0002596872509457171, + "learning_rate": 4.616018567387425e-06, + "loss": 0.0002, + "step": 2157 + }, + { + "epoch": 2.76032, + "grad_norm": 0.00011134509259136394, + "learning_rate": 4.609587980469958e-06, + "loss": 0.0002, + "step": 2158 + }, + { + "epoch": 2.7616, + "grad_norm": 0.0002468661987222731, + "learning_rate": 4.603190556013013e-06, + "loss": 0.0002, + "step": 2159 + }, + { + "epoch": 2.76288, + "grad_norm": 0.00023062645050231367, + "learning_rate": 4.596826306216087e-06, + "loss": 0.0002, + "step": 2160 + }, + { + "epoch": 2.76288, + "eval_loss": 1.179226279258728, + "eval_runtime": 43.5134, + "eval_samples_per_second": 11.537, + "eval_steps_per_second": 1.448, + "step": 2160 + }, + { + "epoch": 2.76416, + "grad_norm": 0.00036494573578238487, + "learning_rate": 4.590495243215422e-06, + "loss": 0.0002, + "step": 2161 + }, + { + "epoch": 2.76544, + "grad_norm": 0.00019542551308404654, + "learning_rate": 4.584197379083969e-06, + "loss": 0.0002, + "step": 2162 + }, + { + "epoch": 2.76672, + "grad_norm": 0.00019057006284128875, + "learning_rate": 4.577932725831367e-06, + "loss": 0.0002, + "step": 2163 + }, + { + "epoch": 2.768, + "grad_norm": 0.00048471824266016483, + "learning_rate": 4.571701295403939e-06, + "loss": 0.0002, + "step": 2164 + }, + { + "epoch": 2.76928, + "grad_norm": 0.00010409307287773117, + "learning_rate": 4.565503099684641e-06, + "loss": 0.0002, + "step": 2165 + }, + { + "epoch": 2.77056, + "grad_norm": 0.00016285658057313412, + "learning_rate": 4.559338150493056e-06, + "loss": 0.0002, + "step": 2166 + }, + { + "epoch": 2.77184, + "grad_norm": 0.00011712354171322659, + "learning_rate": 4.553206459585366e-06, + "loss": 0.0002, + "step": 2167 + }, + { + "epoch": 2.77312, + "grad_norm": 0.0002646244829520583, + "learning_rate": 4.547108038654336e-06, + "loss": 0.0002, + "step": 2168 + }, + { + "epoch": 2.7744, + "grad_norm": 0.0004395518044475466, + "learning_rate": 4.541042899329287e-06, + "loss": 0.0002, + "step": 2169 + }, + { + "epoch": 2.77568, + "grad_norm": 0.00015305139822885394, + "learning_rate": 4.535011053176065e-06, + "loss": 0.0002, + "step": 2170 + }, + { + "epoch": 2.77696, + "grad_norm": 0.00021129405649844557, + "learning_rate": 4.52901251169704e-06, + "loss": 0.0002, + "step": 2171 + }, + { + "epoch": 2.7782400000000003, + "grad_norm": 0.0003616804606281221, + "learning_rate": 4.523047286331064e-06, + "loss": 0.0002, + "step": 2172 + }, + { + "epoch": 2.7795199999999998, + "grad_norm": 0.00016815500566735864, + "learning_rate": 4.517115388453462e-06, + "loss": 0.0002, + "step": 2173 + }, + { + "epoch": 2.7808, + "grad_norm": 0.00023923354456201196, + "learning_rate": 4.511216829376001e-06, + "loss": 0.0002, + "step": 2174 + }, + { + "epoch": 2.78208, + "grad_norm": 0.00020838297496084124, + "learning_rate": 4.505351620346878e-06, + "loss": 0.0002, + "step": 2175 + }, + { + "epoch": 2.78336, + "grad_norm": 0.00019640865502879024, + "learning_rate": 4.49951977255069e-06, + "loss": 0.0002, + "step": 2176 + }, + { + "epoch": 2.78464, + "grad_norm": 0.0001642953575355932, + "learning_rate": 4.493721297108412e-06, + "loss": 0.0002, + "step": 2177 + }, + { + "epoch": 2.78592, + "grad_norm": 0.0002643915649969131, + "learning_rate": 4.487956205077392e-06, + "loss": 0.0002, + "step": 2178 + }, + { + "epoch": 2.7872, + "grad_norm": 0.00023071031318977475, + "learning_rate": 4.482224507451309e-06, + "loss": 0.0002, + "step": 2179 + }, + { + "epoch": 2.78848, + "grad_norm": 0.0002969163761008531, + "learning_rate": 4.476526215160157e-06, + "loss": 0.0002, + "step": 2180 + }, + { + "epoch": 2.7897600000000002, + "grad_norm": 0.00031866165227256715, + "learning_rate": 4.470861339070243e-06, + "loss": 0.0002, + "step": 2181 + }, + { + "epoch": 2.7910399999999997, + "grad_norm": 0.0005999641725793481, + "learning_rate": 4.465229889984131e-06, + "loss": 0.0002, + "step": 2182 + }, + { + "epoch": 2.79232, + "grad_norm": 0.0003952770493924618, + "learning_rate": 4.459631878640661e-06, + "loss": 0.0002, + "step": 2183 + }, + { + "epoch": 2.7936, + "grad_norm": 0.0008258552988991141, + "learning_rate": 4.454067315714897e-06, + "loss": 0.0002, + "step": 2184 + }, + { + "epoch": 2.79488, + "grad_norm": 0.00019526865798979998, + "learning_rate": 4.448536211818127e-06, + "loss": 0.0002, + "step": 2185 + }, + { + "epoch": 2.79616, + "grad_norm": 0.0006488322978839278, + "learning_rate": 4.443038577497828e-06, + "loss": 0.0002, + "step": 2186 + }, + { + "epoch": 2.79744, + "grad_norm": 0.000724302779417485, + "learning_rate": 4.437574423237652e-06, + "loss": 0.0002, + "step": 2187 + }, + { + "epoch": 2.79872, + "grad_norm": 0.00013176088395994157, + "learning_rate": 4.432143759457415e-06, + "loss": 0.0002, + "step": 2188 + }, + { + "epoch": 2.8, + "grad_norm": 0.0008997769327834249, + "learning_rate": 4.42674659651306e-06, + "loss": 0.0002, + "step": 2189 + }, + { + "epoch": 2.80128, + "grad_norm": 0.0004246947355568409, + "learning_rate": 4.4213829446966495e-06, + "loss": 0.0002, + "step": 2190 + }, + { + "epoch": 2.80128, + "eval_loss": 1.1821544170379639, + "eval_runtime": 43.5085, + "eval_samples_per_second": 11.538, + "eval_steps_per_second": 1.448, + "step": 2190 + }, + { + "epoch": 2.80256, + "grad_norm": 0.00038387291715480387, + "learning_rate": 4.416052814236344e-06, + "loss": 0.0002, + "step": 2191 + }, + { + "epoch": 2.80384, + "grad_norm": 0.00042236316949129105, + "learning_rate": 4.410756215296375e-06, + "loss": 0.0002, + "step": 2192 + }, + { + "epoch": 2.80512, + "grad_norm": 0.00043121661292389035, + "learning_rate": 4.405493157977039e-06, + "loss": 0.0002, + "step": 2193 + }, + { + "epoch": 2.8064, + "grad_norm": 0.00012004785821773112, + "learning_rate": 4.400263652314666e-06, + "loss": 0.0002, + "step": 2194 + }, + { + "epoch": 2.80768, + "grad_norm": 0.0002733418659772724, + "learning_rate": 4.395067708281608e-06, + "loss": 0.0002, + "step": 2195 + }, + { + "epoch": 2.80896, + "grad_norm": 0.00019762851297855377, + "learning_rate": 4.389905335786212e-06, + "loss": 0.0002, + "step": 2196 + }, + { + "epoch": 2.81024, + "grad_norm": 0.00014854299661237746, + "learning_rate": 4.3847765446728054e-06, + "loss": 0.0002, + "step": 2197 + }, + { + "epoch": 2.81152, + "grad_norm": 0.0003144988731946796, + "learning_rate": 4.3796813447216906e-06, + "loss": 0.0002, + "step": 2198 + }, + { + "epoch": 2.8128, + "grad_norm": 0.0001517291384516284, + "learning_rate": 4.374619745649099e-06, + "loss": 0.0002, + "step": 2199 + }, + { + "epoch": 2.81408, + "grad_norm": 0.0002452972694300115, + "learning_rate": 4.369591757107193e-06, + "loss": 0.0002, + "step": 2200 + }, + { + "epoch": 2.81536, + "grad_norm": 0.0003286633000243455, + "learning_rate": 4.3645973886840434e-06, + "loss": 0.0002, + "step": 2201 + }, + { + "epoch": 2.81664, + "grad_norm": 0.0004903704393655062, + "learning_rate": 4.359636649903608e-06, + "loss": 0.0002, + "step": 2202 + }, + { + "epoch": 2.81792, + "grad_norm": 0.00029815713060088456, + "learning_rate": 4.354709550225714e-06, + "loss": 0.0002, + "step": 2203 + }, + { + "epoch": 2.8192, + "grad_norm": 0.00015331384201999754, + "learning_rate": 4.34981609904604e-06, + "loss": 0.0002, + "step": 2204 + }, + { + "epoch": 2.82048, + "grad_norm": 0.000682519341353327, + "learning_rate": 4.344956305696104e-06, + "loss": 0.0002, + "step": 2205 + }, + { + "epoch": 2.8217600000000003, + "grad_norm": 0.0003466587222646922, + "learning_rate": 4.3401301794432325e-06, + "loss": 0.0002, + "step": 2206 + }, + { + "epoch": 2.8230399999999998, + "grad_norm": 0.0006293237674981356, + "learning_rate": 4.335337729490559e-06, + "loss": 0.0002, + "step": 2207 + }, + { + "epoch": 2.82432, + "grad_norm": 0.0004514796892181039, + "learning_rate": 4.330578964976992e-06, + "loss": 0.0002, + "step": 2208 + }, + { + "epoch": 2.8256, + "grad_norm": 9.345506259705871e-05, + "learning_rate": 4.32585389497721e-06, + "loss": 0.0002, + "step": 2209 + }, + { + "epoch": 2.82688, + "grad_norm": 0.0005342678050510585, + "learning_rate": 4.321162528501635e-06, + "loss": 0.0002, + "step": 2210 + }, + { + "epoch": 2.82816, + "grad_norm": 0.00011190603981958702, + "learning_rate": 4.316504874496412e-06, + "loss": 0.0002, + "step": 2211 + }, + { + "epoch": 2.82944, + "grad_norm": 0.00010925935202976689, + "learning_rate": 4.3118809418434106e-06, + "loss": 0.0002, + "step": 2212 + }, + { + "epoch": 2.83072, + "grad_norm": 0.00012517714640125632, + "learning_rate": 4.307290739360186e-06, + "loss": 0.0002, + "step": 2213 + }, + { + "epoch": 2.832, + "grad_norm": 0.00015721505042165518, + "learning_rate": 4.302734275799981e-06, + "loss": 0.0002, + "step": 2214 + }, + { + "epoch": 2.8332800000000002, + "grad_norm": 0.00013470579870045185, + "learning_rate": 4.2982115598516915e-06, + "loss": 0.0002, + "step": 2215 + }, + { + "epoch": 2.8345599999999997, + "grad_norm": 0.0003435101534705609, + "learning_rate": 4.293722600139862e-06, + "loss": 0.0002, + "step": 2216 + }, + { + "epoch": 2.83584, + "grad_norm": 0.00033612651168368757, + "learning_rate": 4.289267405224666e-06, + "loss": 0.0002, + "step": 2217 + }, + { + "epoch": 2.83712, + "grad_norm": 0.0004498395719565451, + "learning_rate": 4.284845983601892e-06, + "loss": 0.0002, + "step": 2218 + }, + { + "epoch": 2.8384, + "grad_norm": 0.00033877856913022697, + "learning_rate": 4.280458343702924e-06, + "loss": 0.0002, + "step": 2219 + }, + { + "epoch": 2.83968, + "grad_norm": 0.00011193512909812853, + "learning_rate": 4.2761044938947214e-06, + "loss": 0.0002, + "step": 2220 + }, + { + "epoch": 2.83968, + "eval_loss": 1.1815558671951294, + "eval_runtime": 43.5068, + "eval_samples_per_second": 11.538, + "eval_steps_per_second": 1.448, + "step": 2220 + }, + { + "epoch": 2.84096, + "grad_norm": 0.0005381443188525736, + "learning_rate": 4.271784442479811e-06, + "loss": 0.0002, + "step": 2221 + }, + { + "epoch": 2.84224, + "grad_norm": 0.000222622329602018, + "learning_rate": 4.2674981976962724e-06, + "loss": 0.0002, + "step": 2222 + }, + { + "epoch": 2.84352, + "grad_norm": 0.0003087735385634005, + "learning_rate": 4.263245767717712e-06, + "loss": 0.0002, + "step": 2223 + }, + { + "epoch": 2.8448, + "grad_norm": 0.00019644832354970276, + "learning_rate": 4.259027160653258e-06, + "loss": 0.0002, + "step": 2224 + }, + { + "epoch": 2.84608, + "grad_norm": 0.00019042703206650913, + "learning_rate": 4.254842384547535e-06, + "loss": 0.0002, + "step": 2225 + }, + { + "epoch": 2.84736, + "grad_norm": 0.0001009497937047854, + "learning_rate": 4.250691447380659e-06, + "loss": 0.0002, + "step": 2226 + }, + { + "epoch": 2.84864, + "grad_norm": 0.0005751587450504303, + "learning_rate": 4.2465743570682134e-06, + "loss": 0.0002, + "step": 2227 + }, + { + "epoch": 2.84992, + "grad_norm": 0.0003292016335763037, + "learning_rate": 4.242491121461242e-06, + "loss": 0.0002, + "step": 2228 + }, + { + "epoch": 2.8512, + "grad_norm": 0.0004692550573963672, + "learning_rate": 4.23844174834623e-06, + "loss": 0.0002, + "step": 2229 + }, + { + "epoch": 2.85248, + "grad_norm": 0.000195763394003734, + "learning_rate": 4.234426245445078e-06, + "loss": 0.0002, + "step": 2230 + }, + { + "epoch": 2.85376, + "grad_norm": 0.00020026469428557903, + "learning_rate": 4.230444620415114e-06, + "loss": 0.0002, + "step": 2231 + }, + { + "epoch": 2.85504, + "grad_norm": 0.0003763276035897434, + "learning_rate": 4.226496880849052e-06, + "loss": 0.0002, + "step": 2232 + }, + { + "epoch": 2.85632, + "grad_norm": 0.00046748522436246276, + "learning_rate": 4.222583034274991e-06, + "loss": 0.0002, + "step": 2233 + }, + { + "epoch": 2.8576, + "grad_norm": 0.0001498359051765874, + "learning_rate": 4.218703088156401e-06, + "loss": 0.0002, + "step": 2234 + }, + { + "epoch": 2.85888, + "grad_norm": 0.0005568740307353437, + "learning_rate": 4.214857049892103e-06, + "loss": 0.0002, + "step": 2235 + }, + { + "epoch": 2.86016, + "grad_norm": 0.0005128760822117329, + "learning_rate": 4.211044926816259e-06, + "loss": 0.0002, + "step": 2236 + }, + { + "epoch": 2.86144, + "grad_norm": 0.0001381375186610967, + "learning_rate": 4.207266726198356e-06, + "loss": 0.0002, + "step": 2237 + }, + { + "epoch": 2.86272, + "grad_norm": 0.00025074410950765014, + "learning_rate": 4.203522455243192e-06, + "loss": 0.0002, + "step": 2238 + }, + { + "epoch": 2.864, + "grad_norm": 0.0009641392389312387, + "learning_rate": 4.199812121090866e-06, + "loss": 0.0002, + "step": 2239 + }, + { + "epoch": 2.8652800000000003, + "grad_norm": 0.00031504526850767434, + "learning_rate": 4.196135730816762e-06, + "loss": 0.0002, + "step": 2240 + }, + { + "epoch": 2.8665599999999998, + "grad_norm": 0.00048518256517127156, + "learning_rate": 4.19249329143153e-06, + "loss": 0.0002, + "step": 2241 + }, + { + "epoch": 2.86784, + "grad_norm": 0.0004062238149344921, + "learning_rate": 4.18888480988108e-06, + "loss": 0.0002, + "step": 2242 + }, + { + "epoch": 2.86912, + "grad_norm": 0.0003679286455735564, + "learning_rate": 4.185310293046573e-06, + "loss": 0.0002, + "step": 2243 + }, + { + "epoch": 2.8704, + "grad_norm": 0.00010955840116366744, + "learning_rate": 4.1817697477443895e-06, + "loss": 0.0002, + "step": 2244 + }, + { + "epoch": 2.87168, + "grad_norm": 0.00031511441920883954, + "learning_rate": 4.178263180726138e-06, + "loss": 0.0002, + "step": 2245 + }, + { + "epoch": 2.87296, + "grad_norm": 0.0006907782517373562, + "learning_rate": 4.1747905986786295e-06, + "loss": 0.0002, + "step": 2246 + }, + { + "epoch": 2.87424, + "grad_norm": 0.00010518222552491352, + "learning_rate": 4.171352008223863e-06, + "loss": 0.0002, + "step": 2247 + }, + { + "epoch": 2.87552, + "grad_norm": 0.0002229425881523639, + "learning_rate": 4.167947415919027e-06, + "loss": 0.0002, + "step": 2248 + }, + { + "epoch": 2.8768000000000002, + "grad_norm": 0.0005626695929095149, + "learning_rate": 4.1645768282564715e-06, + "loss": 0.0002, + "step": 2249 + }, + { + "epoch": 2.8780799999999997, + "grad_norm": 0.0001849379186751321, + "learning_rate": 4.1612402516637e-06, + "loss": 0.0002, + "step": 2250 + }, + { + "epoch": 2.8780799999999997, + "eval_loss": 1.1795929670333862, + "eval_runtime": 43.3675, + "eval_samples_per_second": 11.575, + "eval_steps_per_second": 1.453, + "step": 2250 + }, + { + "epoch": 2.87936, + "grad_norm": 0.00012294574116822332, + "learning_rate": 4.157937692503361e-06, + "loss": 0.0002, + "step": 2251 + }, + { + "epoch": 2.88064, + "grad_norm": 0.00041280241566710174, + "learning_rate": 4.154669157073239e-06, + "loss": 0.0002, + "step": 2252 + }, + { + "epoch": 2.88192, + "grad_norm": 0.00011976512178080156, + "learning_rate": 4.15143465160623e-06, + "loss": 0.0002, + "step": 2253 + }, + { + "epoch": 2.8832, + "grad_norm": 0.0003270368033554405, + "learning_rate": 4.148234182270339e-06, + "loss": 0.0002, + "step": 2254 + }, + { + "epoch": 2.88448, + "grad_norm": 0.0001754332333803177, + "learning_rate": 4.145067755168667e-06, + "loss": 0.0002, + "step": 2255 + }, + { + "epoch": 2.88576, + "grad_norm": 0.00020488614973146468, + "learning_rate": 4.141935376339401e-06, + "loss": 0.0002, + "step": 2256 + }, + { + "epoch": 2.88704, + "grad_norm": 0.0003626338439062238, + "learning_rate": 4.138837051755794e-06, + "loss": 0.0002, + "step": 2257 + }, + { + "epoch": 2.88832, + "grad_norm": 0.0001282071170862764, + "learning_rate": 4.135772787326164e-06, + "loss": 0.0002, + "step": 2258 + }, + { + "epoch": 2.8895999999999997, + "grad_norm": 0.000176410234416835, + "learning_rate": 4.132742588893878e-06, + "loss": 0.0002, + "step": 2259 + }, + { + "epoch": 2.89088, + "grad_norm": 9.673011663835496e-05, + "learning_rate": 4.129746462237341e-06, + "loss": 0.0002, + "step": 2260 + }, + { + "epoch": 2.89216, + "grad_norm": 0.0004994468181394041, + "learning_rate": 4.126784413069985e-06, + "loss": 0.0002, + "step": 2261 + }, + { + "epoch": 2.89344, + "grad_norm": 0.0002942743303719908, + "learning_rate": 4.123856447040254e-06, + "loss": 0.0002, + "step": 2262 + }, + { + "epoch": 2.89472, + "grad_norm": 0.0006747826701030135, + "learning_rate": 4.120962569731607e-06, + "loss": 0.0002, + "step": 2263 + }, + { + "epoch": 2.896, + "grad_norm": 0.0002655930002219975, + "learning_rate": 4.118102786662489e-06, + "loss": 0.0002, + "step": 2264 + }, + { + "epoch": 2.89728, + "grad_norm": 0.00012897778651677072, + "learning_rate": 4.115277103286335e-06, + "loss": 0.0002, + "step": 2265 + }, + { + "epoch": 2.89856, + "grad_norm": 0.0007293322705663741, + "learning_rate": 4.11248552499155e-06, + "loss": 0.0002, + "step": 2266 + }, + { + "epoch": 2.89984, + "grad_norm": 0.0006220631767064333, + "learning_rate": 4.1097280571015044e-06, + "loss": 0.0002, + "step": 2267 + }, + { + "epoch": 2.90112, + "grad_norm": 0.00011390577856218442, + "learning_rate": 4.107004704874524e-06, + "loss": 0.0002, + "step": 2268 + }, + { + "epoch": 2.9024, + "grad_norm": 0.0009089137311093509, + "learning_rate": 4.104315473503875e-06, + "loss": 0.0002, + "step": 2269 + }, + { + "epoch": 2.90368, + "grad_norm": 0.0007660266710445285, + "learning_rate": 4.101660368117756e-06, + "loss": 0.0002, + "step": 2270 + }, + { + "epoch": 2.90496, + "grad_norm": 0.00010220236435998231, + "learning_rate": 4.099039393779296e-06, + "loss": 0.0002, + "step": 2271 + }, + { + "epoch": 2.90624, + "grad_norm": 0.0007261789869517088, + "learning_rate": 4.096452555486533e-06, + "loss": 0.0002, + "step": 2272 + }, + { + "epoch": 2.90752, + "grad_norm": 0.0006142670172266662, + "learning_rate": 4.0938998581724085e-06, + "loss": 0.0002, + "step": 2273 + }, + { + "epoch": 2.9088000000000003, + "grad_norm": 0.00012708887516055256, + "learning_rate": 4.0913813067047645e-06, + "loss": 0.0002, + "step": 2274 + }, + { + "epoch": 2.91008, + "grad_norm": 0.00042775258771143854, + "learning_rate": 4.088896905886322e-06, + "loss": 0.0002, + "step": 2275 + }, + { + "epoch": 2.91136, + "grad_norm": 0.0002019358944380656, + "learning_rate": 4.086446660454687e-06, + "loss": 0.0002, + "step": 2276 + }, + { + "epoch": 2.91264, + "grad_norm": 0.0001991699100472033, + "learning_rate": 4.084030575082325e-06, + "loss": 0.0002, + "step": 2277 + }, + { + "epoch": 2.91392, + "grad_norm": 0.0001914306340040639, + "learning_rate": 4.081648654376567e-06, + "loss": 0.0002, + "step": 2278 + }, + { + "epoch": 2.9152, + "grad_norm": 0.00037433000397868454, + "learning_rate": 4.07930090287959e-06, + "loss": 0.0002, + "step": 2279 + }, + { + "epoch": 2.91648, + "grad_norm": 0.00021021462453063577, + "learning_rate": 4.076987325068415e-06, + "loss": 0.0002, + "step": 2280 + }, + { + "epoch": 2.91648, + "eval_loss": 1.180129051208496, + "eval_runtime": 43.4225, + "eval_samples_per_second": 11.561, + "eval_steps_per_second": 1.451, + "step": 2280 + }, + { + "epoch": 2.91776, + "grad_norm": 0.00023146902094595134, + "learning_rate": 4.074707925354891e-06, + "loss": 0.0002, + "step": 2281 + }, + { + "epoch": 2.91904, + "grad_norm": 0.0002690425608307123, + "learning_rate": 4.072462708085697e-06, + "loss": 0.0002, + "step": 2282 + }, + { + "epoch": 2.9203200000000002, + "grad_norm": 0.00017321344057563692, + "learning_rate": 4.07025167754233e-06, + "loss": 0.0002, + "step": 2283 + }, + { + "epoch": 2.9215999999999998, + "grad_norm": 0.0005131001817062497, + "learning_rate": 4.068074837941084e-06, + "loss": 0.0002, + "step": 2284 + }, + { + "epoch": 2.92288, + "grad_norm": 0.0001270236971322447, + "learning_rate": 4.065932193433064e-06, + "loss": 0.0002, + "step": 2285 + }, + { + "epoch": 2.92416, + "grad_norm": 0.00018355673819314688, + "learning_rate": 4.063823748104164e-06, + "loss": 0.0002, + "step": 2286 + }, + { + "epoch": 2.92544, + "grad_norm": 0.00037108006654307246, + "learning_rate": 4.0617495059750585e-06, + "loss": 0.0002, + "step": 2287 + }, + { + "epoch": 2.92672, + "grad_norm": 0.0005037841037847102, + "learning_rate": 4.059709471001203e-06, + "loss": 0.0002, + "step": 2288 + }, + { + "epoch": 2.928, + "grad_norm": 0.00011262594489380717, + "learning_rate": 4.057703647072821e-06, + "loss": 0.0002, + "step": 2289 + }, + { + "epoch": 2.92928, + "grad_norm": 0.0007866703672334552, + "learning_rate": 4.055732038014895e-06, + "loss": 0.0002, + "step": 2290 + }, + { + "epoch": 2.93056, + "grad_norm": 0.00035812563146464527, + "learning_rate": 4.053794647587166e-06, + "loss": 0.0002, + "step": 2291 + }, + { + "epoch": 2.9318400000000002, + "grad_norm": 0.00012094627163605765, + "learning_rate": 4.051891479484118e-06, + "loss": 0.0002, + "step": 2292 + }, + { + "epoch": 2.9331199999999997, + "grad_norm": 0.00018673752492759377, + "learning_rate": 4.050022537334979e-06, + "loss": 0.0002, + "step": 2293 + }, + { + "epoch": 2.9344, + "grad_norm": 0.0005387719720602036, + "learning_rate": 4.048187824703708e-06, + "loss": 0.0002, + "step": 2294 + }, + { + "epoch": 2.93568, + "grad_norm": 0.000565039983484894, + "learning_rate": 4.046387345088988e-06, + "loss": 0.0002, + "step": 2295 + }, + { + "epoch": 2.93696, + "grad_norm": 0.0003313857887405902, + "learning_rate": 4.044621101924227e-06, + "loss": 0.0002, + "step": 2296 + }, + { + "epoch": 2.93824, + "grad_norm": 0.00014667944924440235, + "learning_rate": 4.042889098577545e-06, + "loss": 0.0002, + "step": 2297 + }, + { + "epoch": 2.93952, + "grad_norm": 0.0001700992143014446, + "learning_rate": 4.041191338351762e-06, + "loss": 0.0002, + "step": 2298 + }, + { + "epoch": 2.9408, + "grad_norm": 0.000125421880511567, + "learning_rate": 4.03952782448441e-06, + "loss": 0.0002, + "step": 2299 + }, + { + "epoch": 2.94208, + "grad_norm": 0.0002756211906671524, + "learning_rate": 4.037898560147704e-06, + "loss": 0.0002, + "step": 2300 + }, + { + "epoch": 2.94336, + "grad_norm": 0.00016799187869764864, + "learning_rate": 4.036303548448556e-06, + "loss": 0.0002, + "step": 2301 + }, + { + "epoch": 2.94464, + "grad_norm": 0.0005784616805613041, + "learning_rate": 4.034742792428553e-06, + "loss": 0.0002, + "step": 2302 + }, + { + "epoch": 2.94592, + "grad_norm": 0.00011791891301982105, + "learning_rate": 4.033216295063964e-06, + "loss": 0.0002, + "step": 2303 + }, + { + "epoch": 2.9472, + "grad_norm": 0.00036018676473759115, + "learning_rate": 4.031724059265726e-06, + "loss": 0.0002, + "step": 2304 + }, + { + "epoch": 2.94848, + "grad_norm": 0.00035507127176970243, + "learning_rate": 4.0302660878794435e-06, + "loss": 0.0002, + "step": 2305 + }, + { + "epoch": 2.94976, + "grad_norm": 0.0001190779876196757, + "learning_rate": 4.0288423836853805e-06, + "loss": 0.0002, + "step": 2306 + }, + { + "epoch": 2.95104, + "grad_norm": 0.0005441626999527216, + "learning_rate": 4.02745294939845e-06, + "loss": 0.0002, + "step": 2307 + }, + { + "epoch": 2.9523200000000003, + "grad_norm": 0.0001164350105682388, + "learning_rate": 4.026097787668224e-06, + "loss": 0.0002, + "step": 2308 + }, + { + "epoch": 2.9536, + "grad_norm": 0.0001243723090738058, + "learning_rate": 4.0247769010789095e-06, + "loss": 0.0002, + "step": 2309 + }, + { + "epoch": 2.95488, + "grad_norm": 0.00012079760199412704, + "learning_rate": 4.023490292149359e-06, + "loss": 0.0002, + "step": 2310 + }, + { + "epoch": 2.95488, + "eval_loss": 1.1803983449935913, + "eval_runtime": 43.384, + "eval_samples_per_second": 11.571, + "eval_steps_per_second": 1.452, + "step": 2310 + }, + { + "epoch": 2.95616, + "grad_norm": 0.00018000841373577714, + "learning_rate": 4.022237963333059e-06, + "loss": 0.0002, + "step": 2311 + }, + { + "epoch": 2.95744, + "grad_norm": 0.00032567320158705115, + "learning_rate": 4.021019917018121e-06, + "loss": 0.0002, + "step": 2312 + }, + { + "epoch": 2.95872, + "grad_norm": 0.0001935055770445615, + "learning_rate": 4.01983615552729e-06, + "loss": 0.0002, + "step": 2313 + }, + { + "epoch": 2.96, + "grad_norm": 0.00019914534641429782, + "learning_rate": 4.0186866811179235e-06, + "loss": 0.0002, + "step": 2314 + }, + { + "epoch": 2.96128, + "grad_norm": 0.0004246193275321275, + "learning_rate": 4.017571495982e-06, + "loss": 0.0002, + "step": 2315 + }, + { + "epoch": 2.96256, + "grad_norm": 0.0001828927779570222, + "learning_rate": 4.016490602246111e-06, + "loss": 0.0002, + "step": 2316 + }, + { + "epoch": 2.9638400000000003, + "grad_norm": 0.00032234840909950435, + "learning_rate": 4.015444001971456e-06, + "loss": 0.0002, + "step": 2317 + }, + { + "epoch": 2.9651199999999998, + "grad_norm": 0.0006089527742005885, + "learning_rate": 4.014431697153837e-06, + "loss": 0.0002, + "step": 2318 + }, + { + "epoch": 2.9664, + "grad_norm": 0.00029877014458179474, + "learning_rate": 4.013453689723657e-06, + "loss": 0.0002, + "step": 2319 + }, + { + "epoch": 2.96768, + "grad_norm": 0.0003506283101160079, + "learning_rate": 4.01250998154592e-06, + "loss": 0.0002, + "step": 2320 + }, + { + "epoch": 2.96896, + "grad_norm": 0.0006300249951891601, + "learning_rate": 4.011600574420216e-06, + "loss": 0.0002, + "step": 2321 + }, + { + "epoch": 2.97024, + "grad_norm": 0.0005357094341889024, + "learning_rate": 4.010725470080733e-06, + "loss": 0.0002, + "step": 2322 + }, + { + "epoch": 2.97152, + "grad_norm": 0.00011800980428233743, + "learning_rate": 4.009884670196239e-06, + "loss": 0.0002, + "step": 2323 + }, + { + "epoch": 2.9728, + "grad_norm": 0.0004599698877427727, + "learning_rate": 4.009078176370089e-06, + "loss": 0.0002, + "step": 2324 + }, + { + "epoch": 2.97408, + "grad_norm": 0.0005400791415013373, + "learning_rate": 4.008305990140219e-06, + "loss": 0.0002, + "step": 2325 + }, + { + "epoch": 2.9753600000000002, + "grad_norm": 0.00037052438710816205, + "learning_rate": 4.00756811297914e-06, + "loss": 0.0002, + "step": 2326 + }, + { + "epoch": 2.9766399999999997, + "grad_norm": 0.00015403024735860527, + "learning_rate": 4.006864546293941e-06, + "loss": 0.0002, + "step": 2327 + }, + { + "epoch": 2.97792, + "grad_norm": 0.0002792374580167234, + "learning_rate": 4.00619529142628e-06, + "loss": 0.0002, + "step": 2328 + }, + { + "epoch": 2.9792, + "grad_norm": 0.0011096937814727426, + "learning_rate": 4.005560349652384e-06, + "loss": 0.0002, + "step": 2329 + }, + { + "epoch": 2.98048, + "grad_norm": 0.00017472790204919875, + "learning_rate": 4.00495972218305e-06, + "loss": 0.0002, + "step": 2330 + }, + { + "epoch": 2.98176, + "grad_norm": 0.0001532040478195995, + "learning_rate": 4.004393410163635e-06, + "loss": 0.0002, + "step": 2331 + }, + { + "epoch": 2.98304, + "grad_norm": 0.000590935698710382, + "learning_rate": 4.003861414674068e-06, + "loss": 0.0002, + "step": 2332 + }, + { + "epoch": 2.98432, + "grad_norm": 0.0006752461194992065, + "learning_rate": 4.003363736728825e-06, + "loss": 0.0002, + "step": 2333 + }, + { + "epoch": 2.9856, + "grad_norm": 0.00026696111308410764, + "learning_rate": 4.002900377276953e-06, + "loss": 0.0002, + "step": 2334 + }, + { + "epoch": 2.98688, + "grad_norm": 0.00044541782699525356, + "learning_rate": 4.002471337202048e-06, + "loss": 0.0002, + "step": 2335 + }, + { + "epoch": 2.98816, + "grad_norm": 0.0009002209990285337, + "learning_rate": 4.002076617322264e-06, + "loss": 0.0002, + "step": 2336 + }, + { + "epoch": 2.98944, + "grad_norm": 0.00013183237751945853, + "learning_rate": 4.001716218390306e-06, + "loss": 0.0002, + "step": 2337 + }, + { + "epoch": 2.99072, + "grad_norm": 0.00016939207853283733, + "learning_rate": 4.001390141093436e-06, + "loss": 0.0002, + "step": 2338 + }, + { + "epoch": 2.992, + "grad_norm": 0.0005103266448713839, + "learning_rate": 4.0010983860534634e-06, + "loss": 0.0002, + "step": 2339 + }, + { + "epoch": 2.99328, + "grad_norm": 0.0006204844103194773, + "learning_rate": 4.000840953826744e-06, + "loss": 0.0002, + "step": 2340 + }, + { + "epoch": 2.99328, + "eval_loss": 1.179864525794983, + "eval_runtime": 43.3601, + "eval_samples_per_second": 11.577, + "eval_steps_per_second": 1.453, + "step": 2340 + }, + { + "epoch": 2.99456, + "grad_norm": 0.00016618035442661494, + "learning_rate": 4.000617844904188e-06, + "loss": 0.0002, + "step": 2341 + }, + { + "epoch": 2.99584, + "grad_norm": 0.0002050574403256178, + "learning_rate": 4.0004290597112505e-06, + "loss": 0.0002, + "step": 2342 + }, + { + "epoch": 2.99712, + "grad_norm": 0.0006428991910070181, + "learning_rate": 4.0002745986079325e-06, + "loss": 0.0002, + "step": 2343 + }, + { + "epoch": 2.9984, + "grad_norm": 0.0003186491085216403, + "learning_rate": 4.0001544618887825e-06, + "loss": 0.0002, + "step": 2344 + }, + { + "epoch": 2.99968, + "grad_norm": 0.0002871889155358076, + "learning_rate": 4.000068649782895e-06, + "loss": 0.0002, + "step": 2345 + }, + { + "epoch": 3.0, + "grad_norm": 0.0007445301744155586, + "learning_rate": 4.000017162453906e-06, + "loss": 0.0002, + "step": 2346 + } + ], + "logging_steps": 1, + "max_steps": 2346, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0606903825807704e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}