| { | |
| "best_metric": 0.7249829173088074, | |
| "best_model_checkpoint": "saves/starcoder2-7b/lora/sft/checkpoint-5000", | |
| "epoch": 0.7980845969672785, | |
| "eval_steps": 100, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.628385603427887, | |
| "learning_rate": 4.999999126897802e-05, | |
| "loss": 1.2582, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.0855119228363037, | |
| "learning_rate": 4.999996507591817e-05, | |
| "loss": 0.801, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.5689586400985718, | |
| "learning_rate": 4.9999921420838745e-05, | |
| "loss": 1.067, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.0851330757141113, | |
| "learning_rate": 4.999986030377024e-05, | |
| "loss": 1.2953, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.397479772567749, | |
| "learning_rate": 4.999978172475535e-05, | |
| "loss": 0.9826, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.344118595123291, | |
| "learning_rate": 4.9999685683848954e-05, | |
| "loss": 0.9485, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.158163070678711, | |
| "learning_rate": 4.9999596278606616e-05, | |
| "loss": 0.8103, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.602233648300171, | |
| "learning_rate": 4.999946880647276e-05, | |
| "loss": 0.8648, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.557242751121521, | |
| "learning_rate": 4.999932387266596e-05, | |
| "loss": 1.0198, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.36068856716156, | |
| "learning_rate": 4.999916147728746e-05, | |
| "loss": 0.9367, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.3263639211654663, | |
| "learning_rate": 4.999898162045068e-05, | |
| "loss": 0.9695, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.333601474761963, | |
| "learning_rate": 4.999878430228126e-05, | |
| "loss": 1.1509, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4753800630569458, | |
| "learning_rate": 4.999856952291702e-05, | |
| "loss": 1.1461, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.5096240043640137, | |
| "learning_rate": 4.9998337282507965e-05, | |
| "loss": 1.1722, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.189892053604126, | |
| "learning_rate": 4.999808758121633e-05, | |
| "loss": 1.1834, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.9292634725570679, | |
| "learning_rate": 4.999782041921651e-05, | |
| "loss": 0.9498, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 2.1775777339935303, | |
| "learning_rate": 4.9997535796695134e-05, | |
| "loss": 0.9346, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.6854296922683716, | |
| "learning_rate": 4.999723371385099e-05, | |
| "loss": 1.119, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.4571490287780762, | |
| "learning_rate": 4.999691417089507e-05, | |
| "loss": 0.8671, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.277044653892517, | |
| "learning_rate": 4.999657716805059e-05, | |
| "loss": 1.2469, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 0.8478816747665405, | |
| "eval_runtime": 96.2736, | |
| "eval_samples_per_second": 7.24, | |
| "eval_steps_per_second": 7.24, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.6687743067741394, | |
| "learning_rate": 4.9996222705552933e-05, | |
| "loss": 0.735, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.3488354682922363, | |
| "learning_rate": 4.9995850783649665e-05, | |
| "loss": 0.8344, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.1043323278427124, | |
| "learning_rate": 4.9995461402600593e-05, | |
| "loss": 0.8254, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.9382895827293396, | |
| "learning_rate": 4.9995054562677684e-05, | |
| "loss": 0.9179, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.2824612855911255, | |
| "learning_rate": 4.9994630264165107e-05, | |
| "loss": 0.8663, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.0491925477981567, | |
| "learning_rate": 4.999418850735923e-05, | |
| "loss": 0.9247, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.3642233610153198, | |
| "learning_rate": 4.99937292925686e-05, | |
| "loss": 0.8253, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.747757911682129, | |
| "learning_rate": 4.9993252620113976e-05, | |
| "loss": 1.0245, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.299494981765747, | |
| "learning_rate": 4.999275849032832e-05, | |
| "loss": 0.8723, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7195830345153809, | |
| "learning_rate": 4.999224690355675e-05, | |
| "loss": 1.0524, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.9922987222671509, | |
| "learning_rate": 4.9991717860156616e-05, | |
| "loss": 0.9502, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0577458143234253, | |
| "learning_rate": 4.9991171360497437e-05, | |
| "loss": 1.0115, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0001195669174194, | |
| "learning_rate": 4.999060740496093e-05, | |
| "loss": 1.1999, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.2456804513931274, | |
| "learning_rate": 4.999002599394102e-05, | |
| "loss": 0.8882, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0445325374603271, | |
| "learning_rate": 4.9989427127843814e-05, | |
| "loss": 1.0615, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.2410887479782104, | |
| "learning_rate": 4.9988810807087584e-05, | |
| "loss": 1.1068, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.8935971260070801, | |
| "learning_rate": 4.998817703210285e-05, | |
| "loss": 0.6683, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.1614488363265991, | |
| "learning_rate": 4.9987525803332265e-05, | |
| "loss": 0.7446, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.9392004013061523, | |
| "learning_rate": 4.998685712123072e-05, | |
| "loss": 0.7397, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0314444303512573, | |
| "learning_rate": 4.9986170986265266e-05, | |
| "loss": 1.3584, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 0.8368077278137207, | |
| "eval_runtime": 96.5262, | |
| "eval_samples_per_second": 7.221, | |
| "eval_steps_per_second": 7.221, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.8964811563491821, | |
| "learning_rate": 4.998546739891516e-05, | |
| "loss": 0.9546, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0679796934127808, | |
| "learning_rate": 4.998474635967185e-05, | |
| "loss": 0.864, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.2340985536575317, | |
| "learning_rate": 4.998400786903896e-05, | |
| "loss": 0.885, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.7219617366790771, | |
| "learning_rate": 4.9983251927532315e-05, | |
| "loss": 1.1069, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.1480705738067627, | |
| "learning_rate": 4.9982478535679924e-05, | |
| "loss": 1.0416, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.515589714050293, | |
| "learning_rate": 4.9981687694021996e-05, | |
| "loss": 1.1844, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.6687963008880615, | |
| "learning_rate": 4.998087940311091e-05, | |
| "loss": 0.8664, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9256645441055298, | |
| "learning_rate": 4.998005366351125e-05, | |
| "loss": 1.0125, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.2500052452087402, | |
| "learning_rate": 4.997921047579978e-05, | |
| "loss": 1.1374, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.0543216466903687, | |
| "learning_rate": 4.9978349840565434e-05, | |
| "loss": 0.8502, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.3009012937545776, | |
| "learning_rate": 4.997747175840937e-05, | |
| "loss": 1.0357, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.8456661105155945, | |
| "learning_rate": 4.997657622994491e-05, | |
| "loss": 0.6883, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.5856515765190125, | |
| "learning_rate": 4.9975663255797555e-05, | |
| "loss": 0.7656, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.973818302154541, | |
| "learning_rate": 4.997473283660501e-05, | |
| "loss": 0.823, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.9960187673568726, | |
| "learning_rate": 4.997378497301715e-05, | |
| "loss": 0.8726, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.2900679111480713, | |
| "learning_rate": 4.997281966569604e-05, | |
| "loss": 0.9781, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.828894853591919, | |
| "learning_rate": 4.9971836915315926e-05, | |
| "loss": 0.8932, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.239621877670288, | |
| "learning_rate": 4.9970836722563256e-05, | |
| "loss": 1.2022, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.0117149353027344, | |
| "learning_rate": 4.996981908813664e-05, | |
| "loss": 0.8032, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.8861119747161865, | |
| "learning_rate": 4.996878401274687e-05, | |
| "loss": 1.0651, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 0.8281473517417908, | |
| "eval_runtime": 96.5283, | |
| "eval_samples_per_second": 7.221, | |
| "eval_steps_per_second": 7.221, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.8583046197891235, | |
| "learning_rate": 4.996773149711693e-05, | |
| "loss": 0.8784, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.5717499256134033, | |
| "learning_rate": 4.9966661541981984e-05, | |
| "loss": 0.8395, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.982342004776001, | |
| "learning_rate": 4.9965574148089376e-05, | |
| "loss": 0.9869, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.9000777006149292, | |
| "learning_rate": 4.9964469316198633e-05, | |
| "loss": 0.8435, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.8733209371566772, | |
| "learning_rate": 4.9963347047081464e-05, | |
| "loss": 0.7281, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.323739767074585, | |
| "learning_rate": 4.9962207341521746e-05, | |
| "loss": 1.1013, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.7102876901626587, | |
| "learning_rate": 4.996105020031554e-05, | |
| "loss": 0.8276, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.9196123480796814, | |
| "learning_rate": 4.995987562427109e-05, | |
| "loss": 0.8274, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.210099458694458, | |
| "learning_rate": 4.995868361420883e-05, | |
| "loss": 1.3257, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.8923581838607788, | |
| "learning_rate": 4.9957474170961335e-05, | |
| "loss": 0.6815, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9576735496520996, | |
| "learning_rate": 4.9956247295373396e-05, | |
| "loss": 1.23, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.3774089813232422, | |
| "learning_rate": 4.995500298830196e-05, | |
| "loss": 1.0556, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.1523677110671997, | |
| "learning_rate": 4.995374125061614e-05, | |
| "loss": 1.1787, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.8310608863830566, | |
| "learning_rate": 4.9952462083197246e-05, | |
| "loss": 0.8525, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9814196825027466, | |
| "learning_rate": 4.9951165486938765e-05, | |
| "loss": 0.8522, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9878122210502625, | |
| "learning_rate": 4.994985146274633e-05, | |
| "loss": 0.6618, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.2652586698532104, | |
| "learning_rate": 4.994852001153777e-05, | |
| "loss": 1.0489, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.2940975427627563, | |
| "learning_rate": 4.994717113424307e-05, | |
| "loss": 1.104, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9636249542236328, | |
| "learning_rate": 4.99458048318044e-05, | |
| "loss": 0.9228, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.8122813105583191, | |
| "learning_rate": 4.994442110517611e-05, | |
| "loss": 0.9209, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 0.8184689879417419, | |
| "eval_runtime": 96.4572, | |
| "eval_samples_per_second": 7.226, | |
| "eval_steps_per_second": 7.226, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.8742052912712097, | |
| "learning_rate": 4.99430199553247e-05, | |
| "loss": 0.9608, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.5679522752761841, | |
| "learning_rate": 4.9941601383228835e-05, | |
| "loss": 0.5963, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.0234627723693848, | |
| "learning_rate": 4.994016538987938e-05, | |
| "loss": 0.8642, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.8581897616386414, | |
| "learning_rate": 4.993871197627934e-05, | |
| "loss": 0.8993, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.4666485786437988, | |
| "learning_rate": 4.9937241143443904e-05, | |
| "loss": 0.8565, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.1166578531265259, | |
| "learning_rate": 4.993575289240041e-05, | |
| "loss": 0.881, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.303992748260498, | |
| "learning_rate": 4.9934247224188393e-05, | |
| "loss": 0.9962, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.9011989235877991, | |
| "learning_rate": 4.993272413985952e-05, | |
| "loss": 0.9316, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.8321458101272583, | |
| "learning_rate": 4.993118364047764e-05, | |
| "loss": 0.7889, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.7780352234840393, | |
| "learning_rate": 4.992962572711877e-05, | |
| "loss": 0.8287, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.9090210199356079, | |
| "learning_rate": 4.992805040087108e-05, | |
| "loss": 0.7018, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.8694137334823608, | |
| "learning_rate": 4.9926457662834906e-05, | |
| "loss": 0.8484, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.6327371001243591, | |
| "learning_rate": 4.992484751412274e-05, | |
| "loss": 0.716, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.200668215751648, | |
| "learning_rate": 4.9923219955859254e-05, | |
| "loss": 0.9525, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.8530198931694031, | |
| "learning_rate": 4.9921574989181266e-05, | |
| "loss": 0.744, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.168479323387146, | |
| "learning_rate": 4.991991261523775e-05, | |
| "loss": 0.729, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.9499714970588684, | |
| "learning_rate": 4.9918232835189834e-05, | |
| "loss": 0.7725, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.8434467911720276, | |
| "learning_rate": 4.991653565021084e-05, | |
| "loss": 1.1558, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.7665804624557495, | |
| "learning_rate": 4.99148210614862e-05, | |
| "loss": 1.0208, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.5782546401023865, | |
| "learning_rate": 4.991308907021353e-05, | |
| "loss": 0.8306, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 0.8132078051567078, | |
| "eval_runtime": 96.433, | |
| "eval_samples_per_second": 7.228, | |
| "eval_steps_per_second": 7.228, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.0821778774261475, | |
| "learning_rate": 4.9911339677602584e-05, | |
| "loss": 0.9503, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.5409029126167297, | |
| "learning_rate": 4.99095728848753e-05, | |
| "loss": 0.8586, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.9011789560317993, | |
| "learning_rate": 4.990778869326575e-05, | |
| "loss": 0.7981, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.0092263221740723, | |
| "learning_rate": 4.990598710402013e-05, | |
| "loss": 1.0174, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.4362307786941528, | |
| "learning_rate": 4.9904168118396844e-05, | |
| "loss": 0.8373, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.1772639751434326, | |
| "learning_rate": 4.9902331737666414e-05, | |
| "loss": 0.9599, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.9610542058944702, | |
| "learning_rate": 4.990047796311151e-05, | |
| "loss": 0.6895, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.9922348260879517, | |
| "learning_rate": 4.989860679602698e-05, | |
| "loss": 0.7315, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.2409151792526245, | |
| "learning_rate": 4.9896718237719785e-05, | |
| "loss": 0.8574, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.016333818435669, | |
| "learning_rate": 4.9894812289509046e-05, | |
| "loss": 1.1248, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.9131489396095276, | |
| "learning_rate": 4.989288895272604e-05, | |
| "loss": 0.9847, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.215469479560852, | |
| "learning_rate": 4.989094822871419e-05, | |
| "loss": 0.912, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.0536105632781982, | |
| "learning_rate": 4.988899011882903e-05, | |
| "loss": 0.8425, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.9705311059951782, | |
| "learning_rate": 4.988701462443829e-05, | |
| "loss": 0.9385, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.2488442659378052, | |
| "learning_rate": 4.98850217469218e-05, | |
| "loss": 0.7865, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.7318600416183472, | |
| "learning_rate": 4.988301148767157e-05, | |
| "loss": 0.8231, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.8247858881950378, | |
| "learning_rate": 4.9880983848091704e-05, | |
| "loss": 0.8553, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.858172595500946, | |
| "learning_rate": 4.987893882959849e-05, | |
| "loss": 1.3952, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.2286418676376343, | |
| "learning_rate": 4.987687643362033e-05, | |
| "loss": 0.837, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.034350872039795, | |
| "learning_rate": 4.9874796661597765e-05, | |
| "loss": 0.9175, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 0.8063747882843018, | |
| "eval_runtime": 96.4224, | |
| "eval_samples_per_second": 7.229, | |
| "eval_steps_per_second": 7.229, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.7192366123199463, | |
| "learning_rate": 4.987269951498348e-05, | |
| "loss": 0.8563, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.2645854949951172, | |
| "learning_rate": 4.98705849952423e-05, | |
| "loss": 0.6663, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.0610381364822388, | |
| "learning_rate": 4.9868453103851176e-05, | |
| "loss": 0.8452, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8550002574920654, | |
| "learning_rate": 4.986630384229919e-05, | |
| "loss": 0.8894, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.7490519285202026, | |
| "learning_rate": 4.986413721208757e-05, | |
| "loss": 0.9106, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.557860255241394, | |
| "learning_rate": 4.986195321472965e-05, | |
| "loss": 0.685, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.7450752258300781, | |
| "learning_rate": 4.9859751851750934e-05, | |
| "loss": 0.8472, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.176376461982727, | |
| "learning_rate": 4.985753312468903e-05, | |
| "loss": 1.0197, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.0625300407409668, | |
| "learning_rate": 4.985529703509367e-05, | |
| "loss": 0.9685, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8808372616767883, | |
| "learning_rate": 4.985304358452672e-05, | |
| "loss": 0.8612, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8110201954841614, | |
| "learning_rate": 4.985077277456218e-05, | |
| "loss": 0.8401, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.9364888072013855, | |
| "learning_rate": 4.984848460678618e-05, | |
| "loss": 0.6197, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.0113518238067627, | |
| "learning_rate": 4.984617908279694e-05, | |
| "loss": 0.9889, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.1148868799209595, | |
| "learning_rate": 4.984385620420485e-05, | |
| "loss": 0.9558, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.9506175518035889, | |
| "learning_rate": 4.984151597263238e-05, | |
| "loss": 0.7323, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.0044193267822266, | |
| "learning_rate": 4.983915838971415e-05, | |
| "loss": 0.7504, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.2674214839935303, | |
| "learning_rate": 4.9836783457096875e-05, | |
| "loss": 1.032, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.4945333003997803, | |
| "learning_rate": 4.983439117643942e-05, | |
| "loss": 1.0359, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.9860715866088867, | |
| "learning_rate": 4.9831981549412744e-05, | |
| "loss": 1.1152, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.8287227153778076, | |
| "learning_rate": 4.982955457769992e-05, | |
| "loss": 0.8157, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_loss": 0.8022791743278503, | |
| "eval_runtime": 96.5324, | |
| "eval_samples_per_second": 7.22, | |
| "eval_steps_per_second": 7.22, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.9216273427009583, | |
| "learning_rate": 4.9827110262996144e-05, | |
| "loss": 0.8395, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.7642357349395752, | |
| "learning_rate": 4.982464860700874e-05, | |
| "loss": 0.8817, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.8851175308227539, | |
| "learning_rate": 4.982216961145711e-05, | |
| "loss": 0.8558, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.44226109981536865, | |
| "learning_rate": 4.98196732780728e-05, | |
| "loss": 0.882, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.8005027174949646, | |
| "learning_rate": 4.981715960859945e-05, | |
| "loss": 0.8835, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.7451304793357849, | |
| "learning_rate": 4.981462860479281e-05, | |
| "loss": 0.8551, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.1069347858428955, | |
| "learning_rate": 4.9812080268420745e-05, | |
| "loss": 0.999, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.8892244100570679, | |
| "learning_rate": 4.980951460126322e-05, | |
| "loss": 1.012, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.8935977816581726, | |
| "learning_rate": 4.9806931605112305e-05, | |
| "loss": 0.9911, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.8456961512565613, | |
| "learning_rate": 4.9804331281772176e-05, | |
| "loss": 0.7595, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.78443443775177, | |
| "learning_rate": 4.980171363305911e-05, | |
| "loss": 0.8308, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.0028038024902344, | |
| "learning_rate": 4.979907866080149e-05, | |
| "loss": 0.9637, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.1801577806472778, | |
| "learning_rate": 4.9796426366839786e-05, | |
| "loss": 0.6159, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.8370681405067444, | |
| "learning_rate": 4.979375675302659e-05, | |
| "loss": 0.9276, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.8605382442474365, | |
| "learning_rate": 4.979106982122658e-05, | |
| "loss": 1.1077, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.7788259387016296, | |
| "learning_rate": 4.978836557331652e-05, | |
| "loss": 0.8172, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.4312686920166016, | |
| "learning_rate": 4.978564401118528e-05, | |
| "loss": 0.8759, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.9109662175178528, | |
| "learning_rate": 4.978290513673381e-05, | |
| "loss": 0.947, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.1819065809249878, | |
| "learning_rate": 4.9780148951875195e-05, | |
| "loss": 0.7364, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.9400575160980225, | |
| "learning_rate": 4.977737545853455e-05, | |
| "loss": 0.9469, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "eval_loss": 0.7995806932449341, | |
| "eval_runtime": 96.5877, | |
| "eval_samples_per_second": 7.216, | |
| "eval_steps_per_second": 7.216, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.693812370300293, | |
| "learning_rate": 4.9774584658649126e-05, | |
| "loss": 0.9433, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.0892895460128784, | |
| "learning_rate": 4.9771776554168234e-05, | |
| "loss": 0.7027, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.9118362665176392, | |
| "learning_rate": 4.976895114705329e-05, | |
| "loss": 0.9468, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.8032681345939636, | |
| "learning_rate": 4.976610843927779e-05, | |
| "loss": 0.7927, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.168225646018982, | |
| "learning_rate": 4.976324843282732e-05, | |
| "loss": 0.9673, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.077602744102478, | |
| "learning_rate": 4.976037112969953e-05, | |
| "loss": 0.9156, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.8643108606338501, | |
| "learning_rate": 4.9757476531904165e-05, | |
| "loss": 0.6999, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.933397650718689, | |
| "learning_rate": 4.975456464146306e-05, | |
| "loss": 0.8828, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.7036295533180237, | |
| "learning_rate": 4.975163546041011e-05, | |
| "loss": 0.8709, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.5974694490432739, | |
| "learning_rate": 4.974868899079128e-05, | |
| "loss": 0.7594, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.7244943380355835, | |
| "learning_rate": 4.974572523466465e-05, | |
| "loss": 0.8714, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.5783522725105286, | |
| "learning_rate": 4.9742744194100345e-05, | |
| "loss": 0.8941, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.7480617761611938, | |
| "learning_rate": 4.973974587118055e-05, | |
| "loss": 0.9798, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.7548874020576477, | |
| "learning_rate": 4.973673026799956e-05, | |
| "loss": 0.7767, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.7075071930885315, | |
| "learning_rate": 4.97336973866637e-05, | |
| "loss": 0.7779, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.7042987942695618, | |
| "learning_rate": 4.97306472292914e-05, | |
| "loss": 0.8249, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.0242459774017334, | |
| "learning_rate": 4.972757979801313e-05, | |
| "loss": 0.9223, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.6138095259666443, | |
| "learning_rate": 4.9724495094971436e-05, | |
| "loss": 0.9842, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.7905042767524719, | |
| "learning_rate": 4.9721393122320925e-05, | |
| "loss": 0.8738, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.9658048748970032, | |
| "learning_rate": 4.9718273882228265e-05, | |
| "loss": 0.8872, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "eval_loss": 0.7954564690589905, | |
| "eval_runtime": 96.643, | |
| "eval_samples_per_second": 7.212, | |
| "eval_steps_per_second": 7.212, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.8425014019012451, | |
| "learning_rate": 4.97151373768722e-05, | |
| "loss": 0.778, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.5527231693267822, | |
| "learning_rate": 4.971198360844351e-05, | |
| "loss": 0.8332, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.7870334386825562, | |
| "learning_rate": 4.9708812579145056e-05, | |
| "loss": 0.9265, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.9935321807861328, | |
| "learning_rate": 4.970562429119173e-05, | |
| "loss": 0.7243, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.9546892046928406, | |
| "learning_rate": 4.970241874681051e-05, | |
| "loss": 0.9908, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.7340118885040283, | |
| "learning_rate": 4.969919594824039e-05, | |
| "loss": 0.7932, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.1686015129089355, | |
| "learning_rate": 4.9695955897732453e-05, | |
| "loss": 0.9842, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.9721456170082092, | |
| "learning_rate": 4.9692698597549815e-05, | |
| "loss": 0.9271, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.6477334499359131, | |
| "learning_rate": 4.9689424049967623e-05, | |
| "loss": 0.934, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.0759055614471436, | |
| "learning_rate": 4.968613225727311e-05, | |
| "loss": 1.0465, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.7222158908843994, | |
| "learning_rate": 4.968282322176552e-05, | |
| "loss": 0.7732, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.8591343760490417, | |
| "learning_rate": 4.9679496945756155e-05, | |
| "loss": 0.9062, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.8495111465454102, | |
| "learning_rate": 4.967615343156837e-05, | |
| "loss": 0.8861, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.6847331523895264, | |
| "learning_rate": 4.967279268153753e-05, | |
| "loss": 0.8001, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.690113365650177, | |
| "learning_rate": 4.9669414698011074e-05, | |
| "loss": 0.7378, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8349626064300537, | |
| "learning_rate": 4.9666019483348456e-05, | |
| "loss": 0.7193, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.6444108486175537, | |
| "learning_rate": 4.966260703992116e-05, | |
| "loss": 0.8729, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.9515655040740967, | |
| "learning_rate": 4.965917737011274e-05, | |
| "loss": 0.7532, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8138986229896545, | |
| "learning_rate": 4.965573047631873e-05, | |
| "loss": 1.0124, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.0182080268859863, | |
| "learning_rate": 4.9652266360946745e-05, | |
| "loss": 0.8842, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 0.7912728190422058, | |
| "eval_runtime": 96.5004, | |
| "eval_samples_per_second": 7.223, | |
| "eval_steps_per_second": 7.223, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.9665297269821167, | |
| "learning_rate": 4.96487850264164e-05, | |
| "loss": 1.0155, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.1356585025787354, | |
| "learning_rate": 4.964528647515933e-05, | |
| "loss": 0.8705, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.5548833608627319, | |
| "learning_rate": 4.9641770709619234e-05, | |
| "loss": 0.9634, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8028444647789001, | |
| "learning_rate": 4.9638237732251794e-05, | |
| "loss": 0.8722, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.934234082698822, | |
| "learning_rate": 4.9634687545524724e-05, | |
| "loss": 0.9731, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.7293463349342346, | |
| "learning_rate": 4.963112015191778e-05, | |
| "loss": 1.0237, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.6442769169807434, | |
| "learning_rate": 4.962753555392271e-05, | |
| "loss": 1.1331, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.7877534031867981, | |
| "learning_rate": 4.962393375404331e-05, | |
| "loss": 1.0737, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5739997625350952, | |
| "learning_rate": 4.9620314754795343e-05, | |
| "loss": 0.8836, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.7318402528762817, | |
| "learning_rate": 4.9616678558706634e-05, | |
| "loss": 0.9981, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5463365316390991, | |
| "learning_rate": 4.961302516831699e-05, | |
| "loss": 0.7336, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.7839176654815674, | |
| "learning_rate": 4.960935458617824e-05, | |
| "loss": 1.025, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.7076404690742493, | |
| "learning_rate": 4.9605666814854225e-05, | |
| "loss": 0.833, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.732940673828125, | |
| "learning_rate": 4.960196185692077e-05, | |
| "loss": 0.5103, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.7256388068199158, | |
| "learning_rate": 4.959823971496574e-05, | |
| "loss": 0.8617, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.1714242696762085, | |
| "learning_rate": 4.959450039158898e-05, | |
| "loss": 1.0345, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5849193930625916, | |
| "learning_rate": 4.9590743889402325e-05, | |
| "loss": 0.729, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.6283109784126282, | |
| "learning_rate": 4.958697021102963e-05, | |
| "loss": 0.8527, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.6387770175933838, | |
| "learning_rate": 4.9583179359106746e-05, | |
| "loss": 0.7411, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.5853758454322815, | |
| "learning_rate": 4.957937133628151e-05, | |
| "loss": 0.7909, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "eval_loss": 0.7863278985023499, | |
| "eval_runtime": 96.3784, | |
| "eval_samples_per_second": 7.232, | |
| "eval_steps_per_second": 7.232, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.9301708936691284, | |
| "learning_rate": 4.9575546145213755e-05, | |
| "loss": 0.7149, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.125088095664978, | |
| "learning_rate": 4.9571703788575314e-05, | |
| "loss": 0.8034, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.0697988271713257, | |
| "learning_rate": 4.956784426905e-05, | |
| "loss": 0.8874, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.7094873189926147, | |
| "learning_rate": 4.956396758933361e-05, | |
| "loss": 0.6612, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8048680424690247, | |
| "learning_rate": 4.956007375213393e-05, | |
| "loss": 0.9558, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8820949196815491, | |
| "learning_rate": 4.9556162760170756e-05, | |
| "loss": 0.9442, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.7214958071708679, | |
| "learning_rate": 4.955223461617583e-05, | |
| "loss": 0.8392, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8364250063896179, | |
| "learning_rate": 4.954828932289288e-05, | |
| "loss": 0.9834, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8735854625701904, | |
| "learning_rate": 4.954432688307764e-05, | |
| "loss": 0.8817, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.810013473033905, | |
| "learning_rate": 4.9540347299497805e-05, | |
| "loss": 0.7723, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8791002035140991, | |
| "learning_rate": 4.953635057493302e-05, | |
| "loss": 0.706, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.7556783556938171, | |
| "learning_rate": 4.953233671217493e-05, | |
| "loss": 0.8145, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.3251086473464966, | |
| "learning_rate": 4.952830571402716e-05, | |
| "loss": 0.8413, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.8531173467636108, | |
| "learning_rate": 4.952425758330527e-05, | |
| "loss": 0.8236, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.0738744735717773, | |
| "learning_rate": 4.952019232283681e-05, | |
| "loss": 0.8357, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.7908213138580322, | |
| "learning_rate": 4.9516109935461306e-05, | |
| "loss": 0.6165, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.9802565574645996, | |
| "learning_rate": 4.951201042403021e-05, | |
| "loss": 0.7203, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.7866708636283875, | |
| "learning_rate": 4.9507893791406974e-05, | |
| "loss": 0.8479, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.6721138954162598, | |
| "learning_rate": 4.950376004046698e-05, | |
| "loss": 0.8871, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.1981366872787476, | |
| "learning_rate": 4.9499609174097574e-05, | |
| "loss": 0.8196, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "eval_loss": 0.7843652367591858, | |
| "eval_runtime": 96.5411, | |
| "eval_samples_per_second": 7.22, | |
| "eval_steps_per_second": 7.22, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.7013841867446899, | |
| "learning_rate": 4.9495441195198064e-05, | |
| "loss": 1.0009, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.8476290702819824, | |
| "learning_rate": 4.949125610667972e-05, | |
| "loss": 0.5127, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.7680797576904297, | |
| "learning_rate": 4.9487053911465735e-05, | |
| "loss": 0.7003, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.9771925806999207, | |
| "learning_rate": 4.948283461249127e-05, | |
| "loss": 1.1135, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.4247405529022217, | |
| "learning_rate": 4.947859821270342e-05, | |
| "loss": 0.8253, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.184887409210205, | |
| "learning_rate": 4.947434471506125e-05, | |
| "loss": 1.1208, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.7579745054244995, | |
| "learning_rate": 4.9470074122535745e-05, | |
| "loss": 1.1363, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.8529625535011292, | |
| "learning_rate": 4.9465786438109826e-05, | |
| "loss": 0.8699, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.810576319694519, | |
| "learning_rate": 4.9461481664778374e-05, | |
| "loss": 1.0166, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.8605110049247742, | |
| "learning_rate": 4.9457159805548187e-05, | |
| "loss": 0.9427, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.59971684217453, | |
| "learning_rate": 4.945282086343801e-05, | |
| "loss": 0.6536, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.0233818292617798, | |
| "learning_rate": 4.9448464841478506e-05, | |
| "loss": 0.9505, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.8945149779319763, | |
| "learning_rate": 4.9444091742712293e-05, | |
| "loss": 0.8416, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.702805757522583, | |
| "learning_rate": 4.9439701570193886e-05, | |
| "loss": 0.9419, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.7464181184768677, | |
| "learning_rate": 4.9435294326989745e-05, | |
| "loss": 0.7972, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.1765002012252808, | |
| "learning_rate": 4.943175624360097e-05, | |
| "loss": 0.9914, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.6549853682518005, | |
| "learning_rate": 4.9427318280928034e-05, | |
| "loss": 0.8924, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.5978650450706482, | |
| "learning_rate": 4.942286325621888e-05, | |
| "loss": 0.6224, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.7752617597579956, | |
| "learning_rate": 4.941839117258523e-05, | |
| "loss": 0.8666, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.6919072866439819, | |
| "learning_rate": 4.941390203315078e-05, | |
| "loss": 0.9341, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "eval_loss": 0.7824844717979431, | |
| "eval_runtime": 96.8874, | |
| "eval_samples_per_second": 7.194, | |
| "eval_steps_per_second": 7.194, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.7222729325294495, | |
| "learning_rate": 4.94093958410511e-05, | |
| "loss": 0.9925, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.9575716853141785, | |
| "learning_rate": 4.9404872599433686e-05, | |
| "loss": 0.8623, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.7721400260925293, | |
| "learning_rate": 4.940033231145793e-05, | |
| "loss": 1.0061, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.7019990682601929, | |
| "learning_rate": 4.9395774980295165e-05, | |
| "loss": 0.8697, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.7828916907310486, | |
| "learning_rate": 4.939120060912858e-05, | |
| "loss": 1.0066, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.0238871574401855, | |
| "learning_rate": 4.93866092011533e-05, | |
| "loss": 1.0285, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.48669734597206116, | |
| "learning_rate": 4.938200075957634e-05, | |
| "loss": 0.7454, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.8834619522094727, | |
| "learning_rate": 4.93773752876166e-05, | |
| "loss": 0.9998, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.6462609767913818, | |
| "learning_rate": 4.9372732788504905e-05, | |
| "loss": 0.7278, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.7309257388114929, | |
| "learning_rate": 4.936807326548395e-05, | |
| "loss": 0.7301, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.8515027165412903, | |
| "learning_rate": 4.936339672180833e-05, | |
| "loss": 0.8307, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.913206934928894, | |
| "learning_rate": 4.935870316074451e-05, | |
| "loss": 0.9467, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.6705841422080994, | |
| "learning_rate": 4.935399258557088e-05, | |
| "loss": 0.7124, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.676695704460144, | |
| "learning_rate": 4.934926499957767e-05, | |
| "loss": 0.9318, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.0529104471206665, | |
| "learning_rate": 4.934452040606703e-05, | |
| "loss": 1.0307, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.7150225639343262, | |
| "learning_rate": 4.933975880835296e-05, | |
| "loss": 0.8718, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.7180047035217285, | |
| "learning_rate": 4.933498020976135e-05, | |
| "loss": 0.7515, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.0961759090423584, | |
| "learning_rate": 4.933018461362997e-05, | |
| "loss": 0.8797, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.830609142780304, | |
| "learning_rate": 4.9325372023308446e-05, | |
| "loss": 0.6927, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.5277318358421326, | |
| "learning_rate": 4.9320542442158305e-05, | |
| "loss": 0.8801, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_loss": 0.7787255644798279, | |
| "eval_runtime": 96.8812, | |
| "eval_samples_per_second": 7.194, | |
| "eval_steps_per_second": 7.194, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.3845161199569702, | |
| "learning_rate": 4.931569587355289e-05, | |
| "loss": 0.8782, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.8579941987991333, | |
| "learning_rate": 4.9310832320877476e-05, | |
| "loss": 0.713, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.2643532454967499, | |
| "learning_rate": 4.930595178752914e-05, | |
| "loss": 0.9781, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.4968445897102356, | |
| "learning_rate": 4.930105427691685e-05, | |
| "loss": 0.93, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.9254417419433594, | |
| "learning_rate": 4.929613979246144e-05, | |
| "loss": 0.6353, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.9814417958259583, | |
| "learning_rate": 4.9291208337595574e-05, | |
| "loss": 0.9672, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.7159338593482971, | |
| "learning_rate": 4.928625991576379e-05, | |
| "loss": 0.9482, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.623866617679596, | |
| "learning_rate": 4.9281294530422476e-05, | |
| "loss": 0.623, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.8750379681587219, | |
| "learning_rate": 4.927631218503985e-05, | |
| "loss": 0.772, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.5593128800392151, | |
| "learning_rate": 4.9271312883096e-05, | |
| "loss": 0.6579, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.6411569714546204, | |
| "learning_rate": 4.9266296628082834e-05, | |
| "loss": 0.9239, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.9317705631256104, | |
| "learning_rate": 4.9261263423504135e-05, | |
| "loss": 0.9315, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.8312699198722839, | |
| "learning_rate": 4.9256213272875486e-05, | |
| "loss": 0.7334, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.6170663833618164, | |
| "learning_rate": 4.925114617972433e-05, | |
| "loss": 0.8603, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.7176920771598816, | |
| "learning_rate": 4.924606214758995e-05, | |
| "loss": 0.8738, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.8957033157348633, | |
| "learning_rate": 4.924096118002343e-05, | |
| "loss": 0.8861, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5490685701370239, | |
| "learning_rate": 4.923584328058772e-05, | |
| "loss": 0.712, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.7401763796806335, | |
| "learning_rate": 4.923070845285757e-05, | |
| "loss": 0.8118, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.7380841374397278, | |
| "learning_rate": 4.922555670041957e-05, | |
| "loss": 0.8476, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.0009427070617676, | |
| "learning_rate": 4.922038802687212e-05, | |
| "loss": 0.9109, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_loss": 0.777683675289154, | |
| "eval_runtime": 96.9147, | |
| "eval_samples_per_second": 7.192, | |
| "eval_steps_per_second": 7.192, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.7970065474510193, | |
| "learning_rate": 4.921520243582545e-05, | |
| "loss": 0.616, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.6530303955078125, | |
| "learning_rate": 4.92099999309016e-05, | |
| "loss": 0.9223, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.48044708371162415, | |
| "learning_rate": 4.9204780515734406e-05, | |
| "loss": 0.6762, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.7560244798660278, | |
| "learning_rate": 4.919954419396956e-05, | |
| "loss": 0.8726, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.8580659031867981, | |
| "learning_rate": 4.919429096926453e-05, | |
| "loss": 0.7654, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.1246473789215088, | |
| "learning_rate": 4.918902084528859e-05, | |
| "loss": 0.9123, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.0745307207107544, | |
| "learning_rate": 4.918373382572283e-05, | |
| "loss": 0.79, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.9591856598854065, | |
| "learning_rate": 4.917842991426014e-05, | |
| "loss": 1.1778, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.0233389139175415, | |
| "learning_rate": 4.91731091146052e-05, | |
| "loss": 0.8827, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.648965060710907, | |
| "learning_rate": 4.91677714304745e-05, | |
| "loss": 0.8634, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.6523327231407166, | |
| "learning_rate": 4.91624168655963e-05, | |
| "loss": 0.9916, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.8029198050498962, | |
| "learning_rate": 4.915704542371068e-05, | |
| "loss": 0.7867, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.6397082805633545, | |
| "learning_rate": 4.915165710856948e-05, | |
| "loss": 0.7738, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.5862845778465271, | |
| "learning_rate": 4.914625192393636e-05, | |
| "loss": 0.7026, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.5333505868911743, | |
| "learning_rate": 4.914082987358673e-05, | |
| "loss": 0.8623, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.5689602494239807, | |
| "learning_rate": 4.913539096130779e-05, | |
| "loss": 0.7619, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.7333836555480957, | |
| "learning_rate": 4.912993519089853e-05, | |
| "loss": 0.8116, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.7610496282577515, | |
| "learning_rate": 4.91244625661697e-05, | |
| "loss": 0.74, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.6331669092178345, | |
| "learning_rate": 4.9118973090943835e-05, | |
| "loss": 1.0445, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.7263479828834534, | |
| "learning_rate": 4.911346676905521e-05, | |
| "loss": 0.8964, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "eval_loss": 0.7759388089179993, | |
| "eval_runtime": 96.8818, | |
| "eval_samples_per_second": 7.194, | |
| "eval_steps_per_second": 7.194, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.6523721814155579, | |
| "learning_rate": 4.910794360434993e-05, | |
| "loss": 1.0127, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.055384874343872, | |
| "learning_rate": 4.9102403600685796e-05, | |
| "loss": 0.9855, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.7640814185142517, | |
| "learning_rate": 4.9096846761932414e-05, | |
| "loss": 0.7963, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5843799710273743, | |
| "learning_rate": 4.9091273091971124e-05, | |
| "loss": 0.8854, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.9825207591056824, | |
| "learning_rate": 4.9085682594695036e-05, | |
| "loss": 0.8086, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.9490563869476318, | |
| "learning_rate": 4.908007527400901e-05, | |
| "loss": 0.6838, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.9472922682762146, | |
| "learning_rate": 4.907445113382966e-05, | |
| "loss": 0.8732, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.6690593957901001, | |
| "learning_rate": 4.9068810178085344e-05, | |
| "loss": 0.8551, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.7245538830757141, | |
| "learning_rate": 4.906315241071616e-05, | |
| "loss": 0.7639, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.8342815041542053, | |
| "learning_rate": 4.905747783567397e-05, | |
| "loss": 0.9417, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.6241989135742188, | |
| "learning_rate": 4.9051786456922354e-05, | |
| "loss": 0.9394, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5671687126159668, | |
| "learning_rate": 4.904607827843663e-05, | |
| "loss": 0.6381, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.795868456363678, | |
| "learning_rate": 4.9040353304203864e-05, | |
| "loss": 0.7676, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.9995182156562805, | |
| "learning_rate": 4.9034611538222844e-05, | |
| "loss": 1.0327, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.7473803758621216, | |
| "learning_rate": 4.902885298450409e-05, | |
| "loss": 0.8835, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.5757468938827515, | |
| "learning_rate": 4.902307764706984e-05, | |
| "loss": 0.7548, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.8357987403869629, | |
| "learning_rate": 4.901728552995407e-05, | |
| "loss": 0.9184, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.6664137244224548, | |
| "learning_rate": 4.901147663720247e-05, | |
| "loss": 0.9872, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.861997663974762, | |
| "learning_rate": 4.900565097287243e-05, | |
| "loss": 0.8541, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.7566475868225098, | |
| "learning_rate": 4.8999808541033086e-05, | |
| "loss": 0.9265, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "eval_loss": 0.7741928696632385, | |
| "eval_runtime": 96.9038, | |
| "eval_samples_per_second": 7.193, | |
| "eval_steps_per_second": 7.193, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.45475611090660095, | |
| "learning_rate": 4.8993949345765266e-05, | |
| "loss": 0.7186, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.8672823905944824, | |
| "learning_rate": 4.8988073391161515e-05, | |
| "loss": 0.919, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.7782495617866516, | |
| "learning_rate": 4.8982180681326074e-05, | |
| "loss": 0.6618, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.6640329957008362, | |
| "learning_rate": 4.897627122037489e-05, | |
| "loss": 0.6662, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.8019454479217529, | |
| "learning_rate": 4.897034501243561e-05, | |
| "loss": 0.9459, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.8336368799209595, | |
| "learning_rate": 4.896440206164761e-05, | |
| "loss": 0.8058, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.6316781044006348, | |
| "learning_rate": 4.8958442372161906e-05, | |
| "loss": 0.9132, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.7768308520317078, | |
| "learning_rate": 4.895246594814124e-05, | |
| "loss": 0.7512, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.9891632795333862, | |
| "learning_rate": 4.894647279376002e-05, | |
| "loss": 0.843, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.6162430047988892, | |
| "learning_rate": 4.894046291320439e-05, | |
| "loss": 0.8233, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.6184887290000916, | |
| "learning_rate": 4.893443631067211e-05, | |
| "loss": 0.7428, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.7117312550544739, | |
| "learning_rate": 4.892839299037267e-05, | |
| "loss": 0.8707, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.7165163159370422, | |
| "learning_rate": 4.892233295652721e-05, | |
| "loss": 1.0485, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.8377657532691956, | |
| "learning_rate": 4.891625621336855e-05, | |
| "loss": 0.7368, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.6349939703941345, | |
| "learning_rate": 4.89101627651412e-05, | |
| "loss": 0.7357, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.969137191772461, | |
| "learning_rate": 4.890405261610131e-05, | |
| "loss": 0.7605, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.5980018377304077, | |
| "learning_rate": 4.889792577051671e-05, | |
| "loss": 0.9253, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.681398332118988, | |
| "learning_rate": 4.889178223266688e-05, | |
| "loss": 0.7235, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.6999421715736389, | |
| "learning_rate": 4.888562200684299e-05, | |
| "loss": 0.8521, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.7693730592727661, | |
| "learning_rate": 4.887944509734783e-05, | |
| "loss": 0.8632, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "eval_loss": 0.76987224817276, | |
| "eval_runtime": 96.9052, | |
| "eval_samples_per_second": 7.193, | |
| "eval_steps_per_second": 7.193, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.7641138434410095, | |
| "learning_rate": 4.8873251508495865e-05, | |
| "loss": 0.7074, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.732545018196106, | |
| "learning_rate": 4.886704124461321e-05, | |
| "loss": 0.6901, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.0327179431915283, | |
| "learning_rate": 4.88608143100376e-05, | |
| "loss": 0.8256, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.7066757082939148, | |
| "learning_rate": 4.885457070911845e-05, | |
| "loss": 0.6635, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.809877336025238, | |
| "learning_rate": 4.8848310446216806e-05, | |
| "loss": 0.795, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.738153338432312, | |
| "learning_rate": 4.8842033525705335e-05, | |
| "loss": 0.9089, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.754896879196167, | |
| "learning_rate": 4.883573995196836e-05, | |
| "loss": 0.7103, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.0111182928085327, | |
| "learning_rate": 4.8829429729401826e-05, | |
| "loss": 1.046, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.6233395934104919, | |
| "learning_rate": 4.8823102862413306e-05, | |
| "loss": 0.761, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.3443419933319092, | |
| "learning_rate": 4.8816759355422e-05, | |
| "loss": 0.8436, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.6685923337936401, | |
| "learning_rate": 4.8810399212858736e-05, | |
| "loss": 0.8956, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.0405924320220947, | |
| "learning_rate": 4.880402243916596e-05, | |
| "loss": 1.1458, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.8413107991218567, | |
| "learning_rate": 4.879762903879772e-05, | |
| "loss": 0.8133, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.7151504158973694, | |
| "learning_rate": 4.8791219016219705e-05, | |
| "loss": 0.9207, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.6887856125831604, | |
| "learning_rate": 4.878479237590918e-05, | |
| "loss": 0.8185, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.5687748193740845, | |
| "learning_rate": 4.877834912235506e-05, | |
| "loss": 0.9035, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.9966350793838501, | |
| "learning_rate": 4.877188926005782e-05, | |
| "loss": 0.7764, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.0459462404251099, | |
| "learning_rate": 4.8765412793529574e-05, | |
| "loss": 0.6658, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.8338847160339355, | |
| "learning_rate": 4.8758919727293995e-05, | |
| "loss": 0.7363, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.7602768540382385, | |
| "learning_rate": 4.875241006588638e-05, | |
| "loss": 1.0081, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 0.7692809700965881, | |
| "eval_runtime": 96.4899, | |
| "eval_samples_per_second": 7.224, | |
| "eval_steps_per_second": 7.224, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.5455746054649353, | |
| "learning_rate": 4.874588381385362e-05, | |
| "loss": 0.7855, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.8574795126914978, | |
| "learning_rate": 4.8739340975754165e-05, | |
| "loss": 1.068, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.0321904420852661, | |
| "learning_rate": 4.873278155615808e-05, | |
| "loss": 0.8239, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.2484744787216187, | |
| "learning_rate": 4.8726205559646996e-05, | |
| "loss": 0.9307, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.7140147686004639, | |
| "learning_rate": 4.871961299081412e-05, | |
| "loss": 0.9876, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.8003590106964111, | |
| "learning_rate": 4.871300385426426e-05, | |
| "loss": 0.8615, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.7282931208610535, | |
| "learning_rate": 4.870637815461376e-05, | |
| "loss": 0.8734, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.6800629496574402, | |
| "learning_rate": 4.869973589649055e-05, | |
| "loss": 0.7718, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.8813210129737854, | |
| "learning_rate": 4.869307708453413e-05, | |
| "loss": 0.7943, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.6612805724143982, | |
| "learning_rate": 4.868640172339557e-05, | |
| "loss": 0.6807, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.653191328048706, | |
| "learning_rate": 4.867970981773748e-05, | |
| "loss": 0.8948, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.7479822635650635, | |
| "learning_rate": 4.8673001372234025e-05, | |
| "loss": 0.8583, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.8667622710291026e-05, | |
| "loss": 0.7443, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.5788535475730896, | |
| "learning_rate": 4.866088450488172e-05, | |
| "loss": 0.7249, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.7408040165901184, | |
| "learning_rate": 4.86541297727762e-05, | |
| "loss": 0.7115, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6549968719482422, | |
| "learning_rate": 4.864735851869251e-05, | |
| "loss": 0.9095, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.4595119059085846, | |
| "learning_rate": 4.864057074736026e-05, | |
| "loss": 1.2808, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.5746715068817139, | |
| "learning_rate": 4.863376646352058e-05, | |
| "loss": 0.8139, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6972643136978149, | |
| "learning_rate": 4.862694567192614e-05, | |
| "loss": 0.9797, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6935243010520935, | |
| "learning_rate": 4.8620108377341124e-05, | |
| "loss": 0.7651, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 0.766412615776062, | |
| "eval_runtime": 96.4555, | |
| "eval_samples_per_second": 7.226, | |
| "eval_steps_per_second": 7.226, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.9983006715774536, | |
| "learning_rate": 4.861325458454128e-05, | |
| "loss": 0.8256, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6732650995254517, | |
| "learning_rate": 4.860638429831384e-05, | |
| "loss": 0.8136, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6780042052268982, | |
| "learning_rate": 4.859949752345758e-05, | |
| "loss": 0.8911, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.9892123937606812, | |
| "learning_rate": 4.8592594264782794e-05, | |
| "loss": 0.7907, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.9327254295349121, | |
| "learning_rate": 4.8585674527111266e-05, | |
| "loss": 0.8712, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.0295612812042236, | |
| "learning_rate": 4.857873831527632e-05, | |
| "loss": 0.9188, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.3071186542510986, | |
| "learning_rate": 4.8571785634122766e-05, | |
| "loss": 0.8801, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.9625150561332703, | |
| "learning_rate": 4.856481648850694e-05, | |
| "loss": 0.8333, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.6674854159355164, | |
| "learning_rate": 4.855783088329664e-05, | |
| "loss": 1.0388, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.5447000861167908, | |
| "learning_rate": 4.8550828823371196e-05, | |
| "loss": 0.7893, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.9970148801803589, | |
| "learning_rate": 4.854381031362142e-05, | |
| "loss": 0.8198, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.7657136917114258, | |
| "learning_rate": 4.853677535894961e-05, | |
| "loss": 0.5977, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.4694065451622009, | |
| "learning_rate": 4.852972396426956e-05, | |
| "loss": 0.5965, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.8955700993537903, | |
| "learning_rate": 4.852265613450653e-05, | |
| "loss": 0.6938, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.9884099960327148, | |
| "learning_rate": 4.851557187459727e-05, | |
| "loss": 0.8946, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.6793637871742249, | |
| "learning_rate": 4.850847118949002e-05, | |
| "loss": 0.841, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.7438017725944519, | |
| "learning_rate": 4.850135408414447e-05, | |
| "loss": 0.8843, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.7632609009742737, | |
| "learning_rate": 4.849422056353178e-05, | |
| "loss": 0.8263, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.7281492352485657, | |
| "learning_rate": 4.84870706326346e-05, | |
| "loss": 0.8989, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.6480591893196106, | |
| "learning_rate": 4.847990429644702e-05, | |
| "loss": 1.0037, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "eval_loss": 0.7653521299362183, | |
| "eval_runtime": 96.4452, | |
| "eval_samples_per_second": 7.227, | |
| "eval_steps_per_second": 7.227, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.5578673481941223, | |
| "learning_rate": 4.8472721559974584e-05, | |
| "loss": 0.911, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.5615595579147339, | |
| "learning_rate": 4.846552242823433e-05, | |
| "loss": 0.6938, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.588246762752533, | |
| "learning_rate": 4.845830690625469e-05, | |
| "loss": 0.7898, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.8140611052513123, | |
| "learning_rate": 4.8451074999075595e-05, | |
| "loss": 0.7702, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.9400056600570679, | |
| "learning_rate": 4.8443826711748385e-05, | |
| "loss": 0.7959, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.7187873721122742, | |
| "learning_rate": 4.8436562049335874e-05, | |
| "loss": 0.7223, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.7627830505371094, | |
| "learning_rate": 4.8429281016912275e-05, | |
| "loss": 0.793, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.6755004525184631, | |
| "learning_rate": 4.842198361956328e-05, | |
| "loss": 0.7665, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.6032254695892334, | |
| "learning_rate": 4.8414669862385966e-05, | |
| "loss": 0.7952, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.8377916216850281, | |
| "learning_rate": 4.840733975048887e-05, | |
| "loss": 1.0016, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.7361429929733276, | |
| "learning_rate": 4.839999328899194e-05, | |
| "loss": 0.8773, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.8006517887115479, | |
| "learning_rate": 4.8392630483026546e-05, | |
| "loss": 0.9334, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.9716467261314392, | |
| "learning_rate": 4.8385251337735473e-05, | |
| "loss": 1.0359, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.6826418042182922, | |
| "learning_rate": 4.8377855858272925e-05, | |
| "loss": 0.6841, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.4519975781440735, | |
| "learning_rate": 4.8370444049804494e-05, | |
| "loss": 0.8326, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.677891731262207, | |
| "learning_rate": 4.836301591750721e-05, | |
| "loss": 1.0841, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.5161852836608887, | |
| "learning_rate": 4.835557146656948e-05, | |
| "loss": 0.8701, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.6586780548095703, | |
| "learning_rate": 4.834811070219112e-05, | |
| "loss": 0.8261, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.48046165704727173, | |
| "learning_rate": 4.834063362958333e-05, | |
| "loss": 0.6375, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.0315968990325928, | |
| "learning_rate": 4.833314025396872e-05, | |
| "loss": 0.8768, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_loss": 0.7641988396644592, | |
| "eval_runtime": 96.3923, | |
| "eval_samples_per_second": 7.231, | |
| "eval_steps_per_second": 7.231, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.7704123258590698, | |
| "learning_rate": 4.8325630580581263e-05, | |
| "loss": 0.8849, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.087425708770752, | |
| "learning_rate": 4.831810461466634e-05, | |
| "loss": 0.9828, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.4766077995300293, | |
| "learning_rate": 4.83105623614807e-05, | |
| "loss": 0.7103, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.6079148054122925, | |
| "learning_rate": 4.830300382629247e-05, | |
| "loss": 0.7253, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.6767585873603821, | |
| "learning_rate": 4.829542901438115e-05, | |
| "loss": 0.7852, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.7065784335136414, | |
| "learning_rate": 4.8287837931037585e-05, | |
| "loss": 0.8047, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8305274248123169, | |
| "learning_rate": 4.828023058156404e-05, | |
| "loss": 0.7912, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8435990810394287, | |
| "learning_rate": 4.827260697127409e-05, | |
| "loss": 0.826, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8484389185905457, | |
| "learning_rate": 4.8264967105492705e-05, | |
| "loss": 0.706, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.7461299300193787, | |
| "learning_rate": 4.825731098955617e-05, | |
| "loss": 0.763, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.7928741574287415, | |
| "learning_rate": 4.824963862881216e-05, | |
| "loss": 0.8125, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.7152695059776306, | |
| "learning_rate": 4.824195002861968e-05, | |
| "loss": 1.129, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8594226241111755, | |
| "learning_rate": 4.8234245194349056e-05, | |
| "loss": 0.8873, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.9760085940361023, | |
| "learning_rate": 4.822652413138199e-05, | |
| "loss": 0.9713, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.7297483682632446, | |
| "learning_rate": 4.8218786845111505e-05, | |
| "loss": 0.6953, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8251492381095886, | |
| "learning_rate": 4.8211033340941956e-05, | |
| "loss": 0.7649, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.742917537689209, | |
| "learning_rate": 4.820326362428901e-05, | |
| "loss": 0.9756, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7784115076065063, | |
| "learning_rate": 4.819547770057969e-05, | |
| "loss": 0.6937, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.782772183418274, | |
| "learning_rate": 4.8187675575252314e-05, | |
| "loss": 0.9062, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7802585363388062, | |
| "learning_rate": 4.8179857253756514e-05, | |
| "loss": 0.8052, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_loss": 0.7618402242660522, | |
| "eval_runtime": 96.4079, | |
| "eval_samples_per_second": 7.23, | |
| "eval_steps_per_second": 7.23, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.799985945224762, | |
| "learning_rate": 4.8172022741553255e-05, | |
| "loss": 0.9046, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.026978850364685, | |
| "learning_rate": 4.816417204411481e-05, | |
| "loss": 0.7195, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.8067365884780884, | |
| "learning_rate": 4.8156305166924734e-05, | |
| "loss": 0.8193, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.247164249420166, | |
| "learning_rate": 4.81484221154779e-05, | |
| "loss": 0.6138, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.8662647604942322, | |
| "learning_rate": 4.814052289528047e-05, | |
| "loss": 0.7763, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.9020537734031677, | |
| "learning_rate": 4.813260751184992e-05, | |
| "loss": 0.9236, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.6113781929016113, | |
| "learning_rate": 4.812467597071499e-05, | |
| "loss": 0.8753, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.6988622546195984, | |
| "learning_rate": 4.811672827741572e-05, | |
| "loss": 0.6747, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.9095928072929382, | |
| "learning_rate": 4.810876443750344e-05, | |
| "loss": 1.0578, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.643699049949646, | |
| "learning_rate": 4.8100784456540724e-05, | |
| "loss": 0.8177, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7084022760391235, | |
| "learning_rate": 4.809278834010146e-05, | |
| "loss": 0.9345, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.5328305959701538, | |
| "learning_rate": 4.808477609377078e-05, | |
| "loss": 0.6781, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8238436579704285, | |
| "learning_rate": 4.80767477231451e-05, | |
| "loss": 0.7306, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.0184216499328613, | |
| "learning_rate": 4.806870323383208e-05, | |
| "loss": 1.0288, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8620426654815674, | |
| "learning_rate": 4.806064263145066e-05, | |
| "loss": 0.7925, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.6541377305984497, | |
| "learning_rate": 4.805256592163102e-05, | |
| "loss": 0.8629, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8664489984512329, | |
| "learning_rate": 4.8044473110014594e-05, | |
| "loss": 0.8184, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7283564209938049, | |
| "learning_rate": 4.803636420225406e-05, | |
| "loss": 0.9444, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7168800234794617, | |
| "learning_rate": 4.802823920401335e-05, | |
| "loss": 0.8118, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8198531866073608, | |
| "learning_rate": 4.802009812096762e-05, | |
| "loss": 0.7271, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "eval_loss": 0.7595117688179016, | |
| "eval_runtime": 96.4847, | |
| "eval_samples_per_second": 7.224, | |
| "eval_steps_per_second": 7.224, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.5693966150283813, | |
| "learning_rate": 4.801194095880327e-05, | |
| "loss": 0.7801, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7175332307815552, | |
| "learning_rate": 4.800376772321793e-05, | |
| "loss": 0.7873, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.7779633402824402, | |
| "learning_rate": 4.799557841992046e-05, | |
| "loss": 0.894, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.7832231521606445, | |
| "learning_rate": 4.798737305463092e-05, | |
| "loss": 0.8035, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.5115272998809814, | |
| "learning_rate": 4.797915163308064e-05, | |
| "loss": 0.8885, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.9534878730773926, | |
| "learning_rate": 4.79709141610121e-05, | |
| "loss": 0.8175, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.7053850889205933, | |
| "learning_rate": 4.796266064417905e-05, | |
| "loss": 0.6971, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.236257791519165, | |
| "learning_rate": 4.795439108834641e-05, | |
| "loss": 1.0832, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.6936543583869934, | |
| "learning_rate": 4.794610549929031e-05, | |
| "loss": 0.858, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.8064691424369812, | |
| "learning_rate": 4.793780388279809e-05, | |
| "loss": 0.6951, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.7180449962615967, | |
| "learning_rate": 4.792948624466827e-05, | |
| "loss": 0.6779, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.6903377175331116, | |
| "learning_rate": 4.792115259071058e-05, | |
| "loss": 0.8281, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.9112733006477356, | |
| "learning_rate": 4.791280292674591e-05, | |
| "loss": 0.938, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.8657469153404236, | |
| "learning_rate": 4.790443725860636e-05, | |
| "loss": 0.8063, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.9260883927345276, | |
| "learning_rate": 4.7896055592135194e-05, | |
| "loss": 1.0093, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7651245594024658, | |
| "learning_rate": 4.788765793318685e-05, | |
| "loss": 0.6686, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.6063816547393799, | |
| "learning_rate": 4.7879244287626945e-05, | |
| "loss": 0.8516, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.9127621650695801, | |
| "learning_rate": 4.787081466133225e-05, | |
| "loss": 0.7992, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.061246633529663, | |
| "learning_rate": 4.7862369060190716e-05, | |
| "loss": 0.8232, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7100695967674255, | |
| "learning_rate": 4.785390749010143e-05, | |
| "loss": 0.9615, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 0.7581596970558167, | |
| "eval_runtime": 96.5797, | |
| "eval_samples_per_second": 7.217, | |
| "eval_steps_per_second": 7.217, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 16.361513137817383, | |
| "learning_rate": 4.784542995697464e-05, | |
| "loss": 0.7725, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7746205925941467, | |
| "learning_rate": 4.7836936466731764e-05, | |
| "loss": 0.8464, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7703484892845154, | |
| "learning_rate": 4.7828427025305345e-05, | |
| "loss": 0.8596, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7838412523269653, | |
| "learning_rate": 4.7819901638639066e-05, | |
| "loss": 0.666, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.5832842588424683, | |
| "learning_rate": 4.781136031268776e-05, | |
| "loss": 0.4995, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.798271894454956, | |
| "learning_rate": 4.780280305341739e-05, | |
| "loss": 1.0017, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.463828444480896, | |
| "learning_rate": 4.779422986680503e-05, | |
| "loss": 0.5894, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.761908233165741, | |
| "learning_rate": 4.7785640758838916e-05, | |
| "loss": 0.9198, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.8427887558937073, | |
| "learning_rate": 4.777703573551837e-05, | |
| "loss": 0.8572, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.6188894510269165, | |
| "learning_rate": 4.776841480285384e-05, | |
| "loss": 0.9102, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.7198623418807983, | |
| "learning_rate": 4.775977796686691e-05, | |
| "loss": 0.8472, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.0144587755203247, | |
| "learning_rate": 4.775112523359023e-05, | |
| "loss": 0.7059, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.9784219861030579, | |
| "learning_rate": 4.77424566090676e-05, | |
| "loss": 0.7417, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.5349156856536865, | |
| "learning_rate": 4.773377209935387e-05, | |
| "loss": 0.7287, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.7715370655059814, | |
| "learning_rate": 4.772507171051502e-05, | |
| "loss": 0.8393, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.8483054637908936, | |
| "learning_rate": 4.771635544862813e-05, | |
| "loss": 0.8938, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.8196272253990173, | |
| "learning_rate": 4.770762331978132e-05, | |
| "loss": 0.8321, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.6155353784561157, | |
| "learning_rate": 4.769887533007384e-05, | |
| "loss": 0.9291, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.8897277116775513, | |
| "learning_rate": 4.769011148561601e-05, | |
| "loss": 0.7098, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.2256160974502563, | |
| "learning_rate": 4.768133179252921e-05, | |
| "loss": 0.8284, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "eval_loss": 0.7554901838302612, | |
| "eval_runtime": 96.5279, | |
| "eval_samples_per_second": 7.221, | |
| "eval_steps_per_second": 7.221, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.6943432688713074, | |
| "learning_rate": 4.767253625694588e-05, | |
| "loss": 0.8785, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.6707726120948792, | |
| "learning_rate": 4.7663724885009556e-05, | |
| "loss": 0.7949, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.5595915913581848, | |
| "learning_rate": 4.765489768287481e-05, | |
| "loss": 0.8796, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.9889727234840393, | |
| "learning_rate": 4.7646054656707306e-05, | |
| "loss": 1.0676, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.8624396324157715, | |
| "learning_rate": 4.763719581268371e-05, | |
| "loss": 0.709, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.7466241121292114, | |
| "learning_rate": 4.7628321156991767e-05, | |
| "loss": 0.8084, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.6439360976219177, | |
| "learning_rate": 4.761943069583027e-05, | |
| "loss": 0.8831, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.9999917149543762, | |
| "learning_rate": 4.761052443540904e-05, | |
| "loss": 0.6372, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.688369870185852, | |
| "learning_rate": 4.760160238194894e-05, | |
| "loss": 0.7938, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.6920734643936157, | |
| "learning_rate": 4.759266454168186e-05, | |
| "loss": 0.7378, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.7592100501060486, | |
| "learning_rate": 4.758371092085073e-05, | |
| "loss": 1.097, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.9243403077125549, | |
| "learning_rate": 4.757474152570946e-05, | |
| "loss": 1.0404, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.8212980031967163, | |
| "learning_rate": 4.756575636252304e-05, | |
| "loss": 0.6179, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.6905696392059326, | |
| "learning_rate": 4.755675543756744e-05, | |
| "loss": 0.8398, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.8420882821083069, | |
| "learning_rate": 4.754773875712961e-05, | |
| "loss": 0.7552, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.6216087341308594, | |
| "learning_rate": 4.7538706327507575e-05, | |
| "loss": 0.8345, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.7430551648139954, | |
| "learning_rate": 4.75296581550103e-05, | |
| "loss": 0.8277, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.7866222262382507, | |
| "learning_rate": 4.752059424595778e-05, | |
| "loss": 0.9178, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.6548468470573425, | |
| "learning_rate": 4.7511514606680985e-05, | |
| "loss": 0.745, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.6956586837768555, | |
| "learning_rate": 4.750241924352187e-05, | |
| "loss": 0.8631, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "eval_loss": 0.7539612650871277, | |
| "eval_runtime": 96.4433, | |
| "eval_samples_per_second": 7.227, | |
| "eval_steps_per_second": 7.227, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.6508235335350037, | |
| "learning_rate": 4.7493308162833394e-05, | |
| "loss": 0.9936, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.8658422827720642, | |
| "learning_rate": 4.7484181370979475e-05, | |
| "loss": 0.8, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.9571516513824463, | |
| "learning_rate": 4.747503887433501e-05, | |
| "loss": 0.7028, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.7693742513656616, | |
| "learning_rate": 4.7465880679285866e-05, | |
| "loss": 0.7194, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.34340238571167, | |
| "learning_rate": 4.745670679222888e-05, | |
| "loss": 1.0445, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.71327805519104, | |
| "learning_rate": 4.7447517219571834e-05, | |
| "loss": 0.8088, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.9449920058250427, | |
| "learning_rate": 4.743831196773349e-05, | |
| "loss": 0.7939, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.8091790676116943, | |
| "learning_rate": 4.742909104314353e-05, | |
| "loss": 0.7816, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.5790795087814331, | |
| "learning_rate": 4.741985445224263e-05, | |
| "loss": 0.8778, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.1936956644058228, | |
| "learning_rate": 4.741060220148236e-05, | |
| "loss": 1.0242, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.5158389806747437, | |
| "learning_rate": 4.7401334297325244e-05, | |
| "loss": 0.7954, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.8950900435447693, | |
| "learning_rate": 4.7392050746244754e-05, | |
| "loss": 0.7603, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.7289401888847351, | |
| "learning_rate": 4.738275155472528e-05, | |
| "loss": 0.879, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.8410510420799255, | |
| "learning_rate": 4.7373436729262145e-05, | |
| "loss": 0.7399, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.7992503643035889, | |
| "learning_rate": 4.736410627636156e-05, | |
| "loss": 0.6779, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.6706194281578064, | |
| "learning_rate": 4.73547602025407e-05, | |
| "loss": 0.7878, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.7177903652191162, | |
| "learning_rate": 4.734539851432763e-05, | |
| "loss": 0.6958, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.6557692885398865, | |
| "learning_rate": 4.73360212182613e-05, | |
| "loss": 0.6695, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.6754157543182373, | |
| "learning_rate": 4.7326628320891586e-05, | |
| "loss": 0.9057, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.1403777599334717, | |
| "learning_rate": 4.731721982877926e-05, | |
| "loss": 1.0507, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "eval_loss": 0.7518497705459595, | |
| "eval_runtime": 96.4525, | |
| "eval_samples_per_second": 7.226, | |
| "eval_steps_per_second": 7.226, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.8268899321556091, | |
| "learning_rate": 4.730779574849598e-05, | |
| "loss": 0.7375, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5358712673187256, | |
| "learning_rate": 4.72983560866243e-05, | |
| "loss": 0.7839, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.0761948823928833, | |
| "learning_rate": 4.7288900849757636e-05, | |
| "loss": 0.7936, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.7037429213523865, | |
| "learning_rate": 4.7279430044500315e-05, | |
| "loss": 0.6875, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.6378889679908752, | |
| "learning_rate": 4.726994367746751e-05, | |
| "loss": 0.9209, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5508277416229248, | |
| "learning_rate": 4.7260441755285284e-05, | |
| "loss": 0.9402, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.9046247005462646, | |
| "learning_rate": 4.725092428459055e-05, | |
| "loss": 0.6336, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.8689594864845276, | |
| "learning_rate": 4.7241391272031096e-05, | |
| "loss": 1.1281, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.8785949945449829, | |
| "learning_rate": 4.723184272426555e-05, | |
| "loss": 0.711, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.9959015250205994, | |
| "learning_rate": 4.722227864796339e-05, | |
| "loss": 0.7432, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.6438590884208679, | |
| "learning_rate": 4.721269904980497e-05, | |
| "loss": 0.883, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.6714455485343933, | |
| "learning_rate": 4.720310393648145e-05, | |
| "loss": 1.065, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.7378780245780945, | |
| "learning_rate": 4.7193493314694846e-05, | |
| "loss": 0.5352, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.7698020935058594, | |
| "learning_rate": 4.7183867191158006e-05, | |
| "loss": 0.7016, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.952795684337616, | |
| "learning_rate": 4.7174225572594586e-05, | |
| "loss": 1.0659, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.6401458978652954, | |
| "learning_rate": 4.71645684657391e-05, | |
| "loss": 0.7335, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.8375076055526733, | |
| "learning_rate": 4.715489587733685e-05, | |
| "loss": 0.9264, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.693505048751831, | |
| "learning_rate": 4.714520781414397e-05, | |
| "loss": 1.0286, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.0239859819412231, | |
| "learning_rate": 4.7135504282927375e-05, | |
| "loss": 0.6875, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.602035403251648, | |
| "learning_rate": 4.712578529046483e-05, | |
| "loss": 0.8247, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "eval_loss": 0.7512397766113281, | |
| "eval_runtime": 96.4745, | |
| "eval_samples_per_second": 7.225, | |
| "eval_steps_per_second": 7.225, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.6859713196754456, | |
| "learning_rate": 4.711605084354487e-05, | |
| "loss": 0.7521, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.7126486301422119, | |
| "learning_rate": 4.7106300948966817e-05, | |
| "loss": 0.7656, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.4363511800765991, | |
| "learning_rate": 4.70965356135408e-05, | |
| "loss": 1.1595, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6381859183311462, | |
| "learning_rate": 4.7086754844087724e-05, | |
| "loss": 0.6949, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.7931796312332153, | |
| "learning_rate": 4.7076958647439284e-05, | |
| "loss": 1.0821, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.9333865642547607, | |
| "learning_rate": 4.706714703043795e-05, | |
| "loss": 0.7753, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.8860915899276733, | |
| "learning_rate": 4.705731999993694e-05, | |
| "loss": 0.7257, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6868377327919006, | |
| "learning_rate": 4.704747756280027e-05, | |
| "loss": 0.8148, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.5337914228439331, | |
| "learning_rate": 4.7037619725902706e-05, | |
| "loss": 0.7379, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.4664730429649353, | |
| "learning_rate": 4.7027746496129745e-05, | |
| "loss": 0.6226, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.7305762767791748, | |
| "learning_rate": 4.701785788037768e-05, | |
| "loss": 0.9018, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6576158404350281, | |
| "learning_rate": 4.7007953885553525e-05, | |
| "loss": 0.7777, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.9728206396102905, | |
| "learning_rate": 4.699803451857503e-05, | |
| "loss": 0.8004, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6211077570915222, | |
| "learning_rate": 4.69880997863707e-05, | |
| "loss": 0.7407, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.2564159631729126, | |
| "learning_rate": 4.697814969587976e-05, | |
| "loss": 0.7993, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.927930474281311, | |
| "learning_rate": 4.696818425405217e-05, | |
| "loss": 0.8803, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9062425494194031, | |
| "learning_rate": 4.695820346784861e-05, | |
| "loss": 0.8835, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.6738875508308411, | |
| "learning_rate": 4.694820734424047e-05, | |
| "loss": 0.7817, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.326353669166565, | |
| "learning_rate": 4.6938195890209866e-05, | |
| "loss": 0.9213, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.4853856563568115, | |
| "learning_rate": 4.692816911274962e-05, | |
| "loss": 0.9835, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 0.7496011257171631, | |
| "eval_runtime": 96.515, | |
| "eval_samples_per_second": 7.222, | |
| "eval_steps_per_second": 7.222, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.5411309003829956, | |
| "learning_rate": 4.691812701886324e-05, | |
| "loss": 0.7556, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.7545793652534485, | |
| "learning_rate": 4.6908069615564966e-05, | |
| "loss": 0.8295, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.850104808807373, | |
| "learning_rate": 4.6897996909879695e-05, | |
| "loss": 1.0194, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.69708651304245, | |
| "learning_rate": 4.6887908908843026e-05, | |
| "loss": 0.7918, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.1333253383636475, | |
| "learning_rate": 4.687780561950126e-05, | |
| "loss": 0.7287, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9223487973213196, | |
| "learning_rate": 4.686768704891134e-05, | |
| "loss": 0.9592, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.7700949311256409, | |
| "learning_rate": 4.685755320414091e-05, | |
| "loss": 0.8572, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.5573208332061768, | |
| "learning_rate": 4.684740409226829e-05, | |
| "loss": 0.9441, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.6346720457077026, | |
| "learning_rate": 4.6837239720382426e-05, | |
| "loss": 0.8398, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.8065741062164307, | |
| "learning_rate": 4.682706009558297e-05, | |
| "loss": 0.9325, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.3001660406589508, | |
| "learning_rate": 4.681686522498018e-05, | |
| "loss": 0.8997, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.860211968421936, | |
| "learning_rate": 4.680665511569501e-05, | |
| "loss": 0.6883, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.722518265247345, | |
| "learning_rate": 4.6796429774859015e-05, | |
| "loss": 0.8607, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.6525880694389343, | |
| "learning_rate": 4.678618920961442e-05, | |
| "loss": 0.9256, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.7581719756126404, | |
| "learning_rate": 4.6775933427114084e-05, | |
| "loss": 0.662, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.6604760885238647, | |
| "learning_rate": 4.676566243452146e-05, | |
| "loss": 0.734, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.7573785781860352, | |
| "learning_rate": 4.6755376239010665e-05, | |
| "loss": 0.7113, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.8933848738670349, | |
| "learning_rate": 4.674507484776641e-05, | |
| "loss": 0.8523, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.5941946506500244, | |
| "learning_rate": 4.6734758267984044e-05, | |
| "loss": 0.7907, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.7756261825561523, | |
| "learning_rate": 4.672442650686949e-05, | |
| "loss": 0.8407, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "eval_loss": 0.7495761513710022, | |
| "eval_runtime": 96.4482, | |
| "eval_samples_per_second": 7.227, | |
| "eval_steps_per_second": 7.227, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.6407367587089539, | |
| "learning_rate": 4.671407957163931e-05, | |
| "loss": 0.6413, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.069754719734192, | |
| "learning_rate": 4.670371746952063e-05, | |
| "loss": 0.8934, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.9698624610900879, | |
| "learning_rate": 4.669334020775122e-05, | |
| "loss": 0.7261, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.6487118005752563, | |
| "learning_rate": 4.668294779357938e-05, | |
| "loss": 0.8951, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.0640240907669067, | |
| "learning_rate": 4.667254023426404e-05, | |
| "loss": 0.8568, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5772892236709595, | |
| "learning_rate": 4.666211753707468e-05, | |
| "loss": 0.9798, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.6915898323059082, | |
| "learning_rate": 4.665167970929137e-05, | |
| "loss": 0.8694, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5959879159927368, | |
| "learning_rate": 4.664122675820474e-05, | |
| "loss": 0.6521, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.833991289138794, | |
| "learning_rate": 4.663075869111597e-05, | |
| "loss": 0.9194, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.9575549960136414, | |
| "learning_rate": 4.662027551533685e-05, | |
| "loss": 1.0088, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5501818656921387, | |
| "learning_rate": 4.660977723818965e-05, | |
| "loss": 0.5997, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.6001989245414734, | |
| "learning_rate": 4.659926386700725e-05, | |
| "loss": 0.7643, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.6806654930114746, | |
| "learning_rate": 4.658873540913303e-05, | |
| "loss": 0.899, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.7098959684371948, | |
| "learning_rate": 4.657819187192094e-05, | |
| "loss": 1.0281, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.9234817028045654, | |
| "learning_rate": 4.6567633262735446e-05, | |
| "loss": 0.9495, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.596527636051178, | |
| "learning_rate": 4.655705958895153e-05, | |
| "loss": 0.6352, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.154539704322815, | |
| "learning_rate": 4.6546470857954736e-05, | |
| "loss": 0.8939, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.7502239942550659, | |
| "learning_rate": 4.653586707714108e-05, | |
| "loss": 0.692, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.7868794202804565, | |
| "learning_rate": 4.652524825391711e-05, | |
| "loss": 0.908, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.6436206102371216, | |
| "learning_rate": 4.6514614395699886e-05, | |
| "loss": 0.7417, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "eval_loss": 0.7466740012168884, | |
| "eval_runtime": 96.4309, | |
| "eval_samples_per_second": 7.228, | |
| "eval_steps_per_second": 7.228, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.8566870093345642, | |
| "learning_rate": 4.6503965509916956e-05, | |
| "loss": 0.8041, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.5982272028923035, | |
| "learning_rate": 4.649330160400639e-05, | |
| "loss": 0.4528, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.6511960029602051, | |
| "learning_rate": 4.648262268541671e-05, | |
| "loss": 0.877, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.8476071357727051, | |
| "learning_rate": 4.6471928761606965e-05, | |
| "loss": 0.7145, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.0408881902694702, | |
| "learning_rate": 4.6461219840046654e-05, | |
| "loss": 0.5539, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.7445903420448303, | |
| "learning_rate": 4.645049592821577e-05, | |
| "loss": 0.8306, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.9672279357910156, | |
| "learning_rate": 4.6439757033604756e-05, | |
| "loss": 0.8645, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.7082134485244751, | |
| "learning_rate": 4.6429003163714556e-05, | |
| "loss": 0.8188, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.8803107142448425, | |
| "learning_rate": 4.641823432605654e-05, | |
| "loss": 0.7956, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.7926101088523865, | |
| "learning_rate": 4.640745052815254e-05, | |
| "loss": 0.715, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.890519380569458, | |
| "learning_rate": 4.639665177753485e-05, | |
| "loss": 0.8825, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.9909971952438354, | |
| "learning_rate": 4.638583808174619e-05, | |
| "loss": 0.7843, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.7450726628303528, | |
| "learning_rate": 4.6375009448339743e-05, | |
| "loss": 0.9714, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.8220781683921814, | |
| "learning_rate": 4.636416588487911e-05, | |
| "loss": 0.8467, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.025499701499939, | |
| "learning_rate": 4.63533073989383e-05, | |
| "loss": 0.9301, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.8067827820777893, | |
| "learning_rate": 4.634243399810181e-05, | |
| "loss": 0.7078, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.8833619952201843, | |
| "learning_rate": 4.6331545689964475e-05, | |
| "loss": 0.699, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.0600448846817017, | |
| "learning_rate": 4.632064248213159e-05, | |
| "loss": 0.7849, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.0503095388412476, | |
| "learning_rate": 4.630972438221885e-05, | |
| "loss": 0.6215, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.5159885287284851, | |
| "learning_rate": 4.629879139785235e-05, | |
| "loss": 0.7449, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "eval_loss": 0.7472941279411316, | |
| "eval_runtime": 96.4994, | |
| "eval_samples_per_second": 7.223, | |
| "eval_steps_per_second": 7.223, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.072464108467102, | |
| "learning_rate": 4.6287843536668575e-05, | |
| "loss": 0.8511, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.9016098976135254, | |
| "learning_rate": 4.62768808063144e-05, | |
| "loss": 0.7373, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.0161947011947632, | |
| "learning_rate": 4.626590321444712e-05, | |
| "loss": 0.9035, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.7459146976470947, | |
| "learning_rate": 4.625491076873435e-05, | |
| "loss": 0.6468, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.950080394744873, | |
| "learning_rate": 4.624390347685413e-05, | |
| "loss": 0.7211, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.7308927774429321, | |
| "learning_rate": 4.623288134649485e-05, | |
| "loss": 0.9238, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.7227129340171814, | |
| "learning_rate": 4.622184438535527e-05, | |
| "loss": 0.9773, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.7054020166397095, | |
| "learning_rate": 4.62107926011445e-05, | |
| "loss": 0.7783, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.6535981297492981, | |
| "learning_rate": 4.619972600158201e-05, | |
| "loss": 0.6559, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.7245693206787109, | |
| "learning_rate": 4.618864459439762e-05, | |
| "loss": 0.8352, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.9683626890182495, | |
| "learning_rate": 4.6177548387331485e-05, | |
| "loss": 0.9397, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.1847660541534424, | |
| "learning_rate": 4.616643738813411e-05, | |
| "loss": 0.7383, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.8566804528236389, | |
| "learning_rate": 4.615531160456633e-05, | |
| "loss": 0.8066, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7312522530555725, | |
| "learning_rate": 4.61441710443993e-05, | |
| "loss": 0.7974, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.6620572209358215, | |
| "learning_rate": 4.6133015715414484e-05, | |
| "loss": 0.9136, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.5405072569847107, | |
| "learning_rate": 4.612184562540369e-05, | |
| "loss": 0.6921, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7474086284637451, | |
| "learning_rate": 4.611066078216901e-05, | |
| "loss": 0.8463, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.9125152230262756, | |
| "learning_rate": 4.609946119352287e-05, | |
| "loss": 0.8508, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.9998400211334229, | |
| "learning_rate": 4.608824686728797e-05, | |
| "loss": 0.8735, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.5990025401115417, | |
| "learning_rate": 4.6077017811297304e-05, | |
| "loss": 0.8562, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "eval_loss": 0.743736743927002, | |
| "eval_runtime": 96.3748, | |
| "eval_samples_per_second": 7.232, | |
| "eval_steps_per_second": 7.232, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.35676899552345276, | |
| "learning_rate": 4.606577403339418e-05, | |
| "loss": 0.8914, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.772233784198761, | |
| "learning_rate": 4.605451554143216e-05, | |
| "loss": 0.779, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.7336989641189575, | |
| "learning_rate": 4.604324234327509e-05, | |
| "loss": 0.7678, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.7039794325828552, | |
| "learning_rate": 4.603195444679711e-05, | |
| "loss": 0.8783, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.6955629587173462, | |
| "learning_rate": 4.602065185988259e-05, | |
| "loss": 0.818, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.7369412779808044, | |
| "learning_rate": 4.60093345904262e-05, | |
| "loss": 0.6942, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.6824669241905212, | |
| "learning_rate": 4.5998002646332835e-05, | |
| "loss": 0.9274, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.355720043182373, | |
| "learning_rate": 4.598665603551765e-05, | |
| "loss": 0.7219, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.8629677295684814, | |
| "learning_rate": 4.597529476590605e-05, | |
| "loss": 0.8023, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.2956135272979736, | |
| "learning_rate": 4.596391884543368e-05, | |
| "loss": 0.9574, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.2683722972869873, | |
| "learning_rate": 4.59525282820464e-05, | |
| "loss": 0.6996, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.7317371368408203, | |
| "learning_rate": 4.594112308370032e-05, | |
| "loss": 1.03, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.0310641527175903, | |
| "learning_rate": 4.5929703258361756e-05, | |
| "loss": 0.6917, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.9479489326477051, | |
| "learning_rate": 4.591826881400726e-05, | |
| "loss": 0.9939, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.9485552310943604, | |
| "learning_rate": 4.5906819758623576e-05, | |
| "loss": 1.0317, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.724987268447876, | |
| "learning_rate": 4.589535610020765e-05, | |
| "loss": 0.6915, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7091718316078186, | |
| "learning_rate": 4.5883877846766654e-05, | |
| "loss": 0.8673, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.8297457098960876, | |
| "learning_rate": 4.587238500631793e-05, | |
| "loss": 0.8114, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7213541269302368, | |
| "learning_rate": 4.586087758688903e-05, | |
| "loss": 0.863, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.1096009016036987, | |
| "learning_rate": 4.584935559651765e-05, | |
| "loss": 0.9222, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 0.7428527474403381, | |
| "eval_runtime": 96.3993, | |
| "eval_samples_per_second": 7.23, | |
| "eval_steps_per_second": 7.23, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.5834380984306335, | |
| "learning_rate": 4.583781904325172e-05, | |
| "loss": 0.6609, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.5797068476676941, | |
| "learning_rate": 4.5826267935149285e-05, | |
| "loss": 0.7933, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6027450561523438, | |
| "learning_rate": 4.581470228027861e-05, | |
| "loss": 0.7841, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.5674509406089783, | |
| "learning_rate": 4.5803122086718077e-05, | |
| "loss": 0.7721, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7398461103439331, | |
| "learning_rate": 4.5791527362556235e-05, | |
| "loss": 0.7651, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6617181301116943, | |
| "learning_rate": 4.577991811589181e-05, | |
| "loss": 0.9359, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.49279505014419556, | |
| "learning_rate": 4.576829435483362e-05, | |
| "loss": 0.6278, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.5201964378356934, | |
| "learning_rate": 4.575665608750067e-05, | |
| "loss": 0.853, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7188725471496582, | |
| "learning_rate": 4.5745003322022084e-05, | |
| "loss": 0.8338, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.0798031091690063, | |
| "learning_rate": 4.573333606653708e-05, | |
| "loss": 0.9776, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.6439509987831116, | |
| "learning_rate": 4.5721654329195046e-05, | |
| "loss": 0.9331, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7663920521736145, | |
| "learning_rate": 4.570995811815545e-05, | |
| "loss": 1.0533, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7230969071388245, | |
| "learning_rate": 4.569824744158789e-05, | |
| "loss": 0.6966, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.016112208366394, | |
| "learning_rate": 4.568652230767205e-05, | |
| "loss": 0.8393, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.0165222883224487, | |
| "learning_rate": 4.567478272459773e-05, | |
| "loss": 1.0218, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.709685742855072, | |
| "learning_rate": 4.5663028700564826e-05, | |
| "loss": 0.7273, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.5664321780204773, | |
| "learning_rate": 4.565126024378328e-05, | |
| "loss": 0.9079, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7938306927680969, | |
| "learning_rate": 4.5639477362473173e-05, | |
| "loss": 0.976, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.6710417866706848, | |
| "learning_rate": 4.5627680064864606e-05, | |
| "loss": 1.1969, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.9886580109596252, | |
| "learning_rate": 4.5615868359197796e-05, | |
| "loss": 0.9242, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "eval_loss": 0.7412505149841309, | |
| "eval_runtime": 96.4, | |
| "eval_samples_per_second": 7.23, | |
| "eval_steps_per_second": 7.23, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.8157562613487244, | |
| "learning_rate": 4.5604042253723014e-05, | |
| "loss": 0.8398, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.2368131875991821, | |
| "learning_rate": 4.559220175670054e-05, | |
| "loss": 0.8742, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.6060155034065247, | |
| "learning_rate": 4.558034687640078e-05, | |
| "loss": 0.6993, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.1366558074951172, | |
| "learning_rate": 4.556847762110415e-05, | |
| "loss": 0.9328, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.7205525636672974, | |
| "learning_rate": 4.555659399910108e-05, | |
| "loss": 0.827, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.6944175958633423, | |
| "learning_rate": 4.554469601869209e-05, | |
| "loss": 0.7805, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.6939406394958496, | |
| "learning_rate": 4.55327836881877e-05, | |
| "loss": 0.7996, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.592650830745697, | |
| "learning_rate": 4.552085701590844e-05, | |
| "loss": 0.6599, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.5287877321243286, | |
| "learning_rate": 4.5508916010184884e-05, | |
| "loss": 0.6856, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.6414081454277039, | |
| "learning_rate": 4.549696067935762e-05, | |
| "loss": 0.7622, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.2272289991378784, | |
| "learning_rate": 4.548499103177719e-05, | |
| "loss": 1.0834, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.5912505388259888, | |
| "learning_rate": 4.547300707580422e-05, | |
| "loss": 0.8738, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.6686813235282898, | |
| "learning_rate": 4.5461008819809246e-05, | |
| "loss": 0.6221, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.891153872013092, | |
| "learning_rate": 4.544899627217286e-05, | |
| "loss": 0.9009, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.1651557683944702, | |
| "learning_rate": 4.543696944128559e-05, | |
| "loss": 0.8448, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.7525443434715271, | |
| "learning_rate": 4.5424928335547964e-05, | |
| "loss": 0.6654, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6798614859580994, | |
| "learning_rate": 4.541287296337048e-05, | |
| "loss": 0.9244, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.498735249042511, | |
| "learning_rate": 4.540080333317358e-05, | |
| "loss": 0.6815, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6097673773765564, | |
| "learning_rate": 4.5388719453387694e-05, | |
| "loss": 0.8536, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6685522198677063, | |
| "learning_rate": 4.537662133245319e-05, | |
| "loss": 0.8092, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "eval_loss": 0.7402560114860535, | |
| "eval_runtime": 96.4998, | |
| "eval_samples_per_second": 7.223, | |
| "eval_steps_per_second": 7.223, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.162788987159729, | |
| "learning_rate": 4.5364508978820375e-05, | |
| "loss": 0.6143, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.8281823992729187, | |
| "learning_rate": 4.5352382400949524e-05, | |
| "loss": 0.8143, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6465135812759399, | |
| "learning_rate": 4.534024160731082e-05, | |
| "loss": 0.9152, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.5903899669647217, | |
| "learning_rate": 4.532808660638438e-05, | |
| "loss": 0.7229, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6988681554794312, | |
| "learning_rate": 4.5315917406660265e-05, | |
| "loss": 0.6863, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.7910459637641907, | |
| "learning_rate": 4.530373401663843e-05, | |
| "loss": 0.8762, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.7580087184906006, | |
| "learning_rate": 4.529153644482875e-05, | |
| "loss": 0.9896, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6871665716171265, | |
| "learning_rate": 4.5279324699751005e-05, | |
| "loss": 0.8831, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.0093677043914795, | |
| "learning_rate": 4.526709878993488e-05, | |
| "loss": 0.742, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.9898921847343445, | |
| "learning_rate": 4.525485872391996e-05, | |
| "loss": 0.766, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.8706837296485901, | |
| "learning_rate": 4.524260451025569e-05, | |
| "loss": 0.7545, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.1715607643127441, | |
| "learning_rate": 4.523033615750142e-05, | |
| "loss": 0.84, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.017062783241272, | |
| "learning_rate": 4.521805367422638e-05, | |
| "loss": 0.7477, | |
| "step": 3765 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6071624159812927, | |
| "learning_rate": 4.520575706900965e-05, | |
| "loss": 0.793, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.5821404457092285, | |
| "learning_rate": 4.519344635044018e-05, | |
| "loss": 0.7514, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6849238872528076, | |
| "learning_rate": 4.51811215271168e-05, | |
| "loss": 0.862, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.8808868527412415, | |
| "learning_rate": 4.5168782607648166e-05, | |
| "loss": 0.7189, | |
| "step": 3785 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.7080340385437012, | |
| "learning_rate": 4.5156429600652774e-05, | |
| "loss": 0.6987, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.705869734287262, | |
| "learning_rate": 4.5144062514759e-05, | |
| "loss": 0.6482, | |
| "step": 3795 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6345694065093994, | |
| "learning_rate": 4.5131681358605007e-05, | |
| "loss": 0.7279, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "eval_loss": 0.7394095063209534, | |
| "eval_runtime": 96.4977, | |
| "eval_samples_per_second": 7.223, | |
| "eval_steps_per_second": 7.223, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.748913586139679, | |
| "learning_rate": 4.511928614083881e-05, | |
| "loss": 0.7474, | |
| "step": 3805 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6260043382644653, | |
| "learning_rate": 4.5106876870118255e-05, | |
| "loss": 0.7469, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.596367597579956, | |
| "learning_rate": 4.509445355511098e-05, | |
| "loss": 0.8437, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.3925014734268188, | |
| "learning_rate": 4.5082016204494445e-05, | |
| "loss": 1.0928, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.1370338201522827, | |
| "learning_rate": 4.506956482695592e-05, | |
| "loss": 0.8908, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6746950149536133, | |
| "learning_rate": 4.505709943119246e-05, | |
| "loss": 0.7121, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6608826518058777, | |
| "learning_rate": 4.504462002591091e-05, | |
| "loss": 0.9397, | |
| "step": 3835 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6542508006095886, | |
| "learning_rate": 4.5032126619827916e-05, | |
| "loss": 0.6942, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.5825070738792419, | |
| "learning_rate": 4.5019619221669895e-05, | |
| "loss": 0.7083, | |
| "step": 3845 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.8596588373184204, | |
| "learning_rate": 4.500709784017303e-05, | |
| "loss": 0.839, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.641009509563446, | |
| "learning_rate": 4.499456248408328e-05, | |
| "loss": 0.72, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.213782548904419, | |
| "learning_rate": 4.498201316215635e-05, | |
| "loss": 0.7116, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.1411411762237549, | |
| "learning_rate": 4.496944988315775e-05, | |
| "loss": 1.0208, | |
| "step": 3865 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.8265553712844849, | |
| "learning_rate": 4.495687265586266e-05, | |
| "loss": 0.7664, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.9309681057929993, | |
| "learning_rate": 4.4944281489056065e-05, | |
| "loss": 0.9126, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.49171608686447144, | |
| "learning_rate": 4.493167639153266e-05, | |
| "loss": 0.6271, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.743669867515564, | |
| "learning_rate": 4.491905737209688e-05, | |
| "loss": 0.7965, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.6191633939743042, | |
| "learning_rate": 4.490642443956287e-05, | |
| "loss": 0.5884, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.5481441020965576, | |
| "learning_rate": 4.489377760275452e-05, | |
| "loss": 0.6281, | |
| "step": 3895 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.7155417203903198, | |
| "learning_rate": 4.488111687050539e-05, | |
| "loss": 0.7774, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "eval_loss": 0.738506019115448, | |
| "eval_runtime": 96.7667, | |
| "eval_samples_per_second": 7.203, | |
| "eval_steps_per_second": 7.203, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.032523274421692, | |
| "learning_rate": 4.4868442251658795e-05, | |
| "loss": 0.7621, | |
| "step": 3905 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.584082841873169, | |
| "learning_rate": 4.4855753755067703e-05, | |
| "loss": 0.6617, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.7214722037315369, | |
| "learning_rate": 4.4843051389594814e-05, | |
| "loss": 0.8669, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.6019904613494873, | |
| "learning_rate": 4.4830335164112504e-05, | |
| "loss": 0.736, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.8038384318351746, | |
| "learning_rate": 4.48176050875028e-05, | |
| "loss": 0.637, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.9631878733634949, | |
| "learning_rate": 4.4804861168657455e-05, | |
| "loss": 0.9722, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.5342935919761658, | |
| "learning_rate": 4.4792103416477836e-05, | |
| "loss": 0.8081, | |
| "step": 3935 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.5893488526344299, | |
| "learning_rate": 4.477933183987503e-05, | |
| "loss": 0.61, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.388850212097168, | |
| "learning_rate": 4.476654644776973e-05, | |
| "loss": 0.8454, | |
| "step": 3945 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.6928623914718628, | |
| "learning_rate": 4.4753747249092305e-05, | |
| "loss": 0.7209, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.2383430004119873, | |
| "learning_rate": 4.4740934252782757e-05, | |
| "loss": 0.8205, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.6005001664161682, | |
| "learning_rate": 4.472810746779074e-05, | |
| "loss": 0.6083, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.7928474545478821, | |
| "learning_rate": 4.471526690307552e-05, | |
| "loss": 0.9735, | |
| "step": 3965 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.8710891008377075, | |
| "learning_rate": 4.4702412567606014e-05, | |
| "loss": 0.7573, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.6327987313270569, | |
| "learning_rate": 4.468954447036071e-05, | |
| "loss": 0.8563, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.7048762440681458, | |
| "learning_rate": 4.467666262032777e-05, | |
| "loss": 0.9176, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.6058861017227173, | |
| "learning_rate": 4.466376702650492e-05, | |
| "loss": 0.5525, | |
| "step": 3985 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.637993574142456, | |
| "learning_rate": 4.465085769789949e-05, | |
| "loss": 0.7256, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.6992897987365723, | |
| "learning_rate": 4.463793464352842e-05, | |
| "loss": 0.8824, | |
| "step": 3995 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.7812734246253967, | |
| "learning_rate": 4.462499787241822e-05, | |
| "loss": 0.8942, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 0.7363680601119995, | |
| "eval_runtime": 96.9231, | |
| "eval_samples_per_second": 7.191, | |
| "eval_steps_per_second": 7.191, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.907598078250885, | |
| "learning_rate": 4.4612047393605e-05, | |
| "loss": 0.867, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.9081722497940063, | |
| "learning_rate": 4.459908321613442e-05, | |
| "loss": 0.8757, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.5538048148155212, | |
| "learning_rate": 4.4586105349061726e-05, | |
| "loss": 0.6709, | |
| "step": 4015 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.6632833480834961, | |
| "learning_rate": 4.457311380145173e-05, | |
| "loss": 0.8362, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.8646539449691772, | |
| "learning_rate": 4.4560108582378766e-05, | |
| "loss": 0.8527, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.6309005618095398, | |
| "learning_rate": 4.454708970092678e-05, | |
| "loss": 0.595, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.5711541175842285, | |
| "learning_rate": 4.45340571661892e-05, | |
| "loss": 0.8069, | |
| "step": 4035 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.1379880905151367, | |
| "learning_rate": 4.4521010987269006e-05, | |
| "loss": 0.8464, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.6005469560623169, | |
| "learning_rate": 4.450795117327874e-05, | |
| "loss": 0.5801, | |
| "step": 4045 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.7842866778373718, | |
| "learning_rate": 4.449487773334042e-05, | |
| "loss": 0.6238, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.7519890069961548, | |
| "learning_rate": 4.448179067658563e-05, | |
| "loss": 1.1255, | |
| "step": 4055 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.5955212712287903, | |
| "learning_rate": 4.446869001215542e-05, | |
| "loss": 0.7738, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.5085921287536621, | |
| "learning_rate": 4.4455575749200364e-05, | |
| "loss": 0.6239, | |
| "step": 4065 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.779778003692627, | |
| "learning_rate": 4.444244789688056e-05, | |
| "loss": 0.9719, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.7279208898544312, | |
| "learning_rate": 4.442930646436554e-05, | |
| "loss": 0.9854, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.9218065738677979, | |
| "learning_rate": 4.4416151460834376e-05, | |
| "loss": 0.8096, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.7595914006233215, | |
| "learning_rate": 4.44029828954756e-05, | |
| "loss": 0.7955, | |
| "step": 4085 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.785493016242981, | |
| "learning_rate": 4.43898007774872e-05, | |
| "loss": 0.8598, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.5540453195571899, | |
| "learning_rate": 4.437660511607666e-05, | |
| "loss": 0.8485, | |
| "step": 4095 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.7215760350227356, | |
| "learning_rate": 4.43633959204609e-05, | |
| "loss": 0.9286, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "eval_loss": 0.7347923517227173, | |
| "eval_runtime": 96.8658, | |
| "eval_samples_per_second": 7.196, | |
| "eval_steps_per_second": 7.196, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.7934743762016296, | |
| "learning_rate": 4.435017319986631e-05, | |
| "loss": 0.7829, | |
| "step": 4105 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.503614068031311, | |
| "learning_rate": 4.43369369635287e-05, | |
| "loss": 0.7203, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.6292420625686646, | |
| "learning_rate": 4.4323687220693365e-05, | |
| "loss": 0.7556, | |
| "step": 4115 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.6981114149093628, | |
| "learning_rate": 4.431042398061499e-05, | |
| "loss": 0.6953, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.8554514050483704, | |
| "learning_rate": 4.4297147252557715e-05, | |
| "loss": 0.7731, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.1464003324508667, | |
| "learning_rate": 4.428385704579509e-05, | |
| "loss": 0.7761, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.6772524118423462, | |
| "learning_rate": 4.427055336961008e-05, | |
| "loss": 0.7529, | |
| "step": 4135 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.5949820280075073, | |
| "learning_rate": 4.425723623329507e-05, | |
| "loss": 0.9164, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.848900318145752, | |
| "learning_rate": 4.4243905646151825e-05, | |
| "loss": 0.8385, | |
| "step": 4145 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.7119936943054199, | |
| "learning_rate": 4.4230561617491514e-05, | |
| "loss": 0.6342, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.4240078628063202, | |
| "learning_rate": 4.421720415663472e-05, | |
| "loss": 0.9921, | |
| "step": 4155 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.166399359703064, | |
| "learning_rate": 4.4203833272911355e-05, | |
| "loss": 0.6751, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.7882303595542908, | |
| "learning_rate": 4.4190448975660756e-05, | |
| "loss": 0.8711, | |
| "step": 4165 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.7739405632019043, | |
| "learning_rate": 4.417705127423162e-05, | |
| "loss": 0.7635, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.6729245781898499, | |
| "learning_rate": 4.416364017798197e-05, | |
| "loss": 1.0083, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.7291648983955383, | |
| "learning_rate": 4.4150215696279233e-05, | |
| "loss": 0.9355, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.569436252117157, | |
| "learning_rate": 4.413677783850015e-05, | |
| "loss": 0.5718, | |
| "step": 4185 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.7857233285903931, | |
| "learning_rate": 4.412332661403085e-05, | |
| "loss": 0.6356, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.124894618988037, | |
| "learning_rate": 4.410986203226672e-05, | |
| "loss": 0.9911, | |
| "step": 4195 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.7088748216629028, | |
| "learning_rate": 4.409638410261256e-05, | |
| "loss": 0.7703, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "eval_loss": 0.7353793978691101, | |
| "eval_runtime": 96.9146, | |
| "eval_samples_per_second": 7.192, | |
| "eval_steps_per_second": 7.192, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.8883334398269653, | |
| "learning_rate": 4.4082892834482456e-05, | |
| "loss": 0.7829, | |
| "step": 4205 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.5809643864631653, | |
| "learning_rate": 4.406938823729979e-05, | |
| "loss": 0.79, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.2371888160705566, | |
| "learning_rate": 4.405587032049731e-05, | |
| "loss": 0.9394, | |
| "step": 4215 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.6468964219093323, | |
| "learning_rate": 4.4042339093517e-05, | |
| "loss": 0.7621, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.8613569736480713, | |
| "learning_rate": 4.4028794565810194e-05, | |
| "loss": 0.9303, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.8210548162460327, | |
| "learning_rate": 4.4015236746837505e-05, | |
| "loss": 1.04, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.8066801428794861, | |
| "learning_rate": 4.4001665646068804e-05, | |
| "loss": 0.9942, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.6841477751731873, | |
| "learning_rate": 4.3988081272983263e-05, | |
| "loss": 0.6893, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.7812705636024475, | |
| "learning_rate": 4.3974483637069333e-05, | |
| "loss": 0.9125, | |
| "step": 4245 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.7913382649421692, | |
| "learning_rate": 4.3960872747824686e-05, | |
| "loss": 0.9298, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.6053805947303772, | |
| "learning_rate": 4.394724861475631e-05, | |
| "loss": 0.7055, | |
| "step": 4255 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.6879487633705139, | |
| "learning_rate": 4.393361124738039e-05, | |
| "loss": 0.605, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.7929925918579102, | |
| "learning_rate": 4.3919960655222394e-05, | |
| "loss": 0.8569, | |
| "step": 4265 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.5888631939888, | |
| "learning_rate": 4.390629684781701e-05, | |
| "loss": 0.6246, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.9546008706092834, | |
| "learning_rate": 4.389261983470815e-05, | |
| "loss": 0.7964, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.620267391204834, | |
| "learning_rate": 4.387892962544896e-05, | |
| "loss": 0.7127, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.7655039429664612, | |
| "learning_rate": 4.3865226229601805e-05, | |
| "loss": 0.6936, | |
| "step": 4285 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.5404471158981323, | |
| "learning_rate": 4.3851509656738264e-05, | |
| "loss": 0.6141, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.9140282273292542, | |
| "learning_rate": 4.38377799164391e-05, | |
| "loss": 1.152, | |
| "step": 4295 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.4845621585845947, | |
| "learning_rate": 4.382403701829429e-05, | |
| "loss": 0.8322, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "eval_loss": 0.733027458190918, | |
| "eval_runtime": 96.886, | |
| "eval_samples_per_second": 7.194, | |
| "eval_steps_per_second": 7.194, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.598147988319397, | |
| "learning_rate": 4.381028097190299e-05, | |
| "loss": 0.772, | |
| "step": 4305 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.5572992563247681, | |
| "learning_rate": 4.3796511786873574e-05, | |
| "loss": 0.7232, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.7913936376571655, | |
| "learning_rate": 4.378272947282354e-05, | |
| "loss": 0.6972, | |
| "step": 4315 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.4532865583896637, | |
| "learning_rate": 4.376893403937959e-05, | |
| "loss": 0.7454, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.8871356844902039, | |
| "learning_rate": 4.375512549617759e-05, | |
| "loss": 0.6946, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.7564520835876465, | |
| "learning_rate": 4.374130385286255e-05, | |
| "loss": 0.9257, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.7280387282371521, | |
| "learning_rate": 4.3727469119088624e-05, | |
| "loss": 0.756, | |
| "step": 4335 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.6494055986404419, | |
| "learning_rate": 4.3713621304519144e-05, | |
| "loss": 0.6358, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.6048948764801025, | |
| "learning_rate": 4.369976041882654e-05, | |
| "loss": 0.6705, | |
| "step": 4345 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.6458585858345032, | |
| "learning_rate": 4.36858864716924e-05, | |
| "loss": 0.7999, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.837872326374054, | |
| "learning_rate": 4.36719994728074e-05, | |
| "loss": 0.7671, | |
| "step": 4355 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6451572775840759, | |
| "learning_rate": 4.365809943187138e-05, | |
| "loss": 0.8672, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6438645124435425, | |
| "learning_rate": 4.364418635859326e-05, | |
| "loss": 0.78, | |
| "step": 4365 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.7427099347114563, | |
| "learning_rate": 4.363026026269106e-05, | |
| "loss": 0.8977, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.7844499945640564, | |
| "learning_rate": 4.36163211538919e-05, | |
| "loss": 0.7586, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.8544999361038208, | |
| "learning_rate": 4.360236904193201e-05, | |
| "loss": 0.7085, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.431629180908203, | |
| "learning_rate": 4.358840393655668e-05, | |
| "loss": 0.8572, | |
| "step": 4385 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6864097118377686, | |
| "learning_rate": 4.357442584752027e-05, | |
| "loss": 0.6848, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.7158388495445251, | |
| "learning_rate": 4.356043478458623e-05, | |
| "loss": 1.0071, | |
| "step": 4395 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.7883514165878296, | |
| "learning_rate": 4.3546430757527066e-05, | |
| "loss": 0.9851, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_loss": 0.732368528842926, | |
| "eval_runtime": 96.9109, | |
| "eval_samples_per_second": 7.192, | |
| "eval_steps_per_second": 7.192, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.673925518989563, | |
| "learning_rate": 4.353241377612433e-05, | |
| "loss": 0.7076, | |
| "step": 4405 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.9540270566940308, | |
| "learning_rate": 4.351838385016862e-05, | |
| "loss": 0.8989, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.8137551546096802, | |
| "learning_rate": 4.35043409894596e-05, | |
| "loss": 0.7633, | |
| "step": 4415 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.779330313205719, | |
| "learning_rate": 4.349028520380594e-05, | |
| "loss": 0.7013, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.7883580327033997, | |
| "learning_rate": 4.347621650302535e-05, | |
| "loss": 0.9788, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.7106336951255798, | |
| "learning_rate": 4.3462134896944565e-05, | |
| "loss": 0.8399, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.6303668022155762, | |
| "learning_rate": 4.344804039539933e-05, | |
| "loss": 0.5943, | |
| "step": 4435 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.2975471019744873, | |
| "learning_rate": 4.3433933008234395e-05, | |
| "loss": 0.8917, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.68232661485672, | |
| "learning_rate": 4.341981274530351e-05, | |
| "loss": 0.7756, | |
| "step": 4445 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.6689594984054565, | |
| "learning_rate": 4.340567961646943e-05, | |
| "loss": 0.772, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.102365493774414, | |
| "learning_rate": 4.339153363160388e-05, | |
| "loss": 0.738, | |
| "step": 4455 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.6535090804100037, | |
| "learning_rate": 4.337737480058758e-05, | |
| "loss": 0.9096, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.676058292388916, | |
| "learning_rate": 4.3363203133310206e-05, | |
| "loss": 0.9634, | |
| "step": 4465 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.9258711934089661, | |
| "learning_rate": 4.3349018639670415e-05, | |
| "loss": 0.8025, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.5786353349685669, | |
| "learning_rate": 4.333482132957581e-05, | |
| "loss": 0.7638, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7258582711219788, | |
| "learning_rate": 4.332061121294296e-05, | |
| "loss": 1.3538, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.9384926557540894, | |
| "learning_rate": 4.330638829969738e-05, | |
| "loss": 0.8485, | |
| "step": 4485 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.5252525806427002, | |
| "learning_rate": 4.3292152599773494e-05, | |
| "loss": 0.8547, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7551200985908508, | |
| "learning_rate": 4.32779041231147e-05, | |
| "loss": 0.7435, | |
| "step": 4495 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7492663264274597, | |
| "learning_rate": 4.3263642879673286e-05, | |
| "loss": 0.8712, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 0.7316818237304688, | |
| "eval_runtime": 96.9418, | |
| "eval_samples_per_second": 7.19, | |
| "eval_steps_per_second": 7.19, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7490917444229126, | |
| "learning_rate": 4.3249368879410475e-05, | |
| "loss": 0.7598, | |
| "step": 4505 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7305790781974792, | |
| "learning_rate": 4.323508213229639e-05, | |
| "loss": 0.8315, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7009093165397644, | |
| "learning_rate": 4.3220782648310075e-05, | |
| "loss": 0.7482, | |
| "step": 4515 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7155885100364685, | |
| "learning_rate": 4.320647043743945e-05, | |
| "loss": 0.8385, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.6159176826477051, | |
| "learning_rate": 4.319214550968133e-05, | |
| "loss": 0.6507, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7776069045066833, | |
| "learning_rate": 4.3177807875041424e-05, | |
| "loss": 0.855, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.6204195618629456, | |
| "learning_rate": 4.316345754353432e-05, | |
| "loss": 0.7169, | |
| "step": 4535 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7233458757400513, | |
| "learning_rate": 4.3149094525183426e-05, | |
| "loss": 0.5399, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.7921779155731201, | |
| "learning_rate": 4.313471883002108e-05, | |
| "loss": 0.9124, | |
| "step": 4545 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.9145547747612, | |
| "learning_rate": 4.3120330468088435e-05, | |
| "loss": 1.2346, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.8733106255531311, | |
| "learning_rate": 4.310592944943549e-05, | |
| "loss": 0.6737, | |
| "step": 4555 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.6620619297027588, | |
| "learning_rate": 4.3091515784121107e-05, | |
| "loss": 0.8041, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.7026892900466919, | |
| "learning_rate": 4.307708948221296e-05, | |
| "loss": 0.9422, | |
| "step": 4565 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.7953292727470398, | |
| "learning_rate": 4.3062650553787566e-05, | |
| "loss": 0.7398, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.6465870141983032, | |
| "learning_rate": 4.304819900893024e-05, | |
| "loss": 0.8175, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.3427163362503052, | |
| "learning_rate": 4.303373485773513e-05, | |
| "loss": 0.7331, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.6665405035018921, | |
| "learning_rate": 4.3019258110305186e-05, | |
| "loss": 0.7529, | |
| "step": 4585 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.796320915222168, | |
| "learning_rate": 4.300476877675215e-05, | |
| "loss": 0.915, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.551832377910614, | |
| "learning_rate": 4.299026686719655e-05, | |
| "loss": 0.7693, | |
| "step": 4595 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.75690096616745, | |
| "learning_rate": 4.297575239176771e-05, | |
| "loss": 0.7871, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "eval_loss": 0.730965256690979, | |
| "eval_runtime": 96.8803, | |
| "eval_samples_per_second": 7.194, | |
| "eval_steps_per_second": 7.194, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.7093445062637329, | |
| "learning_rate": 4.296122536060373e-05, | |
| "loss": 0.6279, | |
| "step": 4605 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6522731781005859, | |
| "learning_rate": 4.294668578385147e-05, | |
| "loss": 0.5442, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.7964634299278259, | |
| "learning_rate": 4.2932133671666565e-05, | |
| "loss": 1.0221, | |
| "step": 4615 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.7733820676803589, | |
| "learning_rate": 4.2917569034213395e-05, | |
| "loss": 0.7152, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.8039364218711853, | |
| "learning_rate": 4.2902991881665097e-05, | |
| "loss": 1.0939, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.756020724773407, | |
| "learning_rate": 4.2888402224203536e-05, | |
| "loss": 0.7539, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.5059025287628174, | |
| "learning_rate": 4.2873800072019345e-05, | |
| "loss": 0.8716, | |
| "step": 4635 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.8273636102676392, | |
| "learning_rate": 4.285918543531183e-05, | |
| "loss": 0.687, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6505921483039856, | |
| "learning_rate": 4.2844558324289076e-05, | |
| "loss": 1.0697, | |
| "step": 4645 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6481053829193115, | |
| "learning_rate": 4.282991874916784e-05, | |
| "loss": 0.884, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.8193663954734802, | |
| "learning_rate": 4.28152667201736e-05, | |
| "loss": 0.962, | |
| "step": 4655 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.7153398990631104, | |
| "learning_rate": 4.280060224754053e-05, | |
| "loss": 0.7705, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.7197556495666504, | |
| "learning_rate": 4.278592534151149e-05, | |
| "loss": 0.8521, | |
| "step": 4665 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.009098768234253, | |
| "learning_rate": 4.2771236012338044e-05, | |
| "loss": 0.8425, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.598564088344574, | |
| "learning_rate": 4.275653427028041e-05, | |
| "loss": 0.7072, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.8035867810249329, | |
| "learning_rate": 4.2741820125607504e-05, | |
| "loss": 0.6689, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.6109891533851624, | |
| "learning_rate": 4.2727093588596866e-05, | |
| "loss": 0.7776, | |
| "step": 4685 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.7504151463508606, | |
| "learning_rate": 4.271235466953473e-05, | |
| "loss": 0.7481, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.9492344260215759, | |
| "learning_rate": 4.269760337871594e-05, | |
| "loss": 0.7261, | |
| "step": 4695 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.5792133212089539, | |
| "learning_rate": 4.2682839726444035e-05, | |
| "loss": 0.7156, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": 0.7283556461334229, | |
| "eval_runtime": 96.8998, | |
| "eval_samples_per_second": 7.193, | |
| "eval_steps_per_second": 7.193, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.8895491361618042, | |
| "learning_rate": 4.266806372303113e-05, | |
| "loss": 0.8466, | |
| "step": 4705 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.7995960712432861, | |
| "learning_rate": 4.2653275378798005e-05, | |
| "loss": 0.7823, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.6673771739006042, | |
| "learning_rate": 4.263847470407405e-05, | |
| "loss": 0.5461, | |
| "step": 4715 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.6228974461555481, | |
| "learning_rate": 4.262366170919726e-05, | |
| "loss": 0.7611, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.8050612807273865, | |
| "learning_rate": 4.2608836404514255e-05, | |
| "loss": 0.6524, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.8815121650695801, | |
| "learning_rate": 4.2593998800380216e-05, | |
| "loss": 0.9997, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.0408731698989868, | |
| "learning_rate": 4.257914890715897e-05, | |
| "loss": 0.7031, | |
| "step": 4735 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6744192838668823, | |
| "learning_rate": 4.256428673522287e-05, | |
| "loss": 0.6587, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.014369249343872, | |
| "learning_rate": 4.254941229495289e-05, | |
| "loss": 0.7726, | |
| "step": 4745 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.7497864365577698, | |
| "learning_rate": 4.2534525596738526e-05, | |
| "loss": 0.7327, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6479122042655945, | |
| "learning_rate": 4.2519626650977905e-05, | |
| "loss": 0.7071, | |
| "step": 4755 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6300268173217773, | |
| "learning_rate": 4.250471546807765e-05, | |
| "loss": 0.9479, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.8272077441215515, | |
| "learning_rate": 4.248979205845294e-05, | |
| "loss": 0.9013, | |
| "step": 4765 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.7070410847663879, | |
| "learning_rate": 4.2474856432527524e-05, | |
| "loss": 0.713, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.7199767231941223, | |
| "learning_rate": 4.2459908600733654e-05, | |
| "loss": 0.9308, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6886048316955566, | |
| "learning_rate": 4.244494857351212e-05, | |
| "loss": 0.8008, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6097077131271362, | |
| "learning_rate": 4.242997636131222e-05, | |
| "loss": 0.9639, | |
| "step": 4785 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.0947343111038208, | |
| "learning_rate": 4.241499197459178e-05, | |
| "loss": 0.9012, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.6965738534927368, | |
| "learning_rate": 4.239999542381712e-05, | |
| "loss": 0.6745, | |
| "step": 4795 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.8290371894836426, | |
| "learning_rate": 4.238498671946306e-05, | |
| "loss": 0.7856, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "eval_loss": 0.7277354598045349, | |
| "eval_runtime": 96.9165, | |
| "eval_samples_per_second": 7.192, | |
| "eval_steps_per_second": 7.192, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.8061904907226562, | |
| "learning_rate": 4.2369965872012904e-05, | |
| "loss": 0.7034, | |
| "step": 4805 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.6652625799179077, | |
| "learning_rate": 4.2354932891958434e-05, | |
| "loss": 0.5825, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.6529026627540588, | |
| "learning_rate": 4.2339887789799916e-05, | |
| "loss": 0.7407, | |
| "step": 4815 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 3.7802493572235107, | |
| "learning_rate": 4.232483057604607e-05, | |
| "loss": 0.8906, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.7709060907363892, | |
| "learning_rate": 4.230976126121411e-05, | |
| "loss": 0.863, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.2582249641418457, | |
| "learning_rate": 4.229467985582966e-05, | |
| "loss": 1.065, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.5523508191108704, | |
| "learning_rate": 4.22795863704268e-05, | |
| "loss": 0.5925, | |
| "step": 4835 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.3535953760147095, | |
| "learning_rate": 4.2264480815548076e-05, | |
| "loss": 0.7993, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.067133903503418, | |
| "learning_rate": 4.2249363201744425e-05, | |
| "loss": 0.7921, | |
| "step": 4845 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.6478603482246399, | |
| "learning_rate": 4.223423353957523e-05, | |
| "loss": 0.6769, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.6439855694770813, | |
| "learning_rate": 4.2219091839608276e-05, | |
| "loss": 0.9018, | |
| "step": 4855 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.5302556753158569, | |
| "learning_rate": 4.2203938112419786e-05, | |
| "loss": 0.837, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.8129810690879822, | |
| "learning_rate": 4.218877236859433e-05, | |
| "loss": 0.9195, | |
| "step": 4865 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.6531801819801331, | |
| "learning_rate": 4.217359461872493e-05, | |
| "loss": 0.6829, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7695423364639282, | |
| "learning_rate": 4.215840487341296e-05, | |
| "loss": 0.7739, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.3163946866989136, | |
| "learning_rate": 4.2143203143268184e-05, | |
| "loss": 0.9678, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.1124577522277832, | |
| "learning_rate": 4.212798943890871e-05, | |
| "loss": 0.9327, | |
| "step": 4885 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.8979106545448303, | |
| "learning_rate": 4.2112763770961074e-05, | |
| "loss": 0.7043, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.9879763126373291, | |
| "learning_rate": 4.2097526150060085e-05, | |
| "loss": 0.8129, | |
| "step": 4895 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7016007304191589, | |
| "learning_rate": 4.208227658684898e-05, | |
| "loss": 0.7906, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "eval_loss": 0.7254941463470459, | |
| "eval_runtime": 96.9328, | |
| "eval_samples_per_second": 7.191, | |
| "eval_steps_per_second": 7.191, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.5404706597328186, | |
| "learning_rate": 4.206701509197927e-05, | |
| "loss": 0.7769, | |
| "step": 4905 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7096789479255676, | |
| "learning_rate": 4.205174167611085e-05, | |
| "loss": 0.5985, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.8139373660087585, | |
| "learning_rate": 4.20364563499119e-05, | |
| "loss": 0.75, | |
| "step": 4915 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.2196255922317505, | |
| "learning_rate": 4.202115912405897e-05, | |
| "loss": 0.8441, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.6912347674369812, | |
| "learning_rate": 4.200585000923689e-05, | |
| "loss": 0.8885, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.7879334688186646, | |
| "learning_rate": 4.199052901613878e-05, | |
| "loss": 0.6353, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.0302627086639404, | |
| "learning_rate": 4.197519615546608e-05, | |
| "loss": 0.7704, | |
| "step": 4935 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.5351320505142212, | |
| "learning_rate": 4.195985143792851e-05, | |
| "loss": 0.8094, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.7632457613945007, | |
| "learning_rate": 4.194449487424409e-05, | |
| "loss": 0.9625, | |
| "step": 4945 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.7706131935119629, | |
| "learning_rate": 4.1929126475139096e-05, | |
| "loss": 0.7007, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.6262048482894897, | |
| "learning_rate": 4.191374625134806e-05, | |
| "loss": 0.7768, | |
| "step": 4955 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.8302519917488098, | |
| "learning_rate": 4.189835421361381e-05, | |
| "loss": 0.8281, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.5914260149002075, | |
| "learning_rate": 4.188295037268738e-05, | |
| "loss": 0.8554, | |
| "step": 4965 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.7599936127662659, | |
| "learning_rate": 4.1867534739328085e-05, | |
| "loss": 0.9547, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.4832470417022705, | |
| "learning_rate": 4.1852107324303455e-05, | |
| "loss": 0.5212, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.8040557503700256, | |
| "learning_rate": 4.183666813838927e-05, | |
| "loss": 0.8939, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7503822445869446, | |
| "learning_rate": 4.182121719236952e-05, | |
| "loss": 0.9279, | |
| "step": 4985 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7608035206794739, | |
| "learning_rate": 4.180575449703639e-05, | |
| "loss": 0.7965, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.5479308366775513, | |
| "learning_rate": 4.1790280063190315e-05, | |
| "loss": 0.7478, | |
| "step": 4995 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7714606523513794, | |
| "learning_rate": 4.177479390163989e-05, | |
| "loss": 0.7917, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 0.7249829173088074, | |
| "eval_runtime": 96.9791, | |
| "eval_samples_per_second": 7.187, | |
| "eval_steps_per_second": 7.187, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 18795, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 2.1577658793984e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |