| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.985172981878089, | |
| "eval_steps": 500, | |
| "global_step": 453, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006589785831960461, | |
| "grad_norm": 57166.33984375, | |
| "learning_rate": 0.0, | |
| "loss": 0.9208, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.013179571663920923, | |
| "grad_norm": 24538.927734375, | |
| "learning_rate": 1.0869565217391306e-06, | |
| "loss": 0.7281, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.019769357495881382, | |
| "grad_norm": 15162.59375, | |
| "learning_rate": 2.173913043478261e-06, | |
| "loss": 0.7891, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.026359143327841845, | |
| "grad_norm": 19609.22265625, | |
| "learning_rate": 3.2608695652173914e-06, | |
| "loss": 0.7988, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.032948929159802305, | |
| "grad_norm": 36443.21484375, | |
| "learning_rate": 4.347826086956522e-06, | |
| "loss": 0.7955, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.039538714991762765, | |
| "grad_norm": 5708.90673828125, | |
| "learning_rate": 5.4347826086956525e-06, | |
| "loss": 0.756, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04612850082372323, | |
| "grad_norm": 16939.138671875, | |
| "learning_rate": 6.521739130434783e-06, | |
| "loss": 0.8382, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.05271828665568369, | |
| "grad_norm": 9426.6240234375, | |
| "learning_rate": 7.608695652173914e-06, | |
| "loss": 0.8287, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.05930807248764415, | |
| "grad_norm": 35329.375, | |
| "learning_rate": 8.695652173913044e-06, | |
| "loss": 0.7193, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.06589785831960461, | |
| "grad_norm": 25405.27734375, | |
| "learning_rate": 9.782608695652175e-06, | |
| "loss": 0.7123, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07248764415156507, | |
| "grad_norm": 39777.97265625, | |
| "learning_rate": 1.0869565217391305e-05, | |
| "loss": 0.8391, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.07907742998352553, | |
| "grad_norm": 32360.591796875, | |
| "learning_rate": 1.1956521739130435e-05, | |
| "loss": 0.7638, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.085667215815486, | |
| "grad_norm": 20279.001953125, | |
| "learning_rate": 1.3043478260869566e-05, | |
| "loss": 0.8044, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.09225700164744646, | |
| "grad_norm": 21474.5, | |
| "learning_rate": 1.4130434782608694e-05, | |
| "loss": 0.6984, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.09884678747940692, | |
| "grad_norm": 24501.21484375, | |
| "learning_rate": 1.5217391304347828e-05, | |
| "loss": 0.7501, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.10543657331136738, | |
| "grad_norm": 16850.5625, | |
| "learning_rate": 1.630434782608696e-05, | |
| "loss": 0.7073, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.11202635914332784, | |
| "grad_norm": 18344.18359375, | |
| "learning_rate": 1.739130434782609e-05, | |
| "loss": 0.7623, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.1186161449752883, | |
| "grad_norm": 7098.125, | |
| "learning_rate": 1.8478260869565216e-05, | |
| "loss": 0.6343, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.12520593080724876, | |
| "grad_norm": 15517.8115234375, | |
| "learning_rate": 1.956521739130435e-05, | |
| "loss": 0.6598, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.13179571663920922, | |
| "grad_norm": 111118.3046875, | |
| "learning_rate": 2.065217391304348e-05, | |
| "loss": 0.6546, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13838550247116968, | |
| "grad_norm": 22035.267578125, | |
| "learning_rate": 2.173913043478261e-05, | |
| "loss": 0.6148, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.14497528830313014, | |
| "grad_norm": 21264.662109375, | |
| "learning_rate": 2.282608695652174e-05, | |
| "loss": 0.6115, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.1515650741350906, | |
| "grad_norm": 10420.123046875, | |
| "learning_rate": 2.391304347826087e-05, | |
| "loss": 0.6716, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.15815485996705106, | |
| "grad_norm": 31856.8828125, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.6439, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.16474464579901152, | |
| "grad_norm": 33552.83984375, | |
| "learning_rate": 2.608695652173913e-05, | |
| "loss": 0.6781, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.171334431630972, | |
| "grad_norm": 8155.12451171875, | |
| "learning_rate": 2.7173913043478262e-05, | |
| "loss": 0.6042, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.17792421746293247, | |
| "grad_norm": 11283.130859375, | |
| "learning_rate": 2.826086956521739e-05, | |
| "loss": 0.6639, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.18451400329489293, | |
| "grad_norm": 56492.08984375, | |
| "learning_rate": 2.9347826086956526e-05, | |
| "loss": 0.6198, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.19110378912685339, | |
| "grad_norm": 36276.30859375, | |
| "learning_rate": 3.0434782608695656e-05, | |
| "loss": 0.7941, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.19769357495881384, | |
| "grad_norm": 16123.35546875, | |
| "learning_rate": 3.152173913043479e-05, | |
| "loss": 0.6976, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2042833607907743, | |
| "grad_norm": 3790.1572265625, | |
| "learning_rate": 3.260869565217392e-05, | |
| "loss": 0.6401, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.21087314662273476, | |
| "grad_norm": 20178.978515625, | |
| "learning_rate": 3.369565217391305e-05, | |
| "loss": 0.7029, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.21746293245469522, | |
| "grad_norm": 12479.48046875, | |
| "learning_rate": 3.478260869565218e-05, | |
| "loss": 0.7024, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.22405271828665568, | |
| "grad_norm": 17013.83984375, | |
| "learning_rate": 3.58695652173913e-05, | |
| "loss": 0.7124, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.23064250411861614, | |
| "grad_norm": 8563.2705078125, | |
| "learning_rate": 3.695652173913043e-05, | |
| "loss": 0.6505, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2372322899505766, | |
| "grad_norm": 8254.0205078125, | |
| "learning_rate": 3.804347826086957e-05, | |
| "loss": 0.6654, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.24382207578253706, | |
| "grad_norm": 49902.2890625, | |
| "learning_rate": 3.91304347826087e-05, | |
| "loss": 0.6191, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.2504118616144975, | |
| "grad_norm": 18592.572265625, | |
| "learning_rate": 4.021739130434783e-05, | |
| "loss": 0.6082, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.257001647446458, | |
| "grad_norm": 25148.34375, | |
| "learning_rate": 4.130434782608696e-05, | |
| "loss": 0.7318, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.26359143327841844, | |
| "grad_norm": 11293.0634765625, | |
| "learning_rate": 4.239130434782609e-05, | |
| "loss": 0.6075, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2701812191103789, | |
| "grad_norm": 10365.61328125, | |
| "learning_rate": 4.347826086956522e-05, | |
| "loss": 0.5871, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.27677100494233936, | |
| "grad_norm": 6548.3740234375, | |
| "learning_rate": 4.456521739130435e-05, | |
| "loss": 0.5931, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2833607907742998, | |
| "grad_norm": 17649.12109375, | |
| "learning_rate": 4.565217391304348e-05, | |
| "loss": 0.7389, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2899505766062603, | |
| "grad_norm": 14737.7197265625, | |
| "learning_rate": 4.673913043478261e-05, | |
| "loss": 0.5972, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.29654036243822074, | |
| "grad_norm": 9264.072265625, | |
| "learning_rate": 4.782608695652174e-05, | |
| "loss": 0.6256, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.3031301482701812, | |
| "grad_norm": 86530.90625, | |
| "learning_rate": 4.891304347826087e-05, | |
| "loss": 0.6694, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.30971993410214166, | |
| "grad_norm": 10428.66015625, | |
| "learning_rate": 5e-05, | |
| "loss": 0.6411, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.3163097199341021, | |
| "grad_norm": 35924.46484375, | |
| "learning_rate": 4.987714987714988e-05, | |
| "loss": 0.7293, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3228995057660626, | |
| "grad_norm": 9777.244140625, | |
| "learning_rate": 4.9754299754299756e-05, | |
| "loss": 0.6293, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.32948929159802304, | |
| "grad_norm": 19732.7421875, | |
| "learning_rate": 4.963144963144963e-05, | |
| "loss": 0.686, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.33607907742998355, | |
| "grad_norm": 35202.5234375, | |
| "learning_rate": 4.950859950859951e-05, | |
| "loss": 0.6859, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.342668863261944, | |
| "grad_norm": 12520.201171875, | |
| "learning_rate": 4.9385749385749387e-05, | |
| "loss": 0.6202, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.34925864909390447, | |
| "grad_norm": 34133.8671875, | |
| "learning_rate": 4.926289926289926e-05, | |
| "loss": 0.6566, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.35584843492586493, | |
| "grad_norm": 30663.423828125, | |
| "learning_rate": 4.914004914004915e-05, | |
| "loss": 0.607, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.3624382207578254, | |
| "grad_norm": 33234.12109375, | |
| "learning_rate": 4.901719901719902e-05, | |
| "loss": 0.5683, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.36902800658978585, | |
| "grad_norm": 11093.591796875, | |
| "learning_rate": 4.8894348894348894e-05, | |
| "loss": 0.6449, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3756177924217463, | |
| "grad_norm": 10510.125, | |
| "learning_rate": 4.877149877149878e-05, | |
| "loss": 0.6932, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.38220757825370677, | |
| "grad_norm": 76218.2109375, | |
| "learning_rate": 4.8648648648648654e-05, | |
| "loss": 0.681, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.38879736408566723, | |
| "grad_norm": 25486.68359375, | |
| "learning_rate": 4.8525798525798524e-05, | |
| "loss": 0.577, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.3953871499176277, | |
| "grad_norm": 39751.8671875, | |
| "learning_rate": 4.840294840294841e-05, | |
| "loss": 0.6199, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.40197693574958815, | |
| "grad_norm": 14526.541015625, | |
| "learning_rate": 4.8280098280098285e-05, | |
| "loss": 0.5628, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.4085667215815486, | |
| "grad_norm": 33324.5625, | |
| "learning_rate": 4.8157248157248155e-05, | |
| "loss": 0.7594, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.41515650741350907, | |
| "grad_norm": 13729.7373046875, | |
| "learning_rate": 4.803439803439804e-05, | |
| "loss": 0.6227, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.42174629324546953, | |
| "grad_norm": 34946.58203125, | |
| "learning_rate": 4.7911547911547915e-05, | |
| "loss": 0.6377, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.42833607907743, | |
| "grad_norm": 7339.2587890625, | |
| "learning_rate": 4.778869778869779e-05, | |
| "loss": 0.7474, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.43492586490939045, | |
| "grad_norm": 61978.32421875, | |
| "learning_rate": 4.766584766584767e-05, | |
| "loss": 0.7136, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.4415156507413509, | |
| "grad_norm": 18481.29296875, | |
| "learning_rate": 4.7542997542997546e-05, | |
| "loss": 0.64, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.44810543657331137, | |
| "grad_norm": 11941.44140625, | |
| "learning_rate": 4.742014742014742e-05, | |
| "loss": 0.6273, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.4546952224052718, | |
| "grad_norm": 70506.828125, | |
| "learning_rate": 4.72972972972973e-05, | |
| "loss": 0.7494, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.4612850082372323, | |
| "grad_norm": 8327.736328125, | |
| "learning_rate": 4.7174447174447176e-05, | |
| "loss": 0.6674, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.46787479406919275, | |
| "grad_norm": 13985.94140625, | |
| "learning_rate": 4.705159705159705e-05, | |
| "loss": 0.6474, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.4744645799011532, | |
| "grad_norm": 9354.6904296875, | |
| "learning_rate": 4.692874692874693e-05, | |
| "loss": 0.663, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.48105436573311366, | |
| "grad_norm": 30927.59765625, | |
| "learning_rate": 4.680589680589681e-05, | |
| "loss": 0.6372, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4876441515650741, | |
| "grad_norm": 32827.7109375, | |
| "learning_rate": 4.6683046683046684e-05, | |
| "loss": 0.623, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.4942339373970346, | |
| "grad_norm": 30631.48046875, | |
| "learning_rate": 4.656019656019656e-05, | |
| "loss": 0.6329, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.500823723228995, | |
| "grad_norm": 23105.517578125, | |
| "learning_rate": 4.6437346437346444e-05, | |
| "loss": 0.719, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.5074135090609555, | |
| "grad_norm": 21652.029296875, | |
| "learning_rate": 4.6314496314496314e-05, | |
| "loss": 0.7059, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.514003294892916, | |
| "grad_norm": 15986.599609375, | |
| "learning_rate": 4.619164619164619e-05, | |
| "loss": 0.6683, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.5205930807248764, | |
| "grad_norm": 20643.82421875, | |
| "learning_rate": 4.6068796068796074e-05, | |
| "loss": 0.7123, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.5271828665568369, | |
| "grad_norm": 24529.80078125, | |
| "learning_rate": 4.594594594594595e-05, | |
| "loss": 0.6836, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5337726523887973, | |
| "grad_norm": 19665.833984375, | |
| "learning_rate": 4.582309582309582e-05, | |
| "loss": 0.6292, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.5403624382207578, | |
| "grad_norm": 35243.45703125, | |
| "learning_rate": 4.5700245700245705e-05, | |
| "loss": 0.6891, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.5469522240527183, | |
| "grad_norm": 15323.2197265625, | |
| "learning_rate": 4.557739557739558e-05, | |
| "loss": 0.6287, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.5535420098846787, | |
| "grad_norm": 21972.83203125, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 0.676, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.5601317957166392, | |
| "grad_norm": 38221.46484375, | |
| "learning_rate": 4.5331695331695335e-05, | |
| "loss": 0.7015, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5667215815485996, | |
| "grad_norm": 17371.90625, | |
| "learning_rate": 4.520884520884521e-05, | |
| "loss": 0.6665, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5733113673805601, | |
| "grad_norm": 23913.32421875, | |
| "learning_rate": 4.508599508599509e-05, | |
| "loss": 0.6641, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.5799011532125206, | |
| "grad_norm": 14188.1494140625, | |
| "learning_rate": 4.4963144963144966e-05, | |
| "loss": 0.6153, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.586490939044481, | |
| "grad_norm": 15268.400390625, | |
| "learning_rate": 4.484029484029484e-05, | |
| "loss": 0.7959, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5930807248764415, | |
| "grad_norm": 85963.09375, | |
| "learning_rate": 4.471744471744472e-05, | |
| "loss": 0.7464, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5996705107084019, | |
| "grad_norm": 48107.12890625, | |
| "learning_rate": 4.4594594594594596e-05, | |
| "loss": 0.6248, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.6062602965403624, | |
| "grad_norm": 32619.958984375, | |
| "learning_rate": 4.447174447174447e-05, | |
| "loss": 0.6176, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.6128500823723229, | |
| "grad_norm": 13193.05078125, | |
| "learning_rate": 4.434889434889435e-05, | |
| "loss": 0.7016, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.6194398682042833, | |
| "grad_norm": 242728.328125, | |
| "learning_rate": 4.422604422604423e-05, | |
| "loss": 0.6053, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6260296540362438, | |
| "grad_norm": 13395.2021484375, | |
| "learning_rate": 4.4103194103194104e-05, | |
| "loss": 0.6552, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6326194398682042, | |
| "grad_norm": 26586.83984375, | |
| "learning_rate": 4.398034398034398e-05, | |
| "loss": 0.6451, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.6392092257001647, | |
| "grad_norm": 20530.763671875, | |
| "learning_rate": 4.385749385749386e-05, | |
| "loss": 0.6142, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.6457990115321252, | |
| "grad_norm": 120099.6875, | |
| "learning_rate": 4.373464373464374e-05, | |
| "loss": 0.7366, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.6523887973640856, | |
| "grad_norm": 32992.8359375, | |
| "learning_rate": 4.361179361179362e-05, | |
| "loss": 0.6341, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.6589785831960461, | |
| "grad_norm": 114159.0625, | |
| "learning_rate": 4.348894348894349e-05, | |
| "loss": 0.6375, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6655683690280065, | |
| "grad_norm": 59862.4453125, | |
| "learning_rate": 4.336609336609337e-05, | |
| "loss": 0.642, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.6721581548599671, | |
| "grad_norm": 22114.5078125, | |
| "learning_rate": 4.324324324324325e-05, | |
| "loss": 0.6883, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6787479406919276, | |
| "grad_norm": 63335.8203125, | |
| "learning_rate": 4.312039312039312e-05, | |
| "loss": 0.695, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.685337726523888, | |
| "grad_norm": 8014.02001953125, | |
| "learning_rate": 4.2997542997543e-05, | |
| "loss": 0.668, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6919275123558485, | |
| "grad_norm": 35121.1640625, | |
| "learning_rate": 4.287469287469288e-05, | |
| "loss": 0.5207, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6985172981878089, | |
| "grad_norm": 16844.70703125, | |
| "learning_rate": 4.2751842751842756e-05, | |
| "loss": 0.7609, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.7051070840197694, | |
| "grad_norm": 12384.4345703125, | |
| "learning_rate": 4.262899262899263e-05, | |
| "loss": 0.6501, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.7116968698517299, | |
| "grad_norm": 5228.11572265625, | |
| "learning_rate": 4.250614250614251e-05, | |
| "loss": 0.6624, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.7182866556836903, | |
| "grad_norm": 112343.34375, | |
| "learning_rate": 4.2383292383292386e-05, | |
| "loss": 0.6711, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.7248764415156508, | |
| "grad_norm": 11115.724609375, | |
| "learning_rate": 4.226044226044226e-05, | |
| "loss": 0.668, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7314662273476112, | |
| "grad_norm": 91624.140625, | |
| "learning_rate": 4.213759213759214e-05, | |
| "loss": 0.6913, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.7380560131795717, | |
| "grad_norm": 6046.64794921875, | |
| "learning_rate": 4.2014742014742017e-05, | |
| "loss": 0.6699, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.7446457990115322, | |
| "grad_norm": 26374.94921875, | |
| "learning_rate": 4.189189189189189e-05, | |
| "loss": 0.7146, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.7512355848434926, | |
| "grad_norm": 27798.625, | |
| "learning_rate": 4.176904176904177e-05, | |
| "loss": 0.6188, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.7578253706754531, | |
| "grad_norm": 8118.2197265625, | |
| "learning_rate": 4.164619164619165e-05, | |
| "loss": 0.6177, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7644151565074135, | |
| "grad_norm": 40166.359375, | |
| "learning_rate": 4.1523341523341524e-05, | |
| "loss": 0.66, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.771004942339374, | |
| "grad_norm": 16126.8427734375, | |
| "learning_rate": 4.14004914004914e-05, | |
| "loss": 0.7109, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.7775947281713345, | |
| "grad_norm": 18022.84765625, | |
| "learning_rate": 4.127764127764128e-05, | |
| "loss": 0.6224, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.7841845140032949, | |
| "grad_norm": 15072.46484375, | |
| "learning_rate": 4.1154791154791154e-05, | |
| "loss": 0.5862, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.7907742998352554, | |
| "grad_norm": 11486.779296875, | |
| "learning_rate": 4.103194103194104e-05, | |
| "loss": 0.591, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7973640856672158, | |
| "grad_norm": 12171.77734375, | |
| "learning_rate": 4.0909090909090915e-05, | |
| "loss": 0.6598, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.8039538714991763, | |
| "grad_norm": 22686.841796875, | |
| "learning_rate": 4.0786240786240785e-05, | |
| "loss": 0.6059, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.8105436573311368, | |
| "grad_norm": 27070.658203125, | |
| "learning_rate": 4.066339066339067e-05, | |
| "loss": 0.5981, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.8171334431630972, | |
| "grad_norm": 15005.7265625, | |
| "learning_rate": 4.0540540540540545e-05, | |
| "loss": 0.6305, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.8237232289950577, | |
| "grad_norm": 11356.5380859375, | |
| "learning_rate": 4.0417690417690415e-05, | |
| "loss": 0.6736, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8303130148270181, | |
| "grad_norm": 78888.9296875, | |
| "learning_rate": 4.02948402948403e-05, | |
| "loss": 0.6746, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.8369028006589786, | |
| "grad_norm": 8765.6591796875, | |
| "learning_rate": 4.0171990171990176e-05, | |
| "loss": 0.6088, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.8434925864909391, | |
| "grad_norm": 27692.36328125, | |
| "learning_rate": 4.004914004914005e-05, | |
| "loss": 0.7365, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.8500823723228995, | |
| "grad_norm": 27458.75, | |
| "learning_rate": 3.992628992628993e-05, | |
| "loss": 0.6885, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.85667215815486, | |
| "grad_norm": 32011.431640625, | |
| "learning_rate": 3.9803439803439806e-05, | |
| "loss": 0.6128, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8632619439868204, | |
| "grad_norm": 8888.3623046875, | |
| "learning_rate": 3.968058968058968e-05, | |
| "loss": 0.8316, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.8698517298187809, | |
| "grad_norm": 31571.5, | |
| "learning_rate": 3.955773955773956e-05, | |
| "loss": 0.6086, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.8764415156507414, | |
| "grad_norm": 7211.0869140625, | |
| "learning_rate": 3.943488943488944e-05, | |
| "loss": 0.6343, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.8830313014827018, | |
| "grad_norm": 28377.615234375, | |
| "learning_rate": 3.9312039312039314e-05, | |
| "loss": 0.6234, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.8896210873146623, | |
| "grad_norm": 19913.7734375, | |
| "learning_rate": 3.918918918918919e-05, | |
| "loss": 0.7022, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8962108731466227, | |
| "grad_norm": 38258.0546875, | |
| "learning_rate": 3.906633906633907e-05, | |
| "loss": 0.553, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.9028006589785832, | |
| "grad_norm": 18686.283203125, | |
| "learning_rate": 3.8943488943488944e-05, | |
| "loss": 0.6194, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.9093904448105437, | |
| "grad_norm": 25210.80078125, | |
| "learning_rate": 3.882063882063882e-05, | |
| "loss": 0.6567, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.9159802306425041, | |
| "grad_norm": 24367.556640625, | |
| "learning_rate": 3.86977886977887e-05, | |
| "loss": 0.686, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.9225700164744646, | |
| "grad_norm": 135211.953125, | |
| "learning_rate": 3.857493857493858e-05, | |
| "loss": 0.5756, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.929159802306425, | |
| "grad_norm": 18591.775390625, | |
| "learning_rate": 3.845208845208845e-05, | |
| "loss": 0.6784, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.9357495881383855, | |
| "grad_norm": 13147.021484375, | |
| "learning_rate": 3.8329238329238335e-05, | |
| "loss": 0.6022, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.942339373970346, | |
| "grad_norm": 19533.595703125, | |
| "learning_rate": 3.820638820638821e-05, | |
| "loss": 0.5408, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.9489291598023064, | |
| "grad_norm": 7753.4462890625, | |
| "learning_rate": 3.808353808353808e-05, | |
| "loss": 0.6628, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.9555189456342669, | |
| "grad_norm": 44759.55859375, | |
| "learning_rate": 3.7960687960687965e-05, | |
| "loss": 0.5822, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9621087314662273, | |
| "grad_norm": 7180.6044921875, | |
| "learning_rate": 3.783783783783784e-05, | |
| "loss": 0.5473, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.9686985172981878, | |
| "grad_norm": 37214.40625, | |
| "learning_rate": 3.771498771498771e-05, | |
| "loss": 0.6619, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.9752883031301482, | |
| "grad_norm": 14762.1357421875, | |
| "learning_rate": 3.7592137592137596e-05, | |
| "loss": 0.6526, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.9818780889621087, | |
| "grad_norm": 67078.1875, | |
| "learning_rate": 3.746928746928747e-05, | |
| "loss": 0.72, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.9884678747940692, | |
| "grad_norm": 21195.07421875, | |
| "learning_rate": 3.734643734643735e-05, | |
| "loss": 0.6566, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9950576606260296, | |
| "grad_norm": 22123.794921875, | |
| "learning_rate": 3.7223587223587226e-05, | |
| "loss": 0.647, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.00164744645799, | |
| "grad_norm": 9963.392578125, | |
| "learning_rate": 3.71007371007371e-05, | |
| "loss": 0.5923, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.0082372322899507, | |
| "grad_norm": 10004.744140625, | |
| "learning_rate": 3.697788697788698e-05, | |
| "loss": 0.5811, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.014827018121911, | |
| "grad_norm": 5818.58935546875, | |
| "learning_rate": 3.685503685503686e-05, | |
| "loss": 0.5744, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.0214168039538716, | |
| "grad_norm": 12603.640625, | |
| "learning_rate": 3.6732186732186734e-05, | |
| "loss": 0.6278, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.028006589785832, | |
| "grad_norm": 23099.1796875, | |
| "learning_rate": 3.660933660933661e-05, | |
| "loss": 0.6001, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.0345963756177925, | |
| "grad_norm": 19207.27734375, | |
| "learning_rate": 3.648648648648649e-05, | |
| "loss": 0.5701, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.0411861614497528, | |
| "grad_norm": 10040.673828125, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 0.6095, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.0477759472817134, | |
| "grad_norm": 13896.298828125, | |
| "learning_rate": 3.624078624078625e-05, | |
| "loss": 0.5673, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.0543657331136738, | |
| "grad_norm": 10502.177734375, | |
| "learning_rate": 3.611793611793612e-05, | |
| "loss": 0.5662, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0609555189456343, | |
| "grad_norm": 6510.84033203125, | |
| "learning_rate": 3.5995085995085995e-05, | |
| "loss": 0.5095, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.0675453047775947, | |
| "grad_norm": 18709.1640625, | |
| "learning_rate": 3.587223587223588e-05, | |
| "loss": 0.5437, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.0741350906095553, | |
| "grad_norm": 6607.08935546875, | |
| "learning_rate": 3.574938574938575e-05, | |
| "loss": 0.5686, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.0807248764415156, | |
| "grad_norm": 11358.892578125, | |
| "learning_rate": 3.562653562653563e-05, | |
| "loss": 0.585, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.0873146622734762, | |
| "grad_norm": 24775.66796875, | |
| "learning_rate": 3.550368550368551e-05, | |
| "loss": 0.5248, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.0939044481054365, | |
| "grad_norm": 13074.2919921875, | |
| "learning_rate": 3.538083538083538e-05, | |
| "loss": 0.5206, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.100494233937397, | |
| "grad_norm": 10716.3486328125, | |
| "learning_rate": 3.525798525798526e-05, | |
| "loss": 0.6356, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.1070840197693574, | |
| "grad_norm": 19648.953125, | |
| "learning_rate": 3.513513513513514e-05, | |
| "loss": 0.5096, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.113673805601318, | |
| "grad_norm": 10499.521484375, | |
| "learning_rate": 3.501228501228501e-05, | |
| "loss": 0.6298, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.1202635914332784, | |
| "grad_norm": 38415.08984375, | |
| "learning_rate": 3.488943488943489e-05, | |
| "loss": 0.6272, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.126853377265239, | |
| "grad_norm": 18025.1328125, | |
| "learning_rate": 3.476658476658477e-05, | |
| "loss": 0.609, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.1334431630971993, | |
| "grad_norm": 17254.71484375, | |
| "learning_rate": 3.4643734643734647e-05, | |
| "loss": 0.5771, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.1400329489291599, | |
| "grad_norm": 56900.84375, | |
| "learning_rate": 3.452088452088452e-05, | |
| "loss": 0.4617, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.1466227347611202, | |
| "grad_norm": 94377.71875, | |
| "learning_rate": 3.43980343980344e-05, | |
| "loss": 0.5262, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.1532125205930808, | |
| "grad_norm": 26418.802734375, | |
| "learning_rate": 3.427518427518428e-05, | |
| "loss": 0.5388, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1598023064250411, | |
| "grad_norm": 39558.36328125, | |
| "learning_rate": 3.4152334152334154e-05, | |
| "loss": 0.6323, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.1663920922570017, | |
| "grad_norm": 12363.81640625, | |
| "learning_rate": 3.402948402948403e-05, | |
| "loss": 0.4672, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.172981878088962, | |
| "grad_norm": 4902.41748046875, | |
| "learning_rate": 3.390663390663391e-05, | |
| "loss": 0.6218, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.1795716639209226, | |
| "grad_norm": 18641.564453125, | |
| "learning_rate": 3.3783783783783784e-05, | |
| "loss": 0.5324, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.186161449752883, | |
| "grad_norm": 20244.51171875, | |
| "learning_rate": 3.366093366093366e-05, | |
| "loss": 0.5641, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1927512355848435, | |
| "grad_norm": 10752.59765625, | |
| "learning_rate": 3.3538083538083545e-05, | |
| "loss": 0.5537, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.1993410214168039, | |
| "grad_norm": 16433.798828125, | |
| "learning_rate": 3.3415233415233415e-05, | |
| "loss": 0.6038, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.2059308072487644, | |
| "grad_norm": 65011.72265625, | |
| "learning_rate": 3.329238329238329e-05, | |
| "loss": 0.71, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.2125205930807248, | |
| "grad_norm": 139034.671875, | |
| "learning_rate": 3.3169533169533175e-05, | |
| "loss": 0.5752, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.2191103789126854, | |
| "grad_norm": 6198.3046875, | |
| "learning_rate": 3.3046683046683045e-05, | |
| "loss": 0.6047, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.2257001647446457, | |
| "grad_norm": 32208.603515625, | |
| "learning_rate": 3.292383292383293e-05, | |
| "loss": 0.548, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.2322899505766063, | |
| "grad_norm": 83243.2578125, | |
| "learning_rate": 3.2800982800982806e-05, | |
| "loss": 0.6223, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.2388797364085666, | |
| "grad_norm": 11003.818359375, | |
| "learning_rate": 3.2678132678132676e-05, | |
| "loss": 0.5597, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.2454695222405272, | |
| "grad_norm": 30586.80078125, | |
| "learning_rate": 3.255528255528256e-05, | |
| "loss": 0.5653, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.2520593080724876, | |
| "grad_norm": 36942.6171875, | |
| "learning_rate": 3.2432432432432436e-05, | |
| "loss": 0.4991, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2586490939044481, | |
| "grad_norm": 34296.30859375, | |
| "learning_rate": 3.2309582309582306e-05, | |
| "loss": 0.5868, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.2652388797364087, | |
| "grad_norm": 10427.1875, | |
| "learning_rate": 3.218673218673219e-05, | |
| "loss": 0.6163, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.271828665568369, | |
| "grad_norm": 50948.59375, | |
| "learning_rate": 3.206388206388207e-05, | |
| "loss": 0.5254, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.2784184514003294, | |
| "grad_norm": 5948.72802734375, | |
| "learning_rate": 3.1941031941031943e-05, | |
| "loss": 0.5176, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.28500823723229, | |
| "grad_norm": 24703.78515625, | |
| "learning_rate": 3.181818181818182e-05, | |
| "loss": 0.5382, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.2915980230642505, | |
| "grad_norm": 31584.517578125, | |
| "learning_rate": 3.16953316953317e-05, | |
| "loss": 0.5676, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.2981878088962109, | |
| "grad_norm": 15364.802734375, | |
| "learning_rate": 3.1572481572481574e-05, | |
| "loss": 0.5168, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.3047775947281712, | |
| "grad_norm": 20843.013671875, | |
| "learning_rate": 3.144963144963145e-05, | |
| "loss": 0.5424, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.3113673805601318, | |
| "grad_norm": 17939.484375, | |
| "learning_rate": 3.132678132678133e-05, | |
| "loss": 0.5653, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.3179571663920924, | |
| "grad_norm": 9113.7607421875, | |
| "learning_rate": 3.120393120393121e-05, | |
| "loss": 0.5648, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3245469522240527, | |
| "grad_norm": 32489.458984375, | |
| "learning_rate": 3.108108108108108e-05, | |
| "loss": 0.5858, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.331136738056013, | |
| "grad_norm": 23361.166015625, | |
| "learning_rate": 3.095823095823096e-05, | |
| "loss": 0.5749, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.3377265238879736, | |
| "grad_norm": 12417.0478515625, | |
| "learning_rate": 3.083538083538084e-05, | |
| "loss": 0.5019, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.3443163097199342, | |
| "grad_norm": 24939.2265625, | |
| "learning_rate": 3.071253071253071e-05, | |
| "loss": 0.5861, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.3509060955518946, | |
| "grad_norm": 20826.5859375, | |
| "learning_rate": 3.058968058968059e-05, | |
| "loss": 0.591, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.357495881383855, | |
| "grad_norm": 13685.1572265625, | |
| "learning_rate": 3.046683046683047e-05, | |
| "loss": 0.6588, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.3640856672158155, | |
| "grad_norm": 26390.5625, | |
| "learning_rate": 3.0343980343980342e-05, | |
| "loss": 0.5532, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.370675453047776, | |
| "grad_norm": 13378.560546875, | |
| "learning_rate": 3.0221130221130222e-05, | |
| "loss": 0.5108, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.3772652388797364, | |
| "grad_norm": 8095.1689453125, | |
| "learning_rate": 3.0098280098280103e-05, | |
| "loss": 0.5522, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.3838550247116967, | |
| "grad_norm": 11471.375, | |
| "learning_rate": 2.9975429975429976e-05, | |
| "loss": 0.4777, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3904448105436573, | |
| "grad_norm": 11452.013671875, | |
| "learning_rate": 2.9852579852579853e-05, | |
| "loss": 0.5945, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.3970345963756179, | |
| "grad_norm": 31670.03125, | |
| "learning_rate": 2.9729729729729733e-05, | |
| "loss": 0.6015, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.4036243822075782, | |
| "grad_norm": 22501.869140625, | |
| "learning_rate": 2.9606879606879607e-05, | |
| "loss": 0.4813, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.4102141680395386, | |
| "grad_norm": 102328.46875, | |
| "learning_rate": 2.9484029484029483e-05, | |
| "loss": 0.6696, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.4168039538714992, | |
| "grad_norm": 26580.23828125, | |
| "learning_rate": 2.9361179361179364e-05, | |
| "loss": 0.4692, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.4233937397034597, | |
| "grad_norm": 12255.455078125, | |
| "learning_rate": 2.9238329238329237e-05, | |
| "loss": 0.5184, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.42998352553542, | |
| "grad_norm": 13490.662109375, | |
| "learning_rate": 2.9115479115479117e-05, | |
| "loss": 0.5556, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.4365733113673804, | |
| "grad_norm": 14122.8046875, | |
| "learning_rate": 2.8992628992628994e-05, | |
| "loss": 0.5259, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.443163097199341, | |
| "grad_norm": 14578.7783203125, | |
| "learning_rate": 2.8869778869778868e-05, | |
| "loss": 0.5878, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.4497528830313016, | |
| "grad_norm": 36417.54296875, | |
| "learning_rate": 2.8746928746928748e-05, | |
| "loss": 0.6328, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.456342668863262, | |
| "grad_norm": 24717.400390625, | |
| "learning_rate": 2.8624078624078625e-05, | |
| "loss": 0.5597, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.4629324546952225, | |
| "grad_norm": 42796.0390625, | |
| "learning_rate": 2.8501228501228505e-05, | |
| "loss": 0.6242, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.4695222405271828, | |
| "grad_norm": 11775.6015625, | |
| "learning_rate": 2.8378378378378378e-05, | |
| "loss": 0.5561, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.4761120263591434, | |
| "grad_norm": 11294.82421875, | |
| "learning_rate": 2.825552825552826e-05, | |
| "loss": 0.6068, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.4827018121911038, | |
| "grad_norm": 164647.03125, | |
| "learning_rate": 2.8132678132678135e-05, | |
| "loss": 0.5618, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4892915980230643, | |
| "grad_norm": 19815.3203125, | |
| "learning_rate": 2.800982800982801e-05, | |
| "loss": 0.5637, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.4958813838550247, | |
| "grad_norm": 87994.859375, | |
| "learning_rate": 2.788697788697789e-05, | |
| "loss": 0.6073, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.5024711696869852, | |
| "grad_norm": 9418.3603515625, | |
| "learning_rate": 2.776412776412777e-05, | |
| "loss": 0.5177, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.5090609555189456, | |
| "grad_norm": 293888.625, | |
| "learning_rate": 2.764127764127764e-05, | |
| "loss": 0.6276, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.515650741350906, | |
| "grad_norm": 10872.537109375, | |
| "learning_rate": 2.751842751842752e-05, | |
| "loss": 0.5746, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5222405271828665, | |
| "grad_norm": 60597.2109375, | |
| "learning_rate": 2.73955773955774e-05, | |
| "loss": 0.5874, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.528830313014827, | |
| "grad_norm": 53855.8515625, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 0.5431, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.5354200988467874, | |
| "grad_norm": 17162.5859375, | |
| "learning_rate": 2.714987714987715e-05, | |
| "loss": 0.5703, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.5420098846787478, | |
| "grad_norm": 12093.095703125, | |
| "learning_rate": 2.702702702702703e-05, | |
| "loss": 0.5776, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.5485996705107083, | |
| "grad_norm": 33037.2421875, | |
| "learning_rate": 2.6904176904176904e-05, | |
| "loss": 0.5018, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.555189456342669, | |
| "grad_norm": 18464.041015625, | |
| "learning_rate": 2.678132678132678e-05, | |
| "loss": 0.5508, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.5617792421746293, | |
| "grad_norm": 33591.078125, | |
| "learning_rate": 2.665847665847666e-05, | |
| "loss": 0.5522, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.5683690280065898, | |
| "grad_norm": 13077.189453125, | |
| "learning_rate": 2.6535626535626534e-05, | |
| "loss": 0.4632, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.5749588138385504, | |
| "grad_norm": 8347.5576171875, | |
| "learning_rate": 2.6412776412776414e-05, | |
| "loss": 0.6278, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.5815485996705108, | |
| "grad_norm": 15981.6953125, | |
| "learning_rate": 2.628992628992629e-05, | |
| "loss": 0.496, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.588138385502471, | |
| "grad_norm": 8308.947265625, | |
| "learning_rate": 2.616707616707617e-05, | |
| "loss": 0.659, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.5947281713344317, | |
| "grad_norm": 16152.51171875, | |
| "learning_rate": 2.6044226044226045e-05, | |
| "loss": 0.5447, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.6013179571663922, | |
| "grad_norm": 10750.263671875, | |
| "learning_rate": 2.5921375921375925e-05, | |
| "loss": 0.6014, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.6079077429983526, | |
| "grad_norm": 16181.7021484375, | |
| "learning_rate": 2.5798525798525802e-05, | |
| "loss": 0.5084, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.614497528830313, | |
| "grad_norm": 17315.806640625, | |
| "learning_rate": 2.5675675675675675e-05, | |
| "loss": 0.6028, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.6210873146622735, | |
| "grad_norm": 35364.375, | |
| "learning_rate": 2.5552825552825555e-05, | |
| "loss": 0.565, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.627677100494234, | |
| "grad_norm": 7373.3349609375, | |
| "learning_rate": 2.5429975429975432e-05, | |
| "loss": 0.5849, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.6342668863261944, | |
| "grad_norm": 10823.8154296875, | |
| "learning_rate": 2.5307125307125306e-05, | |
| "loss": 0.4575, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.6408566721581548, | |
| "grad_norm": 15178.8916015625, | |
| "learning_rate": 2.5184275184275186e-05, | |
| "loss": 0.4679, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.6474464579901154, | |
| "grad_norm": 10164.9697265625, | |
| "learning_rate": 2.5061425061425066e-05, | |
| "loss": 0.4315, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.654036243822076, | |
| "grad_norm": 8609.7099609375, | |
| "learning_rate": 2.493857493857494e-05, | |
| "loss": 0.6248, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.6606260296540363, | |
| "grad_norm": 40045.09375, | |
| "learning_rate": 2.4815724815724816e-05, | |
| "loss": 0.5335, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.6672158154859966, | |
| "grad_norm": 17241.337890625, | |
| "learning_rate": 2.4692874692874693e-05, | |
| "loss": 0.6546, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.6738056013179572, | |
| "grad_norm": 23707.69921875, | |
| "learning_rate": 2.4570024570024573e-05, | |
| "loss": 0.5672, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.6803953871499178, | |
| "grad_norm": 9953.140625, | |
| "learning_rate": 2.4447174447174447e-05, | |
| "loss": 0.5867, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.6869851729818781, | |
| "grad_norm": 29007.3125, | |
| "learning_rate": 2.4324324324324327e-05, | |
| "loss": 0.5345, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.6935749588138385, | |
| "grad_norm": 9451.4765625, | |
| "learning_rate": 2.4201474201474204e-05, | |
| "loss": 0.6363, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.700164744645799, | |
| "grad_norm": 36570.16796875, | |
| "learning_rate": 2.4078624078624077e-05, | |
| "loss": 0.5335, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.7067545304777596, | |
| "grad_norm": 48013.99609375, | |
| "learning_rate": 2.3955773955773958e-05, | |
| "loss": 0.5234, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.71334431630972, | |
| "grad_norm": 43459.08203125, | |
| "learning_rate": 2.3832923832923834e-05, | |
| "loss": 0.5699, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.7199341021416803, | |
| "grad_norm": 27575.84375, | |
| "learning_rate": 2.371007371007371e-05, | |
| "loss": 0.5577, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.7265238879736409, | |
| "grad_norm": 6892.0205078125, | |
| "learning_rate": 2.3587223587223588e-05, | |
| "loss": 0.6098, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.7331136738056014, | |
| "grad_norm": 79846.03125, | |
| "learning_rate": 2.3464373464373465e-05, | |
| "loss": 0.5509, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.7397034596375618, | |
| "grad_norm": 13488.93359375, | |
| "learning_rate": 2.3341523341523342e-05, | |
| "loss": 0.5872, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.7462932454695221, | |
| "grad_norm": 20397.814453125, | |
| "learning_rate": 2.3218673218673222e-05, | |
| "loss": 0.5657, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.7528830313014827, | |
| "grad_norm": 14773.505859375, | |
| "learning_rate": 2.3095823095823095e-05, | |
| "loss": 0.5319, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.7594728171334433, | |
| "grad_norm": 59875.83203125, | |
| "learning_rate": 2.2972972972972976e-05, | |
| "loss": 0.6762, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.7660626029654036, | |
| "grad_norm": 29569.642578125, | |
| "learning_rate": 2.2850122850122852e-05, | |
| "loss": 0.7493, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.772652388797364, | |
| "grad_norm": 96513.0625, | |
| "learning_rate": 2.272727272727273e-05, | |
| "loss": 0.5796, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.7792421746293245, | |
| "grad_norm": 33220.875, | |
| "learning_rate": 2.2604422604422606e-05, | |
| "loss": 0.5513, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.7858319604612851, | |
| "grad_norm": 9378.62890625, | |
| "learning_rate": 2.2481572481572483e-05, | |
| "loss": 0.5444, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.7924217462932455, | |
| "grad_norm": 8672.4189453125, | |
| "learning_rate": 2.235872235872236e-05, | |
| "loss": 0.5997, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.7990115321252058, | |
| "grad_norm": 75445.5390625, | |
| "learning_rate": 2.2235872235872237e-05, | |
| "loss": 0.5383, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.8056013179571664, | |
| "grad_norm": 12022.205078125, | |
| "learning_rate": 2.2113022113022113e-05, | |
| "loss": 0.6333, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.812191103789127, | |
| "grad_norm": 28632.75390625, | |
| "learning_rate": 2.199017199017199e-05, | |
| "loss": 0.5491, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.8187808896210873, | |
| "grad_norm": 24272.568359375, | |
| "learning_rate": 2.186732186732187e-05, | |
| "loss": 0.5039, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.8253706754530477, | |
| "grad_norm": 9478.4189453125, | |
| "learning_rate": 2.1744471744471744e-05, | |
| "loss": 0.4395, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.8319604612850082, | |
| "grad_norm": 10953.66796875, | |
| "learning_rate": 2.1621621621621624e-05, | |
| "loss": 0.6367, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.8385502471169688, | |
| "grad_norm": 18094.830078125, | |
| "learning_rate": 2.14987714987715e-05, | |
| "loss": 0.5477, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.8451400329489291, | |
| "grad_norm": 18395.52734375, | |
| "learning_rate": 2.1375921375921378e-05, | |
| "loss": 0.5301, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8517298187808895, | |
| "grad_norm": 104393.421875, | |
| "learning_rate": 2.1253071253071255e-05, | |
| "loss": 0.581, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.85831960461285, | |
| "grad_norm": 10463.978515625, | |
| "learning_rate": 2.113022113022113e-05, | |
| "loss": 0.5688, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.8649093904448106, | |
| "grad_norm": 7907.69189453125, | |
| "learning_rate": 2.1007371007371008e-05, | |
| "loss": 0.5534, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.871499176276771, | |
| "grad_norm": 12477.5712890625, | |
| "learning_rate": 2.0884520884520885e-05, | |
| "loss": 0.5412, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.8780889621087313, | |
| "grad_norm": 22077.701171875, | |
| "learning_rate": 2.0761670761670762e-05, | |
| "loss": 0.6015, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.884678747940692, | |
| "grad_norm": 29665.4296875, | |
| "learning_rate": 2.063882063882064e-05, | |
| "loss": 0.6349, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.8912685337726525, | |
| "grad_norm": 8550.171875, | |
| "learning_rate": 2.051597051597052e-05, | |
| "loss": 0.5846, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.8978583196046128, | |
| "grad_norm": 34534.80859375, | |
| "learning_rate": 2.0393120393120392e-05, | |
| "loss": 0.6063, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.9044481054365732, | |
| "grad_norm": 25177.533203125, | |
| "learning_rate": 2.0270270270270273e-05, | |
| "loss": 0.5608, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.9110378912685337, | |
| "grad_norm": 12976.5283203125, | |
| "learning_rate": 2.014742014742015e-05, | |
| "loss": 0.6424, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.9176276771004943, | |
| "grad_norm": 36851.8359375, | |
| "learning_rate": 2.0024570024570026e-05, | |
| "loss": 0.6051, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.9242174629324547, | |
| "grad_norm": 14700.5849609375, | |
| "learning_rate": 1.9901719901719903e-05, | |
| "loss": 0.6738, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.930807248764415, | |
| "grad_norm": 10047.1728515625, | |
| "learning_rate": 1.977886977886978e-05, | |
| "loss": 0.5817, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.9373970345963756, | |
| "grad_norm": 18526.845703125, | |
| "learning_rate": 1.9656019656019657e-05, | |
| "loss": 0.5037, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.9439868204283361, | |
| "grad_norm": 51555.40234375, | |
| "learning_rate": 1.9533169533169534e-05, | |
| "loss": 0.7556, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.9505766062602965, | |
| "grad_norm": 16909.0625, | |
| "learning_rate": 1.941031941031941e-05, | |
| "loss": 0.5487, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.9571663920922568, | |
| "grad_norm": 22686.32421875, | |
| "learning_rate": 1.928746928746929e-05, | |
| "loss": 0.617, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.9637561779242174, | |
| "grad_norm": 68786.3359375, | |
| "learning_rate": 1.9164619164619167e-05, | |
| "loss": 0.5581, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.970345963756178, | |
| "grad_norm": 44203.34375, | |
| "learning_rate": 1.904176904176904e-05, | |
| "loss": 0.5491, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.9769357495881383, | |
| "grad_norm": 39032.02734375, | |
| "learning_rate": 1.891891891891892e-05, | |
| "loss": 0.5876, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9835255354200987, | |
| "grad_norm": 15117.4921875, | |
| "learning_rate": 1.8796068796068798e-05, | |
| "loss": 0.6322, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.9901153212520593, | |
| "grad_norm": 17359.947265625, | |
| "learning_rate": 1.8673218673218675e-05, | |
| "loss": 0.5526, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.9967051070840198, | |
| "grad_norm": 26160.140625, | |
| "learning_rate": 1.855036855036855e-05, | |
| "loss": 0.4606, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.00329489291598, | |
| "grad_norm": 27996.744140625, | |
| "learning_rate": 1.842751842751843e-05, | |
| "loss": 0.5775, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.0098846787479405, | |
| "grad_norm": 10826.375, | |
| "learning_rate": 1.8304668304668305e-05, | |
| "loss": 0.4791, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.0164744645799013, | |
| "grad_norm": 7867.39697265625, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 0.6314, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.0230642504118617, | |
| "grad_norm": 13956.0439453125, | |
| "learning_rate": 1.805896805896806e-05, | |
| "loss": 0.4655, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.029654036243822, | |
| "grad_norm": 14681.5576171875, | |
| "learning_rate": 1.793611793611794e-05, | |
| "loss": 0.536, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.0362438220757824, | |
| "grad_norm": 12553.2109375, | |
| "learning_rate": 1.7813267813267816e-05, | |
| "loss": 0.5533, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.042833607907743, | |
| "grad_norm": 27893.552734375, | |
| "learning_rate": 1.769041769041769e-05, | |
| "loss": 0.4929, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0494233937397035, | |
| "grad_norm": 16818.6171875, | |
| "learning_rate": 1.756756756756757e-05, | |
| "loss": 0.5123, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.056013179571664, | |
| "grad_norm": 16560.30859375, | |
| "learning_rate": 1.7444717444717446e-05, | |
| "loss": 0.5537, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.062602965403624, | |
| "grad_norm": 10146.822265625, | |
| "learning_rate": 1.7321867321867323e-05, | |
| "loss": 0.5042, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.069192751235585, | |
| "grad_norm": 15292.8125, | |
| "learning_rate": 1.71990171990172e-05, | |
| "loss": 0.4717, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.0757825370675453, | |
| "grad_norm": 89878.0078125, | |
| "learning_rate": 1.7076167076167077e-05, | |
| "loss": 0.5159, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.0823723228995057, | |
| "grad_norm": 23665.125, | |
| "learning_rate": 1.6953316953316954e-05, | |
| "loss": 0.4446, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.088962108731466, | |
| "grad_norm": 15712.8779296875, | |
| "learning_rate": 1.683046683046683e-05, | |
| "loss": 0.594, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.095551894563427, | |
| "grad_norm": 50007.99609375, | |
| "learning_rate": 1.6707616707616707e-05, | |
| "loss": 0.5259, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.102141680395387, | |
| "grad_norm": 30487.50390625, | |
| "learning_rate": 1.6584766584766588e-05, | |
| "loss": 0.5166, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.1087314662273475, | |
| "grad_norm": 29152.314453125, | |
| "learning_rate": 1.6461916461916464e-05, | |
| "loss": 0.5055, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.115321252059308, | |
| "grad_norm": 22360.861328125, | |
| "learning_rate": 1.6339066339066338e-05, | |
| "loss": 0.4797, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.1219110378912687, | |
| "grad_norm": 45466.05078125, | |
| "learning_rate": 1.6216216216216218e-05, | |
| "loss": 0.4505, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.128500823723229, | |
| "grad_norm": 18740.75390625, | |
| "learning_rate": 1.6093366093366095e-05, | |
| "loss": 0.4849, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.1350906095551894, | |
| "grad_norm": 9168.376953125, | |
| "learning_rate": 1.5970515970515972e-05, | |
| "loss": 0.4991, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.1416803953871497, | |
| "grad_norm": 7032.57373046875, | |
| "learning_rate": 1.584766584766585e-05, | |
| "loss": 0.5332, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.1482701812191105, | |
| "grad_norm": 14946.6484375, | |
| "learning_rate": 1.5724815724815725e-05, | |
| "loss": 0.4797, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.154859967051071, | |
| "grad_norm": 10197.732421875, | |
| "learning_rate": 1.5601965601965606e-05, | |
| "loss": 0.4698, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.161449752883031, | |
| "grad_norm": 11579.7890625, | |
| "learning_rate": 1.547911547911548e-05, | |
| "loss": 0.4995, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.168039538714992, | |
| "grad_norm": 26515.12890625, | |
| "learning_rate": 1.5356265356265356e-05, | |
| "loss": 0.4931, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.1746293245469523, | |
| "grad_norm": 16076.119140625, | |
| "learning_rate": 1.5233415233415234e-05, | |
| "loss": 0.5051, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.1812191103789127, | |
| "grad_norm": 12420.37890625, | |
| "learning_rate": 1.5110565110565111e-05, | |
| "loss": 0.5227, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.187808896210873, | |
| "grad_norm": 17015.359375, | |
| "learning_rate": 1.4987714987714988e-05, | |
| "loss": 0.5115, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.1943986820428334, | |
| "grad_norm": 13335.0068359375, | |
| "learning_rate": 1.4864864864864867e-05, | |
| "loss": 0.5835, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.200988467874794, | |
| "grad_norm": 41513.1015625, | |
| "learning_rate": 1.4742014742014742e-05, | |
| "loss": 0.437, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.2075782537067545, | |
| "grad_norm": 11096.04296875, | |
| "learning_rate": 1.4619164619164619e-05, | |
| "loss": 0.4619, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.214168039538715, | |
| "grad_norm": 22811.95703125, | |
| "learning_rate": 1.4496314496314497e-05, | |
| "loss": 0.521, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.2207578253706757, | |
| "grad_norm": 96701.7109375, | |
| "learning_rate": 1.4373464373464374e-05, | |
| "loss": 0.5015, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.227347611202636, | |
| "grad_norm": 28764.791015625, | |
| "learning_rate": 1.4250614250614252e-05, | |
| "loss": 0.5669, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.2339373970345964, | |
| "grad_norm": 9551.7666015625, | |
| "learning_rate": 1.412776412776413e-05, | |
| "loss": 0.5331, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.2405271828665567, | |
| "grad_norm": 24150.939453125, | |
| "learning_rate": 1.4004914004914004e-05, | |
| "loss": 0.6009, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.247116968698517, | |
| "grad_norm": 6333.8935546875, | |
| "learning_rate": 1.3882063882063885e-05, | |
| "loss": 0.4918, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.253706754530478, | |
| "grad_norm": 12934.755859375, | |
| "learning_rate": 1.375921375921376e-05, | |
| "loss": 0.4723, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.260296540362438, | |
| "grad_norm": 35915.16015625, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 0.5468, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.2668863261943986, | |
| "grad_norm": 32075.701171875, | |
| "learning_rate": 1.3513513513513515e-05, | |
| "loss": 0.4188, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.2734761120263594, | |
| "grad_norm": 34175.6171875, | |
| "learning_rate": 1.339066339066339e-05, | |
| "loss": 0.5332, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.2800658978583197, | |
| "grad_norm": 27540.275390625, | |
| "learning_rate": 1.3267813267813267e-05, | |
| "loss": 0.4866, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.28665568369028, | |
| "grad_norm": 8475.587890625, | |
| "learning_rate": 1.3144963144963146e-05, | |
| "loss": 0.4882, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.2932454695222404, | |
| "grad_norm": 31548.888671875, | |
| "learning_rate": 1.3022113022113022e-05, | |
| "loss": 0.4984, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.2998352553542007, | |
| "grad_norm": 13258.5400390625, | |
| "learning_rate": 1.2899262899262901e-05, | |
| "loss": 0.4839, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.3064250411861615, | |
| "grad_norm": 38574.671875, | |
| "learning_rate": 1.2776412776412778e-05, | |
| "loss": 0.4749, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.313014827018122, | |
| "grad_norm": 11923.9541015625, | |
| "learning_rate": 1.2653562653562653e-05, | |
| "loss": 0.4684, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.3196046128500822, | |
| "grad_norm": 26359.25390625, | |
| "learning_rate": 1.2530712530712533e-05, | |
| "loss": 0.544, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.326194398682043, | |
| "grad_norm": 13086.6357421875, | |
| "learning_rate": 1.2407862407862408e-05, | |
| "loss": 0.4722, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.3327841845140034, | |
| "grad_norm": 15811.302734375, | |
| "learning_rate": 1.2285012285012287e-05, | |
| "loss": 0.4928, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.3393739703459637, | |
| "grad_norm": 6917.1416015625, | |
| "learning_rate": 1.2162162162162164e-05, | |
| "loss": 0.5691, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.345963756177924, | |
| "grad_norm": 41444.01171875, | |
| "learning_rate": 1.2039312039312039e-05, | |
| "loss": 0.5405, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.352553542009885, | |
| "grad_norm": 139438.609375, | |
| "learning_rate": 1.1916461916461917e-05, | |
| "loss": 0.5219, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.359143327841845, | |
| "grad_norm": 9378.14453125, | |
| "learning_rate": 1.1793611793611794e-05, | |
| "loss": 0.4549, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.3657331136738056, | |
| "grad_norm": 15612.345703125, | |
| "learning_rate": 1.1670761670761671e-05, | |
| "loss": 0.551, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.372322899505766, | |
| "grad_norm": 24843.34375, | |
| "learning_rate": 1.1547911547911548e-05, | |
| "loss": 0.4928, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.3789126853377267, | |
| "grad_norm": 27883.3359375, | |
| "learning_rate": 1.1425061425061426e-05, | |
| "loss": 0.5312, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.385502471169687, | |
| "grad_norm": 29997.705078125, | |
| "learning_rate": 1.1302211302211303e-05, | |
| "loss": 0.5811, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.3920922570016474, | |
| "grad_norm": 15760.669921875, | |
| "learning_rate": 1.117936117936118e-05, | |
| "loss": 0.5359, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.3986820428336078, | |
| "grad_norm": 75893.109375, | |
| "learning_rate": 1.1056511056511057e-05, | |
| "loss": 0.4696, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.4052718286655685, | |
| "grad_norm": 20869.072265625, | |
| "learning_rate": 1.0933660933660935e-05, | |
| "loss": 0.5342, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.411861614497529, | |
| "grad_norm": 15102.4658203125, | |
| "learning_rate": 1.0810810810810812e-05, | |
| "loss": 0.505, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.4184514003294892, | |
| "grad_norm": 24776.064453125, | |
| "learning_rate": 1.0687960687960689e-05, | |
| "loss": 0.5192, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.4250411861614496, | |
| "grad_norm": 17849.11328125, | |
| "learning_rate": 1.0565110565110566e-05, | |
| "loss": 0.5461, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.4316309719934104, | |
| "grad_norm": 55960.76953125, | |
| "learning_rate": 1.0442260442260443e-05, | |
| "loss": 0.5075, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.4382207578253707, | |
| "grad_norm": 19820.607421875, | |
| "learning_rate": 1.031941031941032e-05, | |
| "loss": 0.5661, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.444810543657331, | |
| "grad_norm": 14240.271484375, | |
| "learning_rate": 1.0196560196560196e-05, | |
| "loss": 0.508, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.4514003294892914, | |
| "grad_norm": 134865.75, | |
| "learning_rate": 1.0073710073710075e-05, | |
| "loss": 0.5145, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.4579901153212522, | |
| "grad_norm": 26314.51953125, | |
| "learning_rate": 9.950859950859952e-06, | |
| "loss": 0.5024, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.4645799011532126, | |
| "grad_norm": 8372.9169921875, | |
| "learning_rate": 9.828009828009828e-06, | |
| "loss": 0.6207, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.471169686985173, | |
| "grad_norm": 9625.1865234375, | |
| "learning_rate": 9.705159705159705e-06, | |
| "loss": 0.4774, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.4777594728171333, | |
| "grad_norm": 12379.4033203125, | |
| "learning_rate": 9.582309582309584e-06, | |
| "loss": 0.4996, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.484349258649094, | |
| "grad_norm": 7536.5390625, | |
| "learning_rate": 9.45945945945946e-06, | |
| "loss": 0.5236, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 2.4909390444810544, | |
| "grad_norm": 28774.794921875, | |
| "learning_rate": 9.336609336609337e-06, | |
| "loss": 0.4793, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.4975288303130148, | |
| "grad_norm": 26588.126953125, | |
| "learning_rate": 9.213759213759214e-06, | |
| "loss": 0.4341, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 2.504118616144975, | |
| "grad_norm": 20777.30078125, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.5086, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.510708401976936, | |
| "grad_norm": 11463.2158203125, | |
| "learning_rate": 8.96805896805897e-06, | |
| "loss": 0.5769, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 2.5172981878088962, | |
| "grad_norm": 39708.0390625, | |
| "learning_rate": 8.845208845208845e-06, | |
| "loss": 0.5116, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.5238879736408566, | |
| "grad_norm": 20928.3671875, | |
| "learning_rate": 8.722358722358723e-06, | |
| "loss": 0.4997, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 2.5304777594728174, | |
| "grad_norm": 16851.19140625, | |
| "learning_rate": 8.5995085995086e-06, | |
| "loss": 0.5821, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.5370675453047777, | |
| "grad_norm": 12078.3798828125, | |
| "learning_rate": 8.476658476658477e-06, | |
| "loss": 0.5221, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.543657331136738, | |
| "grad_norm": 66684.203125, | |
| "learning_rate": 8.353808353808354e-06, | |
| "loss": 0.5621, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.5502471169686984, | |
| "grad_norm": 5581.15234375, | |
| "learning_rate": 8.230958230958232e-06, | |
| "loss": 0.4489, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 2.556836902800659, | |
| "grad_norm": 23770.3359375, | |
| "learning_rate": 8.108108108108109e-06, | |
| "loss": 0.5072, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.5634266886326196, | |
| "grad_norm": 15222.48046875, | |
| "learning_rate": 7.985257985257986e-06, | |
| "loss": 0.4957, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 2.57001647446458, | |
| "grad_norm": 14970.568359375, | |
| "learning_rate": 7.862407862407863e-06, | |
| "loss": 0.4914, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5766062602965403, | |
| "grad_norm": 6360.3046875, | |
| "learning_rate": 7.73955773955774e-06, | |
| "loss": 0.4492, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 2.583196046128501, | |
| "grad_norm": 15164.78515625, | |
| "learning_rate": 7.616707616707617e-06, | |
| "loss": 0.4749, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.5897858319604614, | |
| "grad_norm": 22485.28125, | |
| "learning_rate": 7.493857493857494e-06, | |
| "loss": 0.5383, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 2.5963756177924218, | |
| "grad_norm": 30391.03515625, | |
| "learning_rate": 7.371007371007371e-06, | |
| "loss": 0.4506, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.602965403624382, | |
| "grad_norm": 15713.5595703125, | |
| "learning_rate": 7.2481572481572485e-06, | |
| "loss": 0.5178, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.6095551894563425, | |
| "grad_norm": 24513.958984375, | |
| "learning_rate": 7.125307125307126e-06, | |
| "loss": 0.4911, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.6161449752883033, | |
| "grad_norm": 21254.240234375, | |
| "learning_rate": 7.002457002457002e-06, | |
| "loss": 0.4883, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 2.6227347611202636, | |
| "grad_norm": 25682.513671875, | |
| "learning_rate": 6.87960687960688e-06, | |
| "loss": 0.5677, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.629324546952224, | |
| "grad_norm": 10649.7880859375, | |
| "learning_rate": 6.7567567567567575e-06, | |
| "loss": 0.4163, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 2.6359143327841847, | |
| "grad_norm": 11895.4677734375, | |
| "learning_rate": 6.6339066339066335e-06, | |
| "loss": 0.4617, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.642504118616145, | |
| "grad_norm": 71847.7421875, | |
| "learning_rate": 6.511056511056511e-06, | |
| "loss": 0.4855, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 2.6490939044481054, | |
| "grad_norm": 12087.333984375, | |
| "learning_rate": 6.388206388206389e-06, | |
| "loss": 0.5488, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.655683690280066, | |
| "grad_norm": 22596.474609375, | |
| "learning_rate": 6.2653562653562665e-06, | |
| "loss": 0.4533, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 2.662273476112026, | |
| "grad_norm": 13353.603515625, | |
| "learning_rate": 6.142506142506143e-06, | |
| "loss": 0.4916, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.668863261943987, | |
| "grad_norm": 28952.79296875, | |
| "learning_rate": 6.019656019656019e-06, | |
| "loss": 0.4664, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.6754530477759473, | |
| "grad_norm": 18681.466796875, | |
| "learning_rate": 5.896805896805897e-06, | |
| "loss": 0.5339, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.6820428336079076, | |
| "grad_norm": 55370.97265625, | |
| "learning_rate": 5.773955773955774e-06, | |
| "loss": 0.5157, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 2.6886326194398684, | |
| "grad_norm": 18563.974609375, | |
| "learning_rate": 5.6511056511056515e-06, | |
| "loss": 0.4533, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.6952224052718288, | |
| "grad_norm": 17806.52734375, | |
| "learning_rate": 5.528255528255528e-06, | |
| "loss": 0.5011, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 2.701812191103789, | |
| "grad_norm": 192237.5625, | |
| "learning_rate": 5.405405405405406e-06, | |
| "loss": 0.4728, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.7084019769357495, | |
| "grad_norm": 7906.73388671875, | |
| "learning_rate": 5.282555282555283e-06, | |
| "loss": 0.504, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 2.71499176276771, | |
| "grad_norm": 22805.4296875, | |
| "learning_rate": 5.15970515970516e-06, | |
| "loss": 0.5509, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.7215815485996706, | |
| "grad_norm": 12200.3623046875, | |
| "learning_rate": 5.036855036855037e-06, | |
| "loss": 0.4984, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 2.728171334431631, | |
| "grad_norm": 14147.498046875, | |
| "learning_rate": 4.914004914004914e-06, | |
| "loss": 0.4063, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.7347611202635913, | |
| "grad_norm": 11302.486328125, | |
| "learning_rate": 4.791154791154792e-06, | |
| "loss": 0.5305, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.741350906095552, | |
| "grad_norm": 12493.9912109375, | |
| "learning_rate": 4.668304668304669e-06, | |
| "loss": 0.5437, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.7479406919275124, | |
| "grad_norm": 12336.576171875, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.61, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 2.754530477759473, | |
| "grad_norm": 131413.265625, | |
| "learning_rate": 4.422604422604422e-06, | |
| "loss": 0.4386, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.761120263591433, | |
| "grad_norm": 6280.30810546875, | |
| "learning_rate": 4.2997542997543e-06, | |
| "loss": 0.4827, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 2.7677100494233935, | |
| "grad_norm": 22697.05859375, | |
| "learning_rate": 4.176904176904177e-06, | |
| "loss": 0.5122, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.7742998352553543, | |
| "grad_norm": 24444.3203125, | |
| "learning_rate": 4.0540540540540545e-06, | |
| "loss": 0.483, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 2.7808896210873146, | |
| "grad_norm": 11774.201171875, | |
| "learning_rate": 3.931203931203931e-06, | |
| "loss": 0.5757, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.787479406919275, | |
| "grad_norm": 16244.38671875, | |
| "learning_rate": 3.8083538083538086e-06, | |
| "loss": 0.4689, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 2.7940691927512358, | |
| "grad_norm": 22801.099609375, | |
| "learning_rate": 3.6855036855036854e-06, | |
| "loss": 0.4832, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.800658978583196, | |
| "grad_norm": 12143.181640625, | |
| "learning_rate": 3.562653562653563e-06, | |
| "loss": 0.5305, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.8072487644151565, | |
| "grad_norm": 37724.21875, | |
| "learning_rate": 3.43980343980344e-06, | |
| "loss": 0.6104, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.813838550247117, | |
| "grad_norm": 15959.6953125, | |
| "learning_rate": 3.3169533169533168e-06, | |
| "loss": 0.4719, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 2.820428336079077, | |
| "grad_norm": 16792.8984375, | |
| "learning_rate": 3.1941031941031944e-06, | |
| "loss": 0.5533, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.827018121911038, | |
| "grad_norm": 11308.0615234375, | |
| "learning_rate": 3.0712530712530717e-06, | |
| "loss": 0.5628, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 2.8336079077429983, | |
| "grad_norm": 23545.369140625, | |
| "learning_rate": 2.9484029484029485e-06, | |
| "loss": 0.5278, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.8401976935749587, | |
| "grad_norm": 33122.05859375, | |
| "learning_rate": 2.8255528255528258e-06, | |
| "loss": 0.4413, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 2.8467874794069195, | |
| "grad_norm": 22283.62890625, | |
| "learning_rate": 2.702702702702703e-06, | |
| "loss": 0.5699, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.85337726523888, | |
| "grad_norm": 13194.3056640625, | |
| "learning_rate": 2.57985257985258e-06, | |
| "loss": 0.5302, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 2.85996705107084, | |
| "grad_norm": 20634.77734375, | |
| "learning_rate": 2.457002457002457e-06, | |
| "loss": 0.5518, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.8665568369028005, | |
| "grad_norm": 15800.359375, | |
| "learning_rate": 2.3341523341523343e-06, | |
| "loss": 0.5346, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.873146622734761, | |
| "grad_norm": 15633.765625, | |
| "learning_rate": 2.211302211302211e-06, | |
| "loss": 0.4884, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.8797364085667216, | |
| "grad_norm": 31339.26953125, | |
| "learning_rate": 2.0884520884520884e-06, | |
| "loss": 0.4925, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 2.886326194398682, | |
| "grad_norm": 43050.4765625, | |
| "learning_rate": 1.9656019656019657e-06, | |
| "loss": 0.5312, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.892915980230643, | |
| "grad_norm": 17257.248046875, | |
| "learning_rate": 1.8427518427518427e-06, | |
| "loss": 0.4499, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 2.899505766062603, | |
| "grad_norm": 53676.23046875, | |
| "learning_rate": 1.71990171990172e-06, | |
| "loss": 0.4375, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.9060955518945635, | |
| "grad_norm": 31634.296875, | |
| "learning_rate": 1.5970515970515972e-06, | |
| "loss": 0.4394, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 2.912685337726524, | |
| "grad_norm": 22569.1171875, | |
| "learning_rate": 1.4742014742014743e-06, | |
| "loss": 0.4388, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.919275123558484, | |
| "grad_norm": 66667.6796875, | |
| "learning_rate": 1.3513513513513515e-06, | |
| "loss": 0.4113, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 2.925864909390445, | |
| "grad_norm": 7823.2880859375, | |
| "learning_rate": 1.2285012285012285e-06, | |
| "loss": 0.4316, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.9324546952224053, | |
| "grad_norm": 8603.0859375, | |
| "learning_rate": 1.1056511056511056e-06, | |
| "loss": 0.4213, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.9390444810543657, | |
| "grad_norm": 26335.857421875, | |
| "learning_rate": 9.828009828009828e-07, | |
| "loss": 0.4523, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.9456342668863265, | |
| "grad_norm": 18206.806640625, | |
| "learning_rate": 8.5995085995086e-07, | |
| "loss": 0.4656, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 2.952224052718287, | |
| "grad_norm": 9823.25, | |
| "learning_rate": 7.371007371007371e-07, | |
| "loss": 0.5816, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.958813838550247, | |
| "grad_norm": 15623.380859375, | |
| "learning_rate": 6.142506142506143e-07, | |
| "loss": 0.4464, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.9654036243822075, | |
| "grad_norm": 22244.39453125, | |
| "learning_rate": 4.914004914004914e-07, | |
| "loss": 0.5091, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.971993410214168, | |
| "grad_norm": 7978.923828125, | |
| "learning_rate": 3.6855036855036856e-07, | |
| "loss": 0.554, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.9785831960461286, | |
| "grad_norm": 20999.830078125, | |
| "learning_rate": 2.457002457002457e-07, | |
| "loss": 0.5075, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.985172981878089, | |
| "grad_norm": 22284.568359375, | |
| "learning_rate": 1.2285012285012285e-07, | |
| "loss": 0.4835, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.985172981878089, | |
| "step": 453, | |
| "total_flos": 9.99505283128492e+17, | |
| "train_loss": 0.5814478738155323, | |
| "train_runtime": 61267.4818, | |
| "train_samples_per_second": 0.357, | |
| "train_steps_per_second": 0.007 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 453, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.99505283128492e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |