diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,48642 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6944, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00014400921658986175, + "grad_norm": 2.749812126159668, + "learning_rate": 5e-05, + "loss": 4.257, + "step": 1 + }, + { + "epoch": 0.0002880184331797235, + "grad_norm": 3.5556857585906982, + "learning_rate": 4.99999974414711e-05, + "loss": 4.7696, + "step": 2 + }, + { + "epoch": 0.0004320276497695853, + "grad_norm": 3.0576136112213135, + "learning_rate": 4.999998976588493e-05, + "loss": 3.2821, + "step": 3 + }, + { + "epoch": 0.000576036866359447, + "grad_norm": 3.3320424556732178, + "learning_rate": 4.9999976973243055e-05, + "loss": 3.7763, + "step": 4 + }, + { + "epoch": 0.0007200460829493088, + "grad_norm": 4.060346603393555, + "learning_rate": 4.99999590635481e-05, + "loss": 3.8132, + "step": 5 + }, + { + "epoch": 0.0008640552995391706, + "grad_norm": 4.074298858642578, + "learning_rate": 4.999993603680373e-05, + "loss": 2.6277, + "step": 6 + }, + { + "epoch": 0.0010080645161290322, + "grad_norm": 2.761934757232666, + "learning_rate": 4.9999907893014654e-05, + "loss": 5.7244, + "step": 7 + }, + { + "epoch": 0.001152073732718894, + "grad_norm": 4.144633769989014, + "learning_rate": 4.999987463218663e-05, + "loss": 3.0528, + "step": 8 + }, + { + "epoch": 0.0012960829493087558, + "grad_norm": 3.9093730449676514, + "learning_rate": 4.999983625432647e-05, + "loss": 3.7813, + "step": 9 + }, + { + "epoch": 0.0014400921658986176, + "grad_norm": 5.368951797485352, + "learning_rate": 4.999979275944203e-05, + "loss": 3.0779, + "step": 10 + }, + { + "epoch": 0.0015841013824884793, + "grad_norm": 4.925940990447998, + "learning_rate": 4.9999744147542205e-05, + "loss": 3.8772, + "step": 11 + }, + { + "epoch": 0.0017281105990783411, + "grad_norm": 5.794581890106201, + "learning_rate": 4.9999690418636955e-05, + "loss": 3.6812, + "step": 12 + }, + { + "epoch": 0.0018721198156682027, + "grad_norm": 4.615610599517822, + "learning_rate": 4.9999631572737285e-05, + "loss": 5.7108, + "step": 13 + }, + { + "epoch": 0.0020161290322580645, + "grad_norm": 2.7738749980926514, + "learning_rate": 4.999956760985522e-05, + "loss": 4.0871, + "step": 14 + }, + { + "epoch": 0.0021601382488479265, + "grad_norm": 2.613072156906128, + "learning_rate": 4.9999498530003866e-05, + "loss": 3.9328, + "step": 15 + }, + { + "epoch": 0.002304147465437788, + "grad_norm": 1.215724229812622, + "learning_rate": 4.999942433319735e-05, + "loss": 3.4003, + "step": 16 + }, + { + "epoch": 0.0024481566820276496, + "grad_norm": 2.498894214630127, + "learning_rate": 4.999934501945087e-05, + "loss": 4.7901, + "step": 17 + }, + { + "epoch": 0.0025921658986175116, + "grad_norm": 2.734083414077759, + "learning_rate": 4.999926058878066e-05, + "loss": 2.2025, + "step": 18 + }, + { + "epoch": 0.002736175115207373, + "grad_norm": 2.1638023853302, + "learning_rate": 4.9999171041203994e-05, + "loss": 2.2122, + "step": 19 + }, + { + "epoch": 0.002880184331797235, + "grad_norm": 2.0509259700775146, + "learning_rate": 4.99990763767392e-05, + "loss": 3.6287, + "step": 20 + }, + { + "epoch": 0.0030241935483870967, + "grad_norm": 2.9293360710144043, + "learning_rate": 4.9998976595405664e-05, + "loss": 3.8072, + "step": 21 + }, + { + "epoch": 0.0031682027649769587, + "grad_norm": 2.3165132999420166, + "learning_rate": 4.99988716972238e-05, + "loss": 1.9523, + "step": 22 + }, + { + "epoch": 0.0033122119815668202, + "grad_norm": 1.3680247068405151, + "learning_rate": 4.999876168221509e-05, + "loss": 3.4611, + "step": 23 + }, + { + "epoch": 0.0034562211981566822, + "grad_norm": 1.0889722108840942, + "learning_rate": 4.999864655040204e-05, + "loss": 2.611, + "step": 24 + }, + { + "epoch": 0.003600230414746544, + "grad_norm": 2.0757853984832764, + "learning_rate": 4.9998526301808224e-05, + "loss": 3.2987, + "step": 25 + }, + { + "epoch": 0.0037442396313364054, + "grad_norm": 2.0205531120300293, + "learning_rate": 4.9998400936458246e-05, + "loss": 2.4713, + "step": 26 + }, + { + "epoch": 0.0038882488479262674, + "grad_norm": 1.96470046043396, + "learning_rate": 4.999827045437777e-05, + "loss": 3.182, + "step": 27 + }, + { + "epoch": 0.004032258064516129, + "grad_norm": 1.2902711629867554, + "learning_rate": 4.9998134855593514e-05, + "loss": 2.9338, + "step": 28 + }, + { + "epoch": 0.004176267281105991, + "grad_norm": 1.4275892972946167, + "learning_rate": 4.999799414013322e-05, + "loss": 2.584, + "step": 29 + }, + { + "epoch": 0.004320276497695853, + "grad_norm": 1.0785837173461914, + "learning_rate": 4.999784830802569e-05, + "loss": 2.6646, + "step": 30 + }, + { + "epoch": 0.004464285714285714, + "grad_norm": 1.578644871711731, + "learning_rate": 4.9997697359300774e-05, + "loss": 3.7311, + "step": 31 + }, + { + "epoch": 0.004608294930875576, + "grad_norm": 1.5050499439239502, + "learning_rate": 4.9997541293989384e-05, + "loss": 2.5641, + "step": 32 + }, + { + "epoch": 0.004752304147465438, + "grad_norm": 1.0355409383773804, + "learning_rate": 4.999738011212344e-05, + "loss": 2.7307, + "step": 33 + }, + { + "epoch": 0.004896313364055299, + "grad_norm": 0.9430926442146301, + "learning_rate": 4.9997213813735945e-05, + "loss": 2.654, + "step": 34 + }, + { + "epoch": 0.005040322580645161, + "grad_norm": 3.4353392124176025, + "learning_rate": 4.999704239886094e-05, + "loss": 3.5457, + "step": 35 + }, + { + "epoch": 0.005184331797235023, + "grad_norm": 1.1123013496398926, + "learning_rate": 4.9996865867533496e-05, + "loss": 2.6619, + "step": 36 + }, + { + "epoch": 0.005328341013824885, + "grad_norm": 1.1416701078414917, + "learning_rate": 4.999668421978977e-05, + "loss": 2.1824, + "step": 37 + }, + { + "epoch": 0.005472350230414746, + "grad_norm": 0.8658654093742371, + "learning_rate": 4.9996497455666924e-05, + "loss": 1.8141, + "step": 38 + }, + { + "epoch": 0.005616359447004608, + "grad_norm": 1.2492862939834595, + "learning_rate": 4.999630557520319e-05, + "loss": 2.5408, + "step": 39 + }, + { + "epoch": 0.00576036866359447, + "grad_norm": 1.02650785446167, + "learning_rate": 4.999610857843784e-05, + "loss": 2.5699, + "step": 40 + }, + { + "epoch": 0.005904377880184331, + "grad_norm": 1.2021536827087402, + "learning_rate": 4.99959064654112e-05, + "loss": 2.2746, + "step": 41 + }, + { + "epoch": 0.006048387096774193, + "grad_norm": 1.2059953212738037, + "learning_rate": 4.999569923616464e-05, + "loss": 2.7195, + "step": 42 + }, + { + "epoch": 0.006192396313364055, + "grad_norm": 1.1550027132034302, + "learning_rate": 4.9995486890740573e-05, + "loss": 2.4982, + "step": 43 + }, + { + "epoch": 0.006336405529953917, + "grad_norm": 1.330669641494751, + "learning_rate": 4.999526942918247e-05, + "loss": 2.9358, + "step": 44 + }, + { + "epoch": 0.0064804147465437785, + "grad_norm": 1.404761552810669, + "learning_rate": 4.999504685153482e-05, + "loss": 2.8078, + "step": 45 + }, + { + "epoch": 0.0066244239631336405, + "grad_norm": 1.589821457862854, + "learning_rate": 4.9994819157843204e-05, + "loss": 4.18, + "step": 46 + }, + { + "epoch": 0.0067684331797235025, + "grad_norm": 1.1104768514633179, + "learning_rate": 4.999458634815422e-05, + "loss": 3.0762, + "step": 47 + }, + { + "epoch": 0.0069124423963133645, + "grad_norm": 1.8753349781036377, + "learning_rate": 4.999434842251551e-05, + "loss": 3.8214, + "step": 48 + }, + { + "epoch": 0.007056451612903226, + "grad_norm": 0.931186854839325, + "learning_rate": 4.9994105380975785e-05, + "loss": 3.1807, + "step": 49 + }, + { + "epoch": 0.007200460829493088, + "grad_norm": 1.5367690324783325, + "learning_rate": 4.999385722358479e-05, + "loss": 2.4706, + "step": 50 + }, + { + "epoch": 0.00734447004608295, + "grad_norm": 1.8072152137756348, + "learning_rate": 4.999360395039331e-05, + "loss": 3.2191, + "step": 51 + }, + { + "epoch": 0.007488479262672811, + "grad_norm": 1.6499011516571045, + "learning_rate": 4.99933455614532e-05, + "loss": 2.4073, + "step": 52 + }, + { + "epoch": 0.007632488479262673, + "grad_norm": 1.2530362606048584, + "learning_rate": 4.999308205681733e-05, + "loss": 2.5953, + "step": 53 + }, + { + "epoch": 0.007776497695852535, + "grad_norm": 1.3810861110687256, + "learning_rate": 4.9992813436539655e-05, + "loss": 2.6898, + "step": 54 + }, + { + "epoch": 0.007920506912442397, + "grad_norm": 1.6395010948181152, + "learning_rate": 4.9992539700675133e-05, + "loss": 3.4591, + "step": 55 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 2.1489076614379883, + "learning_rate": 4.999226084927982e-05, + "loss": 2.7145, + "step": 56 + }, + { + "epoch": 0.008208525345622119, + "grad_norm": 1.953791856765747, + "learning_rate": 4.999197688241076e-05, + "loss": 2.1815, + "step": 57 + }, + { + "epoch": 0.008352534562211982, + "grad_norm": 1.0499128103256226, + "learning_rate": 4.999168780012611e-05, + "loss": 2.6607, + "step": 58 + }, + { + "epoch": 0.008496543778801843, + "grad_norm": 1.781936764717102, + "learning_rate": 4.999139360248501e-05, + "loss": 2.0277, + "step": 59 + }, + { + "epoch": 0.008640552995391706, + "grad_norm": 1.5102976560592651, + "learning_rate": 4.99910942895477e-05, + "loss": 2.0316, + "step": 60 + }, + { + "epoch": 0.008784562211981567, + "grad_norm": 1.2535592317581177, + "learning_rate": 4.999078986137543e-05, + "loss": 2.6058, + "step": 61 + }, + { + "epoch": 0.008928571428571428, + "grad_norm": 1.643808126449585, + "learning_rate": 4.999048031803052e-05, + "loss": 2.2108, + "step": 62 + }, + { + "epoch": 0.009072580645161291, + "grad_norm": 0.9234294295310974, + "learning_rate": 4.999016565957633e-05, + "loss": 2.1188, + "step": 63 + }, + { + "epoch": 0.009216589861751152, + "grad_norm": 1.0726630687713623, + "learning_rate": 4.9989845886077246e-05, + "loss": 2.7139, + "step": 64 + }, + { + "epoch": 0.009360599078341013, + "grad_norm": 1.6923820972442627, + "learning_rate": 4.998952099759874e-05, + "loss": 2.2525, + "step": 65 + }, + { + "epoch": 0.009504608294930876, + "grad_norm": 1.5629535913467407, + "learning_rate": 4.99891909942073e-05, + "loss": 1.8126, + "step": 66 + }, + { + "epoch": 0.009648617511520737, + "grad_norm": 2.2390613555908203, + "learning_rate": 4.9988855875970475e-05, + "loss": 1.9558, + "step": 67 + }, + { + "epoch": 0.009792626728110598, + "grad_norm": 1.7797694206237793, + "learning_rate": 4.998851564295686e-05, + "loss": 2.7333, + "step": 68 + }, + { + "epoch": 0.009936635944700461, + "grad_norm": 1.8321067094802856, + "learning_rate": 4.998817029523609e-05, + "loss": 2.3075, + "step": 69 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 2.3713109493255615, + "learning_rate": 4.998781983287886e-05, + "loss": 3.4854, + "step": 70 + }, + { + "epoch": 0.010224654377880185, + "grad_norm": 2.17521595954895, + "learning_rate": 4.9987464255956894e-05, + "loss": 2.0593, + "step": 71 + }, + { + "epoch": 0.010368663594470046, + "grad_norm": 1.494936227798462, + "learning_rate": 4.998710356454298e-05, + "loss": 1.866, + "step": 72 + }, + { + "epoch": 0.010512672811059907, + "grad_norm": 1.21134614944458, + "learning_rate": 4.9986737758710946e-05, + "loss": 2.2625, + "step": 73 + }, + { + "epoch": 0.01065668202764977, + "grad_norm": 2.506932258605957, + "learning_rate": 4.998636683853565e-05, + "loss": 1.9831, + "step": 74 + }, + { + "epoch": 0.010800691244239631, + "grad_norm": 2.8391828536987305, + "learning_rate": 4.998599080409303e-05, + "loss": 2.9567, + "step": 75 + }, + { + "epoch": 0.010944700460829493, + "grad_norm": 1.834782361984253, + "learning_rate": 4.998560965546005e-05, + "loss": 2.7494, + "step": 76 + }, + { + "epoch": 0.011088709677419355, + "grad_norm": 1.5167925357818604, + "learning_rate": 4.998522339271472e-05, + "loss": 1.5793, + "step": 77 + }, + { + "epoch": 0.011232718894009217, + "grad_norm": 1.4462932348251343, + "learning_rate": 4.99848320159361e-05, + "loss": 3.0136, + "step": 78 + }, + { + "epoch": 0.011376728110599078, + "grad_norm": 2.923617362976074, + "learning_rate": 4.99844355252043e-05, + "loss": 1.8674, + "step": 79 + }, + { + "epoch": 0.01152073732718894, + "grad_norm": 2.2006890773773193, + "learning_rate": 4.998403392060048e-05, + "loss": 2.3389, + "step": 80 + }, + { + "epoch": 0.011664746543778802, + "grad_norm": 1.4237990379333496, + "learning_rate": 4.998362720220684e-05, + "loss": 1.978, + "step": 81 + }, + { + "epoch": 0.011808755760368663, + "grad_norm": 1.536563754081726, + "learning_rate": 4.998321537010663e-05, + "loss": 2.5149, + "step": 82 + }, + { + "epoch": 0.011952764976958526, + "grad_norm": 1.4726285934448242, + "learning_rate": 4.998279842438413e-05, + "loss": 2.3538, + "step": 83 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 2.524960994720459, + "learning_rate": 4.99823763651247e-05, + "loss": 3.1595, + "step": 84 + }, + { + "epoch": 0.01224078341013825, + "grad_norm": 2.9629170894622803, + "learning_rate": 4.998194919241471e-05, + "loss": 2.1958, + "step": 85 + }, + { + "epoch": 0.01238479262672811, + "grad_norm": 1.8485426902770996, + "learning_rate": 4.998151690634161e-05, + "loss": 2.9255, + "step": 86 + }, + { + "epoch": 0.012528801843317972, + "grad_norm": 2.132467746734619, + "learning_rate": 4.998107950699387e-05, + "loss": 2.1313, + "step": 87 + }, + { + "epoch": 0.012672811059907835, + "grad_norm": 1.389547348022461, + "learning_rate": 4.998063699446103e-05, + "loss": 2.0356, + "step": 88 + }, + { + "epoch": 0.012816820276497696, + "grad_norm": 5.147162914276123, + "learning_rate": 4.9980189368833656e-05, + "loss": 4.0889, + "step": 89 + }, + { + "epoch": 0.012960829493087557, + "grad_norm": 1.5019563436508179, + "learning_rate": 4.997973663020337e-05, + "loss": 3.3771, + "step": 90 + }, + { + "epoch": 0.01310483870967742, + "grad_norm": 2.395458459854126, + "learning_rate": 4.9979278778662844e-05, + "loss": 1.0996, + "step": 91 + }, + { + "epoch": 0.013248847926267281, + "grad_norm": 2.1102678775787354, + "learning_rate": 4.997881581430579e-05, + "loss": 2.2744, + "step": 92 + }, + { + "epoch": 0.013392857142857142, + "grad_norm": 1.8278207778930664, + "learning_rate": 4.997834773722696e-05, + "loss": 2.1892, + "step": 93 + }, + { + "epoch": 0.013536866359447005, + "grad_norm": 3.4146804809570312, + "learning_rate": 4.9977874547522175e-05, + "loss": 2.7589, + "step": 94 + }, + { + "epoch": 0.013680875576036866, + "grad_norm": 2.969169855117798, + "learning_rate": 4.9977396245288276e-05, + "loss": 2.515, + "step": 95 + }, + { + "epoch": 0.013824884792626729, + "grad_norm": 4.20405912399292, + "learning_rate": 4.997691283062318e-05, + "loss": 1.3685, + "step": 96 + }, + { + "epoch": 0.01396889400921659, + "grad_norm": 2.624204635620117, + "learning_rate": 4.9976424303625815e-05, + "loss": 1.7609, + "step": 97 + }, + { + "epoch": 0.014112903225806451, + "grad_norm": 1.8823909759521484, + "learning_rate": 4.9975930664396177e-05, + "loss": 2.1931, + "step": 98 + }, + { + "epoch": 0.014256912442396314, + "grad_norm": 1.310426950454712, + "learning_rate": 4.997543191303532e-05, + "loss": 2.2556, + "step": 99 + }, + { + "epoch": 0.014400921658986175, + "grad_norm": 3.0712099075317383, + "learning_rate": 4.997492804964531e-05, + "loss": 1.5441, + "step": 100 + }, + { + "epoch": 0.014544930875576036, + "grad_norm": 3.5586373805999756, + "learning_rate": 4.9974419074329295e-05, + "loss": 2.4907, + "step": 101 + }, + { + "epoch": 0.0146889400921659, + "grad_norm": 1.9779694080352783, + "learning_rate": 4.997390498719144e-05, + "loss": 2.4171, + "step": 102 + }, + { + "epoch": 0.01483294930875576, + "grad_norm": 1.4305269718170166, + "learning_rate": 4.9973385788336976e-05, + "loss": 1.778, + "step": 103 + }, + { + "epoch": 0.014976958525345621, + "grad_norm": 1.7448749542236328, + "learning_rate": 4.997286147787218e-05, + "loss": 1.6571, + "step": 104 + }, + { + "epoch": 0.015120967741935484, + "grad_norm": 3.7328531742095947, + "learning_rate": 4.997233205590436e-05, + "loss": 1.8411, + "step": 105 + }, + { + "epoch": 0.015264976958525345, + "grad_norm": 2.832186698913574, + "learning_rate": 4.997179752254188e-05, + "loss": 1.8991, + "step": 106 + }, + { + "epoch": 0.015408986175115207, + "grad_norm": 1.8185867071151733, + "learning_rate": 4.997125787789415e-05, + "loss": 1.873, + "step": 107 + }, + { + "epoch": 0.01555299539170507, + "grad_norm": 1.7866613864898682, + "learning_rate": 4.997071312207163e-05, + "loss": 1.8429, + "step": 108 + }, + { + "epoch": 0.015697004608294932, + "grad_norm": 1.799485683441162, + "learning_rate": 4.997016325518582e-05, + "loss": 2.1722, + "step": 109 + }, + { + "epoch": 0.015841013824884793, + "grad_norm": 2.0879735946655273, + "learning_rate": 4.996960827734927e-05, + "loss": 2.2972, + "step": 110 + }, + { + "epoch": 0.015985023041474655, + "grad_norm": 2.4061312675476074, + "learning_rate": 4.9969048188675566e-05, + "loss": 2.1405, + "step": 111 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 2.302462339401245, + "learning_rate": 4.9968482989279356e-05, + "loss": 1.4885, + "step": 112 + }, + { + "epoch": 0.016273041474654377, + "grad_norm": 2.1213808059692383, + "learning_rate": 4.9967912679276316e-05, + "loss": 2.2944, + "step": 113 + }, + { + "epoch": 0.016417050691244238, + "grad_norm": 2.1637730598449707, + "learning_rate": 4.9967337258783195e-05, + "loss": 2.1316, + "step": 114 + }, + { + "epoch": 0.016561059907834103, + "grad_norm": 1.4232454299926758, + "learning_rate": 4.9966756727917764e-05, + "loss": 1.6784, + "step": 115 + }, + { + "epoch": 0.016705069124423964, + "grad_norm": 2.3776438236236572, + "learning_rate": 4.9966171086798844e-05, + "loss": 1.8482, + "step": 116 + }, + { + "epoch": 0.016849078341013825, + "grad_norm": 2.8138346672058105, + "learning_rate": 4.996558033554631e-05, + "loss": 1.6106, + "step": 117 + }, + { + "epoch": 0.016993087557603686, + "grad_norm": 2.2720069885253906, + "learning_rate": 4.996498447428107e-05, + "loss": 2.8357, + "step": 118 + }, + { + "epoch": 0.017137096774193547, + "grad_norm": 1.8003276586532593, + "learning_rate": 4.99643835031251e-05, + "loss": 1.8319, + "step": 119 + }, + { + "epoch": 0.01728110599078341, + "grad_norm": 1.6282298564910889, + "learning_rate": 4.996377742220139e-05, + "loss": 2.2286, + "step": 120 + }, + { + "epoch": 0.017425115207373273, + "grad_norm": 3.0759527683258057, + "learning_rate": 4.996316623163401e-05, + "loss": 3.2795, + "step": 121 + }, + { + "epoch": 0.017569124423963134, + "grad_norm": 2.5114152431488037, + "learning_rate": 4.9962549931548054e-05, + "loss": 3.8899, + "step": 122 + }, + { + "epoch": 0.017713133640552995, + "grad_norm": 2.0858821868896484, + "learning_rate": 4.996192852206967e-05, + "loss": 2.3815, + "step": 123 + }, + { + "epoch": 0.017857142857142856, + "grad_norm": 2.27655029296875, + "learning_rate": 4.9961302003326045e-05, + "loss": 1.1568, + "step": 124 + }, + { + "epoch": 0.018001152073732717, + "grad_norm": 2.0592713356018066, + "learning_rate": 4.996067037544542e-05, + "loss": 1.8342, + "step": 125 + }, + { + "epoch": 0.018145161290322582, + "grad_norm": 2.7861416339874268, + "learning_rate": 4.996003363855707e-05, + "loss": 3.1344, + "step": 126 + }, + { + "epoch": 0.018289170506912443, + "grad_norm": 1.9052131175994873, + "learning_rate": 4.995939179279134e-05, + "loss": 1.4979, + "step": 127 + }, + { + "epoch": 0.018433179723502304, + "grad_norm": 2.171623468399048, + "learning_rate": 4.9958744838279594e-05, + "loss": 1.8892, + "step": 128 + }, + { + "epoch": 0.018577188940092165, + "grad_norm": 1.9321303367614746, + "learning_rate": 4.995809277515424e-05, + "loss": 2.2672, + "step": 129 + }, + { + "epoch": 0.018721198156682026, + "grad_norm": 2.160367965698242, + "learning_rate": 4.995743560354877e-05, + "loss": 1.7384, + "step": 130 + }, + { + "epoch": 0.01886520737327189, + "grad_norm": 2.0389413833618164, + "learning_rate": 4.9956773323597684e-05, + "loss": 1.6655, + "step": 131 + }, + { + "epoch": 0.019009216589861752, + "grad_norm": 3.735802173614502, + "learning_rate": 4.995610593543653e-05, + "loss": 1.2075, + "step": 132 + }, + { + "epoch": 0.019153225806451613, + "grad_norm": 2.24922513961792, + "learning_rate": 4.995543343920192e-05, + "loss": 1.92, + "step": 133 + }, + { + "epoch": 0.019297235023041474, + "grad_norm": 2.3432536125183105, + "learning_rate": 4.99547558350315e-05, + "loss": 2.133, + "step": 134 + }, + { + "epoch": 0.019441244239631335, + "grad_norm": 1.626983642578125, + "learning_rate": 4.995407312306396e-05, + "loss": 1.8947, + "step": 135 + }, + { + "epoch": 0.019585253456221197, + "grad_norm": 1.666089415550232, + "learning_rate": 4.995338530343905e-05, + "loss": 2.2266, + "step": 136 + }, + { + "epoch": 0.01972926267281106, + "grad_norm": 3.5922741889953613, + "learning_rate": 4.995269237629755e-05, + "loss": 2.3561, + "step": 137 + }, + { + "epoch": 0.019873271889400922, + "grad_norm": 2.9366064071655273, + "learning_rate": 4.995199434178128e-05, + "loss": 2.0708, + "step": 138 + }, + { + "epoch": 0.020017281105990783, + "grad_norm": 2.4218077659606934, + "learning_rate": 4.9951291200033125e-05, + "loss": 1.7481, + "step": 139 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 2.4026169776916504, + "learning_rate": 4.9950582951197e-05, + "loss": 1.5119, + "step": 140 + }, + { + "epoch": 0.020305299539170506, + "grad_norm": 1.677156686782837, + "learning_rate": 4.9949869595417876e-05, + "loss": 1.0579, + "step": 141 + }, + { + "epoch": 0.02044930875576037, + "grad_norm": 2.843226194381714, + "learning_rate": 4.994915113284177e-05, + "loss": 2.0048, + "step": 142 + }, + { + "epoch": 0.02059331797235023, + "grad_norm": 2.733941078186035, + "learning_rate": 4.994842756361572e-05, + "loss": 1.1648, + "step": 143 + }, + { + "epoch": 0.020737327188940093, + "grad_norm": 3.0132880210876465, + "learning_rate": 4.994769888788784e-05, + "loss": 2.394, + "step": 144 + }, + { + "epoch": 0.020881336405529954, + "grad_norm": 1.8009469509124756, + "learning_rate": 4.9946965105807275e-05, + "loss": 1.7166, + "step": 145 + }, + { + "epoch": 0.021025345622119815, + "grad_norm": 1.7264102697372437, + "learning_rate": 4.994622621752422e-05, + "loss": 1.3038, + "step": 146 + }, + { + "epoch": 0.021169354838709676, + "grad_norm": 2.6478819847106934, + "learning_rate": 4.994548222318991e-05, + "loss": 2.4485, + "step": 147 + }, + { + "epoch": 0.02131336405529954, + "grad_norm": 2.8302524089813232, + "learning_rate": 4.994473312295663e-05, + "loss": 1.9961, + "step": 148 + }, + { + "epoch": 0.0214573732718894, + "grad_norm": 5.505067825317383, + "learning_rate": 4.9943978916977704e-05, + "loss": 2.1109, + "step": 149 + }, + { + "epoch": 0.021601382488479263, + "grad_norm": 3.1014301776885986, + "learning_rate": 4.994321960540751e-05, + "loss": 4.0997, + "step": 150 + }, + { + "epoch": 0.021745391705069124, + "grad_norm": 2.7188234329223633, + "learning_rate": 4.994245518840146e-05, + "loss": 2.1135, + "step": 151 + }, + { + "epoch": 0.021889400921658985, + "grad_norm": 3.088263750076294, + "learning_rate": 4.994168566611601e-05, + "loss": 2.2821, + "step": 152 + }, + { + "epoch": 0.022033410138248846, + "grad_norm": 4.123351097106934, + "learning_rate": 4.9940911038708686e-05, + "loss": 2.8501, + "step": 153 + }, + { + "epoch": 0.02217741935483871, + "grad_norm": 2.0797510147094727, + "learning_rate": 4.994013130633803e-05, + "loss": 1.6376, + "step": 154 + }, + { + "epoch": 0.022321428571428572, + "grad_norm": 1.82114839553833, + "learning_rate": 4.993934646916364e-05, + "loss": 1.3916, + "step": 155 + }, + { + "epoch": 0.022465437788018433, + "grad_norm": 2.9615752696990967, + "learning_rate": 4.9938556527346155e-05, + "loss": 2.6644, + "step": 156 + }, + { + "epoch": 0.022609447004608294, + "grad_norm": 2.5467231273651123, + "learning_rate": 4.9937761481047265e-05, + "loss": 2.7126, + "step": 157 + }, + { + "epoch": 0.022753456221198155, + "grad_norm": 2.6525495052337646, + "learning_rate": 4.99369613304297e-05, + "loss": 1.6911, + "step": 158 + }, + { + "epoch": 0.02289746543778802, + "grad_norm": 1.9745975732803345, + "learning_rate": 4.9936156075657245e-05, + "loss": 1.8815, + "step": 159 + }, + { + "epoch": 0.02304147465437788, + "grad_norm": 1.9721035957336426, + "learning_rate": 4.993534571689471e-05, + "loss": 1.497, + "step": 160 + }, + { + "epoch": 0.023185483870967742, + "grad_norm": 2.5985803604125977, + "learning_rate": 4.993453025430797e-05, + "loss": 1.5745, + "step": 161 + }, + { + "epoch": 0.023329493087557603, + "grad_norm": 3.1163322925567627, + "learning_rate": 4.9933709688063935e-05, + "loss": 1.8669, + "step": 162 + }, + { + "epoch": 0.023473502304147464, + "grad_norm": 2.3309366703033447, + "learning_rate": 4.993288401833055e-05, + "loss": 3.2113, + "step": 163 + }, + { + "epoch": 0.023617511520737326, + "grad_norm": 1.8410059213638306, + "learning_rate": 4.993205324527683e-05, + "loss": 2.2339, + "step": 164 + }, + { + "epoch": 0.02376152073732719, + "grad_norm": 2.367093801498413, + "learning_rate": 4.99312173690728e-05, + "loss": 1.7787, + "step": 165 + }, + { + "epoch": 0.02390552995391705, + "grad_norm": 2.691084861755371, + "learning_rate": 4.993037638988958e-05, + "loss": 2.2402, + "step": 166 + }, + { + "epoch": 0.024049539170506912, + "grad_norm": 1.7695673704147339, + "learning_rate": 4.992953030789927e-05, + "loss": 1.5765, + "step": 167 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 2.3219804763793945, + "learning_rate": 4.9928679123275065e-05, + "loss": 2.262, + "step": 168 + }, + { + "epoch": 0.024337557603686635, + "grad_norm": 1.6684150695800781, + "learning_rate": 4.992782283619118e-05, + "loss": 1.7065, + "step": 169 + }, + { + "epoch": 0.0244815668202765, + "grad_norm": 1.434299111366272, + "learning_rate": 4.992696144682291e-05, + "loss": 1.4936, + "step": 170 + }, + { + "epoch": 0.02462557603686636, + "grad_norm": 3.129185914993286, + "learning_rate": 4.9926094955346526e-05, + "loss": 2.0031, + "step": 171 + }, + { + "epoch": 0.02476958525345622, + "grad_norm": 1.8196148872375488, + "learning_rate": 4.99252233619394e-05, + "loss": 1.9122, + "step": 172 + }, + { + "epoch": 0.024913594470046083, + "grad_norm": 2.5063042640686035, + "learning_rate": 4.992434666677993e-05, + "loss": 2.0456, + "step": 173 + }, + { + "epoch": 0.025057603686635944, + "grad_norm": 2.5241332054138184, + "learning_rate": 4.992346487004757e-05, + "loss": 2.2062, + "step": 174 + }, + { + "epoch": 0.025201612903225805, + "grad_norm": 2.5988030433654785, + "learning_rate": 4.9922577971922804e-05, + "loss": 2.2962, + "step": 175 + }, + { + "epoch": 0.02534562211981567, + "grad_norm": 4.069245338439941, + "learning_rate": 4.992168597258715e-05, + "loss": 2.0886, + "step": 176 + }, + { + "epoch": 0.02548963133640553, + "grad_norm": 2.378105878829956, + "learning_rate": 4.99207888722232e-05, + "loss": 1.746, + "step": 177 + }, + { + "epoch": 0.025633640552995392, + "grad_norm": 2.4982898235321045, + "learning_rate": 4.991988667101457e-05, + "loss": 2.3038, + "step": 178 + }, + { + "epoch": 0.025777649769585253, + "grad_norm": 2.5849082469940186, + "learning_rate": 4.991897936914593e-05, + "loss": 1.5666, + "step": 179 + }, + { + "epoch": 0.025921658986175114, + "grad_norm": 2.2449848651885986, + "learning_rate": 4.991806696680298e-05, + "loss": 2.1047, + "step": 180 + }, + { + "epoch": 0.02606566820276498, + "grad_norm": 2.2515430450439453, + "learning_rate": 4.991714946417247e-05, + "loss": 2.3806, + "step": 181 + }, + { + "epoch": 0.02620967741935484, + "grad_norm": 1.7759323120117188, + "learning_rate": 4.9916226861442204e-05, + "loss": 2.1071, + "step": 182 + }, + { + "epoch": 0.0263536866359447, + "grad_norm": 2.07549786567688, + "learning_rate": 4.991529915880103e-05, + "loss": 2.5757, + "step": 183 + }, + { + "epoch": 0.026497695852534562, + "grad_norm": 1.8926879167556763, + "learning_rate": 4.9914366356438814e-05, + "loss": 1.1825, + "step": 184 + }, + { + "epoch": 0.026641705069124423, + "grad_norm": 2.59169340133667, + "learning_rate": 4.9913428454546494e-05, + "loss": 2.7584, + "step": 185 + }, + { + "epoch": 0.026785714285714284, + "grad_norm": 2.334949493408203, + "learning_rate": 4.991248545331605e-05, + "loss": 2.1578, + "step": 186 + }, + { + "epoch": 0.02692972350230415, + "grad_norm": 1.9174121618270874, + "learning_rate": 4.991153735294049e-05, + "loss": 2.5909, + "step": 187 + }, + { + "epoch": 0.02707373271889401, + "grad_norm": 2.0596699714660645, + "learning_rate": 4.991058415361386e-05, + "loss": 2.1297, + "step": 188 + }, + { + "epoch": 0.02721774193548387, + "grad_norm": 1.6076545715332031, + "learning_rate": 4.990962585553128e-05, + "loss": 1.0265, + "step": 189 + }, + { + "epoch": 0.027361751152073732, + "grad_norm": 2.113981008529663, + "learning_rate": 4.990866245888889e-05, + "loss": 1.9824, + "step": 190 + }, + { + "epoch": 0.027505760368663593, + "grad_norm": 4.1225152015686035, + "learning_rate": 4.9907693963883884e-05, + "loss": 3.123, + "step": 191 + }, + { + "epoch": 0.027649769585253458, + "grad_norm": 1.8539068698883057, + "learning_rate": 4.99067203707145e-05, + "loss": 1.5319, + "step": 192 + }, + { + "epoch": 0.02779377880184332, + "grad_norm": 2.368051052093506, + "learning_rate": 4.9905741679580007e-05, + "loss": 2.1466, + "step": 193 + }, + { + "epoch": 0.02793778801843318, + "grad_norm": 2.628180503845215, + "learning_rate": 4.990475789068072e-05, + "loss": 1.9729, + "step": 194 + }, + { + "epoch": 0.02808179723502304, + "grad_norm": 3.4089622497558594, + "learning_rate": 4.9903769004218024e-05, + "loss": 3.2299, + "step": 195 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 1.9745320081710815, + "learning_rate": 4.990277502039431e-05, + "loss": 2.5572, + "step": 196 + }, + { + "epoch": 0.028369815668202764, + "grad_norm": 2.038966417312622, + "learning_rate": 4.9901775939413026e-05, + "loss": 1.3201, + "step": 197 + }, + { + "epoch": 0.028513824884792628, + "grad_norm": 2.2828519344329834, + "learning_rate": 4.9900771761478685e-05, + "loss": 3.4881, + "step": 198 + }, + { + "epoch": 0.02865783410138249, + "grad_norm": 2.4174892902374268, + "learning_rate": 4.9899762486796796e-05, + "loss": 1.9477, + "step": 199 + }, + { + "epoch": 0.02880184331797235, + "grad_norm": 1.1866940259933472, + "learning_rate": 4.989874811557397e-05, + "loss": 1.8625, + "step": 200 + }, + { + "epoch": 0.02894585253456221, + "grad_norm": 2.2913496494293213, + "learning_rate": 4.989772864801782e-05, + "loss": 2.4548, + "step": 201 + }, + { + "epoch": 0.029089861751152073, + "grad_norm": 1.8021252155303955, + "learning_rate": 4.9896704084337e-05, + "loss": 2.232, + "step": 202 + }, + { + "epoch": 0.029233870967741934, + "grad_norm": 1.8777146339416504, + "learning_rate": 4.989567442474123e-05, + "loss": 1.651, + "step": 203 + }, + { + "epoch": 0.0293778801843318, + "grad_norm": 1.7925447225570679, + "learning_rate": 4.989463966944127e-05, + "loss": 2.7226, + "step": 204 + }, + { + "epoch": 0.02952188940092166, + "grad_norm": 1.6317986249923706, + "learning_rate": 4.9893599818648904e-05, + "loss": 1.7204, + "step": 205 + }, + { + "epoch": 0.02966589861751152, + "grad_norm": 1.9447832107543945, + "learning_rate": 4.989255487257697e-05, + "loss": 2.3326, + "step": 206 + }, + { + "epoch": 0.029809907834101382, + "grad_norm": 2.7441060543060303, + "learning_rate": 4.9891504831439375e-05, + "loss": 2.0735, + "step": 207 + }, + { + "epoch": 0.029953917050691243, + "grad_norm": 2.145132064819336, + "learning_rate": 4.989044969545101e-05, + "loss": 1.9261, + "step": 208 + }, + { + "epoch": 0.030097926267281108, + "grad_norm": 2.2947516441345215, + "learning_rate": 4.988938946482786e-05, + "loss": 1.1858, + "step": 209 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 1.3634928464889526, + "learning_rate": 4.988832413978693e-05, + "loss": 1.7622, + "step": 210 + }, + { + "epoch": 0.03038594470046083, + "grad_norm": 2.0713212490081787, + "learning_rate": 4.988725372054629e-05, + "loss": 2.1902, + "step": 211 + }, + { + "epoch": 0.03052995391705069, + "grad_norm": 1.7390276193618774, + "learning_rate": 4.988617820732502e-05, + "loss": 1.3188, + "step": 212 + }, + { + "epoch": 0.030673963133640552, + "grad_norm": 4.161805152893066, + "learning_rate": 4.9885097600343254e-05, + "loss": 3.7369, + "step": 213 + }, + { + "epoch": 0.030817972350230413, + "grad_norm": 2.604933023452759, + "learning_rate": 4.988401189982218e-05, + "loss": 1.8913, + "step": 214 + }, + { + "epoch": 0.030961981566820278, + "grad_norm": 1.6910828351974487, + "learning_rate": 4.988292110598403e-05, + "loss": 1.8581, + "step": 215 + }, + { + "epoch": 0.03110599078341014, + "grad_norm": 2.040647029876709, + "learning_rate": 4.988182521905205e-05, + "loss": 1.801, + "step": 216 + }, + { + "epoch": 0.03125, + "grad_norm": 1.3934439420700073, + "learning_rate": 4.9880724239250565e-05, + "loss": 1.6621, + "step": 217 + }, + { + "epoch": 0.031394009216589865, + "grad_norm": 2.042985200881958, + "learning_rate": 4.987961816680492e-05, + "loss": 1.5774, + "step": 218 + }, + { + "epoch": 0.03153801843317972, + "grad_norm": 1.9562087059020996, + "learning_rate": 4.987850700194152e-05, + "loss": 1.3688, + "step": 219 + }, + { + "epoch": 0.03168202764976959, + "grad_norm": 1.5122274160385132, + "learning_rate": 4.9877390744887784e-05, + "loss": 1.4933, + "step": 220 + }, + { + "epoch": 0.031826036866359445, + "grad_norm": 3.482300281524658, + "learning_rate": 4.98762693958722e-05, + "loss": 2.2244, + "step": 221 + }, + { + "epoch": 0.03197004608294931, + "grad_norm": 2.288222312927246, + "learning_rate": 4.987514295512428e-05, + "loss": 1.4899, + "step": 222 + }, + { + "epoch": 0.032114055299539174, + "grad_norm": 2.4324841499328613, + "learning_rate": 4.987401142287459e-05, + "loss": 2.1073, + "step": 223 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 2.3672738075256348, + "learning_rate": 4.987287479935475e-05, + "loss": 1.0661, + "step": 224 + }, + { + "epoch": 0.032402073732718896, + "grad_norm": 3.114368438720703, + "learning_rate": 4.987173308479738e-05, + "loss": 2.2658, + "step": 225 + }, + { + "epoch": 0.032546082949308754, + "grad_norm": 2.737032651901245, + "learning_rate": 4.987058627943619e-05, + "loss": 1.5633, + "step": 226 + }, + { + "epoch": 0.03269009216589862, + "grad_norm": 2.61578106880188, + "learning_rate": 4.98694343835059e-05, + "loss": 1.4491, + "step": 227 + }, + { + "epoch": 0.032834101382488476, + "grad_norm": 1.891682744026184, + "learning_rate": 4.986827739724228e-05, + "loss": 2.0146, + "step": 228 + }, + { + "epoch": 0.03297811059907834, + "grad_norm": 1.9205386638641357, + "learning_rate": 4.986711532088216e-05, + "loss": 0.8661, + "step": 229 + }, + { + "epoch": 0.033122119815668205, + "grad_norm": 1.794940710067749, + "learning_rate": 4.9865948154663376e-05, + "loss": 1.4466, + "step": 230 + }, + { + "epoch": 0.03326612903225806, + "grad_norm": 3.2222681045532227, + "learning_rate": 4.986477589882485e-05, + "loss": 2.5906, + "step": 231 + }, + { + "epoch": 0.03341013824884793, + "grad_norm": 2.8244669437408447, + "learning_rate": 4.98635985536065e-05, + "loss": 1.9234, + "step": 232 + }, + { + "epoch": 0.033554147465437785, + "grad_norm": 2.8919918537139893, + "learning_rate": 4.986241611924932e-05, + "loss": 1.8669, + "step": 233 + }, + { + "epoch": 0.03369815668202765, + "grad_norm": 3.4223971366882324, + "learning_rate": 4.9861228595995326e-05, + "loss": 2.6955, + "step": 234 + }, + { + "epoch": 0.033842165898617514, + "grad_norm": 2.6255218982696533, + "learning_rate": 4.98600359840876e-05, + "loss": 3.4048, + "step": 235 + }, + { + "epoch": 0.03398617511520737, + "grad_norm": 1.9981160163879395, + "learning_rate": 4.9858838283770215e-05, + "loss": 1.8343, + "step": 236 + }, + { + "epoch": 0.034130184331797236, + "grad_norm": 2.8727142810821533, + "learning_rate": 4.985763549528835e-05, + "loss": 2.3524, + "step": 237 + }, + { + "epoch": 0.034274193548387094, + "grad_norm": 2.4221768379211426, + "learning_rate": 4.985642761888819e-05, + "loss": 1.8787, + "step": 238 + }, + { + "epoch": 0.03441820276497696, + "grad_norm": 2.3322486877441406, + "learning_rate": 4.985521465481695e-05, + "loss": 2.2922, + "step": 239 + }, + { + "epoch": 0.03456221198156682, + "grad_norm": 3.3741631507873535, + "learning_rate": 4.9853996603322916e-05, + "loss": 1.6798, + "step": 240 + }, + { + "epoch": 0.03470622119815668, + "grad_norm": 1.701639175415039, + "learning_rate": 4.98527734646554e-05, + "loss": 1.2189, + "step": 241 + }, + { + "epoch": 0.034850230414746546, + "grad_norm": 3.044304370880127, + "learning_rate": 4.9851545239064755e-05, + "loss": 1.4883, + "step": 242 + }, + { + "epoch": 0.0349942396313364, + "grad_norm": 2.7362723350524902, + "learning_rate": 4.985031192680237e-05, + "loss": 2.0001, + "step": 243 + }, + { + "epoch": 0.03513824884792627, + "grad_norm": 2.0627050399780273, + "learning_rate": 4.98490735281207e-05, + "loss": 1.8519, + "step": 244 + }, + { + "epoch": 0.03528225806451613, + "grad_norm": 2.8644959926605225, + "learning_rate": 4.984783004327321e-05, + "loss": 2.0512, + "step": 245 + }, + { + "epoch": 0.03542626728110599, + "grad_norm": 1.8248634338378906, + "learning_rate": 4.984658147251442e-05, + "loss": 1.725, + "step": 246 + }, + { + "epoch": 0.035570276497695855, + "grad_norm": 4.551502704620361, + "learning_rate": 4.984532781609989e-05, + "loss": 2.6323, + "step": 247 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 3.126960277557373, + "learning_rate": 4.984406907428623e-05, + "loss": 1.5795, + "step": 248 + }, + { + "epoch": 0.03585829493087558, + "grad_norm": 2.3643016815185547, + "learning_rate": 4.984280524733107e-05, + "loss": 1.275, + "step": 249 + }, + { + "epoch": 0.036002304147465435, + "grad_norm": 1.8689227104187012, + "learning_rate": 4.98415363354931e-05, + "loss": 5.1534, + "step": 250 + }, + { + "epoch": 0.0361463133640553, + "grad_norm": 4.540099143981934, + "learning_rate": 4.984026233903204e-05, + "loss": 2.2245, + "step": 251 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 3.6170566082000732, + "learning_rate": 4.983898325820866e-05, + "loss": 1.8452, + "step": 252 + }, + { + "epoch": 0.03643433179723502, + "grad_norm": 2.7079555988311768, + "learning_rate": 4.9837699093284765e-05, + "loss": 2.1177, + "step": 253 + }, + { + "epoch": 0.036578341013824886, + "grad_norm": 2.3701915740966797, + "learning_rate": 4.983640984452319e-05, + "loss": 2.1043, + "step": 254 + }, + { + "epoch": 0.036722350230414744, + "grad_norm": 1.8217356204986572, + "learning_rate": 4.9835115512187834e-05, + "loss": 1.2438, + "step": 255 + }, + { + "epoch": 0.03686635944700461, + "grad_norm": 4.052347183227539, + "learning_rate": 4.983381609654362e-05, + "loss": 1.866, + "step": 256 + }, + { + "epoch": 0.03701036866359447, + "grad_norm": 1.6774176359176636, + "learning_rate": 4.983251159785651e-05, + "loss": 0.9232, + "step": 257 + }, + { + "epoch": 0.03715437788018433, + "grad_norm": 3.7263576984405518, + "learning_rate": 4.983120201639353e-05, + "loss": 2.0183, + "step": 258 + }, + { + "epoch": 0.037298387096774195, + "grad_norm": 2.1659698486328125, + "learning_rate": 4.98298873524227e-05, + "loss": 1.1457, + "step": 259 + }, + { + "epoch": 0.03744239631336405, + "grad_norm": 3.9632930755615234, + "learning_rate": 4.982856760621313e-05, + "loss": 1.9733, + "step": 260 + }, + { + "epoch": 0.03758640552995392, + "grad_norm": 2.8567299842834473, + "learning_rate": 4.982724277803494e-05, + "loss": 1.5088, + "step": 261 + }, + { + "epoch": 0.03773041474654378, + "grad_norm": 3.41469144821167, + "learning_rate": 4.9825912868159304e-05, + "loss": 1.624, + "step": 262 + }, + { + "epoch": 0.03787442396313364, + "grad_norm": 4.040119647979736, + "learning_rate": 4.982457787685842e-05, + "loss": 2.6058, + "step": 263 + }, + { + "epoch": 0.038018433179723504, + "grad_norm": 2.387637138366699, + "learning_rate": 4.9823237804405556e-05, + "loss": 2.1812, + "step": 264 + }, + { + "epoch": 0.03816244239631336, + "grad_norm": 1.5378010272979736, + "learning_rate": 4.982189265107499e-05, + "loss": 1.2567, + "step": 265 + }, + { + "epoch": 0.038306451612903226, + "grad_norm": 2.40761661529541, + "learning_rate": 4.9820542417142046e-05, + "loss": 1.3743, + "step": 266 + }, + { + "epoch": 0.038450460829493084, + "grad_norm": 4.465402126312256, + "learning_rate": 4.981918710288309e-05, + "loss": 2.2828, + "step": 267 + }, + { + "epoch": 0.03859447004608295, + "grad_norm": 4.871656894683838, + "learning_rate": 4.981782670857555e-05, + "loss": 1.8028, + "step": 268 + }, + { + "epoch": 0.03873847926267281, + "grad_norm": 3.5532186031341553, + "learning_rate": 4.9816461234497866e-05, + "loss": 1.9784, + "step": 269 + }, + { + "epoch": 0.03888248847926267, + "grad_norm": 3.182777166366577, + "learning_rate": 4.981509068092952e-05, + "loss": 2.4241, + "step": 270 + }, + { + "epoch": 0.039026497695852536, + "grad_norm": 4.068339824676514, + "learning_rate": 4.9813715048151046e-05, + "loss": 1.6463, + "step": 271 + }, + { + "epoch": 0.03917050691244239, + "grad_norm": 3.398367404937744, + "learning_rate": 4.9812334336444004e-05, + "loss": 1.9214, + "step": 272 + }, + { + "epoch": 0.03931451612903226, + "grad_norm": 4.369844436645508, + "learning_rate": 4.981094854609101e-05, + "loss": 2.1146, + "step": 273 + }, + { + "epoch": 0.03945852534562212, + "grad_norm": 3.162872076034546, + "learning_rate": 4.9809557677375704e-05, + "loss": 2.5095, + "step": 274 + }, + { + "epoch": 0.03960253456221198, + "grad_norm": 2.452432155609131, + "learning_rate": 4.980816173058279e-05, + "loss": 3.4455, + "step": 275 + }, + { + "epoch": 0.039746543778801845, + "grad_norm": 3.706747055053711, + "learning_rate": 4.9806760705997966e-05, + "loss": 1.735, + "step": 276 + }, + { + "epoch": 0.0398905529953917, + "grad_norm": 1.2772737741470337, + "learning_rate": 4.980535460390801e-05, + "loss": 0.6546, + "step": 277 + }, + { + "epoch": 0.04003456221198157, + "grad_norm": 2.814746856689453, + "learning_rate": 4.980394342460074e-05, + "loss": 1.7501, + "step": 278 + }, + { + "epoch": 0.04017857142857143, + "grad_norm": 2.7781925201416016, + "learning_rate": 4.980252716836498e-05, + "loss": 2.7258, + "step": 279 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 2.146742820739746, + "learning_rate": 4.980110583549062e-05, + "loss": 1.0519, + "step": 280 + }, + { + "epoch": 0.040466589861751154, + "grad_norm": 2.560239315032959, + "learning_rate": 4.979967942626858e-05, + "loss": 1.5211, + "step": 281 + }, + { + "epoch": 0.04061059907834101, + "grad_norm": 2.88525128364563, + "learning_rate": 4.979824794099082e-05, + "loss": 2.1673, + "step": 282 + }, + { + "epoch": 0.040754608294930876, + "grad_norm": 3.5404763221740723, + "learning_rate": 4.979681137995034e-05, + "loss": 1.6509, + "step": 283 + }, + { + "epoch": 0.04089861751152074, + "grad_norm": 2.1443560123443604, + "learning_rate": 4.979536974344118e-05, + "loss": 1.4166, + "step": 284 + }, + { + "epoch": 0.0410426267281106, + "grad_norm": 2.4609785079956055, + "learning_rate": 4.979392303175842e-05, + "loss": 2.0484, + "step": 285 + }, + { + "epoch": 0.04118663594470046, + "grad_norm": 4.013987064361572, + "learning_rate": 4.979247124519817e-05, + "loss": 3.0529, + "step": 286 + }, + { + "epoch": 0.04133064516129032, + "grad_norm": 3.078587532043457, + "learning_rate": 4.979101438405759e-05, + "loss": 1.9565, + "step": 287 + }, + { + "epoch": 0.041474654377880185, + "grad_norm": 2.5302894115448, + "learning_rate": 4.9789552448634874e-05, + "loss": 1.5136, + "step": 288 + }, + { + "epoch": 0.04161866359447004, + "grad_norm": 3.192695379257202, + "learning_rate": 4.978808543922925e-05, + "loss": 2.1873, + "step": 289 + }, + { + "epoch": 0.04176267281105991, + "grad_norm": 2.1981019973754883, + "learning_rate": 4.9786613356141e-05, + "loss": 0.6374, + "step": 290 + }, + { + "epoch": 0.04190668202764977, + "grad_norm": 3.317621946334839, + "learning_rate": 4.978513619967141e-05, + "loss": 1.5994, + "step": 291 + }, + { + "epoch": 0.04205069124423963, + "grad_norm": 2.4284000396728516, + "learning_rate": 4.9783653970122854e-05, + "loss": 1.5384, + "step": 292 + }, + { + "epoch": 0.042194700460829494, + "grad_norm": 3.4165384769439697, + "learning_rate": 4.97821666677987e-05, + "loss": 1.9987, + "step": 293 + }, + { + "epoch": 0.04233870967741935, + "grad_norm": 3.0300941467285156, + "learning_rate": 4.9780674293003386e-05, + "loss": 1.2029, + "step": 294 + }, + { + "epoch": 0.042482718894009217, + "grad_norm": 2.7069251537323, + "learning_rate": 4.9779176846042366e-05, + "loss": 0.8835, + "step": 295 + }, + { + "epoch": 0.04262672811059908, + "grad_norm": 3.556356906890869, + "learning_rate": 4.977767432722215e-05, + "loss": 1.1764, + "step": 296 + }, + { + "epoch": 0.04277073732718894, + "grad_norm": 1.9838769435882568, + "learning_rate": 4.977616673685026e-05, + "loss": 1.1025, + "step": 297 + }, + { + "epoch": 0.0429147465437788, + "grad_norm": 2.8446812629699707, + "learning_rate": 4.9774654075235286e-05, + "loss": 2.5666, + "step": 298 + }, + { + "epoch": 0.04305875576036866, + "grad_norm": 3.6095120906829834, + "learning_rate": 4.9773136342686835e-05, + "loss": 1.5967, + "step": 299 + }, + { + "epoch": 0.043202764976958526, + "grad_norm": 3.8366310596466064, + "learning_rate": 4.9771613539515574e-05, + "loss": 2.0884, + "step": 300 + }, + { + "epoch": 0.04334677419354839, + "grad_norm": 5.126287460327148, + "learning_rate": 4.977008566603317e-05, + "loss": 1.0344, + "step": 301 + }, + { + "epoch": 0.04349078341013825, + "grad_norm": 2.0179128646850586, + "learning_rate": 4.976855272255239e-05, + "loss": 1.2609, + "step": 302 + }, + { + "epoch": 0.04363479262672811, + "grad_norm": 2.3522181510925293, + "learning_rate": 4.976701470938696e-05, + "loss": 2.003, + "step": 303 + }, + { + "epoch": 0.04377880184331797, + "grad_norm": 4.781628131866455, + "learning_rate": 4.9765471626851703e-05, + "loss": 1.839, + "step": 304 + }, + { + "epoch": 0.043922811059907835, + "grad_norm": 2.1846587657928467, + "learning_rate": 4.9763923475262464e-05, + "loss": 2.0024, + "step": 305 + }, + { + "epoch": 0.04406682027649769, + "grad_norm": 2.7739369869232178, + "learning_rate": 4.9762370254936115e-05, + "loss": 2.9277, + "step": 306 + }, + { + "epoch": 0.04421082949308756, + "grad_norm": 3.660083055496216, + "learning_rate": 4.976081196619057e-05, + "loss": 2.5416, + "step": 307 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 2.3524558544158936, + "learning_rate": 4.97592486093448e-05, + "loss": 1.5162, + "step": 308 + }, + { + "epoch": 0.04449884792626728, + "grad_norm": 2.235851764678955, + "learning_rate": 4.975768018471877e-05, + "loss": 2.5583, + "step": 309 + }, + { + "epoch": 0.044642857142857144, + "grad_norm": 2.432739496231079, + "learning_rate": 4.975610669263353e-05, + "loss": 1.4624, + "step": 310 + }, + { + "epoch": 0.044786866359447, + "grad_norm": 3.056042194366455, + "learning_rate": 4.975452813341114e-05, + "loss": 1.5897, + "step": 311 + }, + { + "epoch": 0.044930875576036866, + "grad_norm": 2.0111398696899414, + "learning_rate": 4.9752944507374704e-05, + "loss": 1.041, + "step": 312 + }, + { + "epoch": 0.04507488479262673, + "grad_norm": 3.064060926437378, + "learning_rate": 4.975135581484836e-05, + "loss": 1.7926, + "step": 313 + }, + { + "epoch": 0.04521889400921659, + "grad_norm": 4.319392681121826, + "learning_rate": 4.974976205615729e-05, + "loss": 2.3241, + "step": 314 + }, + { + "epoch": 0.04536290322580645, + "grad_norm": 2.588327646255493, + "learning_rate": 4.974816323162769e-05, + "loss": 1.1548, + "step": 315 + }, + { + "epoch": 0.04550691244239631, + "grad_norm": 3.4128735065460205, + "learning_rate": 4.974655934158684e-05, + "loss": 3.4577, + "step": 316 + }, + { + "epoch": 0.045650921658986175, + "grad_norm": 2.5935921669006348, + "learning_rate": 4.9744950386363e-05, + "loss": 1.7166, + "step": 317 + }, + { + "epoch": 0.04579493087557604, + "grad_norm": 2.1984691619873047, + "learning_rate": 4.974333636628552e-05, + "loss": 1.2649, + "step": 318 + }, + { + "epoch": 0.0459389400921659, + "grad_norm": 2.2653706073760986, + "learning_rate": 4.974171728168475e-05, + "loss": 2.839, + "step": 319 + }, + { + "epoch": 0.04608294930875576, + "grad_norm": 2.702493667602539, + "learning_rate": 4.974009313289207e-05, + "loss": 1.7781, + "step": 320 + }, + { + "epoch": 0.04622695852534562, + "grad_norm": 3.4411139488220215, + "learning_rate": 4.9738463920239955e-05, + "loss": 1.5396, + "step": 321 + }, + { + "epoch": 0.046370967741935484, + "grad_norm": 2.3458144664764404, + "learning_rate": 4.973682964406183e-05, + "loss": 0.9844, + "step": 322 + }, + { + "epoch": 0.04651497695852535, + "grad_norm": 2.3639841079711914, + "learning_rate": 4.973519030469225e-05, + "loss": 1.387, + "step": 323 + }, + { + "epoch": 0.04665898617511521, + "grad_norm": 4.347269535064697, + "learning_rate": 4.973354590246672e-05, + "loss": 2.0338, + "step": 324 + }, + { + "epoch": 0.04680299539170507, + "grad_norm": 2.389857530593872, + "learning_rate": 4.9731896437721826e-05, + "loss": 1.8687, + "step": 325 + }, + { + "epoch": 0.04694700460829493, + "grad_norm": 2.350520610809326, + "learning_rate": 4.973024191079521e-05, + "loss": 1.5539, + "step": 326 + }, + { + "epoch": 0.04709101382488479, + "grad_norm": 2.5230698585510254, + "learning_rate": 4.972858232202549e-05, + "loss": 1.4047, + "step": 327 + }, + { + "epoch": 0.04723502304147465, + "grad_norm": 2.2744088172912598, + "learning_rate": 4.972691767175238e-05, + "loss": 1.1333, + "step": 328 + }, + { + "epoch": 0.047379032258064516, + "grad_norm": 4.1282572746276855, + "learning_rate": 4.972524796031659e-05, + "loss": 3.3564, + "step": 329 + }, + { + "epoch": 0.04752304147465438, + "grad_norm": 2.8907487392425537, + "learning_rate": 4.9723573188059894e-05, + "loss": 0.8487, + "step": 330 + }, + { + "epoch": 0.04766705069124424, + "grad_norm": 2.6049697399139404, + "learning_rate": 4.972189335532508e-05, + "loss": 0.5925, + "step": 331 + }, + { + "epoch": 0.0478110599078341, + "grad_norm": 3.3478856086730957, + "learning_rate": 4.9720208462455975e-05, + "loss": 1.8246, + "step": 332 + }, + { + "epoch": 0.04795506912442396, + "grad_norm": 3.497406244277954, + "learning_rate": 4.971851850979745e-05, + "loss": 1.091, + "step": 333 + }, + { + "epoch": 0.048099078341013825, + "grad_norm": 3.564157485961914, + "learning_rate": 4.971682349769541e-05, + "loss": 2.6538, + "step": 334 + }, + { + "epoch": 0.04824308755760369, + "grad_norm": 3.9001972675323486, + "learning_rate": 4.97151234264968e-05, + "loss": 2.0585, + "step": 335 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 5.318079948425293, + "learning_rate": 4.971341829654959e-05, + "loss": 2.5094, + "step": 336 + }, + { + "epoch": 0.04853110599078341, + "grad_norm": 2.568713426589966, + "learning_rate": 4.971170810820279e-05, + "loss": 1.8282, + "step": 337 + }, + { + "epoch": 0.04867511520737327, + "grad_norm": 3.0869979858398438, + "learning_rate": 4.970999286180644e-05, + "loss": 1.2825, + "step": 338 + }, + { + "epoch": 0.048819124423963134, + "grad_norm": 2.8025596141815186, + "learning_rate": 4.970827255771162e-05, + "loss": 1.6043, + "step": 339 + }, + { + "epoch": 0.048963133640553, + "grad_norm": 2.5250933170318604, + "learning_rate": 4.970654719627046e-05, + "loss": 0.9027, + "step": 340 + }, + { + "epoch": 0.049107142857142856, + "grad_norm": 3.5213699340820312, + "learning_rate": 4.970481677783609e-05, + "loss": 2.714, + "step": 341 + }, + { + "epoch": 0.04925115207373272, + "grad_norm": 2.9208576679229736, + "learning_rate": 4.970308130276272e-05, + "loss": 0.6678, + "step": 342 + }, + { + "epoch": 0.04939516129032258, + "grad_norm": 2.7967045307159424, + "learning_rate": 4.970134077140556e-05, + "loss": 1.6588, + "step": 343 + }, + { + "epoch": 0.04953917050691244, + "grad_norm": 3.732151508331299, + "learning_rate": 4.9699595184120853e-05, + "loss": 2.2548, + "step": 344 + }, + { + "epoch": 0.04968317972350231, + "grad_norm": 2.9591431617736816, + "learning_rate": 4.969784454126591e-05, + "loss": 1.5615, + "step": 345 + }, + { + "epoch": 0.049827188940092165, + "grad_norm": 4.843438625335693, + "learning_rate": 4.9696088843199046e-05, + "loss": 0.4071, + "step": 346 + }, + { + "epoch": 0.04997119815668203, + "grad_norm": 4.323400020599365, + "learning_rate": 4.969432809027962e-05, + "loss": 2.0711, + "step": 347 + }, + { + "epoch": 0.05011520737327189, + "grad_norm": 2.4797301292419434, + "learning_rate": 4.969256228286804e-05, + "loss": 1.2414, + "step": 348 + }, + { + "epoch": 0.05025921658986175, + "grad_norm": 3.519249677658081, + "learning_rate": 4.969079142132571e-05, + "loss": 2.4563, + "step": 349 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 2.8808319568634033, + "learning_rate": 4.9689015506015124e-05, + "loss": 1.6561, + "step": 350 + }, + { + "epoch": 0.050547235023041474, + "grad_norm": 5.634980201721191, + "learning_rate": 4.9687234537299765e-05, + "loss": 2.8055, + "step": 351 + }, + { + "epoch": 0.05069124423963134, + "grad_norm": 3.6427059173583984, + "learning_rate": 4.9685448515544166e-05, + "loss": 1.1135, + "step": 352 + }, + { + "epoch": 0.0508352534562212, + "grad_norm": 3.313081979751587, + "learning_rate": 4.9683657441113884e-05, + "loss": 3.0836, + "step": 353 + }, + { + "epoch": 0.05097926267281106, + "grad_norm": 2.020021438598633, + "learning_rate": 4.968186131437554e-05, + "loss": 1.9957, + "step": 354 + }, + { + "epoch": 0.05112327188940092, + "grad_norm": 3.7104415893554688, + "learning_rate": 4.968006013569677e-05, + "loss": 1.4607, + "step": 355 + }, + { + "epoch": 0.051267281105990783, + "grad_norm": 7.085813999176025, + "learning_rate": 4.967825390544622e-05, + "loss": 1.7128, + "step": 356 + }, + { + "epoch": 0.05141129032258065, + "grad_norm": 3.181124210357666, + "learning_rate": 4.967644262399362e-05, + "loss": 2.8445, + "step": 357 + }, + { + "epoch": 0.051555299539170506, + "grad_norm": 2.3777191638946533, + "learning_rate": 4.967462629170969e-05, + "loss": 2.1316, + "step": 358 + }, + { + "epoch": 0.05169930875576037, + "grad_norm": 3.506185531616211, + "learning_rate": 4.96728049089662e-05, + "loss": 2.1997, + "step": 359 + }, + { + "epoch": 0.05184331797235023, + "grad_norm": 2.8498380184173584, + "learning_rate": 4.967097847613597e-05, + "loss": 1.6635, + "step": 360 + }, + { + "epoch": 0.05198732718894009, + "grad_norm": 2.8901660442352295, + "learning_rate": 4.966914699359282e-05, + "loss": 1.7879, + "step": 361 + }, + { + "epoch": 0.05213133640552996, + "grad_norm": 2.5326247215270996, + "learning_rate": 4.966731046171164e-05, + "loss": 1.4018, + "step": 362 + }, + { + "epoch": 0.052275345622119815, + "grad_norm": 3.4823482036590576, + "learning_rate": 4.966546888086833e-05, + "loss": 1.4621, + "step": 363 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 1.9757779836654663, + "learning_rate": 4.9663622251439816e-05, + "loss": 0.7964, + "step": 364 + }, + { + "epoch": 0.05256336405529954, + "grad_norm": 3.8553292751312256, + "learning_rate": 4.966177057380409e-05, + "loss": 3.1308, + "step": 365 + }, + { + "epoch": 0.0527073732718894, + "grad_norm": 1.2176287174224854, + "learning_rate": 4.965991384834014e-05, + "loss": 5.3403, + "step": 366 + }, + { + "epoch": 0.05285138248847926, + "grad_norm": 1.8233147859573364, + "learning_rate": 4.965805207542802e-05, + "loss": 1.2617, + "step": 367 + }, + { + "epoch": 0.052995391705069124, + "grad_norm": 4.132518291473389, + "learning_rate": 4.9656185255448785e-05, + "loss": 1.8934, + "step": 368 + }, + { + "epoch": 0.05313940092165899, + "grad_norm": 4.0551862716674805, + "learning_rate": 4.965431338878456e-05, + "loss": 1.6953, + "step": 369 + }, + { + "epoch": 0.053283410138248846, + "grad_norm": 6.559986591339111, + "learning_rate": 4.965243647581847e-05, + "loss": 1.6592, + "step": 370 + }, + { + "epoch": 0.05342741935483871, + "grad_norm": 3.4842450618743896, + "learning_rate": 4.965055451693469e-05, + "loss": 1.1133, + "step": 371 + }, + { + "epoch": 0.05357142857142857, + "grad_norm": 4.1158623695373535, + "learning_rate": 4.964866751251842e-05, + "loss": 1.7885, + "step": 372 + }, + { + "epoch": 0.05371543778801843, + "grad_norm": 2.6869258880615234, + "learning_rate": 4.96467754629559e-05, + "loss": 2.4739, + "step": 373 + }, + { + "epoch": 0.0538594470046083, + "grad_norm": 3.0668272972106934, + "learning_rate": 4.964487836863439e-05, + "loss": 2.0112, + "step": 374 + }, + { + "epoch": 0.054003456221198155, + "grad_norm": 3.75852370262146, + "learning_rate": 4.964297622994222e-05, + "loss": 2.6282, + "step": 375 + }, + { + "epoch": 0.05414746543778802, + "grad_norm": 2.3226561546325684, + "learning_rate": 4.9641069047268684e-05, + "loss": 0.4048, + "step": 376 + }, + { + "epoch": 0.05429147465437788, + "grad_norm": 2.2222118377685547, + "learning_rate": 4.9639156821004184e-05, + "loss": 1.1478, + "step": 377 + }, + { + "epoch": 0.05443548387096774, + "grad_norm": 3.247927188873291, + "learning_rate": 4.9637239551540096e-05, + "loss": 2.8402, + "step": 378 + }, + { + "epoch": 0.05457949308755761, + "grad_norm": 4.6755805015563965, + "learning_rate": 4.963531723926885e-05, + "loss": 2.3409, + "step": 379 + }, + { + "epoch": 0.054723502304147464, + "grad_norm": 2.433645725250244, + "learning_rate": 4.963338988458394e-05, + "loss": 5.0782, + "step": 380 + }, + { + "epoch": 0.05486751152073733, + "grad_norm": 3.889937400817871, + "learning_rate": 4.963145748787982e-05, + "loss": 3.7884, + "step": 381 + }, + { + "epoch": 0.05501152073732719, + "grad_norm": 1.4939064979553223, + "learning_rate": 4.962952004955204e-05, + "loss": 1.4219, + "step": 382 + }, + { + "epoch": 0.05515552995391705, + "grad_norm": 2.200648546218872, + "learning_rate": 4.9627577569997164e-05, + "loss": 0.8473, + "step": 383 + }, + { + "epoch": 0.055299539170506916, + "grad_norm": 2.3939387798309326, + "learning_rate": 4.962563004961276e-05, + "loss": 0.8844, + "step": 384 + }, + { + "epoch": 0.055443548387096774, + "grad_norm": 3.6952030658721924, + "learning_rate": 4.962367748879748e-05, + "loss": 1.1926, + "step": 385 + }, + { + "epoch": 0.05558755760368664, + "grad_norm": 6.031922817230225, + "learning_rate": 4.9621719887950966e-05, + "loss": 3.0587, + "step": 386 + }, + { + "epoch": 0.055731566820276496, + "grad_norm": 4.959585666656494, + "learning_rate": 4.9619757247473894e-05, + "loss": 1.9514, + "step": 387 + }, + { + "epoch": 0.05587557603686636, + "grad_norm": 1.5882185697555542, + "learning_rate": 4.9617789567767995e-05, + "loss": 0.6408, + "step": 388 + }, + { + "epoch": 0.05601958525345622, + "grad_norm": 1.7497529983520508, + "learning_rate": 4.9615816849236016e-05, + "loss": 1.2417, + "step": 389 + }, + { + "epoch": 0.05616359447004608, + "grad_norm": 2.7870688438415527, + "learning_rate": 4.9613839092281735e-05, + "loss": 2.5172, + "step": 390 + }, + { + "epoch": 0.05630760368663595, + "grad_norm": 3.337810516357422, + "learning_rate": 4.9611856297309965e-05, + "loss": 0.7899, + "step": 391 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 2.364415407180786, + "learning_rate": 4.9609868464726544e-05, + "loss": 0.958, + "step": 392 + }, + { + "epoch": 0.05659562211981567, + "grad_norm": 4.1733574867248535, + "learning_rate": 4.960787559493836e-05, + "loss": 2.3146, + "step": 393 + }, + { + "epoch": 0.05673963133640553, + "grad_norm": 2.824965715408325, + "learning_rate": 4.9605877688353294e-05, + "loss": 0.4735, + "step": 394 + }, + { + "epoch": 0.05688364055299539, + "grad_norm": 3.784836769104004, + "learning_rate": 4.960387474538031e-05, + "loss": 2.8236, + "step": 395 + }, + { + "epoch": 0.057027649769585256, + "grad_norm": 6.396073341369629, + "learning_rate": 4.9601866766429364e-05, + "loss": 2.3461, + "step": 396 + }, + { + "epoch": 0.057171658986175114, + "grad_norm": 3.8237464427948, + "learning_rate": 4.959985375191144e-05, + "loss": 2.0224, + "step": 397 + }, + { + "epoch": 0.05731566820276498, + "grad_norm": 5.0668134689331055, + "learning_rate": 4.959783570223859e-05, + "loss": 1.6756, + "step": 398 + }, + { + "epoch": 0.057459677419354836, + "grad_norm": 4.743002891540527, + "learning_rate": 4.9595812617823856e-05, + "loss": 1.7027, + "step": 399 + }, + { + "epoch": 0.0576036866359447, + "grad_norm": 3.144256114959717, + "learning_rate": 4.9593784499081336e-05, + "loss": 1.3278, + "step": 400 + }, + { + "epoch": 0.057747695852534565, + "grad_norm": 4.146528720855713, + "learning_rate": 4.959175134642614e-05, + "loss": 0.9378, + "step": 401 + }, + { + "epoch": 0.05789170506912442, + "grad_norm": 4.262664794921875, + "learning_rate": 4.958971316027443e-05, + "loss": 1.9837, + "step": 402 + }, + { + "epoch": 0.05803571428571429, + "grad_norm": 3.529796838760376, + "learning_rate": 4.9587669941043394e-05, + "loss": 1.2602, + "step": 403 + }, + { + "epoch": 0.058179723502304145, + "grad_norm": 5.2073163986206055, + "learning_rate": 4.9585621689151216e-05, + "loss": 2.0245, + "step": 404 + }, + { + "epoch": 0.05832373271889401, + "grad_norm": 3.768745183944702, + "learning_rate": 4.9583568405017155e-05, + "loss": 2.2067, + "step": 405 + }, + { + "epoch": 0.05846774193548387, + "grad_norm": 2.497687816619873, + "learning_rate": 4.9581510089061476e-05, + "loss": 2.7446, + "step": 406 + }, + { + "epoch": 0.05861175115207373, + "grad_norm": 4.276885509490967, + "learning_rate": 4.9579446741705485e-05, + "loss": 2.6405, + "step": 407 + }, + { + "epoch": 0.0587557603686636, + "grad_norm": 5.575394153594971, + "learning_rate": 4.957737836337152e-05, + "loss": 2.2467, + "step": 408 + }, + { + "epoch": 0.058899769585253454, + "grad_norm": 2.22929310798645, + "learning_rate": 4.957530495448292e-05, + "loss": 3.1937, + "step": 409 + }, + { + "epoch": 0.05904377880184332, + "grad_norm": 4.090132713317871, + "learning_rate": 4.957322651546409e-05, + "loss": 1.512, + "step": 410 + }, + { + "epoch": 0.05918778801843318, + "grad_norm": 3.085160732269287, + "learning_rate": 4.9571143046740445e-05, + "loss": 1.6105, + "step": 411 + }, + { + "epoch": 0.05933179723502304, + "grad_norm": 2.925109624862671, + "learning_rate": 4.9569054548738443e-05, + "loss": 1.5231, + "step": 412 + }, + { + "epoch": 0.059475806451612906, + "grad_norm": 4.001129150390625, + "learning_rate": 4.956696102188555e-05, + "loss": 1.1452, + "step": 413 + }, + { + "epoch": 0.059619815668202764, + "grad_norm": 2.8010239601135254, + "learning_rate": 4.9564862466610284e-05, + "loss": 2.5914, + "step": 414 + }, + { + "epoch": 0.05976382488479263, + "grad_norm": 4.639739990234375, + "learning_rate": 4.956275888334218e-05, + "loss": 3.9043, + "step": 415 + }, + { + "epoch": 0.059907834101382486, + "grad_norm": 1.5308104753494263, + "learning_rate": 4.956065027251179e-05, + "loss": 1.7838, + "step": 416 + }, + { + "epoch": 0.06005184331797235, + "grad_norm": 1.6679797172546387, + "learning_rate": 4.955853663455072e-05, + "loss": 4.8833, + "step": 417 + }, + { + "epoch": 0.060195852534562215, + "grad_norm": 2.150641918182373, + "learning_rate": 4.955641796989161e-05, + "loss": 0.9864, + "step": 418 + }, + { + "epoch": 0.06033986175115207, + "grad_norm": 2.100308656692505, + "learning_rate": 4.95542942789681e-05, + "loss": 0.7333, + "step": 419 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 2.9537367820739746, + "learning_rate": 4.955216556221485e-05, + "loss": 1.3298, + "step": 420 + }, + { + "epoch": 0.060627880184331795, + "grad_norm": 2.3451225757598877, + "learning_rate": 4.955003182006761e-05, + "loss": 0.6668, + "step": 421 + }, + { + "epoch": 0.06077188940092166, + "grad_norm": 2.964503288269043, + "learning_rate": 4.954789305296309e-05, + "loss": 0.6183, + "step": 422 + }, + { + "epoch": 0.060915898617511524, + "grad_norm": 3.4528262615203857, + "learning_rate": 4.9545749261339076e-05, + "loss": 2.1613, + "step": 423 + }, + { + "epoch": 0.06105990783410138, + "grad_norm": 4.056407451629639, + "learning_rate": 4.954360044563435e-05, + "loss": 1.1169, + "step": 424 + }, + { + "epoch": 0.061203917050691246, + "grad_norm": 3.813540458679199, + "learning_rate": 4.954144660628875e-05, + "loss": 0.4237, + "step": 425 + }, + { + "epoch": 0.061347926267281104, + "grad_norm": 3.8731722831726074, + "learning_rate": 4.953928774374312e-05, + "loss": 1.3336, + "step": 426 + }, + { + "epoch": 0.06149193548387097, + "grad_norm": 4.586178302764893, + "learning_rate": 4.953712385843934e-05, + "loss": 1.7067, + "step": 427 + }, + { + "epoch": 0.061635944700460826, + "grad_norm": 3.5186028480529785, + "learning_rate": 4.953495495082032e-05, + "loss": 1.2095, + "step": 428 + }, + { + "epoch": 0.06177995391705069, + "grad_norm": 4.8964619636535645, + "learning_rate": 4.953278102133001e-05, + "loss": 3.2778, + "step": 429 + }, + { + "epoch": 0.061923963133640555, + "grad_norm": 3.9704537391662598, + "learning_rate": 4.9530602070413356e-05, + "loss": 2.2211, + "step": 430 + }, + { + "epoch": 0.06206797235023041, + "grad_norm": 1.306645393371582, + "learning_rate": 4.952841809851636e-05, + "loss": 0.3407, + "step": 431 + }, + { + "epoch": 0.06221198156682028, + "grad_norm": 4.72696590423584, + "learning_rate": 4.9526229106086045e-05, + "loss": 1.0102, + "step": 432 + }, + { + "epoch": 0.062355990783410135, + "grad_norm": 2.176875591278076, + "learning_rate": 4.952403509357044e-05, + "loss": 0.9706, + "step": 433 + }, + { + "epoch": 0.0625, + "grad_norm": 5.783877849578857, + "learning_rate": 4.952183606141865e-05, + "loss": 1.7667, + "step": 434 + }, + { + "epoch": 0.06264400921658986, + "grad_norm": 4.741653919219971, + "learning_rate": 4.951963201008076e-05, + "loss": 3.1717, + "step": 435 + }, + { + "epoch": 0.06278801843317973, + "grad_norm": 1.8580832481384277, + "learning_rate": 4.9517422940007906e-05, + "loss": 1.6385, + "step": 436 + }, + { + "epoch": 0.06293202764976959, + "grad_norm": 4.482171535491943, + "learning_rate": 4.951520885165224e-05, + "loss": 2.0898, + "step": 437 + }, + { + "epoch": 0.06307603686635944, + "grad_norm": 3.6801071166992188, + "learning_rate": 4.9512989745466956e-05, + "loss": 2.1433, + "step": 438 + }, + { + "epoch": 0.0632200460829493, + "grad_norm": 2.419984817504883, + "learning_rate": 4.951076562190626e-05, + "loss": 1.677, + "step": 439 + }, + { + "epoch": 0.06336405529953917, + "grad_norm": 3.4546616077423096, + "learning_rate": 4.9508536481425386e-05, + "loss": 1.6818, + "step": 440 + }, + { + "epoch": 0.06350806451612903, + "grad_norm": 1.1928495168685913, + "learning_rate": 4.9506302324480605e-05, + "loss": 4.7981, + "step": 441 + }, + { + "epoch": 0.06365207373271889, + "grad_norm": 3.5785486698150635, + "learning_rate": 4.950406315152921e-05, + "loss": 1.3833, + "step": 442 + }, + { + "epoch": 0.06379608294930876, + "grad_norm": 4.5283589363098145, + "learning_rate": 4.9501818963029525e-05, + "loss": 2.1176, + "step": 443 + }, + { + "epoch": 0.06394009216589862, + "grad_norm": 4.653388023376465, + "learning_rate": 4.9499569759440875e-05, + "loss": 4.5388, + "step": 444 + }, + { + "epoch": 0.06408410138248848, + "grad_norm": 2.6872665882110596, + "learning_rate": 4.9497315541223654e-05, + "loss": 1.6344, + "step": 445 + }, + { + "epoch": 0.06422811059907835, + "grad_norm": 2.963223695755005, + "learning_rate": 4.949505630883926e-05, + "loss": 2.5404, + "step": 446 + }, + { + "epoch": 0.0643721198156682, + "grad_norm": 2.15028715133667, + "learning_rate": 4.9492792062750105e-05, + "loss": 1.317, + "step": 447 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 3.844714403152466, + "learning_rate": 4.9490522803419644e-05, + "loss": 1.7957, + "step": 448 + }, + { + "epoch": 0.06466013824884792, + "grad_norm": 3.2025198936462402, + "learning_rate": 4.948824853131236e-05, + "loss": 1.4589, + "step": 449 + }, + { + "epoch": 0.06480414746543779, + "grad_norm": 2.875706195831299, + "learning_rate": 4.948596924689376e-05, + "loss": 1.6062, + "step": 450 + }, + { + "epoch": 0.06494815668202765, + "grad_norm": 1.3538668155670166, + "learning_rate": 4.948368495063036e-05, + "loss": 4.6722, + "step": 451 + }, + { + "epoch": 0.06509216589861751, + "grad_norm": 3.0557656288146973, + "learning_rate": 4.948139564298972e-05, + "loss": 2.8941, + "step": 452 + }, + { + "epoch": 0.06523617511520738, + "grad_norm": 3.9655184745788574, + "learning_rate": 4.947910132444043e-05, + "loss": 1.0598, + "step": 453 + }, + { + "epoch": 0.06538018433179724, + "grad_norm": 2.994311571121216, + "learning_rate": 4.947680199545207e-05, + "loss": 2.3701, + "step": 454 + }, + { + "epoch": 0.0655241935483871, + "grad_norm": 3.2831859588623047, + "learning_rate": 4.9474497656495305e-05, + "loss": 1.567, + "step": 455 + }, + { + "epoch": 0.06566820276497695, + "grad_norm": 3.1021313667297363, + "learning_rate": 4.947218830804178e-05, + "loss": 2.3956, + "step": 456 + }, + { + "epoch": 0.06581221198156682, + "grad_norm": 4.257760047912598, + "learning_rate": 4.946987395056416e-05, + "loss": 1.9662, + "step": 457 + }, + { + "epoch": 0.06595622119815668, + "grad_norm": 3.4921886920928955, + "learning_rate": 4.9467554584536185e-05, + "loss": 0.6751, + "step": 458 + }, + { + "epoch": 0.06610023041474654, + "grad_norm": 2.862604856491089, + "learning_rate": 4.946523021043257e-05, + "loss": 1.0118, + "step": 459 + }, + { + "epoch": 0.06624423963133641, + "grad_norm": 2.9912452697753906, + "learning_rate": 4.9462900828729064e-05, + "loss": 1.2436, + "step": 460 + }, + { + "epoch": 0.06638824884792627, + "grad_norm": 3.270113468170166, + "learning_rate": 4.9460566439902474e-05, + "loss": 1.6808, + "step": 461 + }, + { + "epoch": 0.06653225806451613, + "grad_norm": 3.489828586578369, + "learning_rate": 4.9458227044430585e-05, + "loss": 1.4467, + "step": 462 + }, + { + "epoch": 0.066676267281106, + "grad_norm": 2.343580722808838, + "learning_rate": 4.945588264279225e-05, + "loss": 0.5919, + "step": 463 + }, + { + "epoch": 0.06682027649769585, + "grad_norm": 3.877188205718994, + "learning_rate": 4.9453533235467306e-05, + "loss": 1.6616, + "step": 464 + }, + { + "epoch": 0.06696428571428571, + "grad_norm": 2.944786310195923, + "learning_rate": 4.945117882293666e-05, + "loss": 2.0707, + "step": 465 + }, + { + "epoch": 0.06710829493087557, + "grad_norm": 3.715012311935425, + "learning_rate": 4.9448819405682193e-05, + "loss": 1.4361, + "step": 466 + }, + { + "epoch": 0.06725230414746544, + "grad_norm": 3.0289814472198486, + "learning_rate": 4.944645498418685e-05, + "loss": 0.6506, + "step": 467 + }, + { + "epoch": 0.0673963133640553, + "grad_norm": 2.126882791519165, + "learning_rate": 4.944408555893459e-05, + "loss": 2.8611, + "step": 468 + }, + { + "epoch": 0.06754032258064516, + "grad_norm": 3.1468896865844727, + "learning_rate": 4.9441711130410387e-05, + "loss": 1.2494, + "step": 469 + }, + { + "epoch": 0.06768433179723503, + "grad_norm": 3.911149740219116, + "learning_rate": 4.943933169910023e-05, + "loss": 1.9443, + "step": 470 + }, + { + "epoch": 0.06782834101382489, + "grad_norm": 2.362680673599243, + "learning_rate": 4.943694726549117e-05, + "loss": 0.8658, + "step": 471 + }, + { + "epoch": 0.06797235023041474, + "grad_norm": 5.807247161865234, + "learning_rate": 4.9434557830071246e-05, + "loss": 2.0808, + "step": 472 + }, + { + "epoch": 0.06811635944700462, + "grad_norm": 2.1090850830078125, + "learning_rate": 4.9432163393329544e-05, + "loss": 0.617, + "step": 473 + }, + { + "epoch": 0.06826036866359447, + "grad_norm": 3.0224380493164062, + "learning_rate": 4.942976395575615e-05, + "loss": 2.0481, + "step": 474 + }, + { + "epoch": 0.06840437788018433, + "grad_norm": 4.982886791229248, + "learning_rate": 4.9427359517842186e-05, + "loss": 1.8588, + "step": 475 + }, + { + "epoch": 0.06854838709677419, + "grad_norm": 4.664625644683838, + "learning_rate": 4.94249500800798e-05, + "loss": 1.5119, + "step": 476 + }, + { + "epoch": 0.06869239631336406, + "grad_norm": 5.2662434577941895, + "learning_rate": 4.942253564296218e-05, + "loss": 2.104, + "step": 477 + }, + { + "epoch": 0.06883640552995392, + "grad_norm": 2.17962908744812, + "learning_rate": 4.9420116206983494e-05, + "loss": 1.5631, + "step": 478 + }, + { + "epoch": 0.06898041474654378, + "grad_norm": 2.794525623321533, + "learning_rate": 4.941769177263896e-05, + "loss": 0.6886, + "step": 479 + }, + { + "epoch": 0.06912442396313365, + "grad_norm": 4.957353115081787, + "learning_rate": 4.941526234042483e-05, + "loss": 2.965, + "step": 480 + }, + { + "epoch": 0.0692684331797235, + "grad_norm": 4.229015827178955, + "learning_rate": 4.941282791083836e-05, + "loss": 2.1582, + "step": 481 + }, + { + "epoch": 0.06941244239631336, + "grad_norm": 5.998673915863037, + "learning_rate": 4.9410388484377835e-05, + "loss": 2.0558, + "step": 482 + }, + { + "epoch": 0.06955645161290322, + "grad_norm": 4.034383296966553, + "learning_rate": 4.940794406154256e-05, + "loss": 1.3949, + "step": 483 + }, + { + "epoch": 0.06970046082949309, + "grad_norm": 1.4390497207641602, + "learning_rate": 4.940549464283287e-05, + "loss": 1.1701, + "step": 484 + }, + { + "epoch": 0.06984447004608295, + "grad_norm": 6.197383880615234, + "learning_rate": 4.940304022875011e-05, + "loss": 1.8333, + "step": 485 + }, + { + "epoch": 0.0699884792626728, + "grad_norm": 2.4997308254241943, + "learning_rate": 4.940058081979665e-05, + "loss": 0.2556, + "step": 486 + }, + { + "epoch": 0.07013248847926268, + "grad_norm": 1.6872674226760864, + "learning_rate": 4.9398116416475916e-05, + "loss": 1.5144, + "step": 487 + }, + { + "epoch": 0.07027649769585254, + "grad_norm": 2.253967046737671, + "learning_rate": 4.9395647019292294e-05, + "loss": 0.523, + "step": 488 + }, + { + "epoch": 0.0704205069124424, + "grad_norm": 2.74290132522583, + "learning_rate": 4.939317262875125e-05, + "loss": 1.2552, + "step": 489 + }, + { + "epoch": 0.07056451612903226, + "grad_norm": 3.1476497650146484, + "learning_rate": 4.939069324535923e-05, + "loss": 1.7989, + "step": 490 + }, + { + "epoch": 0.07070852534562212, + "grad_norm": 5.522889137268066, + "learning_rate": 4.9388208869623734e-05, + "loss": 1.6278, + "step": 491 + }, + { + "epoch": 0.07085253456221198, + "grad_norm": 4.2238030433654785, + "learning_rate": 4.938571950205326e-05, + "loss": 1.6422, + "step": 492 + }, + { + "epoch": 0.07099654377880184, + "grad_norm": 2.971470355987549, + "learning_rate": 4.938322514315735e-05, + "loss": 0.3039, + "step": 493 + }, + { + "epoch": 0.07114055299539171, + "grad_norm": 4.19732666015625, + "learning_rate": 4.938072579344654e-05, + "loss": 1.413, + "step": 494 + }, + { + "epoch": 0.07128456221198157, + "grad_norm": 3.547116756439209, + "learning_rate": 4.9378221453432415e-05, + "loss": 2.7218, + "step": 495 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 3.3371427059173584, + "learning_rate": 4.937571212362756e-05, + "loss": 2.8269, + "step": 496 + }, + { + "epoch": 0.0715725806451613, + "grad_norm": 4.113306522369385, + "learning_rate": 4.937319780454559e-05, + "loss": 2.0665, + "step": 497 + }, + { + "epoch": 0.07171658986175115, + "grad_norm": 2.7800424098968506, + "learning_rate": 4.937067849670115e-05, + "loss": 1.3921, + "step": 498 + }, + { + "epoch": 0.07186059907834101, + "grad_norm": 2.5544817447662354, + "learning_rate": 4.9368154200609894e-05, + "loss": 0.8249, + "step": 499 + }, + { + "epoch": 0.07200460829493087, + "grad_norm": 2.817105770111084, + "learning_rate": 4.93656249167885e-05, + "loss": 0.5515, + "step": 500 + }, + { + "epoch": 0.07214861751152074, + "grad_norm": 2.180392265319824, + "learning_rate": 4.936309064575467e-05, + "loss": 1.1847, + "step": 501 + }, + { + "epoch": 0.0722926267281106, + "grad_norm": 4.343925952911377, + "learning_rate": 4.9360551388027124e-05, + "loss": 2.0311, + "step": 502 + }, + { + "epoch": 0.07243663594470046, + "grad_norm": 3.701134204864502, + "learning_rate": 4.935800714412559e-05, + "loss": 1.3267, + "step": 503 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 2.049494743347168, + "learning_rate": 4.935545791457085e-05, + "loss": 0.5432, + "step": 504 + }, + { + "epoch": 0.07272465437788019, + "grad_norm": 4.111993312835693, + "learning_rate": 4.935290369988468e-05, + "loss": 0.9113, + "step": 505 + }, + { + "epoch": 0.07286866359447004, + "grad_norm": 1.9071966409683228, + "learning_rate": 4.935034450058987e-05, + "loss": 0.9097, + "step": 506 + }, + { + "epoch": 0.07301267281105991, + "grad_norm": 4.0568766593933105, + "learning_rate": 4.934778031721027e-05, + "loss": 0.9441, + "step": 507 + }, + { + "epoch": 0.07315668202764977, + "grad_norm": 4.7086944580078125, + "learning_rate": 4.9345211150270685e-05, + "loss": 2.479, + "step": 508 + }, + { + "epoch": 0.07330069124423963, + "grad_norm": 3.7469518184661865, + "learning_rate": 4.934263700029701e-05, + "loss": 1.9452, + "step": 509 + }, + { + "epoch": 0.07344470046082949, + "grad_norm": 2.050428628921509, + "learning_rate": 4.934005786781612e-05, + "loss": 0.7025, + "step": 510 + }, + { + "epoch": 0.07358870967741936, + "grad_norm": 4.347752094268799, + "learning_rate": 4.9337473753355914e-05, + "loss": 0.3822, + "step": 511 + }, + { + "epoch": 0.07373271889400922, + "grad_norm": 5.8519721031188965, + "learning_rate": 4.933488465744531e-05, + "loss": 2.0545, + "step": 512 + }, + { + "epoch": 0.07387672811059907, + "grad_norm": 3.992377758026123, + "learning_rate": 4.933229058061425e-05, + "loss": 0.8563, + "step": 513 + }, + { + "epoch": 0.07402073732718895, + "grad_norm": 3.2371022701263428, + "learning_rate": 4.932969152339371e-05, + "loss": 0.6274, + "step": 514 + }, + { + "epoch": 0.0741647465437788, + "grad_norm": 4.47186279296875, + "learning_rate": 4.932708748631566e-05, + "loss": 1.5951, + "step": 515 + }, + { + "epoch": 0.07430875576036866, + "grad_norm": 2.203261375427246, + "learning_rate": 4.93244784699131e-05, + "loss": 0.3378, + "step": 516 + }, + { + "epoch": 0.07445276497695852, + "grad_norm": 3.841233730316162, + "learning_rate": 4.932186447472006e-05, + "loss": 1.0331, + "step": 517 + }, + { + "epoch": 0.07459677419354839, + "grad_norm": 5.463283061981201, + "learning_rate": 4.931924550127156e-05, + "loss": 2.3065, + "step": 518 + }, + { + "epoch": 0.07474078341013825, + "grad_norm": 3.1814353466033936, + "learning_rate": 4.931662155010367e-05, + "loss": 1.4689, + "step": 519 + }, + { + "epoch": 0.0748847926267281, + "grad_norm": 3.343526840209961, + "learning_rate": 4.931399262175347e-05, + "loss": 1.1196, + "step": 520 + }, + { + "epoch": 0.07502880184331798, + "grad_norm": 2.9247307777404785, + "learning_rate": 4.931135871675905e-05, + "loss": 0.557, + "step": 521 + }, + { + "epoch": 0.07517281105990783, + "grad_norm": 7.628046989440918, + "learning_rate": 4.9308719835659514e-05, + "loss": 2.8574, + "step": 522 + }, + { + "epoch": 0.07531682027649769, + "grad_norm": 0.9002310633659363, + "learning_rate": 4.9306075978995006e-05, + "loss": 4.7546, + "step": 523 + }, + { + "epoch": 0.07546082949308756, + "grad_norm": 7.338300704956055, + "learning_rate": 4.930342714730668e-05, + "loss": 2.3074, + "step": 524 + }, + { + "epoch": 0.07560483870967742, + "grad_norm": 3.8721814155578613, + "learning_rate": 4.93007733411367e-05, + "loss": 3.3649, + "step": 525 + }, + { + "epoch": 0.07574884792626728, + "grad_norm": 3.5318939685821533, + "learning_rate": 4.929811456102824e-05, + "loss": 0.6218, + "step": 526 + }, + { + "epoch": 0.07589285714285714, + "grad_norm": 5.605301380157471, + "learning_rate": 4.929545080752553e-05, + "loss": 1.7696, + "step": 527 + }, + { + "epoch": 0.07603686635944701, + "grad_norm": 6.065411567687988, + "learning_rate": 4.929278208117378e-05, + "loss": 2.3668, + "step": 528 + }, + { + "epoch": 0.07618087557603687, + "grad_norm": 2.1854920387268066, + "learning_rate": 4.929010838251923e-05, + "loss": 0.4661, + "step": 529 + }, + { + "epoch": 0.07632488479262672, + "grad_norm": 5.255245685577393, + "learning_rate": 4.9287429712109135e-05, + "loss": 2.5305, + "step": 530 + }, + { + "epoch": 0.0764688940092166, + "grad_norm": 3.6780412197113037, + "learning_rate": 4.928474607049178e-05, + "loss": 2.3643, + "step": 531 + }, + { + "epoch": 0.07661290322580645, + "grad_norm": 3.819746494293213, + "learning_rate": 4.9282057458216455e-05, + "loss": 1.3251, + "step": 532 + }, + { + "epoch": 0.07675691244239631, + "grad_norm": 3.044175624847412, + "learning_rate": 4.927936387583348e-05, + "loss": 0.7034, + "step": 533 + }, + { + "epoch": 0.07690092165898617, + "grad_norm": 1.3694288730621338, + "learning_rate": 4.9276665323894164e-05, + "loss": 0.1918, + "step": 534 + }, + { + "epoch": 0.07704493087557604, + "grad_norm": 4.123863697052002, + "learning_rate": 4.927396180295088e-05, + "loss": 2.6223, + "step": 535 + }, + { + "epoch": 0.0771889400921659, + "grad_norm": 3.30938458442688, + "learning_rate": 4.927125331355696e-05, + "loss": 1.3561, + "step": 536 + }, + { + "epoch": 0.07733294930875576, + "grad_norm": 4.852513790130615, + "learning_rate": 4.926853985626682e-05, + "loss": 2.3003, + "step": 537 + }, + { + "epoch": 0.07747695852534563, + "grad_norm": 4.783610820770264, + "learning_rate": 4.926582143163582e-05, + "loss": 1.5972, + "step": 538 + }, + { + "epoch": 0.07762096774193548, + "grad_norm": 3.0558996200561523, + "learning_rate": 4.92630980402204e-05, + "loss": 0.977, + "step": 539 + }, + { + "epoch": 0.07776497695852534, + "grad_norm": 1.7960939407348633, + "learning_rate": 4.9260369682577965e-05, + "loss": 1.0991, + "step": 540 + }, + { + "epoch": 0.07790898617511521, + "grad_norm": 4.815981864929199, + "learning_rate": 4.925763635926699e-05, + "loss": 1.4951, + "step": 541 + }, + { + "epoch": 0.07805299539170507, + "grad_norm": 3.361757755279541, + "learning_rate": 4.925489807084692e-05, + "loss": 1.6331, + "step": 542 + }, + { + "epoch": 0.07819700460829493, + "grad_norm": 3.6193506717681885, + "learning_rate": 4.9252154817878246e-05, + "loss": 1.3707, + "step": 543 + }, + { + "epoch": 0.07834101382488479, + "grad_norm": 2.890669584274292, + "learning_rate": 4.924940660092245e-05, + "loss": 2.938, + "step": 544 + }, + { + "epoch": 0.07848502304147466, + "grad_norm": 5.115518569946289, + "learning_rate": 4.924665342054204e-05, + "loss": 2.0888, + "step": 545 + }, + { + "epoch": 0.07862903225806452, + "grad_norm": 2.3013715744018555, + "learning_rate": 4.9243895277300566e-05, + "loss": 0.3722, + "step": 546 + }, + { + "epoch": 0.07877304147465437, + "grad_norm": 3.6673707962036133, + "learning_rate": 4.924113217176256e-05, + "loss": 2.9689, + "step": 547 + }, + { + "epoch": 0.07891705069124424, + "grad_norm": 4.170883655548096, + "learning_rate": 4.923836410449357e-05, + "loss": 2.8158, + "step": 548 + }, + { + "epoch": 0.0790610599078341, + "grad_norm": 5.868667125701904, + "learning_rate": 4.9235591076060186e-05, + "loss": 4.4373, + "step": 549 + }, + { + "epoch": 0.07920506912442396, + "grad_norm": 2.5325191020965576, + "learning_rate": 4.923281308702998e-05, + "loss": 2.0252, + "step": 550 + }, + { + "epoch": 0.07934907834101383, + "grad_norm": 4.470123291015625, + "learning_rate": 4.923003013797158e-05, + "loss": 0.4487, + "step": 551 + }, + { + "epoch": 0.07949308755760369, + "grad_norm": 3.4107258319854736, + "learning_rate": 4.922724222945459e-05, + "loss": 2.0948, + "step": 552 + }, + { + "epoch": 0.07963709677419355, + "grad_norm": 2.3222804069519043, + "learning_rate": 4.9224449362049654e-05, + "loss": 2.7608, + "step": 553 + }, + { + "epoch": 0.0797811059907834, + "grad_norm": 5.049975395202637, + "learning_rate": 4.922165153632842e-05, + "loss": 1.4864, + "step": 554 + }, + { + "epoch": 0.07992511520737328, + "grad_norm": 4.217273712158203, + "learning_rate": 4.9218848752863546e-05, + "loss": 1.5832, + "step": 555 + }, + { + "epoch": 0.08006912442396313, + "grad_norm": 4.038072109222412, + "learning_rate": 4.921604101222872e-05, + "loss": 2.3177, + "step": 556 + }, + { + "epoch": 0.08021313364055299, + "grad_norm": 1.4716880321502686, + "learning_rate": 4.9213228314998626e-05, + "loss": 1.5127, + "step": 557 + }, + { + "epoch": 0.08035714285714286, + "grad_norm": 2.2666561603546143, + "learning_rate": 4.9210410661748996e-05, + "loss": 1.4791, + "step": 558 + }, + { + "epoch": 0.08050115207373272, + "grad_norm": 4.355792999267578, + "learning_rate": 4.9207588053056545e-05, + "loss": 0.5861, + "step": 559 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 3.5639593601226807, + "learning_rate": 4.920476048949899e-05, + "loss": 1.1681, + "step": 560 + }, + { + "epoch": 0.08078917050691244, + "grad_norm": 2.8198885917663574, + "learning_rate": 4.920192797165511e-05, + "loss": 1.3955, + "step": 561 + }, + { + "epoch": 0.08093317972350231, + "grad_norm": 2.252075433731079, + "learning_rate": 4.919909050010466e-05, + "loss": 0.8658, + "step": 562 + }, + { + "epoch": 0.08107718894009217, + "grad_norm": 4.061067581176758, + "learning_rate": 4.919624807542842e-05, + "loss": 1.9344, + "step": 563 + }, + { + "epoch": 0.08122119815668202, + "grad_norm": 10.802642822265625, + "learning_rate": 4.919340069820818e-05, + "loss": 4.4959, + "step": 564 + }, + { + "epoch": 0.0813652073732719, + "grad_norm": 3.1654629707336426, + "learning_rate": 4.919054836902677e-05, + "loss": 0.4761, + "step": 565 + }, + { + "epoch": 0.08150921658986175, + "grad_norm": 3.326021432876587, + "learning_rate": 4.918769108846798e-05, + "loss": 0.8158, + "step": 566 + }, + { + "epoch": 0.08165322580645161, + "grad_norm": 3.104285717010498, + "learning_rate": 4.918482885711666e-05, + "loss": 1.0758, + "step": 567 + }, + { + "epoch": 0.08179723502304148, + "grad_norm": 3.3060977458953857, + "learning_rate": 4.918196167555866e-05, + "loss": 2.5367, + "step": 568 + }, + { + "epoch": 0.08194124423963134, + "grad_norm": 3.4435861110687256, + "learning_rate": 4.917908954438084e-05, + "loss": 1.2857, + "step": 569 + }, + { + "epoch": 0.0820852534562212, + "grad_norm": 5.923458576202393, + "learning_rate": 4.917621246417107e-05, + "loss": 1.3729, + "step": 570 + }, + { + "epoch": 0.08222926267281105, + "grad_norm": 4.117618083953857, + "learning_rate": 4.917333043551825e-05, + "loss": 3.071, + "step": 571 + }, + { + "epoch": 0.08237327188940093, + "grad_norm": 1.503610610961914, + "learning_rate": 4.917044345901226e-05, + "loss": 0.2082, + "step": 572 + }, + { + "epoch": 0.08251728110599078, + "grad_norm": 5.279428958892822, + "learning_rate": 4.916755153524403e-05, + "loss": 1.0437, + "step": 573 + }, + { + "epoch": 0.08266129032258064, + "grad_norm": 7.002418518066406, + "learning_rate": 4.916465466480548e-05, + "loss": 2.7668, + "step": 574 + }, + { + "epoch": 0.08280529953917051, + "grad_norm": 2.785033941268921, + "learning_rate": 4.916175284828955e-05, + "loss": 0.3583, + "step": 575 + }, + { + "epoch": 0.08294930875576037, + "grad_norm": 3.4449338912963867, + "learning_rate": 4.915884608629018e-05, + "loss": 1.4241, + "step": 576 + }, + { + "epoch": 0.08309331797235023, + "grad_norm": 3.197173833847046, + "learning_rate": 4.9155934379402335e-05, + "loss": 1.7574, + "step": 577 + }, + { + "epoch": 0.08323732718894009, + "grad_norm": 2.206078052520752, + "learning_rate": 4.915301772822201e-05, + "loss": 1.4601, + "step": 578 + }, + { + "epoch": 0.08338133640552996, + "grad_norm": 2.6495368480682373, + "learning_rate": 4.9150096133346165e-05, + "loss": 1.4908, + "step": 579 + }, + { + "epoch": 0.08352534562211981, + "grad_norm": 1.6625635623931885, + "learning_rate": 4.914716959537283e-05, + "loss": 0.8477, + "step": 580 + }, + { + "epoch": 0.08366935483870967, + "grad_norm": 3.8755693435668945, + "learning_rate": 4.914423811490099e-05, + "loss": 0.6434, + "step": 581 + }, + { + "epoch": 0.08381336405529954, + "grad_norm": 2.998112678527832, + "learning_rate": 4.914130169253066e-05, + "loss": 1.9001, + "step": 582 + }, + { + "epoch": 0.0839573732718894, + "grad_norm": 1.6081026792526245, + "learning_rate": 4.91383603288629e-05, + "loss": 1.0955, + "step": 583 + }, + { + "epoch": 0.08410138248847926, + "grad_norm": 4.602581024169922, + "learning_rate": 4.9135414024499746e-05, + "loss": 2.5558, + "step": 584 + }, + { + "epoch": 0.08424539170506913, + "grad_norm": 6.766359806060791, + "learning_rate": 4.913246278004425e-05, + "loss": 2.7964, + "step": 585 + }, + { + "epoch": 0.08438940092165899, + "grad_norm": 7.207889080047607, + "learning_rate": 4.9129506596100474e-05, + "loss": 1.9962, + "step": 586 + }, + { + "epoch": 0.08453341013824885, + "grad_norm": 3.610482931137085, + "learning_rate": 4.912654547327351e-05, + "loss": 0.3501, + "step": 587 + }, + { + "epoch": 0.0846774193548387, + "grad_norm": 4.866929531097412, + "learning_rate": 4.912357941216944e-05, + "loss": 1.2656, + "step": 588 + }, + { + "epoch": 0.08482142857142858, + "grad_norm": 0.9521603584289551, + "learning_rate": 4.9120608413395366e-05, + "loss": 4.8375, + "step": 589 + }, + { + "epoch": 0.08496543778801843, + "grad_norm": 4.572454929351807, + "learning_rate": 4.91176324775594e-05, + "loss": 2.0882, + "step": 590 + }, + { + "epoch": 0.08510944700460829, + "grad_norm": 3.524322748184204, + "learning_rate": 4.9114651605270654e-05, + "loss": 1.5062, + "step": 591 + }, + { + "epoch": 0.08525345622119816, + "grad_norm": 3.7897229194641113, + "learning_rate": 4.9111665797139275e-05, + "loss": 0.8662, + "step": 592 + }, + { + "epoch": 0.08539746543778802, + "grad_norm": 6.140185832977295, + "learning_rate": 4.91086750537764e-05, + "loss": 2.1775, + "step": 593 + }, + { + "epoch": 0.08554147465437788, + "grad_norm": 3.7889621257781982, + "learning_rate": 4.910567937579417e-05, + "loss": 0.9014, + "step": 594 + }, + { + "epoch": 0.08568548387096774, + "grad_norm": 6.044473648071289, + "learning_rate": 4.9102678763805766e-05, + "loss": 1.459, + "step": 595 + }, + { + "epoch": 0.0858294930875576, + "grad_norm": 5.8334479331970215, + "learning_rate": 4.909967321842535e-05, + "loss": 2.0665, + "step": 596 + }, + { + "epoch": 0.08597350230414746, + "grad_norm": 4.654296398162842, + "learning_rate": 4.909666274026809e-05, + "loss": 1.01, + "step": 597 + }, + { + "epoch": 0.08611751152073732, + "grad_norm": 4.673594951629639, + "learning_rate": 4.90936473299502e-05, + "loss": 2.917, + "step": 598 + }, + { + "epoch": 0.0862615207373272, + "grad_norm": 6.865631580352783, + "learning_rate": 4.9090626988088875e-05, + "loss": 2.1138, + "step": 599 + }, + { + "epoch": 0.08640552995391705, + "grad_norm": 3.803765058517456, + "learning_rate": 4.9087601715302326e-05, + "loss": 1.4755, + "step": 600 + }, + { + "epoch": 0.08654953917050691, + "grad_norm": 4.993073463439941, + "learning_rate": 4.908457151220976e-05, + "loss": 2.7457, + "step": 601 + }, + { + "epoch": 0.08669354838709678, + "grad_norm": 4.143373966217041, + "learning_rate": 4.908153637943144e-05, + "loss": 1.3643, + "step": 602 + }, + { + "epoch": 0.08683755760368664, + "grad_norm": 4.959485054016113, + "learning_rate": 4.9078496317588556e-05, + "loss": 1.9102, + "step": 603 + }, + { + "epoch": 0.0869815668202765, + "grad_norm": 4.468819618225098, + "learning_rate": 4.907545132730339e-05, + "loss": 1.3758, + "step": 604 + }, + { + "epoch": 0.08712557603686635, + "grad_norm": 3.762190580368042, + "learning_rate": 4.907240140919919e-05, + "loss": 1.3466, + "step": 605 + }, + { + "epoch": 0.08726958525345622, + "grad_norm": 5.152024745941162, + "learning_rate": 4.906934656390021e-05, + "loss": 1.1753, + "step": 606 + }, + { + "epoch": 0.08741359447004608, + "grad_norm": 3.8429455757141113, + "learning_rate": 4.9066286792031733e-05, + "loss": 2.7226, + "step": 607 + }, + { + "epoch": 0.08755760368663594, + "grad_norm": 3.2226269245147705, + "learning_rate": 4.9063222094220044e-05, + "loss": 1.7457, + "step": 608 + }, + { + "epoch": 0.08770161290322581, + "grad_norm": 4.298849105834961, + "learning_rate": 4.9060152471092414e-05, + "loss": 1.727, + "step": 609 + }, + { + "epoch": 0.08784562211981567, + "grad_norm": 4.290565490722656, + "learning_rate": 4.905707792327715e-05, + "loss": 2.0171, + "step": 610 + }, + { + "epoch": 0.08798963133640553, + "grad_norm": 3.650557279586792, + "learning_rate": 4.905399845140357e-05, + "loss": 1.5693, + "step": 611 + }, + { + "epoch": 0.08813364055299538, + "grad_norm": 2.3189287185668945, + "learning_rate": 4.9050914056101974e-05, + "loss": 0.9049, + "step": 612 + }, + { + "epoch": 0.08827764976958526, + "grad_norm": 5.301217555999756, + "learning_rate": 4.904782473800369e-05, + "loss": 1.251, + "step": 613 + }, + { + "epoch": 0.08842165898617511, + "grad_norm": 4.366277694702148, + "learning_rate": 4.904473049774104e-05, + "loss": 0.8223, + "step": 614 + }, + { + "epoch": 0.08856566820276497, + "grad_norm": 3.8580892086029053, + "learning_rate": 4.904163133594736e-05, + "loss": 1.3211, + "step": 615 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 4.272387504577637, + "learning_rate": 4.9038527253257005e-05, + "loss": 1.4299, + "step": 616 + }, + { + "epoch": 0.0888536866359447, + "grad_norm": 2.114982843399048, + "learning_rate": 4.9035418250305314e-05, + "loss": 0.2879, + "step": 617 + }, + { + "epoch": 0.08899769585253456, + "grad_norm": 5.419066429138184, + "learning_rate": 4.9032304327728654e-05, + "loss": 3.4535, + "step": 618 + }, + { + "epoch": 0.08914170506912443, + "grad_norm": 4.447429180145264, + "learning_rate": 4.902918548616437e-05, + "loss": 2.2316, + "step": 619 + }, + { + "epoch": 0.08928571428571429, + "grad_norm": 2.9048373699188232, + "learning_rate": 4.902606172625086e-05, + "loss": 2.1718, + "step": 620 + }, + { + "epoch": 0.08942972350230415, + "grad_norm": 6.1953816413879395, + "learning_rate": 4.9022933048627496e-05, + "loss": 1.9946, + "step": 621 + }, + { + "epoch": 0.089573732718894, + "grad_norm": 2.641713857650757, + "learning_rate": 4.9019799453934645e-05, + "loss": 0.2068, + "step": 622 + }, + { + "epoch": 0.08971774193548387, + "grad_norm": 1.7526662349700928, + "learning_rate": 4.901666094281372e-05, + "loss": 1.0958, + "step": 623 + }, + { + "epoch": 0.08986175115207373, + "grad_norm": 4.630856990814209, + "learning_rate": 4.90135175159071e-05, + "loss": 2.3058, + "step": 624 + }, + { + "epoch": 0.09000576036866359, + "grad_norm": 3.696044445037842, + "learning_rate": 4.9010369173858204e-05, + "loss": 1.5942, + "step": 625 + }, + { + "epoch": 0.09014976958525346, + "grad_norm": 3.662588357925415, + "learning_rate": 4.900721591731144e-05, + "loss": 2.3852, + "step": 626 + }, + { + "epoch": 0.09029377880184332, + "grad_norm": 3.704192638397217, + "learning_rate": 4.9004057746912226e-05, + "loss": 2.0071, + "step": 627 + }, + { + "epoch": 0.09043778801843318, + "grad_norm": 3.118990182876587, + "learning_rate": 4.9000894663306965e-05, + "loss": 1.8078, + "step": 628 + }, + { + "epoch": 0.09058179723502305, + "grad_norm": 2.2988367080688477, + "learning_rate": 4.899772666714311e-05, + "loss": 1.1864, + "step": 629 + }, + { + "epoch": 0.0907258064516129, + "grad_norm": 6.144118785858154, + "learning_rate": 4.899455375906907e-05, + "loss": 3.6735, + "step": 630 + }, + { + "epoch": 0.09086981566820276, + "grad_norm": 3.413975715637207, + "learning_rate": 4.89913759397343e-05, + "loss": 1.5754, + "step": 631 + }, + { + "epoch": 0.09101382488479262, + "grad_norm": 4.747596263885498, + "learning_rate": 4.898819320978924e-05, + "loss": 2.4665, + "step": 632 + }, + { + "epoch": 0.09115783410138249, + "grad_norm": 4.2012410163879395, + "learning_rate": 4.8985005569885325e-05, + "loss": 1.9208, + "step": 633 + }, + { + "epoch": 0.09130184331797235, + "grad_norm": 2.749567747116089, + "learning_rate": 4.8981813020675025e-05, + "loss": 1.3037, + "step": 634 + }, + { + "epoch": 0.09144585253456221, + "grad_norm": 3.8268394470214844, + "learning_rate": 4.8978615562811794e-05, + "loss": 1.9722, + "step": 635 + }, + { + "epoch": 0.09158986175115208, + "grad_norm": 3.170144557952881, + "learning_rate": 4.8975413196950096e-05, + "loss": 1.7118, + "step": 636 + }, + { + "epoch": 0.09173387096774194, + "grad_norm": 3.2698676586151123, + "learning_rate": 4.89722059237454e-05, + "loss": 1.1568, + "step": 637 + }, + { + "epoch": 0.0918778801843318, + "grad_norm": 4.3240065574646, + "learning_rate": 4.8968993743854176e-05, + "loss": 1.2602, + "step": 638 + }, + { + "epoch": 0.09202188940092165, + "grad_norm": 2.3277628421783447, + "learning_rate": 4.896577665793389e-05, + "loss": 1.2467, + "step": 639 + }, + { + "epoch": 0.09216589861751152, + "grad_norm": 3.1471645832061768, + "learning_rate": 4.8962554666643036e-05, + "loss": 1.5677, + "step": 640 + }, + { + "epoch": 0.09230990783410138, + "grad_norm": 1.7198737859725952, + "learning_rate": 4.89593277706411e-05, + "loss": 0.8508, + "step": 641 + }, + { + "epoch": 0.09245391705069124, + "grad_norm": 2.843621015548706, + "learning_rate": 4.8956095970588556e-05, + "loss": 1.6642, + "step": 642 + }, + { + "epoch": 0.09259792626728111, + "grad_norm": 4.7249627113342285, + "learning_rate": 4.895285926714691e-05, + "loss": 1.9867, + "step": 643 + }, + { + "epoch": 0.09274193548387097, + "grad_norm": 2.4013898372650146, + "learning_rate": 4.894961766097865e-05, + "loss": 1.4165, + "step": 644 + }, + { + "epoch": 0.09288594470046083, + "grad_norm": 4.531120777130127, + "learning_rate": 4.8946371152747285e-05, + "loss": 2.2532, + "step": 645 + }, + { + "epoch": 0.0930299539170507, + "grad_norm": 4.662786960601807, + "learning_rate": 4.894311974311731e-05, + "loss": 2.502, + "step": 646 + }, + { + "epoch": 0.09317396313364056, + "grad_norm": 2.5308218002319336, + "learning_rate": 4.893986343275423e-05, + "loss": 1.5497, + "step": 647 + }, + { + "epoch": 0.09331797235023041, + "grad_norm": 2.6049208641052246, + "learning_rate": 4.893660222232456e-05, + "loss": 1.2118, + "step": 648 + }, + { + "epoch": 0.09346198156682027, + "grad_norm": 2.8168673515319824, + "learning_rate": 4.893333611249581e-05, + "loss": 0.4275, + "step": 649 + }, + { + "epoch": 0.09360599078341014, + "grad_norm": 2.959716320037842, + "learning_rate": 4.8930065103936484e-05, + "loss": 1.3461, + "step": 650 + }, + { + "epoch": 0.09375, + "grad_norm": 5.086558818817139, + "learning_rate": 4.892678919731612e-05, + "loss": 1.4353, + "step": 651 + }, + { + "epoch": 0.09389400921658986, + "grad_norm": 1.5128074884414673, + "learning_rate": 4.892350839330522e-05, + "loss": 0.8178, + "step": 652 + }, + { + "epoch": 0.09403801843317973, + "grad_norm": 3.2046663761138916, + "learning_rate": 4.8920222692575324e-05, + "loss": 1.069, + "step": 653 + }, + { + "epoch": 0.09418202764976959, + "grad_norm": 3.6564619541168213, + "learning_rate": 4.891693209579894e-05, + "loss": 1.3885, + "step": 654 + }, + { + "epoch": 0.09432603686635944, + "grad_norm": 8.165285110473633, + "learning_rate": 4.89136366036496e-05, + "loss": 1.6856, + "step": 655 + }, + { + "epoch": 0.0944700460829493, + "grad_norm": 3.7258520126342773, + "learning_rate": 4.891033621680184e-05, + "loss": 1.1183, + "step": 656 + }, + { + "epoch": 0.09461405529953917, + "grad_norm": 3.808023452758789, + "learning_rate": 4.890703093593118e-05, + "loss": 1.0728, + "step": 657 + }, + { + "epoch": 0.09475806451612903, + "grad_norm": 2.7809879779815674, + "learning_rate": 4.890372076171416e-05, + "loss": 2.9125, + "step": 658 + }, + { + "epoch": 0.09490207373271889, + "grad_norm": 3.7689788341522217, + "learning_rate": 4.8900405694828313e-05, + "loss": 0.9323, + "step": 659 + }, + { + "epoch": 0.09504608294930876, + "grad_norm": 3.2435684204101562, + "learning_rate": 4.8897085735952175e-05, + "loss": 3.1194, + "step": 660 + }, + { + "epoch": 0.09519009216589862, + "grad_norm": 4.052819728851318, + "learning_rate": 4.8893760885765284e-05, + "loss": 2.5986, + "step": 661 + }, + { + "epoch": 0.09533410138248848, + "grad_norm": 3.7144775390625, + "learning_rate": 4.889043114494817e-05, + "loss": 3.1784, + "step": 662 + }, + { + "epoch": 0.09547811059907835, + "grad_norm": 6.146015644073486, + "learning_rate": 4.888709651418238e-05, + "loss": 2.0729, + "step": 663 + }, + { + "epoch": 0.0956221198156682, + "grad_norm": 4.615922451019287, + "learning_rate": 4.8883756994150455e-05, + "loss": 3.7786, + "step": 664 + }, + { + "epoch": 0.09576612903225806, + "grad_norm": 1.942203164100647, + "learning_rate": 4.8880412585535926e-05, + "loss": 0.5218, + "step": 665 + }, + { + "epoch": 0.09591013824884792, + "grad_norm": 5.326767444610596, + "learning_rate": 4.887706328902335e-05, + "loss": 2.3075, + "step": 666 + }, + { + "epoch": 0.09605414746543779, + "grad_norm": 4.801669120788574, + "learning_rate": 4.887370910529825e-05, + "loss": 3.1185, + "step": 667 + }, + { + "epoch": 0.09619815668202765, + "grad_norm": 4.3257060050964355, + "learning_rate": 4.887035003504718e-05, + "loss": 1.6127, + "step": 668 + }, + { + "epoch": 0.09634216589861751, + "grad_norm": 4.051336765289307, + "learning_rate": 4.886698607895768e-05, + "loss": 2.3667, + "step": 669 + }, + { + "epoch": 0.09648617511520738, + "grad_norm": 3.3444905281066895, + "learning_rate": 4.8863617237718296e-05, + "loss": 1.8017, + "step": 670 + }, + { + "epoch": 0.09663018433179724, + "grad_norm": 2.4796152114868164, + "learning_rate": 4.886024351201856e-05, + "loss": 0.6669, + "step": 671 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 3.8684751987457275, + "learning_rate": 4.885686490254903e-05, + "loss": 1.1613, + "step": 672 + }, + { + "epoch": 0.09691820276497695, + "grad_norm": 3.8083395957946777, + "learning_rate": 4.885348141000122e-05, + "loss": 1.456, + "step": 673 + }, + { + "epoch": 0.09706221198156682, + "grad_norm": 3.118926525115967, + "learning_rate": 4.88500930350677e-05, + "loss": 0.9014, + "step": 674 + }, + { + "epoch": 0.09720622119815668, + "grad_norm": 2.348130464553833, + "learning_rate": 4.8846699778442e-05, + "loss": 0.4631, + "step": 675 + }, + { + "epoch": 0.09735023041474654, + "grad_norm": 4.381468772888184, + "learning_rate": 4.884330164081866e-05, + "loss": 1.5186, + "step": 676 + }, + { + "epoch": 0.09749423963133641, + "grad_norm": 1.3870975971221924, + "learning_rate": 4.883989862289322e-05, + "loss": 0.3364, + "step": 677 + }, + { + "epoch": 0.09763824884792627, + "grad_norm": 4.418837070465088, + "learning_rate": 4.8836490725362206e-05, + "loss": 2.4146, + "step": 678 + }, + { + "epoch": 0.09778225806451613, + "grad_norm": 2.1697723865509033, + "learning_rate": 4.8833077948923166e-05, + "loss": 0.437, + "step": 679 + }, + { + "epoch": 0.097926267281106, + "grad_norm": 3.3119091987609863, + "learning_rate": 4.8829660294274636e-05, + "loss": 2.9176, + "step": 680 + }, + { + "epoch": 0.09807027649769585, + "grad_norm": 2.262763500213623, + "learning_rate": 4.8826237762116144e-05, + "loss": 0.5206, + "step": 681 + }, + { + "epoch": 0.09821428571428571, + "grad_norm": 4.117883205413818, + "learning_rate": 4.882281035314823e-05, + "loss": 0.3921, + "step": 682 + }, + { + "epoch": 0.09835829493087557, + "grad_norm": 4.161159038543701, + "learning_rate": 4.881937806807241e-05, + "loss": 1.4267, + "step": 683 + }, + { + "epoch": 0.09850230414746544, + "grad_norm": 4.369778633117676, + "learning_rate": 4.881594090759122e-05, + "loss": 0.8307, + "step": 684 + }, + { + "epoch": 0.0986463133640553, + "grad_norm": 2.8645386695861816, + "learning_rate": 4.8812498872408186e-05, + "loss": 0.9804, + "step": 685 + }, + { + "epoch": 0.09879032258064516, + "grad_norm": 4.639033794403076, + "learning_rate": 4.8809051963227835e-05, + "loss": 0.9559, + "step": 686 + }, + { + "epoch": 0.09893433179723503, + "grad_norm": 4.299173831939697, + "learning_rate": 4.8805600180755685e-05, + "loss": 0.6381, + "step": 687 + }, + { + "epoch": 0.09907834101382489, + "grad_norm": 4.741522789001465, + "learning_rate": 4.8802143525698255e-05, + "loss": 2.9861, + "step": 688 + }, + { + "epoch": 0.09922235023041474, + "grad_norm": 5.61417818069458, + "learning_rate": 4.879868199876305e-05, + "loss": 1.7311, + "step": 689 + }, + { + "epoch": 0.09936635944700462, + "grad_norm": 5.12117862701416, + "learning_rate": 4.8795215600658606e-05, + "loss": 1.6005, + "step": 690 + }, + { + "epoch": 0.09951036866359447, + "grad_norm": 3.5991039276123047, + "learning_rate": 4.879174433209442e-05, + "loss": 1.6696, + "step": 691 + }, + { + "epoch": 0.09965437788018433, + "grad_norm": 5.763620376586914, + "learning_rate": 4.8788268193780993e-05, + "loss": 1.6482, + "step": 692 + }, + { + "epoch": 0.09979838709677419, + "grad_norm": 4.754615783691406, + "learning_rate": 4.878478718642985e-05, + "loss": 1.1538, + "step": 693 + }, + { + "epoch": 0.09994239631336406, + "grad_norm": 3.4882516860961914, + "learning_rate": 4.878130131075347e-05, + "loss": 2.5671, + "step": 694 + }, + { + "epoch": 0.10008640552995392, + "grad_norm": 5.328705787658691, + "learning_rate": 4.877781056746535e-05, + "loss": 1.116, + "step": 695 + }, + { + "epoch": 0.10023041474654378, + "grad_norm": 5.016254425048828, + "learning_rate": 4.877431495728001e-05, + "loss": 1.1373, + "step": 696 + }, + { + "epoch": 0.10037442396313365, + "grad_norm": 3.9381635189056396, + "learning_rate": 4.877081448091291e-05, + "loss": 0.5054, + "step": 697 + }, + { + "epoch": 0.1005184331797235, + "grad_norm": 2.314626693725586, + "learning_rate": 4.8767309139080555e-05, + "loss": 0.3278, + "step": 698 + }, + { + "epoch": 0.10066244239631336, + "grad_norm": 4.0728044509887695, + "learning_rate": 4.876379893250041e-05, + "loss": 1.2344, + "step": 699 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 3.9546496868133545, + "learning_rate": 4.8760283861890964e-05, + "loss": 0.8234, + "step": 700 + }, + { + "epoch": 0.10095046082949309, + "grad_norm": 3.8217389583587646, + "learning_rate": 4.875676392797168e-05, + "loss": 1.4092, + "step": 701 + }, + { + "epoch": 0.10109447004608295, + "grad_norm": 4.677566051483154, + "learning_rate": 4.875323913146304e-05, + "loss": 1.9095, + "step": 702 + }, + { + "epoch": 0.1012384792626728, + "grad_norm": 4.792636871337891, + "learning_rate": 4.8749709473086505e-05, + "loss": 3.0213, + "step": 703 + }, + { + "epoch": 0.10138248847926268, + "grad_norm": 3.9378762245178223, + "learning_rate": 4.8746174953564525e-05, + "loss": 1.595, + "step": 704 + }, + { + "epoch": 0.10152649769585254, + "grad_norm": 2.7714548110961914, + "learning_rate": 4.874263557362056e-05, + "loss": 0.7223, + "step": 705 + }, + { + "epoch": 0.1016705069124424, + "grad_norm": 2.305675506591797, + "learning_rate": 4.873909133397905e-05, + "loss": 0.411, + "step": 706 + }, + { + "epoch": 0.10181451612903226, + "grad_norm": 4.85443639755249, + "learning_rate": 4.873554223536544e-05, + "loss": 2.0255, + "step": 707 + }, + { + "epoch": 0.10195852534562212, + "grad_norm": 3.166727304458618, + "learning_rate": 4.873198827850618e-05, + "loss": 0.7065, + "step": 708 + }, + { + "epoch": 0.10210253456221198, + "grad_norm": 2.710862636566162, + "learning_rate": 4.8728429464128687e-05, + "loss": 0.645, + "step": 709 + }, + { + "epoch": 0.10224654377880184, + "grad_norm": 3.028265953063965, + "learning_rate": 4.87248657929614e-05, + "loss": 0.6135, + "step": 710 + }, + { + "epoch": 0.10239055299539171, + "grad_norm": 3.705111265182495, + "learning_rate": 4.872129726573373e-05, + "loss": 1.6818, + "step": 711 + }, + { + "epoch": 0.10253456221198157, + "grad_norm": 3.977458953857422, + "learning_rate": 4.87177238831761e-05, + "loss": 1.3381, + "step": 712 + }, + { + "epoch": 0.10267857142857142, + "grad_norm": 6.710785388946533, + "learning_rate": 4.871414564601992e-05, + "loss": 1.5341, + "step": 713 + }, + { + "epoch": 0.1028225806451613, + "grad_norm": 4.5808186531066895, + "learning_rate": 4.871056255499757e-05, + "loss": 1.3763, + "step": 714 + }, + { + "epoch": 0.10296658986175115, + "grad_norm": 5.442184925079346, + "learning_rate": 4.8706974610842474e-05, + "loss": 1.361, + "step": 715 + }, + { + "epoch": 0.10311059907834101, + "grad_norm": 5.461939811706543, + "learning_rate": 4.8703381814289e-05, + "loss": 1.1979, + "step": 716 + }, + { + "epoch": 0.10325460829493087, + "grad_norm": 3.953097343444824, + "learning_rate": 4.869978416607253e-05, + "loss": 0.9672, + "step": 717 + }, + { + "epoch": 0.10339861751152074, + "grad_norm": 3.1251420974731445, + "learning_rate": 4.8696181666929454e-05, + "loss": 1.8849, + "step": 718 + }, + { + "epoch": 0.1035426267281106, + "grad_norm": 2.433943033218384, + "learning_rate": 4.869257431759713e-05, + "loss": 0.3662, + "step": 719 + }, + { + "epoch": 0.10368663594470046, + "grad_norm": 2.166759967803955, + "learning_rate": 4.8688962118813925e-05, + "loss": 0.3463, + "step": 720 + }, + { + "epoch": 0.10383064516129033, + "grad_norm": 2.091562271118164, + "learning_rate": 4.868534507131919e-05, + "loss": 0.1819, + "step": 721 + }, + { + "epoch": 0.10397465437788019, + "grad_norm": 7.785177707672119, + "learning_rate": 4.868172317585326e-05, + "loss": 1.894, + "step": 722 + }, + { + "epoch": 0.10411866359447004, + "grad_norm": 3.660632610321045, + "learning_rate": 4.8678096433157484e-05, + "loss": 0.61, + "step": 723 + }, + { + "epoch": 0.10426267281105991, + "grad_norm": 4.739894390106201, + "learning_rate": 4.867446484397419e-05, + "loss": 1.7969, + "step": 724 + }, + { + "epoch": 0.10440668202764977, + "grad_norm": 2.105351686477661, + "learning_rate": 4.8670828409046696e-05, + "loss": 0.731, + "step": 725 + }, + { + "epoch": 0.10455069124423963, + "grad_norm": 7.061154365539551, + "learning_rate": 4.866718712911932e-05, + "loss": 2.26, + "step": 726 + }, + { + "epoch": 0.10469470046082949, + "grad_norm": 4.487666606903076, + "learning_rate": 4.866354100493737e-05, + "loss": 1.8806, + "step": 727 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 5.036492824554443, + "learning_rate": 4.8659890037247146e-05, + "loss": 1.6266, + "step": 728 + }, + { + "epoch": 0.10498271889400922, + "grad_norm": 3.225632905960083, + "learning_rate": 4.865623422679593e-05, + "loss": 1.7557, + "step": 729 + }, + { + "epoch": 0.10512672811059907, + "grad_norm": 4.041957378387451, + "learning_rate": 4.865257357433199e-05, + "loss": 1.383, + "step": 730 + }, + { + "epoch": 0.10527073732718895, + "grad_norm": 2.961775541305542, + "learning_rate": 4.8648908080604614e-05, + "loss": 0.4406, + "step": 731 + }, + { + "epoch": 0.1054147465437788, + "grad_norm": 4.364943504333496, + "learning_rate": 4.8645237746364065e-05, + "loss": 1.2238, + "step": 732 + }, + { + "epoch": 0.10555875576036866, + "grad_norm": 1.954385757446289, + "learning_rate": 4.864156257236159e-05, + "loss": 0.2481, + "step": 733 + }, + { + "epoch": 0.10570276497695852, + "grad_norm": 5.253281116485596, + "learning_rate": 4.863788255934942e-05, + "loss": 2.5492, + "step": 734 + }, + { + "epoch": 0.10584677419354839, + "grad_norm": 5.107970237731934, + "learning_rate": 4.863419770808081e-05, + "loss": 1.3939, + "step": 735 + }, + { + "epoch": 0.10599078341013825, + "grad_norm": 3.9129045009613037, + "learning_rate": 4.8630508019309976e-05, + "loss": 0.7755, + "step": 736 + }, + { + "epoch": 0.1061347926267281, + "grad_norm": 4.145159721374512, + "learning_rate": 4.862681349379212e-05, + "loss": 1.0409, + "step": 737 + }, + { + "epoch": 0.10627880184331798, + "grad_norm": 4.26768159866333, + "learning_rate": 4.862311413228346e-05, + "loss": 2.0055, + "step": 738 + }, + { + "epoch": 0.10642281105990783, + "grad_norm": 2.7963833808898926, + "learning_rate": 4.861940993554119e-05, + "loss": 1.4514, + "step": 739 + }, + { + "epoch": 0.10656682027649769, + "grad_norm": 5.190325736999512, + "learning_rate": 4.861570090432349e-05, + "loss": 1.7057, + "step": 740 + }, + { + "epoch": 0.10671082949308756, + "grad_norm": 2.626246690750122, + "learning_rate": 4.8611987039389525e-05, + "loss": 0.5627, + "step": 741 + }, + { + "epoch": 0.10685483870967742, + "grad_norm": 5.179800510406494, + "learning_rate": 4.8608268341499465e-05, + "loss": 1.7123, + "step": 742 + }, + { + "epoch": 0.10699884792626728, + "grad_norm": 4.435484886169434, + "learning_rate": 4.8604544811414465e-05, + "loss": 1.6877, + "step": 743 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 3.5103931427001953, + "learning_rate": 4.860081644989667e-05, + "loss": 0.907, + "step": 744 + }, + { + "epoch": 0.10728686635944701, + "grad_norm": 4.484877109527588, + "learning_rate": 4.8597083257709194e-05, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.10743087557603687, + "grad_norm": 4.103257656097412, + "learning_rate": 4.8593345235616164e-05, + "loss": 1.927, + "step": 746 + }, + { + "epoch": 0.10757488479262672, + "grad_norm": 2.848867416381836, + "learning_rate": 4.858960238438268e-05, + "loss": 0.5338, + "step": 747 + }, + { + "epoch": 0.1077188940092166, + "grad_norm": 1.667094111442566, + "learning_rate": 4.858585470477486e-05, + "loss": 0.366, + "step": 748 + }, + { + "epoch": 0.10786290322580645, + "grad_norm": 4.004820823669434, + "learning_rate": 4.858210219755976e-05, + "loss": 1.3249, + "step": 749 + }, + { + "epoch": 0.10800691244239631, + "grad_norm": 2.7294037342071533, + "learning_rate": 4.8578344863505464e-05, + "loss": 0.9061, + "step": 750 + }, + { + "epoch": 0.10815092165898617, + "grad_norm": 3.5107228755950928, + "learning_rate": 4.857458270338103e-05, + "loss": 2.0001, + "step": 751 + }, + { + "epoch": 0.10829493087557604, + "grad_norm": 3.9957597255706787, + "learning_rate": 4.857081571795652e-05, + "loss": 1.3703, + "step": 752 + }, + { + "epoch": 0.1084389400921659, + "grad_norm": 3.7178354263305664, + "learning_rate": 4.856704390800294e-05, + "loss": 2.4036, + "step": 753 + }, + { + "epoch": 0.10858294930875576, + "grad_norm": 3.311417579650879, + "learning_rate": 4.8563267274292334e-05, + "loss": 0.3767, + "step": 754 + }, + { + "epoch": 0.10872695852534563, + "grad_norm": 2.7513182163238525, + "learning_rate": 4.855948581759772e-05, + "loss": 0.6057, + "step": 755 + }, + { + "epoch": 0.10887096774193548, + "grad_norm": 4.501307010650635, + "learning_rate": 4.855569953869307e-05, + "loss": 1.3641, + "step": 756 + }, + { + "epoch": 0.10901497695852534, + "grad_norm": 2.9075746536254883, + "learning_rate": 4.8551908438353374e-05, + "loss": 0.8012, + "step": 757 + }, + { + "epoch": 0.10915898617511521, + "grad_norm": 3.7261056900024414, + "learning_rate": 4.854811251735462e-05, + "loss": 0.6304, + "step": 758 + }, + { + "epoch": 0.10930299539170507, + "grad_norm": 2.646265983581543, + "learning_rate": 4.854431177647375e-05, + "loss": 2.2286, + "step": 759 + }, + { + "epoch": 0.10944700460829493, + "grad_norm": 3.3477084636688232, + "learning_rate": 4.854050621648872e-05, + "loss": 2.1883, + "step": 760 + }, + { + "epoch": 0.10959101382488479, + "grad_norm": 4.126540660858154, + "learning_rate": 4.8536695838178456e-05, + "loss": 3.0915, + "step": 761 + }, + { + "epoch": 0.10973502304147466, + "grad_norm": 3.3576998710632324, + "learning_rate": 4.8532880642322874e-05, + "loss": 2.5265, + "step": 762 + }, + { + "epoch": 0.10987903225806452, + "grad_norm": 1.146294355392456, + "learning_rate": 4.852906062970287e-05, + "loss": 0.1888, + "step": 763 + }, + { + "epoch": 0.11002304147465437, + "grad_norm": 1.7971491813659668, + "learning_rate": 4.8525235801100346e-05, + "loss": 0.1524, + "step": 764 + }, + { + "epoch": 0.11016705069124424, + "grad_norm": 1.0862432718276978, + "learning_rate": 4.8521406157298175e-05, + "loss": 4.7112, + "step": 765 + }, + { + "epoch": 0.1103110599078341, + "grad_norm": 3.496450901031494, + "learning_rate": 4.8517571699080196e-05, + "loss": 1.0907, + "step": 766 + }, + { + "epoch": 0.11045506912442396, + "grad_norm": 5.677220344543457, + "learning_rate": 4.851373242723129e-05, + "loss": 1.6873, + "step": 767 + }, + { + "epoch": 0.11059907834101383, + "grad_norm": 5.5211920738220215, + "learning_rate": 4.8509888342537266e-05, + "loss": 2.6526, + "step": 768 + }, + { + "epoch": 0.11074308755760369, + "grad_norm": 3.4525749683380127, + "learning_rate": 4.850603944578494e-05, + "loss": 0.6601, + "step": 769 + }, + { + "epoch": 0.11088709677419355, + "grad_norm": 2.907694101333618, + "learning_rate": 4.850218573776212e-05, + "loss": 0.3457, + "step": 770 + }, + { + "epoch": 0.1110311059907834, + "grad_norm": 3.3776769638061523, + "learning_rate": 4.849832721925759e-05, + "loss": 0.9116, + "step": 771 + }, + { + "epoch": 0.11117511520737328, + "grad_norm": 5.4771504402160645, + "learning_rate": 4.8494463891061124e-05, + "loss": 2.3138, + "step": 772 + }, + { + "epoch": 0.11131912442396313, + "grad_norm": 4.1573166847229, + "learning_rate": 4.849059575396347e-05, + "loss": 1.1567, + "step": 773 + }, + { + "epoch": 0.11146313364055299, + "grad_norm": 2.4449596405029297, + "learning_rate": 4.848672280875636e-05, + "loss": 0.2914, + "step": 774 + }, + { + "epoch": 0.11160714285714286, + "grad_norm": 4.028317928314209, + "learning_rate": 4.848284505623254e-05, + "loss": 2.6692, + "step": 775 + }, + { + "epoch": 0.11175115207373272, + "grad_norm": 2.9531872272491455, + "learning_rate": 4.84789624971857e-05, + "loss": 0.9912, + "step": 776 + }, + { + "epoch": 0.11189516129032258, + "grad_norm": 5.223951816558838, + "learning_rate": 4.847507513241053e-05, + "loss": 2.2249, + "step": 777 + }, + { + "epoch": 0.11203917050691244, + "grad_norm": 4.274454593658447, + "learning_rate": 4.847118296270272e-05, + "loss": 2.2503, + "step": 778 + }, + { + "epoch": 0.11218317972350231, + "grad_norm": 4.4081034660339355, + "learning_rate": 4.846728598885891e-05, + "loss": 1.0393, + "step": 779 + }, + { + "epoch": 0.11232718894009217, + "grad_norm": 3.9523470401763916, + "learning_rate": 4.846338421167676e-05, + "loss": 1.2622, + "step": 780 + }, + { + "epoch": 0.11247119815668202, + "grad_norm": 2.5422956943511963, + "learning_rate": 4.845947763195488e-05, + "loss": 0.3748, + "step": 781 + }, + { + "epoch": 0.1126152073732719, + "grad_norm": 4.281620979309082, + "learning_rate": 4.845556625049288e-05, + "loss": 0.6888, + "step": 782 + }, + { + "epoch": 0.11275921658986175, + "grad_norm": 1.7741386890411377, + "learning_rate": 4.845165006809136e-05, + "loss": 0.2711, + "step": 783 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 5.280117988586426, + "learning_rate": 4.8447729085551886e-05, + "loss": 1.2479, + "step": 784 + }, + { + "epoch": 0.11304723502304148, + "grad_norm": 7.338584899902344, + "learning_rate": 4.844380330367701e-05, + "loss": 0.9625, + "step": 785 + }, + { + "epoch": 0.11319124423963134, + "grad_norm": 2.885222911834717, + "learning_rate": 4.843987272327029e-05, + "loss": 2.2975, + "step": 786 + }, + { + "epoch": 0.1133352534562212, + "grad_norm": 3.8187179565429688, + "learning_rate": 4.8435937345136215e-05, + "loss": 0.8871, + "step": 787 + }, + { + "epoch": 0.11347926267281105, + "grad_norm": 4.084849834442139, + "learning_rate": 4.8431997170080304e-05, + "loss": 2.0531, + "step": 788 + }, + { + "epoch": 0.11362327188940093, + "grad_norm": 2.199028253555298, + "learning_rate": 4.8428052198909045e-05, + "loss": 1.0282, + "step": 789 + }, + { + "epoch": 0.11376728110599078, + "grad_norm": 3.9906249046325684, + "learning_rate": 4.84241024324299e-05, + "loss": 0.7168, + "step": 790 + }, + { + "epoch": 0.11391129032258064, + "grad_norm": 3.091651201248169, + "learning_rate": 4.842014787145132e-05, + "loss": 1.0273, + "step": 791 + }, + { + "epoch": 0.11405529953917051, + "grad_norm": 2.534789562225342, + "learning_rate": 4.8416188516782715e-05, + "loss": 0.3782, + "step": 792 + }, + { + "epoch": 0.11419930875576037, + "grad_norm": 4.18092155456543, + "learning_rate": 4.841222436923451e-05, + "loss": 0.5337, + "step": 793 + }, + { + "epoch": 0.11434331797235023, + "grad_norm": 5.815586090087891, + "learning_rate": 4.840825542961811e-05, + "loss": 1.9452, + "step": 794 + }, + { + "epoch": 0.11448732718894009, + "grad_norm": 3.7844650745391846, + "learning_rate": 4.8404281698745865e-05, + "loss": 1.7035, + "step": 795 + }, + { + "epoch": 0.11463133640552996, + "grad_norm": 4.6191558837890625, + "learning_rate": 4.840030317743114e-05, + "loss": 2.493, + "step": 796 + }, + { + "epoch": 0.11477534562211981, + "grad_norm": 1.6598927974700928, + "learning_rate": 4.839631986648825e-05, + "loss": 0.5581, + "step": 797 + }, + { + "epoch": 0.11491935483870967, + "grad_norm": 1.2069417238235474, + "learning_rate": 4.839233176673253e-05, + "loss": 4.5994, + "step": 798 + }, + { + "epoch": 0.11506336405529954, + "grad_norm": 5.031913757324219, + "learning_rate": 4.838833887898026e-05, + "loss": 1.3121, + "step": 799 + }, + { + "epoch": 0.1152073732718894, + "grad_norm": 4.255652904510498, + "learning_rate": 4.838434120404872e-05, + "loss": 0.7792, + "step": 800 + }, + { + "epoch": 0.11535138248847926, + "grad_norm": 1.5169037580490112, + "learning_rate": 4.8380338742756157e-05, + "loss": 0.7047, + "step": 801 + }, + { + "epoch": 0.11549539170506913, + "grad_norm": 1.7469778060913086, + "learning_rate": 4.837633149592181e-05, + "loss": 0.5891, + "step": 802 + }, + { + "epoch": 0.11563940092165899, + "grad_norm": 3.612107038497925, + "learning_rate": 4.837231946436589e-05, + "loss": 1.2101, + "step": 803 + }, + { + "epoch": 0.11578341013824885, + "grad_norm": 4.141978740692139, + "learning_rate": 4.836830264890959e-05, + "loss": 1.5791, + "step": 804 + }, + { + "epoch": 0.1159274193548387, + "grad_norm": 2.845172643661499, + "learning_rate": 4.836428105037508e-05, + "loss": 2.3962, + "step": 805 + }, + { + "epoch": 0.11607142857142858, + "grad_norm": 3.8117563724517822, + "learning_rate": 4.83602546695855e-05, + "loss": 0.9326, + "step": 806 + }, + { + "epoch": 0.11621543778801843, + "grad_norm": 4.023331642150879, + "learning_rate": 4.8356223507364996e-05, + "loss": 2.3817, + "step": 807 + }, + { + "epoch": 0.11635944700460829, + "grad_norm": 2.2710177898406982, + "learning_rate": 4.835218756453867e-05, + "loss": 1.5861, + "step": 808 + }, + { + "epoch": 0.11650345622119816, + "grad_norm": 8.185074806213379, + "learning_rate": 4.834814684193261e-05, + "loss": 1.8127, + "step": 809 + }, + { + "epoch": 0.11664746543778802, + "grad_norm": 4.088139057159424, + "learning_rate": 4.834410134037386e-05, + "loss": 0.9368, + "step": 810 + }, + { + "epoch": 0.11679147465437788, + "grad_norm": 5.423403263092041, + "learning_rate": 4.8340051060690494e-05, + "loss": 1.5595, + "step": 811 + }, + { + "epoch": 0.11693548387096774, + "grad_norm": 4.157839775085449, + "learning_rate": 4.833599600371152e-05, + "loss": 1.6971, + "step": 812 + }, + { + "epoch": 0.1170794930875576, + "grad_norm": 1.7779569625854492, + "learning_rate": 4.833193617026692e-05, + "loss": 0.2177, + "step": 813 + }, + { + "epoch": 0.11722350230414746, + "grad_norm": 3.138317346572876, + "learning_rate": 4.832787156118769e-05, + "loss": 3.0398, + "step": 814 + }, + { + "epoch": 0.11736751152073732, + "grad_norm": 6.497288703918457, + "learning_rate": 4.832380217730578e-05, + "loss": 1.5658, + "step": 815 + }, + { + "epoch": 0.1175115207373272, + "grad_norm": 2.4292852878570557, + "learning_rate": 4.831972801945412e-05, + "loss": 0.443, + "step": 816 + }, + { + "epoch": 0.11765552995391705, + "grad_norm": 2.7780532836914062, + "learning_rate": 4.831564908846661e-05, + "loss": 0.4888, + "step": 817 + }, + { + "epoch": 0.11779953917050691, + "grad_norm": 2.4792752265930176, + "learning_rate": 4.831156538517815e-05, + "loss": 1.47, + "step": 818 + }, + { + "epoch": 0.11794354838709678, + "grad_norm": 3.9836902618408203, + "learning_rate": 4.830747691042459e-05, + "loss": 1.7045, + "step": 819 + }, + { + "epoch": 0.11808755760368664, + "grad_norm": 7.291478157043457, + "learning_rate": 4.830338366504277e-05, + "loss": 1.9676, + "step": 820 + }, + { + "epoch": 0.1182315668202765, + "grad_norm": 3.394331455230713, + "learning_rate": 4.829928564987051e-05, + "loss": 0.6592, + "step": 821 + }, + { + "epoch": 0.11837557603686635, + "grad_norm": 4.980962753295898, + "learning_rate": 4.8295182865746604e-05, + "loss": 1.0994, + "step": 822 + }, + { + "epoch": 0.11851958525345622, + "grad_norm": 3.414033889770508, + "learning_rate": 4.82910753135108e-05, + "loss": 1.9506, + "step": 823 + }, + { + "epoch": 0.11866359447004608, + "grad_norm": 2.7856876850128174, + "learning_rate": 4.828696299400387e-05, + "loss": 0.5871, + "step": 824 + }, + { + "epoch": 0.11880760368663594, + "grad_norm": 3.0859103202819824, + "learning_rate": 4.8282845908067507e-05, + "loss": 0.5972, + "step": 825 + }, + { + "epoch": 0.11895161290322581, + "grad_norm": 4.101122856140137, + "learning_rate": 4.8278724056544424e-05, + "loss": 1.5593, + "step": 826 + }, + { + "epoch": 0.11909562211981567, + "grad_norm": 4.62990140914917, + "learning_rate": 4.827459744027828e-05, + "loss": 0.711, + "step": 827 + }, + { + "epoch": 0.11923963133640553, + "grad_norm": 3.4495649337768555, + "learning_rate": 4.827046606011372e-05, + "loss": 0.7396, + "step": 828 + }, + { + "epoch": 0.11938364055299538, + "grad_norm": 3.3950114250183105, + "learning_rate": 4.826632991689638e-05, + "loss": 1.586, + "step": 829 + }, + { + "epoch": 0.11952764976958526, + "grad_norm": 3.292325735092163, + "learning_rate": 4.8262189011472834e-05, + "loss": 0.9181, + "step": 830 + }, + { + "epoch": 0.11967165898617511, + "grad_norm": 4.442233085632324, + "learning_rate": 4.825804334469066e-05, + "loss": 2.1645, + "step": 831 + }, + { + "epoch": 0.11981566820276497, + "grad_norm": 4.785717487335205, + "learning_rate": 4.8253892917398414e-05, + "loss": 0.718, + "step": 832 + }, + { + "epoch": 0.11995967741935484, + "grad_norm": 4.78377103805542, + "learning_rate": 4.82497377304456e-05, + "loss": 0.8974, + "step": 833 + }, + { + "epoch": 0.1201036866359447, + "grad_norm": 4.096307754516602, + "learning_rate": 4.824557778468272e-05, + "loss": 1.3453, + "step": 834 + }, + { + "epoch": 0.12024769585253456, + "grad_norm": 2.7127060890197754, + "learning_rate": 4.824141308096124e-05, + "loss": 0.4644, + "step": 835 + }, + { + "epoch": 0.12039170506912443, + "grad_norm": 3.784881114959717, + "learning_rate": 4.8237243620133594e-05, + "loss": 1.0856, + "step": 836 + }, + { + "epoch": 0.12053571428571429, + "grad_norm": 3.3280937671661377, + "learning_rate": 4.82330694030532e-05, + "loss": 0.8089, + "step": 837 + }, + { + "epoch": 0.12067972350230415, + "grad_norm": 5.040460109710693, + "learning_rate": 4.822889043057446e-05, + "loss": 2.6403, + "step": 838 + }, + { + "epoch": 0.120823732718894, + "grad_norm": 5.149562358856201, + "learning_rate": 4.822470670355271e-05, + "loss": 2.6375, + "step": 839 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 3.078108310699463, + "learning_rate": 4.822051822284431e-05, + "loss": 1.2018, + "step": 840 + }, + { + "epoch": 0.12111175115207373, + "grad_norm": 5.586456298828125, + "learning_rate": 4.821632498930656e-05, + "loss": 1.9191, + "step": 841 + }, + { + "epoch": 0.12125576036866359, + "grad_norm": 5.048520565032959, + "learning_rate": 4.821212700379773e-05, + "loss": 2.5323, + "step": 842 + }, + { + "epoch": 0.12139976958525346, + "grad_norm": 1.8799121379852295, + "learning_rate": 4.8207924267177084e-05, + "loss": 0.2089, + "step": 843 + }, + { + "epoch": 0.12154377880184332, + "grad_norm": 4.67935848236084, + "learning_rate": 4.820371678030485e-05, + "loss": 2.1032, + "step": 844 + }, + { + "epoch": 0.12168778801843318, + "grad_norm": 3.911177396774292, + "learning_rate": 4.819950454404221e-05, + "loss": 1.8171, + "step": 845 + }, + { + "epoch": 0.12183179723502305, + "grad_norm": 3.54793119430542, + "learning_rate": 4.8195287559251356e-05, + "loss": 1.3324, + "step": 846 + }, + { + "epoch": 0.1219758064516129, + "grad_norm": 3.9724390506744385, + "learning_rate": 4.819106582679542e-05, + "loss": 0.7683, + "step": 847 + }, + { + "epoch": 0.12211981566820276, + "grad_norm": 5.300734519958496, + "learning_rate": 4.818683934753851e-05, + "loss": 1.8116, + "step": 848 + }, + { + "epoch": 0.12226382488479262, + "grad_norm": 3.98686146736145, + "learning_rate": 4.818260812234572e-05, + "loss": 1.1141, + "step": 849 + }, + { + "epoch": 0.12240783410138249, + "grad_norm": 3.2157142162323, + "learning_rate": 4.817837215208311e-05, + "loss": 0.8996, + "step": 850 + }, + { + "epoch": 0.12255184331797235, + "grad_norm": 5.104499340057373, + "learning_rate": 4.817413143761769e-05, + "loss": 1.4677, + "step": 851 + }, + { + "epoch": 0.12269585253456221, + "grad_norm": 7.334455966949463, + "learning_rate": 4.816988597981748e-05, + "loss": 1.6004, + "step": 852 + }, + { + "epoch": 0.12283986175115208, + "grad_norm": 6.779260158538818, + "learning_rate": 4.8165635779551446e-05, + "loss": 1.397, + "step": 853 + }, + { + "epoch": 0.12298387096774194, + "grad_norm": 4.294000625610352, + "learning_rate": 4.816138083768952e-05, + "loss": 1.3103, + "step": 854 + }, + { + "epoch": 0.1231278801843318, + "grad_norm": 4.194522857666016, + "learning_rate": 4.815712115510261e-05, + "loss": 2.3709, + "step": 855 + }, + { + "epoch": 0.12327188940092165, + "grad_norm": 6.175501823425293, + "learning_rate": 4.815285673266262e-05, + "loss": 1.3595, + "step": 856 + }, + { + "epoch": 0.12341589861751152, + "grad_norm": 3.0968589782714844, + "learning_rate": 4.8148587571242373e-05, + "loss": 0.6824, + "step": 857 + }, + { + "epoch": 0.12355990783410138, + "grad_norm": 6.115101337432861, + "learning_rate": 4.8144313671715716e-05, + "loss": 2.1205, + "step": 858 + }, + { + "epoch": 0.12370391705069124, + "grad_norm": 4.792463302612305, + "learning_rate": 4.814003503495743e-05, + "loss": 0.715, + "step": 859 + }, + { + "epoch": 0.12384792626728111, + "grad_norm": 4.197549343109131, + "learning_rate": 4.8135751661843275e-05, + "loss": 0.8249, + "step": 860 + }, + { + "epoch": 0.12399193548387097, + "grad_norm": 5.294054985046387, + "learning_rate": 4.813146355324998e-05, + "loss": 1.9881, + "step": 861 + }, + { + "epoch": 0.12413594470046083, + "grad_norm": 5.673846244812012, + "learning_rate": 4.812717071005525e-05, + "loss": 1.9075, + "step": 862 + }, + { + "epoch": 0.1242799539170507, + "grad_norm": 4.169858455657959, + "learning_rate": 4.8122873133137756e-05, + "loss": 0.8117, + "step": 863 + }, + { + "epoch": 0.12442396313364056, + "grad_norm": 2.3217875957489014, + "learning_rate": 4.811857082337713e-05, + "loss": 0.6826, + "step": 864 + }, + { + "epoch": 0.12456797235023041, + "grad_norm": 4.1543707847595215, + "learning_rate": 4.811426378165398e-05, + "loss": 1.6596, + "step": 865 + }, + { + "epoch": 0.12471198156682027, + "grad_norm": 2.812429666519165, + "learning_rate": 4.810995200884988e-05, + "loss": 0.8303, + "step": 866 + }, + { + "epoch": 0.12485599078341014, + "grad_norm": 4.086765289306641, + "learning_rate": 4.8105635505847376e-05, + "loss": 0.6705, + "step": 867 + }, + { + "epoch": 0.125, + "grad_norm": 4.196064472198486, + "learning_rate": 4.8101314273529976e-05, + "loss": 1.8157, + "step": 868 + }, + { + "epoch": 0.12514400921658986, + "grad_norm": 2.5024776458740234, + "learning_rate": 4.8096988312782174e-05, + "loss": 2.3085, + "step": 869 + }, + { + "epoch": 0.12528801843317972, + "grad_norm": 4.333360195159912, + "learning_rate": 4.80926576244894e-05, + "loss": 1.0333, + "step": 870 + }, + { + "epoch": 0.12543202764976957, + "grad_norm": 3.9248955249786377, + "learning_rate": 4.8088322209538074e-05, + "loss": 1.3525, + "step": 871 + }, + { + "epoch": 0.12557603686635946, + "grad_norm": 5.198975086212158, + "learning_rate": 4.8083982068815586e-05, + "loss": 1.3615, + "step": 872 + }, + { + "epoch": 0.12572004608294932, + "grad_norm": 5.518678188323975, + "learning_rate": 4.807963720321028e-05, + "loss": 1.6529, + "step": 873 + }, + { + "epoch": 0.12586405529953917, + "grad_norm": 7.256701946258545, + "learning_rate": 4.807528761361147e-05, + "loss": 1.7532, + "step": 874 + }, + { + "epoch": 0.12600806451612903, + "grad_norm": 5.851320743560791, + "learning_rate": 4.807093330090945e-05, + "loss": 1.1737, + "step": 875 + }, + { + "epoch": 0.1261520737327189, + "grad_norm": 5.852200031280518, + "learning_rate": 4.8066574265995464e-05, + "loss": 1.1501, + "step": 876 + }, + { + "epoch": 0.12629608294930875, + "grad_norm": 4.689886093139648, + "learning_rate": 4.806221050976173e-05, + "loss": 2.0623, + "step": 877 + }, + { + "epoch": 0.1264400921658986, + "grad_norm": 5.351761341094971, + "learning_rate": 4.805784203310143e-05, + "loss": 1.3581, + "step": 878 + }, + { + "epoch": 0.1265841013824885, + "grad_norm": 5.624961853027344, + "learning_rate": 4.805346883690871e-05, + "loss": 0.7498, + "step": 879 + }, + { + "epoch": 0.12672811059907835, + "grad_norm": 3.694092035293579, + "learning_rate": 4.80490909220787e-05, + "loss": 0.8784, + "step": 880 + }, + { + "epoch": 0.1268721198156682, + "grad_norm": 3.476034641265869, + "learning_rate": 4.804470828950748e-05, + "loss": 2.2656, + "step": 881 + }, + { + "epoch": 0.12701612903225806, + "grad_norm": 5.1272711753845215, + "learning_rate": 4.8040320940092076e-05, + "loss": 1.1823, + "step": 882 + }, + { + "epoch": 0.12716013824884792, + "grad_norm": 8.784689903259277, + "learning_rate": 4.803592887473053e-05, + "loss": 2.8748, + "step": 883 + }, + { + "epoch": 0.12730414746543778, + "grad_norm": 5.5509538650512695, + "learning_rate": 4.80315320943218e-05, + "loss": 0.843, + "step": 884 + }, + { + "epoch": 0.12744815668202766, + "grad_norm": 4.456205368041992, + "learning_rate": 4.802713059976583e-05, + "loss": 1.2347, + "step": 885 + }, + { + "epoch": 0.12759216589861752, + "grad_norm": 4.454754829406738, + "learning_rate": 4.802272439196354e-05, + "loss": 0.8432, + "step": 886 + }, + { + "epoch": 0.12773617511520738, + "grad_norm": 5.512842655181885, + "learning_rate": 4.801831347181679e-05, + "loss": 2.3318, + "step": 887 + }, + { + "epoch": 0.12788018433179724, + "grad_norm": 3.344829797744751, + "learning_rate": 4.801389784022843e-05, + "loss": 0.275, + "step": 888 + }, + { + "epoch": 0.1280241935483871, + "grad_norm": 3.842590093612671, + "learning_rate": 4.800947749810224e-05, + "loss": 0.5523, + "step": 889 + }, + { + "epoch": 0.12816820276497695, + "grad_norm": 7.323307991027832, + "learning_rate": 4.8005052446343016e-05, + "loss": 2.9852, + "step": 890 + }, + { + "epoch": 0.1283122119815668, + "grad_norm": 5.162419319152832, + "learning_rate": 4.800062268585647e-05, + "loss": 1.296, + "step": 891 + }, + { + "epoch": 0.1284562211981567, + "grad_norm": 3.5330557823181152, + "learning_rate": 4.79961882175493e-05, + "loss": 1.1257, + "step": 892 + }, + { + "epoch": 0.12860023041474655, + "grad_norm": 2.1054258346557617, + "learning_rate": 4.799174904232916e-05, + "loss": 0.18, + "step": 893 + }, + { + "epoch": 0.1287442396313364, + "grad_norm": 6.363369941711426, + "learning_rate": 4.7987305161104665e-05, + "loss": 1.725, + "step": 894 + }, + { + "epoch": 0.12888824884792627, + "grad_norm": 3.811525344848633, + "learning_rate": 4.7982856574785415e-05, + "loss": 0.6996, + "step": 895 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 2.372326374053955, + "learning_rate": 4.7978403284281946e-05, + "loss": 0.3132, + "step": 896 + }, + { + "epoch": 0.12917626728110598, + "grad_norm": 4.67622709274292, + "learning_rate": 4.7973945290505766e-05, + "loss": 1.5803, + "step": 897 + }, + { + "epoch": 0.12932027649769584, + "grad_norm": 3.453455686569214, + "learning_rate": 4.7969482594369354e-05, + "loss": 0.8545, + "step": 898 + }, + { + "epoch": 0.12946428571428573, + "grad_norm": 4.14625358581543, + "learning_rate": 4.7965015196786143e-05, + "loss": 2.3974, + "step": 899 + }, + { + "epoch": 0.12960829493087558, + "grad_norm": 1.5085270404815674, + "learning_rate": 4.796054309867053e-05, + "loss": 0.1274, + "step": 900 + }, + { + "epoch": 0.12975230414746544, + "grad_norm": 2.173369884490967, + "learning_rate": 4.795606630093788e-05, + "loss": 0.3193, + "step": 901 + }, + { + "epoch": 0.1298963133640553, + "grad_norm": 2.495344638824463, + "learning_rate": 4.795158480450449e-05, + "loss": 0.7611, + "step": 902 + }, + { + "epoch": 0.13004032258064516, + "grad_norm": 4.839320182800293, + "learning_rate": 4.794709861028768e-05, + "loss": 1.4171, + "step": 903 + }, + { + "epoch": 0.13018433179723501, + "grad_norm": 3.609281301498413, + "learning_rate": 4.7942607719205663e-05, + "loss": 2.807, + "step": 904 + }, + { + "epoch": 0.13032834101382487, + "grad_norm": 6.588963985443115, + "learning_rate": 4.793811213217766e-05, + "loss": 3.2934, + "step": 905 + }, + { + "epoch": 0.13047235023041476, + "grad_norm": 5.225557327270508, + "learning_rate": 4.793361185012384e-05, + "loss": 1.1375, + "step": 906 + }, + { + "epoch": 0.13061635944700462, + "grad_norm": 7.463988780975342, + "learning_rate": 4.792910687396533e-05, + "loss": 1.976, + "step": 907 + }, + { + "epoch": 0.13076036866359447, + "grad_norm": 2.3279218673706055, + "learning_rate": 4.79245972046242e-05, + "loss": 0.5052, + "step": 908 + }, + { + "epoch": 0.13090437788018433, + "grad_norm": 2.6533284187316895, + "learning_rate": 4.7920082843023527e-05, + "loss": 0.2591, + "step": 909 + }, + { + "epoch": 0.1310483870967742, + "grad_norm": 5.206713676452637, + "learning_rate": 4.791556379008731e-05, + "loss": 2.5531, + "step": 910 + }, + { + "epoch": 0.13119239631336405, + "grad_norm": 4.001986980438232, + "learning_rate": 4.791104004674052e-05, + "loss": 3.0069, + "step": 911 + }, + { + "epoch": 0.1313364055299539, + "grad_norm": 4.066767692565918, + "learning_rate": 4.7906511613909087e-05, + "loss": 3.6898, + "step": 912 + }, + { + "epoch": 0.1314804147465438, + "grad_norm": 4.131237030029297, + "learning_rate": 4.7901978492519894e-05, + "loss": 1.2579, + "step": 913 + }, + { + "epoch": 0.13162442396313365, + "grad_norm": 3.6639254093170166, + "learning_rate": 4.78974406835008e-05, + "loss": 0.8012, + "step": 914 + }, + { + "epoch": 0.1317684331797235, + "grad_norm": 3.2663846015930176, + "learning_rate": 4.789289818778061e-05, + "loss": 0.7792, + "step": 915 + }, + { + "epoch": 0.13191244239631336, + "grad_norm": 2.3043107986450195, + "learning_rate": 4.78883510062891e-05, + "loss": 0.2905, + "step": 916 + }, + { + "epoch": 0.13205645161290322, + "grad_norm": 5.111736297607422, + "learning_rate": 4.788379913995698e-05, + "loss": 2.0021, + "step": 917 + }, + { + "epoch": 0.13220046082949308, + "grad_norm": 3.3639087677001953, + "learning_rate": 4.7879242589715955e-05, + "loss": 0.8653, + "step": 918 + }, + { + "epoch": 0.13234447004608296, + "grad_norm": 2.072726249694824, + "learning_rate": 4.7874681356498657e-05, + "loss": 0.2582, + "step": 919 + }, + { + "epoch": 0.13248847926267282, + "grad_norm": 4.815186977386475, + "learning_rate": 4.78701154412387e-05, + "loss": 1.2409, + "step": 920 + }, + { + "epoch": 0.13263248847926268, + "grad_norm": 3.5665135383605957, + "learning_rate": 4.786554484487064e-05, + "loss": 0.6835, + "step": 921 + }, + { + "epoch": 0.13277649769585254, + "grad_norm": 5.397080421447754, + "learning_rate": 4.786096956833001e-05, + "loss": 1.5267, + "step": 922 + }, + { + "epoch": 0.1329205069124424, + "grad_norm": 5.17304801940918, + "learning_rate": 4.7856389612553256e-05, + "loss": 1.3293, + "step": 923 + }, + { + "epoch": 0.13306451612903225, + "grad_norm": 3.445164918899536, + "learning_rate": 4.785180497847786e-05, + "loss": 1.0404, + "step": 924 + }, + { + "epoch": 0.1332085253456221, + "grad_norm": 6.623275279998779, + "learning_rate": 4.7847215667042165e-05, + "loss": 2.7474, + "step": 925 + }, + { + "epoch": 0.133352534562212, + "grad_norm": 5.312112331390381, + "learning_rate": 4.784262167918556e-05, + "loss": 0.4643, + "step": 926 + }, + { + "epoch": 0.13349654377880185, + "grad_norm": 3.0414247512817383, + "learning_rate": 4.783802301584834e-05, + "loss": 1.0526, + "step": 927 + }, + { + "epoch": 0.1336405529953917, + "grad_norm": 4.880441665649414, + "learning_rate": 4.783341967797177e-05, + "loss": 2.3499, + "step": 928 + }, + { + "epoch": 0.13378456221198157, + "grad_norm": 3.704800844192505, + "learning_rate": 4.782881166649808e-05, + "loss": 1.1866, + "step": 929 + }, + { + "epoch": 0.13392857142857142, + "grad_norm": 5.5400214195251465, + "learning_rate": 4.782419898237044e-05, + "loss": 0.8907, + "step": 930 + }, + { + "epoch": 0.13407258064516128, + "grad_norm": 6.657557487487793, + "learning_rate": 4.781958162653297e-05, + "loss": 1.0145, + "step": 931 + }, + { + "epoch": 0.13421658986175114, + "grad_norm": 4.112020015716553, + "learning_rate": 4.7814959599930794e-05, + "loss": 0.991, + "step": 932 + }, + { + "epoch": 0.13436059907834103, + "grad_norm": 2.7419893741607666, + "learning_rate": 4.781033290350993e-05, + "loss": 2.3155, + "step": 933 + }, + { + "epoch": 0.13450460829493088, + "grad_norm": 5.777390480041504, + "learning_rate": 4.7805701538217404e-05, + "loss": 0.9644, + "step": 934 + }, + { + "epoch": 0.13464861751152074, + "grad_norm": 2.982146978378296, + "learning_rate": 4.7801065505001155e-05, + "loss": 0.6968, + "step": 935 + }, + { + "epoch": 0.1347926267281106, + "grad_norm": 8.27589225769043, + "learning_rate": 4.779642480481011e-05, + "loss": 3.125, + "step": 936 + }, + { + "epoch": 0.13493663594470046, + "grad_norm": 1.5288752317428589, + "learning_rate": 4.779177943859413e-05, + "loss": 0.1515, + "step": 937 + }, + { + "epoch": 0.1350806451612903, + "grad_norm": 4.467295169830322, + "learning_rate": 4.778712940730404e-05, + "loss": 1.1438, + "step": 938 + }, + { + "epoch": 0.13522465437788017, + "grad_norm": 4.909008502960205, + "learning_rate": 4.778247471189163e-05, + "loss": 0.7315, + "step": 939 + }, + { + "epoch": 0.13536866359447006, + "grad_norm": 4.6534905433654785, + "learning_rate": 4.777781535330962e-05, + "loss": 1.0966, + "step": 940 + }, + { + "epoch": 0.13551267281105991, + "grad_norm": 5.853962421417236, + "learning_rate": 4.777315133251171e-05, + "loss": 2.4386, + "step": 941 + }, + { + "epoch": 0.13565668202764977, + "grad_norm": 6.04210901260376, + "learning_rate": 4.776848265045253e-05, + "loss": 1.8792, + "step": 942 + }, + { + "epoch": 0.13580069124423963, + "grad_norm": 2.639909029006958, + "learning_rate": 4.776380930808769e-05, + "loss": 0.3711, + "step": 943 + }, + { + "epoch": 0.1359447004608295, + "grad_norm": 2.4481422901153564, + "learning_rate": 4.775913130637373e-05, + "loss": 0.527, + "step": 944 + }, + { + "epoch": 0.13608870967741934, + "grad_norm": 2.013697624206543, + "learning_rate": 4.775444864626816e-05, + "loss": 0.4213, + "step": 945 + }, + { + "epoch": 0.13623271889400923, + "grad_norm": 2.25225567817688, + "learning_rate": 4.7749761328729436e-05, + "loss": 0.3395, + "step": 946 + }, + { + "epoch": 0.1363767281105991, + "grad_norm": 3.857779026031494, + "learning_rate": 4.774506935471697e-05, + "loss": 0.7154, + "step": 947 + }, + { + "epoch": 0.13652073732718895, + "grad_norm": 3.1421754360198975, + "learning_rate": 4.774037272519112e-05, + "loss": 0.3293, + "step": 948 + }, + { + "epoch": 0.1366647465437788, + "grad_norm": 4.067907333374023, + "learning_rate": 4.773567144111321e-05, + "loss": 2.7753, + "step": 949 + }, + { + "epoch": 0.13680875576036866, + "grad_norm": 5.429750442504883, + "learning_rate": 4.77309655034455e-05, + "loss": 1.2917, + "step": 950 + }, + { + "epoch": 0.13695276497695852, + "grad_norm": 3.5713839530944824, + "learning_rate": 4.772625491315123e-05, + "loss": 0.6445, + "step": 951 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 3.5764806270599365, + "learning_rate": 4.772153967119456e-05, + "loss": 2.1058, + "step": 952 + }, + { + "epoch": 0.13724078341013826, + "grad_norm": 5.686327934265137, + "learning_rate": 4.7716819778540625e-05, + "loss": 1.816, + "step": 953 + }, + { + "epoch": 0.13738479262672812, + "grad_norm": 1.4938932657241821, + "learning_rate": 4.7712095236155496e-05, + "loss": 0.1286, + "step": 954 + }, + { + "epoch": 0.13752880184331798, + "grad_norm": 2.21421480178833, + "learning_rate": 4.7707366045006205e-05, + "loss": 0.2147, + "step": 955 + }, + { + "epoch": 0.13767281105990783, + "grad_norm": 3.802833318710327, + "learning_rate": 4.770263220606074e-05, + "loss": 2.7477, + "step": 956 + }, + { + "epoch": 0.1378168202764977, + "grad_norm": 2.062763214111328, + "learning_rate": 4.7697893720288037e-05, + "loss": 0.2776, + "step": 957 + }, + { + "epoch": 0.13796082949308755, + "grad_norm": 6.797392845153809, + "learning_rate": 4.769315058865796e-05, + "loss": 2.8932, + "step": 958 + }, + { + "epoch": 0.1381048387096774, + "grad_norm": 2.544973611831665, + "learning_rate": 4.768840281214136e-05, + "loss": 0.4144, + "step": 959 + }, + { + "epoch": 0.1382488479262673, + "grad_norm": 4.850215435028076, + "learning_rate": 4.768365039171002e-05, + "loss": 0.7269, + "step": 960 + }, + { + "epoch": 0.13839285714285715, + "grad_norm": 10.345398902893066, + "learning_rate": 4.767889332833667e-05, + "loss": 2.77, + "step": 961 + }, + { + "epoch": 0.138536866359447, + "grad_norm": 3.908308267593384, + "learning_rate": 4.767413162299501e-05, + "loss": 0.8191, + "step": 962 + }, + { + "epoch": 0.13868087557603687, + "grad_norm": 3.1685791015625, + "learning_rate": 4.766936527665967e-05, + "loss": 0.5054, + "step": 963 + }, + { + "epoch": 0.13882488479262672, + "grad_norm": 3.2342231273651123, + "learning_rate": 4.766459429030624e-05, + "loss": 0.768, + "step": 964 + }, + { + "epoch": 0.13896889400921658, + "grad_norm": 2.964406967163086, + "learning_rate": 4.765981866491125e-05, + "loss": 1.5783, + "step": 965 + }, + { + "epoch": 0.13911290322580644, + "grad_norm": 5.215517044067383, + "learning_rate": 4.765503840145219e-05, + "loss": 1.1551, + "step": 966 + }, + { + "epoch": 0.13925691244239632, + "grad_norm": 2.869062900543213, + "learning_rate": 4.7650253500907494e-05, + "loss": 1.1666, + "step": 967 + }, + { + "epoch": 0.13940092165898618, + "grad_norm": 5.752099514007568, + "learning_rate": 4.764546396425654e-05, + "loss": 2.9742, + "step": 968 + }, + { + "epoch": 0.13954493087557604, + "grad_norm": 0.9724917411804199, + "learning_rate": 4.7640669792479676e-05, + "loss": 4.7665, + "step": 969 + }, + { + "epoch": 0.1396889400921659, + "grad_norm": 5.367128849029541, + "learning_rate": 4.763587098655817e-05, + "loss": 1.019, + "step": 970 + }, + { + "epoch": 0.13983294930875576, + "grad_norm": 5.6281867027282715, + "learning_rate": 4.7631067547474265e-05, + "loss": 2.1102, + "step": 971 + }, + { + "epoch": 0.1399769585253456, + "grad_norm": 5.329975128173828, + "learning_rate": 4.7626259476211135e-05, + "loss": 1.8632, + "step": 972 + }, + { + "epoch": 0.14012096774193547, + "grad_norm": 3.2802412509918213, + "learning_rate": 4.762144677375291e-05, + "loss": 1.2663, + "step": 973 + }, + { + "epoch": 0.14026497695852536, + "grad_norm": 6.589924335479736, + "learning_rate": 4.7616629441084655e-05, + "loss": 1.2158, + "step": 974 + }, + { + "epoch": 0.1404089861751152, + "grad_norm": 2.8345420360565186, + "learning_rate": 4.76118074791924e-05, + "loss": 0.3298, + "step": 975 + }, + { + "epoch": 0.14055299539170507, + "grad_norm": 4.956574440002441, + "learning_rate": 4.7606980889063114e-05, + "loss": 1.8916, + "step": 976 + }, + { + "epoch": 0.14069700460829493, + "grad_norm": 7.475661754608154, + "learning_rate": 4.760214967168472e-05, + "loss": 2.2554, + "step": 977 + }, + { + "epoch": 0.1408410138248848, + "grad_norm": 1.0388163328170776, + "learning_rate": 4.7597313828046075e-05, + "loss": 4.1948, + "step": 978 + }, + { + "epoch": 0.14098502304147464, + "grad_norm": 5.04349422454834, + "learning_rate": 4.759247335913699e-05, + "loss": 1.6292, + "step": 979 + }, + { + "epoch": 0.14112903225806453, + "grad_norm": 4.223476886749268, + "learning_rate": 4.7587628265948235e-05, + "loss": 0.6039, + "step": 980 + }, + { + "epoch": 0.1412730414746544, + "grad_norm": 4.342326641082764, + "learning_rate": 4.7582778549471494e-05, + "loss": 1.9528, + "step": 981 + }, + { + "epoch": 0.14141705069124424, + "grad_norm": 5.761305809020996, + "learning_rate": 4.757792421069944e-05, + "loss": 2.6692, + "step": 982 + }, + { + "epoch": 0.1415610599078341, + "grad_norm": 5.287510395050049, + "learning_rate": 4.757306525062567e-05, + "loss": 2.2327, + "step": 983 + }, + { + "epoch": 0.14170506912442396, + "grad_norm": 4.362831115722656, + "learning_rate": 4.75682016702447e-05, + "loss": 1.2499, + "step": 984 + }, + { + "epoch": 0.14184907834101382, + "grad_norm": 3.312119960784912, + "learning_rate": 4.756333347055205e-05, + "loss": 0.5755, + "step": 985 + }, + { + "epoch": 0.14199308755760368, + "grad_norm": 6.774049282073975, + "learning_rate": 4.7558460652544146e-05, + "loss": 1.5815, + "step": 986 + }, + { + "epoch": 0.14213709677419356, + "grad_norm": 3.497359275817871, + "learning_rate": 4.755358321721836e-05, + "loss": 2.6552, + "step": 987 + }, + { + "epoch": 0.14228110599078342, + "grad_norm": 3.019765615463257, + "learning_rate": 4.7548701165573003e-05, + "loss": 1.038, + "step": 988 + }, + { + "epoch": 0.14242511520737328, + "grad_norm": 6.13399600982666, + "learning_rate": 4.754381449860738e-05, + "loss": 1.563, + "step": 989 + }, + { + "epoch": 0.14256912442396313, + "grad_norm": 4.460932731628418, + "learning_rate": 4.753892321732169e-05, + "loss": 0.7232, + "step": 990 + }, + { + "epoch": 0.142713133640553, + "grad_norm": 2.0888030529022217, + "learning_rate": 4.7534027322717076e-05, + "loss": 0.2593, + "step": 991 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 3.104433298110962, + "learning_rate": 4.7529126815795656e-05, + "loss": 0.7811, + "step": 992 + }, + { + "epoch": 0.1430011520737327, + "grad_norm": 6.3725080490112305, + "learning_rate": 4.752422169756048e-05, + "loss": 1.7914, + "step": 993 + }, + { + "epoch": 0.1431451612903226, + "grad_norm": 2.019559621810913, + "learning_rate": 4.751931196901553e-05, + "loss": 0.2594, + "step": 994 + }, + { + "epoch": 0.14328917050691245, + "grad_norm": 0.7305165529251099, + "learning_rate": 4.751439763116575e-05, + "loss": 0.065, + "step": 995 + }, + { + "epoch": 0.1434331797235023, + "grad_norm": 4.603130340576172, + "learning_rate": 4.750947868501701e-05, + "loss": 1.9054, + "step": 996 + }, + { + "epoch": 0.14357718894009217, + "grad_norm": 5.432317733764648, + "learning_rate": 4.7504555131576136e-05, + "loss": 1.1731, + "step": 997 + }, + { + "epoch": 0.14372119815668202, + "grad_norm": 7.25808048248291, + "learning_rate": 4.749962697185089e-05, + "loss": 1.9381, + "step": 998 + }, + { + "epoch": 0.14386520737327188, + "grad_norm": 4.7334370613098145, + "learning_rate": 4.749469420684997e-05, + "loss": 1.5904, + "step": 999 + }, + { + "epoch": 0.14400921658986174, + "grad_norm": 4.454786777496338, + "learning_rate": 4.748975683758304e-05, + "loss": 1.7573, + "step": 1000 + }, + { + "epoch": 0.14415322580645162, + "grad_norm": 4.903131008148193, + "learning_rate": 4.748481486506069e-05, + "loss": 2.6217, + "step": 1001 + }, + { + "epoch": 0.14429723502304148, + "grad_norm": 3.6456902027130127, + "learning_rate": 4.747986829029445e-05, + "loss": 1.1801, + "step": 1002 + }, + { + "epoch": 0.14444124423963134, + "grad_norm": 3.5882229804992676, + "learning_rate": 4.74749171142968e-05, + "loss": 1.1332, + "step": 1003 + }, + { + "epoch": 0.1445852534562212, + "grad_norm": 3.243081569671631, + "learning_rate": 4.746996133808115e-05, + "loss": 0.5814, + "step": 1004 + }, + { + "epoch": 0.14472926267281105, + "grad_norm": 4.828197956085205, + "learning_rate": 4.746500096266187e-05, + "loss": 1.6727, + "step": 1005 + }, + { + "epoch": 0.1448732718894009, + "grad_norm": 3.8699898719787598, + "learning_rate": 4.7460035989054255e-05, + "loss": 1.909, + "step": 1006 + }, + { + "epoch": 0.14501728110599077, + "grad_norm": 5.871050834655762, + "learning_rate": 4.745506641827455e-05, + "loss": 2.1942, + "step": 1007 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 3.163455009460449, + "learning_rate": 4.745009225133994e-05, + "loss": 0.5846, + "step": 1008 + }, + { + "epoch": 0.1453052995391705, + "grad_norm": 2.1266796588897705, + "learning_rate": 4.7445113489268544e-05, + "loss": 0.5698, + "step": 1009 + }, + { + "epoch": 0.14544930875576037, + "grad_norm": 2.7002837657928467, + "learning_rate": 4.744013013307943e-05, + "loss": 0.6612, + "step": 1010 + }, + { + "epoch": 0.14559331797235023, + "grad_norm": 2.763690948486328, + "learning_rate": 4.74351421837926e-05, + "loss": 0.3552, + "step": 1011 + }, + { + "epoch": 0.14573732718894009, + "grad_norm": 3.7716236114501953, + "learning_rate": 4.7430149642429e-05, + "loss": 1.1019, + "step": 1012 + }, + { + "epoch": 0.14588133640552994, + "grad_norm": 2.1239678859710693, + "learning_rate": 4.7425152510010514e-05, + "loss": 0.6279, + "step": 1013 + }, + { + "epoch": 0.14602534562211983, + "grad_norm": 4.8709211349487305, + "learning_rate": 4.742015078755998e-05, + "loss": 1.97, + "step": 1014 + }, + { + "epoch": 0.1461693548387097, + "grad_norm": 1.7189128398895264, + "learning_rate": 4.741514447610114e-05, + "loss": 4.6868, + "step": 1015 + }, + { + "epoch": 0.14631336405529954, + "grad_norm": 3.7994470596313477, + "learning_rate": 4.741013357665871e-05, + "loss": 1.1611, + "step": 1016 + }, + { + "epoch": 0.1464573732718894, + "grad_norm": 2.5511951446533203, + "learning_rate": 4.740511809025833e-05, + "loss": 0.2934, + "step": 1017 + }, + { + "epoch": 0.14660138248847926, + "grad_norm": 2.198805809020996, + "learning_rate": 4.740009801792658e-05, + "loss": 0.4056, + "step": 1018 + }, + { + "epoch": 0.14674539170506912, + "grad_norm": 2.951573610305786, + "learning_rate": 4.7395073360690985e-05, + "loss": 0.607, + "step": 1019 + }, + { + "epoch": 0.14688940092165897, + "grad_norm": 3.108076810836792, + "learning_rate": 4.739004411958e-05, + "loss": 0.4693, + "step": 1020 + }, + { + "epoch": 0.14703341013824886, + "grad_norm": 1.4672623872756958, + "learning_rate": 4.738501029562302e-05, + "loss": 0.2732, + "step": 1021 + }, + { + "epoch": 0.14717741935483872, + "grad_norm": 4.295193195343018, + "learning_rate": 4.737997188985038e-05, + "loss": 0.5802, + "step": 1022 + }, + { + "epoch": 0.14732142857142858, + "grad_norm": 3.8947227001190186, + "learning_rate": 4.737492890329335e-05, + "loss": 0.983, + "step": 1023 + }, + { + "epoch": 0.14746543778801843, + "grad_norm": 1.286924958229065, + "learning_rate": 4.7369881336984153e-05, + "loss": 0.0923, + "step": 1024 + }, + { + "epoch": 0.1476094470046083, + "grad_norm": 4.526074409484863, + "learning_rate": 4.736482919195593e-05, + "loss": 0.3747, + "step": 1025 + }, + { + "epoch": 0.14775345622119815, + "grad_norm": 5.0510687828063965, + "learning_rate": 4.735977246924275e-05, + "loss": 1.872, + "step": 1026 + }, + { + "epoch": 0.147897465437788, + "grad_norm": 3.918890953063965, + "learning_rate": 4.735471116987966e-05, + "loss": 0.5827, + "step": 1027 + }, + { + "epoch": 0.1480414746543779, + "grad_norm": 4.931419372558594, + "learning_rate": 4.73496452949026e-05, + "loss": 2.5229, + "step": 1028 + }, + { + "epoch": 0.14818548387096775, + "grad_norm": 10.018424987792969, + "learning_rate": 4.734457484534848e-05, + "loss": 3.2631, + "step": 1029 + }, + { + "epoch": 0.1483294930875576, + "grad_norm": 4.651330471038818, + "learning_rate": 4.733949982225511e-05, + "loss": 1.432, + "step": 1030 + }, + { + "epoch": 0.14847350230414746, + "grad_norm": 10.178098678588867, + "learning_rate": 4.733442022666128e-05, + "loss": 2.5269, + "step": 1031 + }, + { + "epoch": 0.14861751152073732, + "grad_norm": 6.973705291748047, + "learning_rate": 4.7329336059606684e-05, + "loss": 2.5524, + "step": 1032 + }, + { + "epoch": 0.14876152073732718, + "grad_norm": 5.217042446136475, + "learning_rate": 4.7324247322131955e-05, + "loss": 2.568, + "step": 1033 + }, + { + "epoch": 0.14890552995391704, + "grad_norm": 3.3824234008789062, + "learning_rate": 4.731915401527868e-05, + "loss": 2.6328, + "step": 1034 + }, + { + "epoch": 0.14904953917050692, + "grad_norm": 5.357469081878662, + "learning_rate": 4.731405614008936e-05, + "loss": 2.4347, + "step": 1035 + }, + { + "epoch": 0.14919354838709678, + "grad_norm": 8.216401100158691, + "learning_rate": 4.730895369760744e-05, + "loss": 1.4305, + "step": 1036 + }, + { + "epoch": 0.14933755760368664, + "grad_norm": 5.289060115814209, + "learning_rate": 4.73038466888773e-05, + "loss": 3.1935, + "step": 1037 + }, + { + "epoch": 0.1494815668202765, + "grad_norm": 2.581969738006592, + "learning_rate": 4.729873511494426e-05, + "loss": 0.3961, + "step": 1038 + }, + { + "epoch": 0.14962557603686635, + "grad_norm": 4.703259468078613, + "learning_rate": 4.729361897685456e-05, + "loss": 1.8665, + "step": 1039 + }, + { + "epoch": 0.1497695852534562, + "grad_norm": 3.5451676845550537, + "learning_rate": 4.72884982756554e-05, + "loss": 0.6132, + "step": 1040 + }, + { + "epoch": 0.1499135944700461, + "grad_norm": 4.779509544372559, + "learning_rate": 4.728337301239487e-05, + "loss": 1.1976, + "step": 1041 + }, + { + "epoch": 0.15005760368663595, + "grad_norm": 7.04539155960083, + "learning_rate": 4.727824318812205e-05, + "loss": 2.4858, + "step": 1042 + }, + { + "epoch": 0.1502016129032258, + "grad_norm": 4.8611674308776855, + "learning_rate": 4.72731088038869e-05, + "loss": 0.8454, + "step": 1043 + }, + { + "epoch": 0.15034562211981567, + "grad_norm": 3.003816604614258, + "learning_rate": 4.726796986074034e-05, + "loss": 0.6905, + "step": 1044 + }, + { + "epoch": 0.15048963133640553, + "grad_norm": 2.9667913913726807, + "learning_rate": 4.7262826359734244e-05, + "loss": 0.8539, + "step": 1045 + }, + { + "epoch": 0.15063364055299538, + "grad_norm": 4.070890426635742, + "learning_rate": 4.7257678301921384e-05, + "loss": 1.5223, + "step": 1046 + }, + { + "epoch": 0.15077764976958524, + "grad_norm": 2.555475950241089, + "learning_rate": 4.725252568835545e-05, + "loss": 0.4024, + "step": 1047 + }, + { + "epoch": 0.15092165898617513, + "grad_norm": 6.376065254211426, + "learning_rate": 4.724736852009113e-05, + "loss": 1.3314, + "step": 1048 + }, + { + "epoch": 0.15106566820276499, + "grad_norm": 3.068768262863159, + "learning_rate": 4.7242206798183984e-05, + "loss": 1.4603, + "step": 1049 + }, + { + "epoch": 0.15120967741935484, + "grad_norm": 2.7716128826141357, + "learning_rate": 4.723704052369053e-05, + "loss": 0.5073, + "step": 1050 + }, + { + "epoch": 0.1513536866359447, + "grad_norm": 6.292183876037598, + "learning_rate": 4.7231869697668214e-05, + "loss": 1.8174, + "step": 1051 + }, + { + "epoch": 0.15149769585253456, + "grad_norm": 4.512652397155762, + "learning_rate": 4.7226694321175415e-05, + "loss": 1.9654, + "step": 1052 + }, + { + "epoch": 0.15164170506912442, + "grad_norm": 7.256927013397217, + "learning_rate": 4.722151439527143e-05, + "loss": 1.5214, + "step": 1053 + }, + { + "epoch": 0.15178571428571427, + "grad_norm": 2.5060362815856934, + "learning_rate": 4.72163299210165e-05, + "loss": 0.4104, + "step": 1054 + }, + { + "epoch": 0.15192972350230416, + "grad_norm": 4.580010890960693, + "learning_rate": 4.721114089947181e-05, + "loss": 0.5552, + "step": 1055 + }, + { + "epoch": 0.15207373271889402, + "grad_norm": 5.826709270477295, + "learning_rate": 4.7205947331699454e-05, + "loss": 1.3746, + "step": 1056 + }, + { + "epoch": 0.15221774193548387, + "grad_norm": 4.352139949798584, + "learning_rate": 4.720074921876245e-05, + "loss": 2.1119, + "step": 1057 + }, + { + "epoch": 0.15236175115207373, + "grad_norm": 3.6313316822052, + "learning_rate": 4.719554656172478e-05, + "loss": 0.5056, + "step": 1058 + }, + { + "epoch": 0.1525057603686636, + "grad_norm": 4.81522274017334, + "learning_rate": 4.719033936165132e-05, + "loss": 1.6793, + "step": 1059 + }, + { + "epoch": 0.15264976958525345, + "grad_norm": 2.5163376331329346, + "learning_rate": 4.7185127619607905e-05, + "loss": 0.2028, + "step": 1060 + }, + { + "epoch": 0.1527937788018433, + "grad_norm": 4.455949306488037, + "learning_rate": 4.717991133666128e-05, + "loss": 1.5808, + "step": 1061 + }, + { + "epoch": 0.1529377880184332, + "grad_norm": 3.6083574295043945, + "learning_rate": 4.7174690513879114e-05, + "loss": 0.4673, + "step": 1062 + }, + { + "epoch": 0.15308179723502305, + "grad_norm": 5.251559257507324, + "learning_rate": 4.716946515233004e-05, + "loss": 0.9122, + "step": 1063 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 3.887223243713379, + "learning_rate": 4.716423525308358e-05, + "loss": 0.6456, + "step": 1064 + }, + { + "epoch": 0.15336981566820276, + "grad_norm": 5.3739094734191895, + "learning_rate": 4.7159000817210205e-05, + "loss": 1.8706, + "step": 1065 + }, + { + "epoch": 0.15351382488479262, + "grad_norm": 2.0411269664764404, + "learning_rate": 4.715376184578132e-05, + "loss": 0.245, + "step": 1066 + }, + { + "epoch": 0.15365783410138248, + "grad_norm": 6.339127063751221, + "learning_rate": 4.714851833986924e-05, + "loss": 1.2812, + "step": 1067 + }, + { + "epoch": 0.15380184331797234, + "grad_norm": 8.484160423278809, + "learning_rate": 4.714327030054722e-05, + "loss": 1.3692, + "step": 1068 + }, + { + "epoch": 0.15394585253456222, + "grad_norm": 2.3006114959716797, + "learning_rate": 4.7138017728889464e-05, + "loss": 0.2552, + "step": 1069 + }, + { + "epoch": 0.15408986175115208, + "grad_norm": 6.321800231933594, + "learning_rate": 4.713276062597104e-05, + "loss": 0.9075, + "step": 1070 + }, + { + "epoch": 0.15423387096774194, + "grad_norm": 1.8357454538345337, + "learning_rate": 4.7127498992868e-05, + "loss": 0.1981, + "step": 1071 + }, + { + "epoch": 0.1543778801843318, + "grad_norm": 4.8191704750061035, + "learning_rate": 4.7122232830657315e-05, + "loss": 0.8219, + "step": 1072 + }, + { + "epoch": 0.15452188940092165, + "grad_norm": 7.852665901184082, + "learning_rate": 4.711696214041687e-05, + "loss": 1.6117, + "step": 1073 + }, + { + "epoch": 0.1546658986175115, + "grad_norm": 1.091920256614685, + "learning_rate": 4.7111686923225485e-05, + "loss": 4.4827, + "step": 1074 + }, + { + "epoch": 0.1548099078341014, + "grad_norm": 1.9806959629058838, + "learning_rate": 4.7106407180162904e-05, + "loss": 0.3767, + "step": 1075 + }, + { + "epoch": 0.15495391705069125, + "grad_norm": 3.768179416656494, + "learning_rate": 4.710112291230978e-05, + "loss": 0.5165, + "step": 1076 + }, + { + "epoch": 0.1550979262672811, + "grad_norm": 1.6886502504348755, + "learning_rate": 4.709583412074774e-05, + "loss": 0.2177, + "step": 1077 + }, + { + "epoch": 0.15524193548387097, + "grad_norm": 5.564102649688721, + "learning_rate": 4.709054080655928e-05, + "loss": 1.2401, + "step": 1078 + }, + { + "epoch": 0.15538594470046083, + "grad_norm": 4.780123233795166, + "learning_rate": 4.708524297082786e-05, + "loss": 1.8888, + "step": 1079 + }, + { + "epoch": 0.15552995391705068, + "grad_norm": 3.551426887512207, + "learning_rate": 4.707994061463785e-05, + "loss": 0.922, + "step": 1080 + }, + { + "epoch": 0.15567396313364054, + "grad_norm": 3.0311851501464844, + "learning_rate": 4.7074633739074555e-05, + "loss": 0.8965, + "step": 1081 + }, + { + "epoch": 0.15581797235023043, + "grad_norm": 1.9502406120300293, + "learning_rate": 4.706932234522419e-05, + "loss": 0.3864, + "step": 1082 + }, + { + "epoch": 0.15596198156682028, + "grad_norm": 2.0368826389312744, + "learning_rate": 4.70640064341739e-05, + "loss": 0.3027, + "step": 1083 + }, + { + "epoch": 0.15610599078341014, + "grad_norm": 3.0014302730560303, + "learning_rate": 4.7058686007011765e-05, + "loss": 0.3261, + "step": 1084 + }, + { + "epoch": 0.15625, + "grad_norm": 4.899149417877197, + "learning_rate": 4.7053361064826785e-05, + "loss": 0.8522, + "step": 1085 + }, + { + "epoch": 0.15639400921658986, + "grad_norm": 4.331324100494385, + "learning_rate": 4.7048031608708876e-05, + "loss": 0.9338, + "step": 1086 + }, + { + "epoch": 0.15653801843317972, + "grad_norm": 4.871620178222656, + "learning_rate": 4.704269763974889e-05, + "loss": 1.7257, + "step": 1087 + }, + { + "epoch": 0.15668202764976957, + "grad_norm": 5.093155384063721, + "learning_rate": 4.703735915903859e-05, + "loss": 0.777, + "step": 1088 + }, + { + "epoch": 0.15682603686635946, + "grad_norm": 7.003866672515869, + "learning_rate": 4.703201616767067e-05, + "loss": 2.4777, + "step": 1089 + }, + { + "epoch": 0.15697004608294932, + "grad_norm": 2.8218986988067627, + "learning_rate": 4.702666866673874e-05, + "loss": 0.4777, + "step": 1090 + }, + { + "epoch": 0.15711405529953917, + "grad_norm": 4.131559371948242, + "learning_rate": 4.7021316657337344e-05, + "loss": 1.7819, + "step": 1091 + }, + { + "epoch": 0.15725806451612903, + "grad_norm": 2.8610498905181885, + "learning_rate": 4.701596014056194e-05, + "loss": 0.408, + "step": 1092 + }, + { + "epoch": 0.1574020737327189, + "grad_norm": 3.667104482650757, + "learning_rate": 4.701059911750893e-05, + "loss": 0.3934, + "step": 1093 + }, + { + "epoch": 0.15754608294930875, + "grad_norm": 4.785576343536377, + "learning_rate": 4.70052335892756e-05, + "loss": 2.0558, + "step": 1094 + }, + { + "epoch": 0.1576900921658986, + "grad_norm": 3.241441488265991, + "learning_rate": 4.699986355696019e-05, + "loss": 2.4281, + "step": 1095 + }, + { + "epoch": 0.1578341013824885, + "grad_norm": 5.013128757476807, + "learning_rate": 4.699448902166184e-05, + "loss": 0.5433, + "step": 1096 + }, + { + "epoch": 0.15797811059907835, + "grad_norm": 2.391993999481201, + "learning_rate": 4.6989109984480636e-05, + "loss": 0.4286, + "step": 1097 + }, + { + "epoch": 0.1581221198156682, + "grad_norm": 3.5514471530914307, + "learning_rate": 4.6983726446517565e-05, + "loss": 0.5289, + "step": 1098 + }, + { + "epoch": 0.15826612903225806, + "grad_norm": 2.242741107940674, + "learning_rate": 4.6978338408874534e-05, + "loss": 0.5241, + "step": 1099 + }, + { + "epoch": 0.15841013824884792, + "grad_norm": 3.3669443130493164, + "learning_rate": 4.697294587265438e-05, + "loss": 0.3633, + "step": 1100 + }, + { + "epoch": 0.15855414746543778, + "grad_norm": 1.4069124460220337, + "learning_rate": 4.6967548838960884e-05, + "loss": 0.131, + "step": 1101 + }, + { + "epoch": 0.15869815668202766, + "grad_norm": 7.251487731933594, + "learning_rate": 4.69621473088987e-05, + "loss": 0.9413, + "step": 1102 + }, + { + "epoch": 0.15884216589861752, + "grad_norm": 9.032854080200195, + "learning_rate": 4.6956741283573427e-05, + "loss": 1.6579, + "step": 1103 + }, + { + "epoch": 0.15898617511520738, + "grad_norm": 3.904507875442505, + "learning_rate": 4.6951330764091584e-05, + "loss": 3.0384, + "step": 1104 + }, + { + "epoch": 0.15913018433179724, + "grad_norm": 2.8903002738952637, + "learning_rate": 4.694591575156061e-05, + "loss": 0.2542, + "step": 1105 + }, + { + "epoch": 0.1592741935483871, + "grad_norm": 1.5627715587615967, + "learning_rate": 4.6940496247088873e-05, + "loss": 0.109, + "step": 1106 + }, + { + "epoch": 0.15941820276497695, + "grad_norm": 3.6437063217163086, + "learning_rate": 4.693507225178564e-05, + "loss": 0.6515, + "step": 1107 + }, + { + "epoch": 0.1595622119815668, + "grad_norm": 5.242105007171631, + "learning_rate": 4.692964376676111e-05, + "loss": 3.2675, + "step": 1108 + }, + { + "epoch": 0.1597062211981567, + "grad_norm": 3.9502604007720947, + "learning_rate": 4.692421079312639e-05, + "loss": 0.4804, + "step": 1109 + }, + { + "epoch": 0.15985023041474655, + "grad_norm": 2.299016237258911, + "learning_rate": 4.6918773331993515e-05, + "loss": 0.5842, + "step": 1110 + }, + { + "epoch": 0.1599942396313364, + "grad_norm": 5.19213342666626, + "learning_rate": 4.6913331384475446e-05, + "loss": 2.3741, + "step": 1111 + }, + { + "epoch": 0.16013824884792627, + "grad_norm": 3.835947275161743, + "learning_rate": 4.690788495168605e-05, + "loss": 0.4874, + "step": 1112 + }, + { + "epoch": 0.16028225806451613, + "grad_norm": 6.400421142578125, + "learning_rate": 4.690243403474011e-05, + "loss": 1.1205, + "step": 1113 + }, + { + "epoch": 0.16042626728110598, + "grad_norm": 4.078497886657715, + "learning_rate": 4.689697863475334e-05, + "loss": 0.5484, + "step": 1114 + }, + { + "epoch": 0.16057027649769584, + "grad_norm": 4.127893447875977, + "learning_rate": 4.6891518752842354e-05, + "loss": 0.7541, + "step": 1115 + }, + { + "epoch": 0.16071428571428573, + "grad_norm": 6.253756046295166, + "learning_rate": 4.6886054390124706e-05, + "loss": 1.2544, + "step": 1116 + }, + { + "epoch": 0.16085829493087558, + "grad_norm": 3.5180530548095703, + "learning_rate": 4.6880585547718845e-05, + "loss": 0.4032, + "step": 1117 + }, + { + "epoch": 0.16100230414746544, + "grad_norm": 3.175388813018799, + "learning_rate": 4.687511222674415e-05, + "loss": 0.4835, + "step": 1118 + }, + { + "epoch": 0.1611463133640553, + "grad_norm": 3.283808946609497, + "learning_rate": 4.686963442832091e-05, + "loss": 1.6917, + "step": 1119 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 2.759634494781494, + "learning_rate": 4.686415215357034e-05, + "loss": 0.414, + "step": 1120 + }, + { + "epoch": 0.16143433179723501, + "grad_norm": 1.9749915599822998, + "learning_rate": 4.685866540361456e-05, + "loss": 0.2748, + "step": 1121 + }, + { + "epoch": 0.16157834101382487, + "grad_norm": 3.1433115005493164, + "learning_rate": 4.6853174179576605e-05, + "loss": 0.4566, + "step": 1122 + }, + { + "epoch": 0.16172235023041476, + "grad_norm": 1.7608540058135986, + "learning_rate": 4.6847678482580435e-05, + "loss": 0.186, + "step": 1123 + }, + { + "epoch": 0.16186635944700462, + "grad_norm": 5.711893558502197, + "learning_rate": 4.6842178313750934e-05, + "loss": 1.6381, + "step": 1124 + }, + { + "epoch": 0.16201036866359447, + "grad_norm": 1.3590450286865234, + "learning_rate": 4.683667367421387e-05, + "loss": 0.2374, + "step": 1125 + }, + { + "epoch": 0.16215437788018433, + "grad_norm": 8.4840726852417, + "learning_rate": 4.6831164565095965e-05, + "loss": 2.9274, + "step": 1126 + }, + { + "epoch": 0.1622983870967742, + "grad_norm": 2.0950727462768555, + "learning_rate": 4.6825650987524825e-05, + "loss": 0.2772, + "step": 1127 + }, + { + "epoch": 0.16244239631336405, + "grad_norm": 5.359508991241455, + "learning_rate": 4.6820132942628974e-05, + "loss": 1.1181, + "step": 1128 + }, + { + "epoch": 0.1625864055299539, + "grad_norm": 5.5400285720825195, + "learning_rate": 4.6814610431537874e-05, + "loss": 1.5373, + "step": 1129 + }, + { + "epoch": 0.1627304147465438, + "grad_norm": 3.9514713287353516, + "learning_rate": 4.680908345538187e-05, + "loss": 0.6286, + "step": 1130 + }, + { + "epoch": 0.16287442396313365, + "grad_norm": 5.69553804397583, + "learning_rate": 4.6803552015292254e-05, + "loss": 1.4226, + "step": 1131 + }, + { + "epoch": 0.1630184331797235, + "grad_norm": 5.451788902282715, + "learning_rate": 4.6798016112401196e-05, + "loss": 1.5376, + "step": 1132 + }, + { + "epoch": 0.16316244239631336, + "grad_norm": 7.845460891723633, + "learning_rate": 4.679247574784182e-05, + "loss": 1.0276, + "step": 1133 + }, + { + "epoch": 0.16330645161290322, + "grad_norm": 3.02510666847229, + "learning_rate": 4.678693092274812e-05, + "loss": 0.6168, + "step": 1134 + }, + { + "epoch": 0.16345046082949308, + "grad_norm": 7.16134786605835, + "learning_rate": 4.678138163825503e-05, + "loss": 1.8862, + "step": 1135 + }, + { + "epoch": 0.16359447004608296, + "grad_norm": 3.544630765914917, + "learning_rate": 4.677582789549838e-05, + "loss": 1.4146, + "step": 1136 + }, + { + "epoch": 0.16373847926267282, + "grad_norm": 8.591194152832031, + "learning_rate": 4.677026969561494e-05, + "loss": 1.4823, + "step": 1137 + }, + { + "epoch": 0.16388248847926268, + "grad_norm": 5.422079563140869, + "learning_rate": 4.6764707039742375e-05, + "loss": 1.0422, + "step": 1138 + }, + { + "epoch": 0.16402649769585254, + "grad_norm": 1.962151050567627, + "learning_rate": 4.6759139929019256e-05, + "loss": 0.1818, + "step": 1139 + }, + { + "epoch": 0.1641705069124424, + "grad_norm": 3.824105978012085, + "learning_rate": 4.675356836458506e-05, + "loss": 0.7527, + "step": 1140 + }, + { + "epoch": 0.16431451612903225, + "grad_norm": 3.7476954460144043, + "learning_rate": 4.674799234758022e-05, + "loss": 2.6368, + "step": 1141 + }, + { + "epoch": 0.1644585253456221, + "grad_norm": 4.408247947692871, + "learning_rate": 4.674241187914601e-05, + "loss": 0.4987, + "step": 1142 + }, + { + "epoch": 0.164602534562212, + "grad_norm": 3.977386713027954, + "learning_rate": 4.673682696042468e-05, + "loss": 1.1065, + "step": 1143 + }, + { + "epoch": 0.16474654377880185, + "grad_norm": 8.769730567932129, + "learning_rate": 4.673123759255935e-05, + "loss": 1.2238, + "step": 1144 + }, + { + "epoch": 0.1648905529953917, + "grad_norm": 3.5165998935699463, + "learning_rate": 4.6725643776694074e-05, + "loss": 0.9757, + "step": 1145 + }, + { + "epoch": 0.16503456221198157, + "grad_norm": 3.004091262817383, + "learning_rate": 4.6720045513973795e-05, + "loss": 2.3558, + "step": 1146 + }, + { + "epoch": 0.16517857142857142, + "grad_norm": 4.725552082061768, + "learning_rate": 4.6714442805544395e-05, + "loss": 1.7544, + "step": 1147 + }, + { + "epoch": 0.16532258064516128, + "grad_norm": 5.647760391235352, + "learning_rate": 4.670883565255264e-05, + "loss": 0.8933, + "step": 1148 + }, + { + "epoch": 0.16546658986175114, + "grad_norm": 4.220912933349609, + "learning_rate": 4.670322405614621e-05, + "loss": 1.2451, + "step": 1149 + }, + { + "epoch": 0.16561059907834103, + "grad_norm": 2.5643632411956787, + "learning_rate": 4.6697608017473714e-05, + "loss": 0.3592, + "step": 1150 + }, + { + "epoch": 0.16575460829493088, + "grad_norm": 3.7353005409240723, + "learning_rate": 4.669198753768463e-05, + "loss": 0.6629, + "step": 1151 + }, + { + "epoch": 0.16589861751152074, + "grad_norm": 4.129006385803223, + "learning_rate": 4.668636261792941e-05, + "loss": 2.3104, + "step": 1152 + }, + { + "epoch": 0.1660426267281106, + "grad_norm": 2.880114793777466, + "learning_rate": 4.6680733259359346e-05, + "loss": 0.4712, + "step": 1153 + }, + { + "epoch": 0.16618663594470046, + "grad_norm": 1.0414623022079468, + "learning_rate": 4.667509946312667e-05, + "loss": 0.0994, + "step": 1154 + }, + { + "epoch": 0.1663306451612903, + "grad_norm": 2.997178316116333, + "learning_rate": 4.666946123038452e-05, + "loss": 2.3279, + "step": 1155 + }, + { + "epoch": 0.16647465437788017, + "grad_norm": 6.172976970672607, + "learning_rate": 4.666381856228697e-05, + "loss": 1.539, + "step": 1156 + }, + { + "epoch": 0.16661866359447006, + "grad_norm": 4.148237228393555, + "learning_rate": 4.6658171459988934e-05, + "loss": 0.6973, + "step": 1157 + }, + { + "epoch": 0.16676267281105991, + "grad_norm": 2.2921736240386963, + "learning_rate": 4.665251992464629e-05, + "loss": 0.279, + "step": 1158 + }, + { + "epoch": 0.16690668202764977, + "grad_norm": 5.573154449462891, + "learning_rate": 4.664686395741582e-05, + "loss": 1.2863, + "step": 1159 + }, + { + "epoch": 0.16705069124423963, + "grad_norm": 5.427803993225098, + "learning_rate": 4.664120355945519e-05, + "loss": 0.7495, + "step": 1160 + }, + { + "epoch": 0.1671947004608295, + "grad_norm": 9.967765808105469, + "learning_rate": 4.663553873192299e-05, + "loss": 3.5931, + "step": 1161 + }, + { + "epoch": 0.16733870967741934, + "grad_norm": 4.0860514640808105, + "learning_rate": 4.662986947597869e-05, + "loss": 1.6532, + "step": 1162 + }, + { + "epoch": 0.16748271889400923, + "grad_norm": 4.581136226654053, + "learning_rate": 4.662419579278271e-05, + "loss": 1.3193, + "step": 1163 + }, + { + "epoch": 0.1676267281105991, + "grad_norm": 4.764404296875, + "learning_rate": 4.661851768349633e-05, + "loss": 2.1301, + "step": 1164 + }, + { + "epoch": 0.16777073732718895, + "grad_norm": 3.5355751514434814, + "learning_rate": 4.661283514928179e-05, + "loss": 1.0111, + "step": 1165 + }, + { + "epoch": 0.1679147465437788, + "grad_norm": 1.355900764465332, + "learning_rate": 4.6607148191302175e-05, + "loss": 0.1501, + "step": 1166 + }, + { + "epoch": 0.16805875576036866, + "grad_norm": 6.633650302886963, + "learning_rate": 4.6601456810721516e-05, + "loss": 2.2768, + "step": 1167 + }, + { + "epoch": 0.16820276497695852, + "grad_norm": 1.920723557472229, + "learning_rate": 4.659576100870474e-05, + "loss": 0.3429, + "step": 1168 + }, + { + "epoch": 0.16834677419354838, + "grad_norm": 3.0395169258117676, + "learning_rate": 4.659006078641767e-05, + "loss": 2.5046, + "step": 1169 + }, + { + "epoch": 0.16849078341013826, + "grad_norm": 2.9273669719696045, + "learning_rate": 4.658435614502705e-05, + "loss": 0.2589, + "step": 1170 + }, + { + "epoch": 0.16863479262672812, + "grad_norm": 3.7137887477874756, + "learning_rate": 4.6578647085700514e-05, + "loss": 0.2955, + "step": 1171 + }, + { + "epoch": 0.16877880184331798, + "grad_norm": 2.1954097747802734, + "learning_rate": 4.6572933609606596e-05, + "loss": 0.286, + "step": 1172 + }, + { + "epoch": 0.16892281105990783, + "grad_norm": 2.6488037109375, + "learning_rate": 4.656721571791476e-05, + "loss": 0.3412, + "step": 1173 + }, + { + "epoch": 0.1690668202764977, + "grad_norm": 2.49554705619812, + "learning_rate": 4.656149341179535e-05, + "loss": 0.5089, + "step": 1174 + }, + { + "epoch": 0.16921082949308755, + "grad_norm": 1.6316190958023071, + "learning_rate": 4.6555766692419625e-05, + "loss": 0.1857, + "step": 1175 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 5.8382697105407715, + "learning_rate": 4.6550035560959735e-05, + "loss": 1.0447, + "step": 1176 + }, + { + "epoch": 0.1694988479262673, + "grad_norm": 3.232395648956299, + "learning_rate": 4.654430001858874e-05, + "loss": 0.8684, + "step": 1177 + }, + { + "epoch": 0.16964285714285715, + "grad_norm": 4.149288654327393, + "learning_rate": 4.653856006648062e-05, + "loss": 0.7744, + "step": 1178 + }, + { + "epoch": 0.169786866359447, + "grad_norm": 4.66519832611084, + "learning_rate": 4.653281570581023e-05, + "loss": 0.4718, + "step": 1179 + }, + { + "epoch": 0.16993087557603687, + "grad_norm": 4.096548080444336, + "learning_rate": 4.652706693775333e-05, + "loss": 1.2296, + "step": 1180 + }, + { + "epoch": 0.17007488479262672, + "grad_norm": 4.488333702087402, + "learning_rate": 4.652131376348661e-05, + "loss": 2.0445, + "step": 1181 + }, + { + "epoch": 0.17021889400921658, + "grad_norm": 3.780850887298584, + "learning_rate": 4.651555618418764e-05, + "loss": 1.0086, + "step": 1182 + }, + { + "epoch": 0.17036290322580644, + "grad_norm": 8.470001220703125, + "learning_rate": 4.650979420103488e-05, + "loss": 0.852, + "step": 1183 + }, + { + "epoch": 0.17050691244239632, + "grad_norm": 2.525731086730957, + "learning_rate": 4.650402781520772e-05, + "loss": 0.643, + "step": 1184 + }, + { + "epoch": 0.17065092165898618, + "grad_norm": 5.716236591339111, + "learning_rate": 4.649825702788643e-05, + "loss": 2.3174, + "step": 1185 + }, + { + "epoch": 0.17079493087557604, + "grad_norm": 6.406065464019775, + "learning_rate": 4.649248184025219e-05, + "loss": 2.1219, + "step": 1186 + }, + { + "epoch": 0.1709389400921659, + "grad_norm": 1.649021029472351, + "learning_rate": 4.648670225348707e-05, + "loss": 0.2491, + "step": 1187 + }, + { + "epoch": 0.17108294930875576, + "grad_norm": 1.9114370346069336, + "learning_rate": 4.648091826877408e-05, + "loss": 0.2885, + "step": 1188 + }, + { + "epoch": 0.1712269585253456, + "grad_norm": 4.858530044555664, + "learning_rate": 4.6475129887297056e-05, + "loss": 0.2863, + "step": 1189 + }, + { + "epoch": 0.17137096774193547, + "grad_norm": 3.7883124351501465, + "learning_rate": 4.646933711024081e-05, + "loss": 0.4631, + "step": 1190 + }, + { + "epoch": 0.17151497695852536, + "grad_norm": 6.575172424316406, + "learning_rate": 4.6463539938791e-05, + "loss": 0.6788, + "step": 1191 + }, + { + "epoch": 0.1716589861751152, + "grad_norm": 4.898850917816162, + "learning_rate": 4.645773837413423e-05, + "loss": 2.2173, + "step": 1192 + }, + { + "epoch": 0.17180299539170507, + "grad_norm": 4.136491775512695, + "learning_rate": 4.6451932417457954e-05, + "loss": 2.0383, + "step": 1193 + }, + { + "epoch": 0.17194700460829493, + "grad_norm": 7.328372478485107, + "learning_rate": 4.644612206995056e-05, + "loss": 1.8275, + "step": 1194 + }, + { + "epoch": 0.1720910138248848, + "grad_norm": 5.4280524253845215, + "learning_rate": 4.6440307332801314e-05, + "loss": 1.561, + "step": 1195 + }, + { + "epoch": 0.17223502304147464, + "grad_norm": 2.6843135356903076, + "learning_rate": 4.64344882072004e-05, + "loss": 0.3029, + "step": 1196 + }, + { + "epoch": 0.17237903225806453, + "grad_norm": 4.390672206878662, + "learning_rate": 4.642866469433889e-05, + "loss": 2.2257, + "step": 1197 + }, + { + "epoch": 0.1725230414746544, + "grad_norm": 6.0431437492370605, + "learning_rate": 4.642283679540874e-05, + "loss": 1.7213, + "step": 1198 + }, + { + "epoch": 0.17266705069124424, + "grad_norm": 0.6336512565612793, + "learning_rate": 4.6417004511602835e-05, + "loss": 4.694, + "step": 1199 + }, + { + "epoch": 0.1728110599078341, + "grad_norm": 2.6938278675079346, + "learning_rate": 4.6411167844114936e-05, + "loss": 0.2585, + "step": 1200 + }, + { + "epoch": 0.17295506912442396, + "grad_norm": 3.5139193534851074, + "learning_rate": 4.6405326794139696e-05, + "loss": 0.7723, + "step": 1201 + }, + { + "epoch": 0.17309907834101382, + "grad_norm": 2.3406288623809814, + "learning_rate": 4.6399481362872685e-05, + "loss": 0.3452, + "step": 1202 + }, + { + "epoch": 0.17324308755760368, + "grad_norm": 4.682081699371338, + "learning_rate": 4.6393631551510356e-05, + "loss": 1.0774, + "step": 1203 + }, + { + "epoch": 0.17338709677419356, + "grad_norm": 2.4185116291046143, + "learning_rate": 4.638777736125006e-05, + "loss": 0.2763, + "step": 1204 + }, + { + "epoch": 0.17353110599078342, + "grad_norm": 7.938871383666992, + "learning_rate": 4.6381918793290055e-05, + "loss": 2.7264, + "step": 1205 + }, + { + "epoch": 0.17367511520737328, + "grad_norm": 2.1163229942321777, + "learning_rate": 4.637605584882947e-05, + "loss": 0.2183, + "step": 1206 + }, + { + "epoch": 0.17381912442396313, + "grad_norm": 0.8464052081108093, + "learning_rate": 4.637018852906836e-05, + "loss": 4.6005, + "step": 1207 + }, + { + "epoch": 0.173963133640553, + "grad_norm": 5.525642395019531, + "learning_rate": 4.636431683520765e-05, + "loss": 0.8613, + "step": 1208 + }, + { + "epoch": 0.17410714285714285, + "grad_norm": 4.458215713500977, + "learning_rate": 4.635844076844919e-05, + "loss": 1.4657, + "step": 1209 + }, + { + "epoch": 0.1742511520737327, + "grad_norm": 3.5176138877868652, + "learning_rate": 4.6352560329995686e-05, + "loss": 2.2067, + "step": 1210 + }, + { + "epoch": 0.1743951612903226, + "grad_norm": 2.2411715984344482, + "learning_rate": 4.634667552105077e-05, + "loss": 1.1365, + "step": 1211 + }, + { + "epoch": 0.17453917050691245, + "grad_norm": 6.108483791351318, + "learning_rate": 4.6340786342818964e-05, + "loss": 1.5791, + "step": 1212 + }, + { + "epoch": 0.1746831797235023, + "grad_norm": 5.166891574859619, + "learning_rate": 4.633489279650567e-05, + "loss": 1.1668, + "step": 1213 + }, + { + "epoch": 0.17482718894009217, + "grad_norm": 2.55145263671875, + "learning_rate": 4.63289948833172e-05, + "loss": 0.6177, + "step": 1214 + }, + { + "epoch": 0.17497119815668202, + "grad_norm": 6.562422752380371, + "learning_rate": 4.632309260446074e-05, + "loss": 0.6814, + "step": 1215 + }, + { + "epoch": 0.17511520737327188, + "grad_norm": 2.5345230102539062, + "learning_rate": 4.6317185961144396e-05, + "loss": 0.3074, + "step": 1216 + }, + { + "epoch": 0.17525921658986174, + "grad_norm": 6.262691974639893, + "learning_rate": 4.631127495457713e-05, + "loss": 1.1577, + "step": 1217 + }, + { + "epoch": 0.17540322580645162, + "grad_norm": 4.928733825683594, + "learning_rate": 4.6305359585968855e-05, + "loss": 2.1601, + "step": 1218 + }, + { + "epoch": 0.17554723502304148, + "grad_norm": 5.252901077270508, + "learning_rate": 4.629943985653032e-05, + "loss": 2.4854, + "step": 1219 + }, + { + "epoch": 0.17569124423963134, + "grad_norm": 4.5968523025512695, + "learning_rate": 4.62935157674732e-05, + "loss": 0.6352, + "step": 1220 + }, + { + "epoch": 0.1758352534562212, + "grad_norm": 3.825725793838501, + "learning_rate": 4.628758732001003e-05, + "loss": 2.2603, + "step": 1221 + }, + { + "epoch": 0.17597926267281105, + "grad_norm": 2.5671486854553223, + "learning_rate": 4.628165451535428e-05, + "loss": 2.2095, + "step": 1222 + }, + { + "epoch": 0.1761232718894009, + "grad_norm": 2.0698771476745605, + "learning_rate": 4.627571735472028e-05, + "loss": 0.2377, + "step": 1223 + }, + { + "epoch": 0.17626728110599077, + "grad_norm": 8.847590446472168, + "learning_rate": 4.6269775839323274e-05, + "loss": 2.4264, + "step": 1224 + }, + { + "epoch": 0.17641129032258066, + "grad_norm": 2.1573774814605713, + "learning_rate": 4.626382997037938e-05, + "loss": 0.2367, + "step": 1225 + }, + { + "epoch": 0.1765552995391705, + "grad_norm": 4.744299411773682, + "learning_rate": 4.625787974910559e-05, + "loss": 2.5361, + "step": 1226 + }, + { + "epoch": 0.17669930875576037, + "grad_norm": 5.418185234069824, + "learning_rate": 4.625192517671984e-05, + "loss": 1.4589, + "step": 1227 + }, + { + "epoch": 0.17684331797235023, + "grad_norm": 5.9997406005859375, + "learning_rate": 4.6245966254440916e-05, + "loss": 1.5707, + "step": 1228 + }, + { + "epoch": 0.17698732718894009, + "grad_norm": 1.2879303693771362, + "learning_rate": 4.6240002983488495e-05, + "loss": 4.2591, + "step": 1229 + }, + { + "epoch": 0.17713133640552994, + "grad_norm": 3.9846878051757812, + "learning_rate": 4.623403536508316e-05, + "loss": 0.836, + "step": 1230 + }, + { + "epoch": 0.17727534562211983, + "grad_norm": 3.678454637527466, + "learning_rate": 4.622806340044638e-05, + "loss": 1.4844, + "step": 1231 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 3.2452173233032227, + "learning_rate": 4.6222087090800506e-05, + "loss": 0.8329, + "step": 1232 + }, + { + "epoch": 0.17756336405529954, + "grad_norm": 4.739253520965576, + "learning_rate": 4.621610643736878e-05, + "loss": 0.8809, + "step": 1233 + }, + { + "epoch": 0.1777073732718894, + "grad_norm": 2.464580535888672, + "learning_rate": 4.6210121441375334e-05, + "loss": 0.2796, + "step": 1234 + }, + { + "epoch": 0.17785138248847926, + "grad_norm": 5.445340633392334, + "learning_rate": 4.6204132104045205e-05, + "loss": 1.2101, + "step": 1235 + }, + { + "epoch": 0.17799539170506912, + "grad_norm": 5.001343727111816, + "learning_rate": 4.61981384266043e-05, + "loss": 1.1739, + "step": 1236 + }, + { + "epoch": 0.17813940092165897, + "grad_norm": 1.9970438480377197, + "learning_rate": 4.6192140410279406e-05, + "loss": 0.225, + "step": 1237 + }, + { + "epoch": 0.17828341013824886, + "grad_norm": 4.901859283447266, + "learning_rate": 4.618613805629822e-05, + "loss": 0.62, + "step": 1238 + }, + { + "epoch": 0.17842741935483872, + "grad_norm": 2.635409116744995, + "learning_rate": 4.618013136588932e-05, + "loss": 0.316, + "step": 1239 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 4.700124263763428, + "learning_rate": 4.617412034028217e-05, + "loss": 1.4786, + "step": 1240 + }, + { + "epoch": 0.17871543778801843, + "grad_norm": 2.238205671310425, + "learning_rate": 4.6168104980707107e-05, + "loss": 0.4521, + "step": 1241 + }, + { + "epoch": 0.1788594470046083, + "grad_norm": 6.6261701583862305, + "learning_rate": 4.616208528839539e-05, + "loss": 2.1075, + "step": 1242 + }, + { + "epoch": 0.17900345622119815, + "grad_norm": 1.2374069690704346, + "learning_rate": 4.615606126457912e-05, + "loss": 0.1592, + "step": 1243 + }, + { + "epoch": 0.179147465437788, + "grad_norm": 5.160887241363525, + "learning_rate": 4.6150032910491325e-05, + "loss": 0.9043, + "step": 1244 + }, + { + "epoch": 0.1792914746543779, + "grad_norm": 5.888960361480713, + "learning_rate": 4.61440002273659e-05, + "loss": 1.2122, + "step": 1245 + }, + { + "epoch": 0.17943548387096775, + "grad_norm": 8.011823654174805, + "learning_rate": 4.613796321643763e-05, + "loss": 1.3548, + "step": 1246 + }, + { + "epoch": 0.1795794930875576, + "grad_norm": 3.5232951641082764, + "learning_rate": 4.613192187894218e-05, + "loss": 1.6229, + "step": 1247 + }, + { + "epoch": 0.17972350230414746, + "grad_norm": 5.965337753295898, + "learning_rate": 4.612587621611609e-05, + "loss": 1.8168, + "step": 1248 + }, + { + "epoch": 0.17986751152073732, + "grad_norm": 2.7078325748443604, + "learning_rate": 4.611982622919683e-05, + "loss": 0.5896, + "step": 1249 + }, + { + "epoch": 0.18001152073732718, + "grad_norm": 5.043031215667725, + "learning_rate": 4.6113771919422713e-05, + "loss": 0.7877, + "step": 1250 + }, + { + "epoch": 0.18015552995391704, + "grad_norm": 2.7861876487731934, + "learning_rate": 4.6107713288032945e-05, + "loss": 0.3545, + "step": 1251 + }, + { + "epoch": 0.18029953917050692, + "grad_norm": 6.058360576629639, + "learning_rate": 4.6101650336267624e-05, + "loss": 0.9732, + "step": 1252 + }, + { + "epoch": 0.18044354838709678, + "grad_norm": 1.8182666301727295, + "learning_rate": 4.609558306536772e-05, + "loss": 0.2274, + "step": 1253 + }, + { + "epoch": 0.18058755760368664, + "grad_norm": 1.059184193611145, + "learning_rate": 4.608951147657511e-05, + "loss": 0.0921, + "step": 1254 + }, + { + "epoch": 0.1807315668202765, + "grad_norm": 2.8558902740478516, + "learning_rate": 4.608343557113254e-05, + "loss": 0.7824, + "step": 1255 + }, + { + "epoch": 0.18087557603686635, + "grad_norm": 8.867794036865234, + "learning_rate": 4.607735535028362e-05, + "loss": 1.7204, + "step": 1256 + }, + { + "epoch": 0.1810195852534562, + "grad_norm": 3.949436664581299, + "learning_rate": 4.6071270815272896e-05, + "loss": 0.9247, + "step": 1257 + }, + { + "epoch": 0.1811635944700461, + "grad_norm": 6.7760844230651855, + "learning_rate": 4.606518196734574e-05, + "loss": 3.4155, + "step": 1258 + }, + { + "epoch": 0.18130760368663595, + "grad_norm": 3.1367948055267334, + "learning_rate": 4.6059088807748435e-05, + "loss": 0.3961, + "step": 1259 + }, + { + "epoch": 0.1814516129032258, + "grad_norm": 7.659944534301758, + "learning_rate": 4.6052991337728146e-05, + "loss": 2.0161, + "step": 1260 + }, + { + "epoch": 0.18159562211981567, + "grad_norm": 3.7291078567504883, + "learning_rate": 4.604688955853293e-05, + "loss": 1.9646, + "step": 1261 + }, + { + "epoch": 0.18173963133640553, + "grad_norm": 3.6046507358551025, + "learning_rate": 4.604078347141169e-05, + "loss": 0.398, + "step": 1262 + }, + { + "epoch": 0.18188364055299538, + "grad_norm": 3.758216619491577, + "learning_rate": 4.6034673077614253e-05, + "loss": 2.9535, + "step": 1263 + }, + { + "epoch": 0.18202764976958524, + "grad_norm": 4.301087379455566, + "learning_rate": 4.6028558378391295e-05, + "loss": 1.4229, + "step": 1264 + }, + { + "epoch": 0.18217165898617513, + "grad_norm": 4.118902206420898, + "learning_rate": 4.6022439374994396e-05, + "loss": 1.1094, + "step": 1265 + }, + { + "epoch": 0.18231566820276499, + "grad_norm": 2.297697067260742, + "learning_rate": 4.6016316068676e-05, + "loss": 0.3225, + "step": 1266 + }, + { + "epoch": 0.18245967741935484, + "grad_norm": 4.758309841156006, + "learning_rate": 4.601018846068945e-05, + "loss": 1.3965, + "step": 1267 + }, + { + "epoch": 0.1826036866359447, + "grad_norm": 7.013579368591309, + "learning_rate": 4.6004056552288956e-05, + "loss": 1.152, + "step": 1268 + }, + { + "epoch": 0.18274769585253456, + "grad_norm": 2.8923635482788086, + "learning_rate": 4.5997920344729606e-05, + "loss": 0.3917, + "step": 1269 + }, + { + "epoch": 0.18289170506912442, + "grad_norm": 3.093027114868164, + "learning_rate": 4.599177983926737e-05, + "loss": 2.1205, + "step": 1270 + }, + { + "epoch": 0.18303571428571427, + "grad_norm": 4.366365432739258, + "learning_rate": 4.5985635037159117e-05, + "loss": 1.5897, + "step": 1271 + }, + { + "epoch": 0.18317972350230416, + "grad_norm": 1.2704883813858032, + "learning_rate": 4.597948593966256e-05, + "loss": 0.1904, + "step": 1272 + }, + { + "epoch": 0.18332373271889402, + "grad_norm": 1.889454960823059, + "learning_rate": 4.597333254803632e-05, + "loss": 0.3757, + "step": 1273 + }, + { + "epoch": 0.18346774193548387, + "grad_norm": 3.8275647163391113, + "learning_rate": 4.596717486353988e-05, + "loss": 1.2401, + "step": 1274 + }, + { + "epoch": 0.18361175115207373, + "grad_norm": 4.659786224365234, + "learning_rate": 4.596101288743362e-05, + "loss": 0.6137, + "step": 1275 + }, + { + "epoch": 0.1837557603686636, + "grad_norm": 5.211240291595459, + "learning_rate": 4.5954846620978795e-05, + "loss": 0.6706, + "step": 1276 + }, + { + "epoch": 0.18389976958525345, + "grad_norm": 0.531303346157074, + "learning_rate": 4.594867606543751e-05, + "loss": 4.8342, + "step": 1277 + }, + { + "epoch": 0.1840437788018433, + "grad_norm": 4.047116279602051, + "learning_rate": 4.594250122207277e-05, + "loss": 0.4033, + "step": 1278 + }, + { + "epoch": 0.1841877880184332, + "grad_norm": 8.415393829345703, + "learning_rate": 4.593632209214847e-05, + "loss": 2.1995, + "step": 1279 + }, + { + "epoch": 0.18433179723502305, + "grad_norm": 6.347961902618408, + "learning_rate": 4.593013867692937e-05, + "loss": 0.6266, + "step": 1280 + }, + { + "epoch": 0.1844758064516129, + "grad_norm": 5.424032688140869, + "learning_rate": 4.5923950977681084e-05, + "loss": 1.073, + "step": 1281 + }, + { + "epoch": 0.18461981566820276, + "grad_norm": 5.255316257476807, + "learning_rate": 4.591775899567015e-05, + "loss": 0.7359, + "step": 1282 + }, + { + "epoch": 0.18476382488479262, + "grad_norm": 10.739800453186035, + "learning_rate": 4.5911562732163935e-05, + "loss": 1.8796, + "step": 1283 + }, + { + "epoch": 0.18490783410138248, + "grad_norm": 4.801985263824463, + "learning_rate": 4.5905362188430724e-05, + "loss": 0.7417, + "step": 1284 + }, + { + "epoch": 0.18505184331797234, + "grad_norm": 6.1646013259887695, + "learning_rate": 4.589915736573965e-05, + "loss": 0.8882, + "step": 1285 + }, + { + "epoch": 0.18519585253456222, + "grad_norm": 4.197654724121094, + "learning_rate": 4.5892948265360725e-05, + "loss": 1.934, + "step": 1286 + }, + { + "epoch": 0.18533986175115208, + "grad_norm": 5.098633766174316, + "learning_rate": 4.5886734888564845e-05, + "loss": 0.9171, + "step": 1287 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 3.854879140853882, + "learning_rate": 4.5880517236623786e-05, + "loss": 1.2084, + "step": 1288 + }, + { + "epoch": 0.1856278801843318, + "grad_norm": 3.606015682220459, + "learning_rate": 4.587429531081019e-05, + "loss": 1.1005, + "step": 1289 + }, + { + "epoch": 0.18577188940092165, + "grad_norm": 2.764026641845703, + "learning_rate": 4.586806911239756e-05, + "loss": 0.3618, + "step": 1290 + }, + { + "epoch": 0.1859158986175115, + "grad_norm": 7.270665168762207, + "learning_rate": 4.586183864266031e-05, + "loss": 0.9549, + "step": 1291 + }, + { + "epoch": 0.1860599078341014, + "grad_norm": 4.746792316436768, + "learning_rate": 4.585560390287369e-05, + "loss": 0.6641, + "step": 1292 + }, + { + "epoch": 0.18620391705069125, + "grad_norm": 3.6638782024383545, + "learning_rate": 4.584936489431385e-05, + "loss": 0.3654, + "step": 1293 + }, + { + "epoch": 0.1863479262672811, + "grad_norm": 4.516382694244385, + "learning_rate": 4.5843121618257804e-05, + "loss": 1.4612, + "step": 1294 + }, + { + "epoch": 0.18649193548387097, + "grad_norm": 4.848468780517578, + "learning_rate": 4.583687407598344e-05, + "loss": 0.3291, + "step": 1295 + }, + { + "epoch": 0.18663594470046083, + "grad_norm": 2.2902939319610596, + "learning_rate": 4.583062226876952e-05, + "loss": 0.3275, + "step": 1296 + }, + { + "epoch": 0.18677995391705068, + "grad_norm": 7.674658298492432, + "learning_rate": 4.582436619789566e-05, + "loss": 1.5168, + "step": 1297 + }, + { + "epoch": 0.18692396313364054, + "grad_norm": 5.339498996734619, + "learning_rate": 4.5818105864642404e-05, + "loss": 3.513, + "step": 1298 + }, + { + "epoch": 0.18706797235023043, + "grad_norm": 6.4102373123168945, + "learning_rate": 4.58118412702911e-05, + "loss": 1.8385, + "step": 1299 + }, + { + "epoch": 0.18721198156682028, + "grad_norm": 4.1127119064331055, + "learning_rate": 4.580557241612401e-05, + "loss": 0.7476, + "step": 1300 + }, + { + "epoch": 0.18735599078341014, + "grad_norm": 6.617447376251221, + "learning_rate": 4.579929930342426e-05, + "loss": 0.8739, + "step": 1301 + }, + { + "epoch": 0.1875, + "grad_norm": 4.205621719360352, + "learning_rate": 4.579302193347585e-05, + "loss": 0.7241, + "step": 1302 + }, + { + "epoch": 0.18764400921658986, + "grad_norm": 7.354917049407959, + "learning_rate": 4.5786740307563636e-05, + "loss": 1.8201, + "step": 1303 + }, + { + "epoch": 0.18778801843317972, + "grad_norm": 2.2132749557495117, + "learning_rate": 4.578045442697336e-05, + "loss": 0.1866, + "step": 1304 + }, + { + "epoch": 0.18793202764976957, + "grad_norm": 1.4462056159973145, + "learning_rate": 4.5774164292991625e-05, + "loss": 0.188, + "step": 1305 + }, + { + "epoch": 0.18807603686635946, + "grad_norm": 4.672199249267578, + "learning_rate": 4.576786990690592e-05, + "loss": 1.2868, + "step": 1306 + }, + { + "epoch": 0.18822004608294932, + "grad_norm": 9.002864837646484, + "learning_rate": 4.5761571270004586e-05, + "loss": 1.4818, + "step": 1307 + }, + { + "epoch": 0.18836405529953917, + "grad_norm": 3.4855809211730957, + "learning_rate": 4.575526838357685e-05, + "loss": 0.5541, + "step": 1308 + }, + { + "epoch": 0.18850806451612903, + "grad_norm": 5.406837463378906, + "learning_rate": 4.5748961248912793e-05, + "loss": 2.856, + "step": 1309 + }, + { + "epoch": 0.1886520737327189, + "grad_norm": 2.598715305328369, + "learning_rate": 4.5742649867303386e-05, + "loss": 0.336, + "step": 1310 + }, + { + "epoch": 0.18879608294930875, + "grad_norm": 2.1883950233459473, + "learning_rate": 4.573633424004045e-05, + "loss": 0.1408, + "step": 1311 + }, + { + "epoch": 0.1889400921658986, + "grad_norm": 4.042177677154541, + "learning_rate": 4.573001436841667e-05, + "loss": 0.5948, + "step": 1312 + }, + { + "epoch": 0.1890841013824885, + "grad_norm": 3.709496259689331, + "learning_rate": 4.572369025372564e-05, + "loss": 2.4765, + "step": 1313 + }, + { + "epoch": 0.18922811059907835, + "grad_norm": 7.751738548278809, + "learning_rate": 4.571736189726177e-05, + "loss": 3.5755, + "step": 1314 + }, + { + "epoch": 0.1893721198156682, + "grad_norm": 4.648199558258057, + "learning_rate": 4.5711029300320366e-05, + "loss": 0.5929, + "step": 1315 + }, + { + "epoch": 0.18951612903225806, + "grad_norm": 5.0924787521362305, + "learning_rate": 4.570469246419761e-05, + "loss": 1.0571, + "step": 1316 + }, + { + "epoch": 0.18966013824884792, + "grad_norm": 6.496212482452393, + "learning_rate": 4.569835139019054e-05, + "loss": 0.8834, + "step": 1317 + }, + { + "epoch": 0.18980414746543778, + "grad_norm": 5.029287815093994, + "learning_rate": 4.569200607959705e-05, + "loss": 0.4847, + "step": 1318 + }, + { + "epoch": 0.18994815668202766, + "grad_norm": 1.6988235712051392, + "learning_rate": 4.5685656533715916e-05, + "loss": 0.1794, + "step": 1319 + }, + { + "epoch": 0.19009216589861752, + "grad_norm": 8.303763389587402, + "learning_rate": 4.5679302753846774e-05, + "loss": 2.3569, + "step": 1320 + }, + { + "epoch": 0.19023617511520738, + "grad_norm": 3.90885853767395, + "learning_rate": 4.567294474129015e-05, + "loss": 1.2504, + "step": 1321 + }, + { + "epoch": 0.19038018433179724, + "grad_norm": 3.9610800743103027, + "learning_rate": 4.56665824973474e-05, + "loss": 1.222, + "step": 1322 + }, + { + "epoch": 0.1905241935483871, + "grad_norm": 3.474600076675415, + "learning_rate": 4.566021602332076e-05, + "loss": 2.2739, + "step": 1323 + }, + { + "epoch": 0.19066820276497695, + "grad_norm": 1.130947470664978, + "learning_rate": 4.565384532051335e-05, + "loss": 0.1453, + "step": 1324 + }, + { + "epoch": 0.1908122119815668, + "grad_norm": 3.4442367553710938, + "learning_rate": 4.564747039022912e-05, + "loss": 0.7562, + "step": 1325 + }, + { + "epoch": 0.1909562211981567, + "grad_norm": 2.30643630027771, + "learning_rate": 4.564109123377292e-05, + "loss": 0.3736, + "step": 1326 + }, + { + "epoch": 0.19110023041474655, + "grad_norm": 4.437337398529053, + "learning_rate": 4.563470785245045e-05, + "loss": 0.9075, + "step": 1327 + }, + { + "epoch": 0.1912442396313364, + "grad_norm": 3.9820821285247803, + "learning_rate": 4.562832024756827e-05, + "loss": 1.1573, + "step": 1328 + }, + { + "epoch": 0.19138824884792627, + "grad_norm": 4.441662311553955, + "learning_rate": 4.562192842043381e-05, + "loss": 1.1276, + "step": 1329 + }, + { + "epoch": 0.19153225806451613, + "grad_norm": 7.867371082305908, + "learning_rate": 4.561553237235538e-05, + "loss": 1.5248, + "step": 1330 + }, + { + "epoch": 0.19167626728110598, + "grad_norm": 7.414824962615967, + "learning_rate": 4.56091321046421e-05, + "loss": 1.1909, + "step": 1331 + }, + { + "epoch": 0.19182027649769584, + "grad_norm": 2.6729843616485596, + "learning_rate": 4.560272761860403e-05, + "loss": 0.2521, + "step": 1332 + }, + { + "epoch": 0.19196428571428573, + "grad_norm": 3.485581874847412, + "learning_rate": 4.5596318915552036e-05, + "loss": 1.8179, + "step": 1333 + }, + { + "epoch": 0.19210829493087558, + "grad_norm": 2.8844757080078125, + "learning_rate": 4.558990599679787e-05, + "loss": 0.3549, + "step": 1334 + }, + { + "epoch": 0.19225230414746544, + "grad_norm": 4.984114646911621, + "learning_rate": 4.558348886365414e-05, + "loss": 1.9386, + "step": 1335 + }, + { + "epoch": 0.1923963133640553, + "grad_norm": 2.7209994792938232, + "learning_rate": 4.557706751743433e-05, + "loss": 0.3666, + "step": 1336 + }, + { + "epoch": 0.19254032258064516, + "grad_norm": 3.466996908187866, + "learning_rate": 4.557064195945277e-05, + "loss": 1.394, + "step": 1337 + }, + { + "epoch": 0.19268433179723501, + "grad_norm": 0.7452341914176941, + "learning_rate": 4.556421219102466e-05, + "loss": 4.7508, + "step": 1338 + }, + { + "epoch": 0.19282834101382487, + "grad_norm": 6.134788990020752, + "learning_rate": 4.5557778213466044e-05, + "loss": 1.6923, + "step": 1339 + }, + { + "epoch": 0.19297235023041476, + "grad_norm": 7.707910537719727, + "learning_rate": 4.555134002809386e-05, + "loss": 1.3671, + "step": 1340 + }, + { + "epoch": 0.19311635944700462, + "grad_norm": 0.6672204732894897, + "learning_rate": 4.554489763622589e-05, + "loss": 0.0727, + "step": 1341 + }, + { + "epoch": 0.19326036866359447, + "grad_norm": 2.3044564723968506, + "learning_rate": 4.553845103918079e-05, + "loss": 0.5711, + "step": 1342 + }, + { + "epoch": 0.19340437788018433, + "grad_norm": 2.7839770317077637, + "learning_rate": 4.553200023827803e-05, + "loss": 0.425, + "step": 1343 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 4.417322635650635, + "learning_rate": 4.5525545234837994e-05, + "loss": 1.5991, + "step": 1344 + }, + { + "epoch": 0.19369239631336405, + "grad_norm": 6.078741550445557, + "learning_rate": 4.551908603018191e-05, + "loss": 0.9462, + "step": 1345 + }, + { + "epoch": 0.1938364055299539, + "grad_norm": 7.039916515350342, + "learning_rate": 4.551262262563186e-05, + "loss": 1.4637, + "step": 1346 + }, + { + "epoch": 0.1939804147465438, + "grad_norm": 0.8199433088302612, + "learning_rate": 4.5506155022510787e-05, + "loss": 4.2992, + "step": 1347 + }, + { + "epoch": 0.19412442396313365, + "grad_norm": 2.488051414489746, + "learning_rate": 4.54996832221425e-05, + "loss": 0.2939, + "step": 1348 + }, + { + "epoch": 0.1942684331797235, + "grad_norm": 3.0229804515838623, + "learning_rate": 4.5493207225851665e-05, + "loss": 1.1047, + "step": 1349 + }, + { + "epoch": 0.19441244239631336, + "grad_norm": 3.0513532161712646, + "learning_rate": 4.5486727034963785e-05, + "loss": 0.319, + "step": 1350 + }, + { + "epoch": 0.19455645161290322, + "grad_norm": 5.573044776916504, + "learning_rate": 4.548024265080526e-05, + "loss": 2.268, + "step": 1351 + }, + { + "epoch": 0.19470046082949308, + "grad_norm": 2.65724515914917, + "learning_rate": 4.5473754074703324e-05, + "loss": 0.5577, + "step": 1352 + }, + { + "epoch": 0.19484447004608296, + "grad_norm": 5.1597371101379395, + "learning_rate": 4.546726130798606e-05, + "loss": 1.4077, + "step": 1353 + }, + { + "epoch": 0.19498847926267282, + "grad_norm": 3.290896415710449, + "learning_rate": 4.5460764351982446e-05, + "loss": 0.3354, + "step": 1354 + }, + { + "epoch": 0.19513248847926268, + "grad_norm": 4.185632228851318, + "learning_rate": 4.5454263208022274e-05, + "loss": 0.3217, + "step": 1355 + }, + { + "epoch": 0.19527649769585254, + "grad_norm": 4.208024024963379, + "learning_rate": 4.5447757877436224e-05, + "loss": 0.9298, + "step": 1356 + }, + { + "epoch": 0.1954205069124424, + "grad_norm": 3.0885162353515625, + "learning_rate": 4.544124836155582e-05, + "loss": 0.3842, + "step": 1357 + }, + { + "epoch": 0.19556451612903225, + "grad_norm": 7.391604900360107, + "learning_rate": 4.543473466171344e-05, + "loss": 1.7926, + "step": 1358 + }, + { + "epoch": 0.1957085253456221, + "grad_norm": 6.110589981079102, + "learning_rate": 4.5428216779242336e-05, + "loss": 0.7325, + "step": 1359 + }, + { + "epoch": 0.195852534562212, + "grad_norm": 1.826169490814209, + "learning_rate": 4.5421694715476584e-05, + "loss": 0.1976, + "step": 1360 + }, + { + "epoch": 0.19599654377880185, + "grad_norm": 3.0660598278045654, + "learning_rate": 4.541516847175115e-05, + "loss": 0.5016, + "step": 1361 + }, + { + "epoch": 0.1961405529953917, + "grad_norm": 3.5116050243377686, + "learning_rate": 4.5408638049401836e-05, + "loss": 0.7362, + "step": 1362 + }, + { + "epoch": 0.19628456221198157, + "grad_norm": 3.514094114303589, + "learning_rate": 4.54021034497653e-05, + "loss": 2.5884, + "step": 1363 + }, + { + "epoch": 0.19642857142857142, + "grad_norm": 1.9167139530181885, + "learning_rate": 4.539556467417907e-05, + "loss": 0.2109, + "step": 1364 + }, + { + "epoch": 0.19657258064516128, + "grad_norm": 2.729886531829834, + "learning_rate": 4.538902172398151e-05, + "loss": 0.3238, + "step": 1365 + }, + { + "epoch": 0.19671658986175114, + "grad_norm": 4.585836410522461, + "learning_rate": 4.538247460051184e-05, + "loss": 1.612, + "step": 1366 + }, + { + "epoch": 0.19686059907834103, + "grad_norm": 4.493183135986328, + "learning_rate": 4.5375923305110155e-05, + "loss": 1.1658, + "step": 1367 + }, + { + "epoch": 0.19700460829493088, + "grad_norm": 3.8819870948791504, + "learning_rate": 4.536936783911737e-05, + "loss": 0.4798, + "step": 1368 + }, + { + "epoch": 0.19714861751152074, + "grad_norm": 1.7101675271987915, + "learning_rate": 4.5362808203875295e-05, + "loss": 0.1741, + "step": 1369 + }, + { + "epoch": 0.1972926267281106, + "grad_norm": 4.828441143035889, + "learning_rate": 4.5356244400726556e-05, + "loss": 1.0387, + "step": 1370 + }, + { + "epoch": 0.19743663594470046, + "grad_norm": 1.0270754098892212, + "learning_rate": 4.534967643101465e-05, + "loss": 0.1394, + "step": 1371 + }, + { + "epoch": 0.1975806451612903, + "grad_norm": 1.3045107126235962, + "learning_rate": 4.534310429608394e-05, + "loss": 0.1177, + "step": 1372 + }, + { + "epoch": 0.19772465437788017, + "grad_norm": 3.8122308254241943, + "learning_rate": 4.53365279972796e-05, + "loss": 0.6293, + "step": 1373 + }, + { + "epoch": 0.19786866359447006, + "grad_norm": 8.710725784301758, + "learning_rate": 4.53299475359477e-05, + "loss": 2.8533, + "step": 1374 + }, + { + "epoch": 0.19801267281105991, + "grad_norm": 8.160808563232422, + "learning_rate": 4.532336291343513e-05, + "loss": 2.5372, + "step": 1375 + }, + { + "epoch": 0.19815668202764977, + "grad_norm": 4.41232442855835, + "learning_rate": 4.531677413108965e-05, + "loss": 0.8123, + "step": 1376 + }, + { + "epoch": 0.19830069124423963, + "grad_norm": 13.120766639709473, + "learning_rate": 4.531018119025989e-05, + "loss": 3.173, + "step": 1377 + }, + { + "epoch": 0.1984447004608295, + "grad_norm": 3.3141586780548096, + "learning_rate": 4.530358409229528e-05, + "loss": 0.8458, + "step": 1378 + }, + { + "epoch": 0.19858870967741934, + "grad_norm": 5.385952949523926, + "learning_rate": 4.529698283854614e-05, + "loss": 2.5617, + "step": 1379 + }, + { + "epoch": 0.19873271889400923, + "grad_norm": 2.2721996307373047, + "learning_rate": 4.529037743036362e-05, + "loss": 0.159, + "step": 1380 + }, + { + "epoch": 0.1988767281105991, + "grad_norm": 5.4742865562438965, + "learning_rate": 4.5283767869099746e-05, + "loss": 1.3538, + "step": 1381 + }, + { + "epoch": 0.19902073732718895, + "grad_norm": 1.2851258516311646, + "learning_rate": 4.5277154156107374e-05, + "loss": 0.2789, + "step": 1382 + }, + { + "epoch": 0.1991647465437788, + "grad_norm": 1.9926263093948364, + "learning_rate": 4.527053629274021e-05, + "loss": 0.1938, + "step": 1383 + }, + { + "epoch": 0.19930875576036866, + "grad_norm": 6.515530586242676, + "learning_rate": 4.526391428035281e-05, + "loss": 0.6761, + "step": 1384 + }, + { + "epoch": 0.19945276497695852, + "grad_norm": 4.749672889709473, + "learning_rate": 4.525728812030059e-05, + "loss": 0.8032, + "step": 1385 + }, + { + "epoch": 0.19959677419354838, + "grad_norm": 9.109404563903809, + "learning_rate": 4.52506578139398e-05, + "loss": 1.1249, + "step": 1386 + }, + { + "epoch": 0.19974078341013826, + "grad_norm": 6.957667827606201, + "learning_rate": 4.524402336262756e-05, + "loss": 0.8964, + "step": 1387 + }, + { + "epoch": 0.19988479262672812, + "grad_norm": 5.974212646484375, + "learning_rate": 4.523738476772182e-05, + "loss": 1.7797, + "step": 1388 + }, + { + "epoch": 0.20002880184331798, + "grad_norm": 4.746833801269531, + "learning_rate": 4.5230742030581374e-05, + "loss": 1.1845, + "step": 1389 + }, + { + "epoch": 0.20017281105990783, + "grad_norm": 3.9150187969207764, + "learning_rate": 4.522409515256588e-05, + "loss": 0.4974, + "step": 1390 + }, + { + "epoch": 0.2003168202764977, + "grad_norm": 3.8776493072509766, + "learning_rate": 4.521744413503583e-05, + "loss": 0.6816, + "step": 1391 + }, + { + "epoch": 0.20046082949308755, + "grad_norm": 2.6132709980010986, + "learning_rate": 4.521078897935258e-05, + "loss": 0.197, + "step": 1392 + }, + { + "epoch": 0.2006048387096774, + "grad_norm": 6.9391937255859375, + "learning_rate": 4.520412968687832e-05, + "loss": 2.1835, + "step": 1393 + }, + { + "epoch": 0.2007488479262673, + "grad_norm": 6.293370723724365, + "learning_rate": 4.519746625897607e-05, + "loss": 1.1257, + "step": 1394 + }, + { + "epoch": 0.20089285714285715, + "grad_norm": 5.341616153717041, + "learning_rate": 4.519079869700975e-05, + "loss": 0.4343, + "step": 1395 + }, + { + "epoch": 0.201036866359447, + "grad_norm": 4.876648426055908, + "learning_rate": 4.518412700234406e-05, + "loss": 0.532, + "step": 1396 + }, + { + "epoch": 0.20118087557603687, + "grad_norm": 3.4397571086883545, + "learning_rate": 4.51774511763446e-05, + "loss": 0.3, + "step": 1397 + }, + { + "epoch": 0.20132488479262672, + "grad_norm": 0.8648613095283508, + "learning_rate": 4.5170771220377785e-05, + "loss": 0.0849, + "step": 1398 + }, + { + "epoch": 0.20146889400921658, + "grad_norm": 1.6853604316711426, + "learning_rate": 4.5164087135810886e-05, + "loss": 0.2195, + "step": 1399 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 3.216339588165283, + "learning_rate": 4.5157398924012017e-05, + "loss": 0.415, + "step": 1400 + }, + { + "epoch": 0.20175691244239632, + "grad_norm": 6.971455097198486, + "learning_rate": 4.515070658635013e-05, + "loss": 1.211, + "step": 1401 + }, + { + "epoch": 0.20190092165898618, + "grad_norm": 3.726780414581299, + "learning_rate": 4.5144010124195034e-05, + "loss": 1.1299, + "step": 1402 + }, + { + "epoch": 0.20204493087557604, + "grad_norm": 1.432807445526123, + "learning_rate": 4.513730953891738e-05, + "loss": 0.1585, + "step": 1403 + }, + { + "epoch": 0.2021889400921659, + "grad_norm": 5.515259742736816, + "learning_rate": 4.5130604831888644e-05, + "loss": 1.9673, + "step": 1404 + }, + { + "epoch": 0.20233294930875576, + "grad_norm": 1.7029646635055542, + "learning_rate": 4.512389600448118e-05, + "loss": 0.1888, + "step": 1405 + }, + { + "epoch": 0.2024769585253456, + "grad_norm": 5.417266845703125, + "learning_rate": 4.5117183058068156e-05, + "loss": 0.9089, + "step": 1406 + }, + { + "epoch": 0.20262096774193547, + "grad_norm": 5.959532737731934, + "learning_rate": 4.51104659940236e-05, + "loss": 2.7487, + "step": 1407 + }, + { + "epoch": 0.20276497695852536, + "grad_norm": 2.1861493587493896, + "learning_rate": 4.5103744813722374e-05, + "loss": 0.2969, + "step": 1408 + }, + { + "epoch": 0.2029089861751152, + "grad_norm": 3.5031847953796387, + "learning_rate": 4.509701951854017e-05, + "loss": 0.5153, + "step": 1409 + }, + { + "epoch": 0.20305299539170507, + "grad_norm": 6.736876964569092, + "learning_rate": 4.5090290109853556e-05, + "loss": 0.8829, + "step": 1410 + }, + { + "epoch": 0.20319700460829493, + "grad_norm": 3.3671412467956543, + "learning_rate": 4.5083556589039915e-05, + "loss": 0.6377, + "step": 1411 + }, + { + "epoch": 0.2033410138248848, + "grad_norm": 6.346477031707764, + "learning_rate": 4.507681895747748e-05, + "loss": 2.2456, + "step": 1412 + }, + { + "epoch": 0.20348502304147464, + "grad_norm": 7.725283145904541, + "learning_rate": 4.5070077216545326e-05, + "loss": 2.7307, + "step": 1413 + }, + { + "epoch": 0.20362903225806453, + "grad_norm": 3.868990182876587, + "learning_rate": 4.5063331367623376e-05, + "loss": 1.3027, + "step": 1414 + }, + { + "epoch": 0.2037730414746544, + "grad_norm": 1.4304883480072021, + "learning_rate": 4.505658141209237e-05, + "loss": 0.1958, + "step": 1415 + }, + { + "epoch": 0.20391705069124424, + "grad_norm": 4.119123458862305, + "learning_rate": 4.504982735133391e-05, + "loss": 0.7675, + "step": 1416 + }, + { + "epoch": 0.2040610599078341, + "grad_norm": 4.620575428009033, + "learning_rate": 4.504306918673044e-05, + "loss": 0.5779, + "step": 1417 + }, + { + "epoch": 0.20420506912442396, + "grad_norm": 4.549313545227051, + "learning_rate": 4.503630691966523e-05, + "loss": 0.8373, + "step": 1418 + }, + { + "epoch": 0.20434907834101382, + "grad_norm": 4.494303226470947, + "learning_rate": 4.50295405515224e-05, + "loss": 1.0445, + "step": 1419 + }, + { + "epoch": 0.20449308755760368, + "grad_norm": 3.7534356117248535, + "learning_rate": 4.5022770083686906e-05, + "loss": 1.8496, + "step": 1420 + }, + { + "epoch": 0.20463709677419356, + "grad_norm": 4.114515781402588, + "learning_rate": 4.501599551754454e-05, + "loss": 2.3784, + "step": 1421 + }, + { + "epoch": 0.20478110599078342, + "grad_norm": 8.67730712890625, + "learning_rate": 4.500921685448193e-05, + "loss": 2.4864, + "step": 1422 + }, + { + "epoch": 0.20492511520737328, + "grad_norm": 2.316235065460205, + "learning_rate": 4.500243409588656e-05, + "loss": 0.4962, + "step": 1423 + }, + { + "epoch": 0.20506912442396313, + "grad_norm": 6.120822429656982, + "learning_rate": 4.4995647243146745e-05, + "loss": 2.7019, + "step": 1424 + }, + { + "epoch": 0.205213133640553, + "grad_norm": 5.481921195983887, + "learning_rate": 4.498885629765162e-05, + "loss": 0.7814, + "step": 1425 + }, + { + "epoch": 0.20535714285714285, + "grad_norm": 2.9454941749572754, + "learning_rate": 4.498206126079117e-05, + "loss": 0.2601, + "step": 1426 + }, + { + "epoch": 0.2055011520737327, + "grad_norm": 2.7093818187713623, + "learning_rate": 4.497526213395623e-05, + "loss": 0.3766, + "step": 1427 + }, + { + "epoch": 0.2056451612903226, + "grad_norm": 2.2650601863861084, + "learning_rate": 4.496845891853845e-05, + "loss": 0.2856, + "step": 1428 + }, + { + "epoch": 0.20578917050691245, + "grad_norm": 1.7868082523345947, + "learning_rate": 4.496165161593035e-05, + "loss": 0.1732, + "step": 1429 + }, + { + "epoch": 0.2059331797235023, + "grad_norm": 4.39101505279541, + "learning_rate": 4.495484022752523e-05, + "loss": 0.3372, + "step": 1430 + }, + { + "epoch": 0.20607718894009217, + "grad_norm": 1.9814434051513672, + "learning_rate": 4.494802475471729e-05, + "loss": 0.4992, + "step": 1431 + }, + { + "epoch": 0.20622119815668202, + "grad_norm": 4.366975784301758, + "learning_rate": 4.4941205198901527e-05, + "loss": 0.6529, + "step": 1432 + }, + { + "epoch": 0.20636520737327188, + "grad_norm": 2.245333194732666, + "learning_rate": 4.4934381561473776e-05, + "loss": 0.2515, + "step": 1433 + }, + { + "epoch": 0.20650921658986174, + "grad_norm": 2.7915987968444824, + "learning_rate": 4.492755384383073e-05, + "loss": 0.2552, + "step": 1434 + }, + { + "epoch": 0.20665322580645162, + "grad_norm": 2.9963219165802, + "learning_rate": 4.4920722047369876e-05, + "loss": 0.2082, + "step": 1435 + }, + { + "epoch": 0.20679723502304148, + "grad_norm": 2.0955357551574707, + "learning_rate": 4.491388617348959e-05, + "loss": 0.1763, + "step": 1436 + }, + { + "epoch": 0.20694124423963134, + "grad_norm": 8.821084022521973, + "learning_rate": 4.490704622358905e-05, + "loss": 0.157, + "step": 1437 + }, + { + "epoch": 0.2070852534562212, + "grad_norm": 4.912189960479736, + "learning_rate": 4.490020219906827e-05, + "loss": 0.6027, + "step": 1438 + }, + { + "epoch": 0.20722926267281105, + "grad_norm": 4.5851898193359375, + "learning_rate": 4.489335410132808e-05, + "loss": 0.945, + "step": 1439 + }, + { + "epoch": 0.2073732718894009, + "grad_norm": 2.3893752098083496, + "learning_rate": 4.488650193177019e-05, + "loss": 0.2072, + "step": 1440 + }, + { + "epoch": 0.20751728110599077, + "grad_norm": 5.9714179039001465, + "learning_rate": 4.487964569179711e-05, + "loss": 1.1973, + "step": 1441 + }, + { + "epoch": 0.20766129032258066, + "grad_norm": 1.0164263248443604, + "learning_rate": 4.487278538281219e-05, + "loss": 0.1078, + "step": 1442 + }, + { + "epoch": 0.2078052995391705, + "grad_norm": 7.101818561553955, + "learning_rate": 4.486592100621961e-05, + "loss": 2.2229, + "step": 1443 + }, + { + "epoch": 0.20794930875576037, + "grad_norm": 4.450230121612549, + "learning_rate": 4.48590525634244e-05, + "loss": 2.1034, + "step": 1444 + }, + { + "epoch": 0.20809331797235023, + "grad_norm": 3.754061460494995, + "learning_rate": 4.4852180055832396e-05, + "loss": 0.8176, + "step": 1445 + }, + { + "epoch": 0.20823732718894009, + "grad_norm": 7.4632391929626465, + "learning_rate": 4.484530348485029e-05, + "loss": 1.6172, + "step": 1446 + }, + { + "epoch": 0.20838133640552994, + "grad_norm": 4.511023998260498, + "learning_rate": 4.483842285188557e-05, + "loss": 0.9971, + "step": 1447 + }, + { + "epoch": 0.20852534562211983, + "grad_norm": 5.067302703857422, + "learning_rate": 4.483153815834661e-05, + "loss": 0.9166, + "step": 1448 + }, + { + "epoch": 0.2086693548387097, + "grad_norm": 2.1708266735076904, + "learning_rate": 4.482464940564257e-05, + "loss": 0.3405, + "step": 1449 + }, + { + "epoch": 0.20881336405529954, + "grad_norm": 4.083951950073242, + "learning_rate": 4.481775659518346e-05, + "loss": 0.4759, + "step": 1450 + }, + { + "epoch": 0.2089573732718894, + "grad_norm": 3.7692196369171143, + "learning_rate": 4.481085972838011e-05, + "loss": 0.4796, + "step": 1451 + }, + { + "epoch": 0.20910138248847926, + "grad_norm": 0.9811137318611145, + "learning_rate": 4.4803958806644185e-05, + "loss": 4.6377, + "step": 1452 + }, + { + "epoch": 0.20924539170506912, + "grad_norm": 5.515955924987793, + "learning_rate": 4.47970538313882e-05, + "loss": 2.4578, + "step": 1453 + }, + { + "epoch": 0.20938940092165897, + "grad_norm": 1.6550893783569336, + "learning_rate": 4.4790144804025456e-05, + "loss": 0.0778, + "step": 1454 + }, + { + "epoch": 0.20953341013824886, + "grad_norm": 5.949126243591309, + "learning_rate": 4.478323172597013e-05, + "loss": 0.8592, + "step": 1455 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 1.199725866317749, + "learning_rate": 4.477631459863719e-05, + "loss": 0.2314, + "step": 1456 + }, + { + "epoch": 0.20982142857142858, + "grad_norm": 3.7951977252960205, + "learning_rate": 4.476939342344246e-05, + "loss": 1.1699, + "step": 1457 + }, + { + "epoch": 0.20996543778801843, + "grad_norm": 2.517693519592285, + "learning_rate": 4.476246820180259e-05, + "loss": 0.291, + "step": 1458 + }, + { + "epoch": 0.2101094470046083, + "grad_norm": 3.1958301067352295, + "learning_rate": 4.475553893513503e-05, + "loss": 0.2588, + "step": 1459 + }, + { + "epoch": 0.21025345622119815, + "grad_norm": 4.985546588897705, + "learning_rate": 4.4748605624858097e-05, + "loss": 1.9965, + "step": 1460 + }, + { + "epoch": 0.210397465437788, + "grad_norm": 4.266109466552734, + "learning_rate": 4.47416682723909e-05, + "loss": 1.4141, + "step": 1461 + }, + { + "epoch": 0.2105414746543779, + "grad_norm": 2.314887285232544, + "learning_rate": 4.473472687915341e-05, + "loss": 0.6092, + "step": 1462 + }, + { + "epoch": 0.21068548387096775, + "grad_norm": 3.947113037109375, + "learning_rate": 4.4727781446566385e-05, + "loss": 0.1091, + "step": 1463 + }, + { + "epoch": 0.2108294930875576, + "grad_norm": 3.858152389526367, + "learning_rate": 4.472083197605146e-05, + "loss": 2.3034, + "step": 1464 + }, + { + "epoch": 0.21097350230414746, + "grad_norm": 2.240093231201172, + "learning_rate": 4.471387846903104e-05, + "loss": 0.2249, + "step": 1465 + }, + { + "epoch": 0.21111751152073732, + "grad_norm": 2.39314603805542, + "learning_rate": 4.470692092692841e-05, + "loss": 0.3464, + "step": 1466 + }, + { + "epoch": 0.21126152073732718, + "grad_norm": 4.3874616622924805, + "learning_rate": 4.469995935116764e-05, + "loss": 1.0922, + "step": 1467 + }, + { + "epoch": 0.21140552995391704, + "grad_norm": 3.198436975479126, + "learning_rate": 4.469299374317365e-05, + "loss": 2.0788, + "step": 1468 + }, + { + "epoch": 0.21154953917050692, + "grad_norm": 1.9973032474517822, + "learning_rate": 4.468602410437217e-05, + "loss": 0.2565, + "step": 1469 + }, + { + "epoch": 0.21169354838709678, + "grad_norm": 3.8564374446868896, + "learning_rate": 4.467905043618976e-05, + "loss": 1.7883, + "step": 1470 + }, + { + "epoch": 0.21183755760368664, + "grad_norm": 4.838620185852051, + "learning_rate": 4.4672072740053816e-05, + "loss": 1.7328, + "step": 1471 + }, + { + "epoch": 0.2119815668202765, + "grad_norm": 5.150815010070801, + "learning_rate": 4.466509101739254e-05, + "loss": 0.6111, + "step": 1472 + }, + { + "epoch": 0.21212557603686635, + "grad_norm": 2.4825439453125, + "learning_rate": 4.465810526963499e-05, + "loss": 0.1057, + "step": 1473 + }, + { + "epoch": 0.2122695852534562, + "grad_norm": 4.321075439453125, + "learning_rate": 4.465111549821099e-05, + "loss": 0.4661, + "step": 1474 + }, + { + "epoch": 0.2124135944700461, + "grad_norm": 2.4407191276550293, + "learning_rate": 4.464412170455124e-05, + "loss": 0.2946, + "step": 1475 + }, + { + "epoch": 0.21255760368663595, + "grad_norm": 0.9786215424537659, + "learning_rate": 4.463712389008725e-05, + "loss": 4.0751, + "step": 1476 + }, + { + "epoch": 0.2127016129032258, + "grad_norm": 5.1047210693359375, + "learning_rate": 4.4630122056251334e-05, + "loss": 0.9287, + "step": 1477 + }, + { + "epoch": 0.21284562211981567, + "grad_norm": 5.083205223083496, + "learning_rate": 4.462311620447666e-05, + "loss": 1.349, + "step": 1478 + }, + { + "epoch": 0.21298963133640553, + "grad_norm": 4.024742126464844, + "learning_rate": 4.461610633619719e-05, + "loss": 0.5878, + "step": 1479 + }, + { + "epoch": 0.21313364055299538, + "grad_norm": 4.686280250549316, + "learning_rate": 4.460909245284773e-05, + "loss": 0.3837, + "step": 1480 + }, + { + "epoch": 0.21327764976958524, + "grad_norm": 2.746426582336426, + "learning_rate": 4.46020745558639e-05, + "loss": 0.3792, + "step": 1481 + }, + { + "epoch": 0.21342165898617513, + "grad_norm": 1.3425133228302002, + "learning_rate": 4.459505264668212e-05, + "loss": 0.0998, + "step": 1482 + }, + { + "epoch": 0.21356566820276499, + "grad_norm": 4.476913928985596, + "learning_rate": 4.458802672673967e-05, + "loss": 0.4936, + "step": 1483 + }, + { + "epoch": 0.21370967741935484, + "grad_norm": 6.621892929077148, + "learning_rate": 4.458099679747463e-05, + "loss": 1.4241, + "step": 1484 + }, + { + "epoch": 0.2138536866359447, + "grad_norm": 3.1843454837799072, + "learning_rate": 4.457396286032589e-05, + "loss": 0.305, + "step": 1485 + }, + { + "epoch": 0.21399769585253456, + "grad_norm": 1.644788384437561, + "learning_rate": 4.4566924916733175e-05, + "loss": 0.1854, + "step": 1486 + }, + { + "epoch": 0.21414170506912442, + "grad_norm": 4.447836875915527, + "learning_rate": 4.455988296813704e-05, + "loss": 2.121, + "step": 1487 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.5974571704864502, + "learning_rate": 4.4552837015978835e-05, + "loss": 0.1569, + "step": 1488 + }, + { + "epoch": 0.21442972350230416, + "grad_norm": 1.4337561130523682, + "learning_rate": 4.454578706170075e-05, + "loss": 0.1856, + "step": 1489 + }, + { + "epoch": 0.21457373271889402, + "grad_norm": 2.18281626701355, + "learning_rate": 4.453873310674578e-05, + "loss": 0.2296, + "step": 1490 + }, + { + "epoch": 0.21471774193548387, + "grad_norm": 8.338083267211914, + "learning_rate": 4.453167515255774e-05, + "loss": 1.6805, + "step": 1491 + }, + { + "epoch": 0.21486175115207373, + "grad_norm": 14.71151065826416, + "learning_rate": 4.4524613200581284e-05, + "loss": 3.5251, + "step": 1492 + }, + { + "epoch": 0.2150057603686636, + "grad_norm": 3.6314713954925537, + "learning_rate": 4.451754725226185e-05, + "loss": 0.4159, + "step": 1493 + }, + { + "epoch": 0.21514976958525345, + "grad_norm": 2.9425625801086426, + "learning_rate": 4.4510477309045735e-05, + "loss": 0.6486, + "step": 1494 + }, + { + "epoch": 0.2152937788018433, + "grad_norm": 6.003162384033203, + "learning_rate": 4.450340337238002e-05, + "loss": 2.5301, + "step": 1495 + }, + { + "epoch": 0.2154377880184332, + "grad_norm": 5.44991397857666, + "learning_rate": 4.4496325443712597e-05, + "loss": 2.7118, + "step": 1496 + }, + { + "epoch": 0.21558179723502305, + "grad_norm": 4.128443717956543, + "learning_rate": 4.448924352449222e-05, + "loss": 1.1978, + "step": 1497 + }, + { + "epoch": 0.2157258064516129, + "grad_norm": 2.1557252407073975, + "learning_rate": 4.448215761616842e-05, + "loss": 0.2943, + "step": 1498 + }, + { + "epoch": 0.21586981566820276, + "grad_norm": 1.7729600667953491, + "learning_rate": 4.447506772019155e-05, + "loss": 0.1928, + "step": 1499 + }, + { + "epoch": 0.21601382488479262, + "grad_norm": 4.62984037399292, + "learning_rate": 4.446797383801281e-05, + "loss": 0.4867, + "step": 1500 + }, + { + "epoch": 0.21615783410138248, + "grad_norm": 9.55689525604248, + "learning_rate": 4.446087597108417e-05, + "loss": 2.2614, + "step": 1501 + }, + { + "epoch": 0.21630184331797234, + "grad_norm": 1.0617728233337402, + "learning_rate": 4.445377412085845e-05, + "loss": 0.0579, + "step": 1502 + }, + { + "epoch": 0.21644585253456222, + "grad_norm": 4.0536274909973145, + "learning_rate": 4.4446668288789265e-05, + "loss": 0.5662, + "step": 1503 + }, + { + "epoch": 0.21658986175115208, + "grad_norm": 0.7927526235580444, + "learning_rate": 4.443955847633106e-05, + "loss": 0.0942, + "step": 1504 + }, + { + "epoch": 0.21673387096774194, + "grad_norm": 3.9817726612091064, + "learning_rate": 4.4432444684939077e-05, + "loss": 0.4948, + "step": 1505 + }, + { + "epoch": 0.2168778801843318, + "grad_norm": 2.5141959190368652, + "learning_rate": 4.44253269160694e-05, + "loss": 1.2407, + "step": 1506 + }, + { + "epoch": 0.21702188940092165, + "grad_norm": 1.7796618938446045, + "learning_rate": 4.4418205171178895e-05, + "loss": 0.1793, + "step": 1507 + }, + { + "epoch": 0.2171658986175115, + "grad_norm": 5.5435357093811035, + "learning_rate": 4.441107945172527e-05, + "loss": 1.4178, + "step": 1508 + }, + { + "epoch": 0.2173099078341014, + "grad_norm": 3.4273555278778076, + "learning_rate": 4.440394975916702e-05, + "loss": 0.3799, + "step": 1509 + }, + { + "epoch": 0.21745391705069125, + "grad_norm": 3.384782552719116, + "learning_rate": 4.4396816094963464e-05, + "loss": 0.6339, + "step": 1510 + }, + { + "epoch": 0.2175979262672811, + "grad_norm": 3.756720542907715, + "learning_rate": 4.438967846057477e-05, + "loss": 0.2339, + "step": 1511 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 4.834780216217041, + "learning_rate": 4.438253685746184e-05, + "loss": 0.8795, + "step": 1512 + }, + { + "epoch": 0.21788594470046083, + "grad_norm": 4.464601039886475, + "learning_rate": 4.437539128708647e-05, + "loss": 1.0961, + "step": 1513 + }, + { + "epoch": 0.21802995391705068, + "grad_norm": 3.4182140827178955, + "learning_rate": 4.436824175091121e-05, + "loss": 1.9705, + "step": 1514 + }, + { + "epoch": 0.21817396313364054, + "grad_norm": 1.996928334236145, + "learning_rate": 4.4361088250399465e-05, + "loss": 0.2786, + "step": 1515 + }, + { + "epoch": 0.21831797235023043, + "grad_norm": 1.3663182258605957, + "learning_rate": 4.435393078701541e-05, + "loss": 0.2671, + "step": 1516 + }, + { + "epoch": 0.21846198156682028, + "grad_norm": 2.27770733833313, + "learning_rate": 4.434676936222405e-05, + "loss": 0.2136, + "step": 1517 + }, + { + "epoch": 0.21860599078341014, + "grad_norm": 2.184112787246704, + "learning_rate": 4.433960397749122e-05, + "loss": 0.2448, + "step": 1518 + }, + { + "epoch": 0.21875, + "grad_norm": 3.8087546825408936, + "learning_rate": 4.433243463428353e-05, + "loss": 0.7618, + "step": 1519 + }, + { + "epoch": 0.21889400921658986, + "grad_norm": 5.904870986938477, + "learning_rate": 4.4325261334068426e-05, + "loss": 2.9173, + "step": 1520 + }, + { + "epoch": 0.21903801843317972, + "grad_norm": 9.057807922363281, + "learning_rate": 4.431808407831416e-05, + "loss": 2.1475, + "step": 1521 + }, + { + "epoch": 0.21918202764976957, + "grad_norm": 2.9627199172973633, + "learning_rate": 4.431090286848978e-05, + "loss": 0.4436, + "step": 1522 + }, + { + "epoch": 0.21932603686635946, + "grad_norm": 3.138866424560547, + "learning_rate": 4.430371770606515e-05, + "loss": 2.4178, + "step": 1523 + }, + { + "epoch": 0.21947004608294932, + "grad_norm": 2.9001362323760986, + "learning_rate": 4.4296528592510966e-05, + "loss": 3.2009, + "step": 1524 + }, + { + "epoch": 0.21961405529953917, + "grad_norm": 1.9725819826126099, + "learning_rate": 4.428933552929869e-05, + "loss": 3.7439, + "step": 1525 + }, + { + "epoch": 0.21975806451612903, + "grad_norm": 3.6198065280914307, + "learning_rate": 4.428213851790063e-05, + "loss": 0.9563, + "step": 1526 + }, + { + "epoch": 0.2199020737327189, + "grad_norm": 0.9161674976348877, + "learning_rate": 4.427493755978987e-05, + "loss": 0.0931, + "step": 1527 + }, + { + "epoch": 0.22004608294930875, + "grad_norm": 8.027008056640625, + "learning_rate": 4.426773265644033e-05, + "loss": 1.3838, + "step": 1528 + }, + { + "epoch": 0.2201900921658986, + "grad_norm": 2.376009464263916, + "learning_rate": 4.426052380932674e-05, + "loss": 0.5107, + "step": 1529 + }, + { + "epoch": 0.2203341013824885, + "grad_norm": 3.4144937992095947, + "learning_rate": 4.4253311019924595e-05, + "loss": 0.3732, + "step": 1530 + }, + { + "epoch": 0.22047811059907835, + "grad_norm": 6.7700982093811035, + "learning_rate": 4.4246094289710245e-05, + "loss": 1.7359, + "step": 1531 + }, + { + "epoch": 0.2206221198156682, + "grad_norm": 1.9006189107894897, + "learning_rate": 4.423887362016082e-05, + "loss": 0.2283, + "step": 1532 + }, + { + "epoch": 0.22076612903225806, + "grad_norm": 4.193785190582275, + "learning_rate": 4.423164901275426e-05, + "loss": 0.7531, + "step": 1533 + }, + { + "epoch": 0.22091013824884792, + "grad_norm": 1.839159369468689, + "learning_rate": 4.422442046896933e-05, + "loss": 0.2337, + "step": 1534 + }, + { + "epoch": 0.22105414746543778, + "grad_norm": 3.5598080158233643, + "learning_rate": 4.421718799028557e-05, + "loss": 0.3531, + "step": 1535 + }, + { + "epoch": 0.22119815668202766, + "grad_norm": 3.624290704727173, + "learning_rate": 4.420995157818334e-05, + "loss": 0.233, + "step": 1536 + }, + { + "epoch": 0.22134216589861752, + "grad_norm": 1.3312181234359741, + "learning_rate": 4.420271123414381e-05, + "loss": 0.2043, + "step": 1537 + }, + { + "epoch": 0.22148617511520738, + "grad_norm": 5.870731353759766, + "learning_rate": 4.419546695964895e-05, + "loss": 1.1031, + "step": 1538 + }, + { + "epoch": 0.22163018433179724, + "grad_norm": 11.462679862976074, + "learning_rate": 4.418821875618154e-05, + "loss": 2.8946, + "step": 1539 + }, + { + "epoch": 0.2217741935483871, + "grad_norm": 4.360170364379883, + "learning_rate": 4.418096662522515e-05, + "loss": 2.6399, + "step": 1540 + }, + { + "epoch": 0.22191820276497695, + "grad_norm": 4.440066814422607, + "learning_rate": 4.417371056826417e-05, + "loss": 1.8975, + "step": 1541 + }, + { + "epoch": 0.2220622119815668, + "grad_norm": 1.1133826971054077, + "learning_rate": 4.416645058678379e-05, + "loss": 0.1923, + "step": 1542 + }, + { + "epoch": 0.2222062211981567, + "grad_norm": 3.699592113494873, + "learning_rate": 4.415918668226998e-05, + "loss": 0.4812, + "step": 1543 + }, + { + "epoch": 0.22235023041474655, + "grad_norm": 0.770598828792572, + "learning_rate": 4.4151918856209556e-05, + "loss": 0.0768, + "step": 1544 + }, + { + "epoch": 0.2224942396313364, + "grad_norm": 4.258463382720947, + "learning_rate": 4.4144647110090105e-05, + "loss": 0.4525, + "step": 1545 + }, + { + "epoch": 0.22263824884792627, + "grad_norm": 4.36538028717041, + "learning_rate": 4.413737144540002e-05, + "loss": 0.495, + "step": 1546 + }, + { + "epoch": 0.22278225806451613, + "grad_norm": 2.0962324142456055, + "learning_rate": 4.4130091863628506e-05, + "loss": 0.1812, + "step": 1547 + }, + { + "epoch": 0.22292626728110598, + "grad_norm": 2.3305351734161377, + "learning_rate": 4.4122808366265556e-05, + "loss": 2.0502, + "step": 1548 + }, + { + "epoch": 0.22307027649769584, + "grad_norm": 3.8904080390930176, + "learning_rate": 4.4115520954801995e-05, + "loss": 2.9, + "step": 1549 + }, + { + "epoch": 0.22321428571428573, + "grad_norm": 4.163515090942383, + "learning_rate": 4.4108229630729394e-05, + "loss": 1.5436, + "step": 1550 + }, + { + "epoch": 0.22335829493087558, + "grad_norm": 2.6302030086517334, + "learning_rate": 4.410093439554019e-05, + "loss": 0.3016, + "step": 1551 + }, + { + "epoch": 0.22350230414746544, + "grad_norm": 3.066129207611084, + "learning_rate": 4.409363525072757e-05, + "loss": 1.1951, + "step": 1552 + }, + { + "epoch": 0.2236463133640553, + "grad_norm": 6.892364025115967, + "learning_rate": 4.408633219778555e-05, + "loss": 1.4371, + "step": 1553 + }, + { + "epoch": 0.22379032258064516, + "grad_norm": 1.6694854497909546, + "learning_rate": 4.4079025238208925e-05, + "loss": 0.2019, + "step": 1554 + }, + { + "epoch": 0.22393433179723501, + "grad_norm": 4.946491241455078, + "learning_rate": 4.40717143734933e-05, + "loss": 0.9585, + "step": 1555 + }, + { + "epoch": 0.22407834101382487, + "grad_norm": 2.481088399887085, + "learning_rate": 4.40643996051351e-05, + "loss": 0.3134, + "step": 1556 + }, + { + "epoch": 0.22422235023041476, + "grad_norm": 5.175475597381592, + "learning_rate": 4.40570809346315e-05, + "loss": 0.461, + "step": 1557 + }, + { + "epoch": 0.22436635944700462, + "grad_norm": 1.0879896879196167, + "learning_rate": 4.404975836348053e-05, + "loss": 0.1462, + "step": 1558 + }, + { + "epoch": 0.22451036866359447, + "grad_norm": 3.4529311656951904, + "learning_rate": 4.404243189318097e-05, + "loss": 0.672, + "step": 1559 + }, + { + "epoch": 0.22465437788018433, + "grad_norm": 3.2076714038848877, + "learning_rate": 4.403510152523243e-05, + "loss": 0.2913, + "step": 1560 + }, + { + "epoch": 0.2247983870967742, + "grad_norm": 2.4183435440063477, + "learning_rate": 4.40277672611353e-05, + "loss": 0.3982, + "step": 1561 + }, + { + "epoch": 0.22494239631336405, + "grad_norm": 6.919425010681152, + "learning_rate": 4.402042910239078e-05, + "loss": 0.622, + "step": 1562 + }, + { + "epoch": 0.2250864055299539, + "grad_norm": 2.598705768585205, + "learning_rate": 4.4013087050500855e-05, + "loss": 0.4611, + "step": 1563 + }, + { + "epoch": 0.2252304147465438, + "grad_norm": 5.333256244659424, + "learning_rate": 4.4005741106968325e-05, + "loss": 2.1074, + "step": 1564 + }, + { + "epoch": 0.22537442396313365, + "grad_norm": 5.0902605056762695, + "learning_rate": 4.399839127329676e-05, + "loss": 2.1136, + "step": 1565 + }, + { + "epoch": 0.2255184331797235, + "grad_norm": 7.413620471954346, + "learning_rate": 4.399103755099054e-05, + "loss": 1.4601, + "step": 1566 + }, + { + "epoch": 0.22566244239631336, + "grad_norm": 5.150853157043457, + "learning_rate": 4.3983679941554865e-05, + "loss": 0.2472, + "step": 1567 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 1.544714093208313, + "learning_rate": 4.397631844649568e-05, + "loss": 0.1712, + "step": 1568 + }, + { + "epoch": 0.22595046082949308, + "grad_norm": 5.388582706451416, + "learning_rate": 4.3968953067319777e-05, + "loss": 0.3277, + "step": 1569 + }, + { + "epoch": 0.22609447004608296, + "grad_norm": 8.074249267578125, + "learning_rate": 4.39615838055347e-05, + "loss": 0.821, + "step": 1570 + }, + { + "epoch": 0.22623847926267282, + "grad_norm": 3.393373727798462, + "learning_rate": 4.395421066264881e-05, + "loss": 0.7481, + "step": 1571 + }, + { + "epoch": 0.22638248847926268, + "grad_norm": 2.4190642833709717, + "learning_rate": 4.394683364017126e-05, + "loss": 0.3474, + "step": 1572 + }, + { + "epoch": 0.22652649769585254, + "grad_norm": 1.101222038269043, + "learning_rate": 4.3939452739612e-05, + "loss": 0.0892, + "step": 1573 + }, + { + "epoch": 0.2266705069124424, + "grad_norm": 3.839137554168701, + "learning_rate": 4.393206796248177e-05, + "loss": 0.417, + "step": 1574 + }, + { + "epoch": 0.22681451612903225, + "grad_norm": 4.606639385223389, + "learning_rate": 4.392467931029211e-05, + "loss": 0.3758, + "step": 1575 + }, + { + "epoch": 0.2269585253456221, + "grad_norm": 4.087855815887451, + "learning_rate": 4.3917286784555325e-05, + "loss": 1.8527, + "step": 1576 + }, + { + "epoch": 0.227102534562212, + "grad_norm": 3.9653613567352295, + "learning_rate": 4.390989038678455e-05, + "loss": 0.3278, + "step": 1577 + }, + { + "epoch": 0.22724654377880185, + "grad_norm": 4.4431471824646, + "learning_rate": 4.390249011849369e-05, + "loss": 0.4264, + "step": 1578 + }, + { + "epoch": 0.2273905529953917, + "grad_norm": 5.110714435577393, + "learning_rate": 4.3895085981197455e-05, + "loss": 0.4535, + "step": 1579 + }, + { + "epoch": 0.22753456221198157, + "grad_norm": 6.627864360809326, + "learning_rate": 4.3887677976411335e-05, + "loss": 1.816, + "step": 1580 + }, + { + "epoch": 0.22767857142857142, + "grad_norm": 3.8172996044158936, + "learning_rate": 4.388026610565163e-05, + "loss": 2.6399, + "step": 1581 + }, + { + "epoch": 0.22782258064516128, + "grad_norm": 1.9998592138290405, + "learning_rate": 4.38728503704354e-05, + "loss": 0.2045, + "step": 1582 + }, + { + "epoch": 0.22796658986175114, + "grad_norm": 3.64190673828125, + "learning_rate": 4.386543077228053e-05, + "loss": 0.7766, + "step": 1583 + }, + { + "epoch": 0.22811059907834103, + "grad_norm": 8.382646560668945, + "learning_rate": 4.385800731270567e-05, + "loss": 1.5269, + "step": 1584 + }, + { + "epoch": 0.22825460829493088, + "grad_norm": 1.3336576223373413, + "learning_rate": 4.3850579993230284e-05, + "loss": 0.1358, + "step": 1585 + }, + { + "epoch": 0.22839861751152074, + "grad_norm": 1.2846561670303345, + "learning_rate": 4.38431488153746e-05, + "loss": 0.2813, + "step": 1586 + }, + { + "epoch": 0.2285426267281106, + "grad_norm": 4.663119316101074, + "learning_rate": 4.383571378065966e-05, + "loss": 1.1874, + "step": 1587 + }, + { + "epoch": 0.22868663594470046, + "grad_norm": 3.8051607608795166, + "learning_rate": 4.382827489060727e-05, + "loss": 0.5865, + "step": 1588 + }, + { + "epoch": 0.2288306451612903, + "grad_norm": 2.868292808532715, + "learning_rate": 4.3820832146740055e-05, + "loss": 0.2462, + "step": 1589 + }, + { + "epoch": 0.22897465437788017, + "grad_norm": 1.1000248193740845, + "learning_rate": 4.38133855505814e-05, + "loss": 0.1182, + "step": 1590 + }, + { + "epoch": 0.22911866359447006, + "grad_norm": 3.5942296981811523, + "learning_rate": 4.380593510365549e-05, + "loss": 0.3121, + "step": 1591 + }, + { + "epoch": 0.22926267281105991, + "grad_norm": 2.5674984455108643, + "learning_rate": 4.379848080748731e-05, + "loss": 0.2875, + "step": 1592 + }, + { + "epoch": 0.22940668202764977, + "grad_norm": 4.9007134437561035, + "learning_rate": 4.3791022663602624e-05, + "loss": 0.7187, + "step": 1593 + }, + { + "epoch": 0.22955069124423963, + "grad_norm": 6.493061065673828, + "learning_rate": 4.3783560673527975e-05, + "loss": 1.0735, + "step": 1594 + }, + { + "epoch": 0.2296947004608295, + "grad_norm": 4.228397369384766, + "learning_rate": 4.37760948387907e-05, + "loss": 0.7495, + "step": 1595 + }, + { + "epoch": 0.22983870967741934, + "grad_norm": 6.315403461456299, + "learning_rate": 4.376862516091893e-05, + "loss": 3.0807, + "step": 1596 + }, + { + "epoch": 0.22998271889400923, + "grad_norm": 2.318319082260132, + "learning_rate": 4.376115164144157e-05, + "loss": 0.2627, + "step": 1597 + }, + { + "epoch": 0.2301267281105991, + "grad_norm": 4.797454357147217, + "learning_rate": 4.375367428188831e-05, + "loss": 0.5209, + "step": 1598 + }, + { + "epoch": 0.23027073732718895, + "grad_norm": 1.2859771251678467, + "learning_rate": 4.374619308378965e-05, + "loss": 0.145, + "step": 1599 + }, + { + "epoch": 0.2304147465437788, + "grad_norm": 4.895183563232422, + "learning_rate": 4.3738708048676846e-05, + "loss": 0.3569, + "step": 1600 + }, + { + "epoch": 0.23055875576036866, + "grad_norm": 3.9941205978393555, + "learning_rate": 4.373121917808196e-05, + "loss": 1.5473, + "step": 1601 + }, + { + "epoch": 0.23070276497695852, + "grad_norm": 5.122671604156494, + "learning_rate": 4.372372647353783e-05, + "loss": 0.9855, + "step": 1602 + }, + { + "epoch": 0.23084677419354838, + "grad_norm": 4.309559345245361, + "learning_rate": 4.371622993657808e-05, + "loss": 0.4389, + "step": 1603 + }, + { + "epoch": 0.23099078341013826, + "grad_norm": 3.1174542903900146, + "learning_rate": 4.370872956873712e-05, + "loss": 0.4379, + "step": 1604 + }, + { + "epoch": 0.23113479262672812, + "grad_norm": 4.149661540985107, + "learning_rate": 4.3701225371550124e-05, + "loss": 0.9477, + "step": 1605 + }, + { + "epoch": 0.23127880184331798, + "grad_norm": 1.5468080043792725, + "learning_rate": 4.36937173465531e-05, + "loss": 0.2327, + "step": 1606 + }, + { + "epoch": 0.23142281105990783, + "grad_norm": 3.7441158294677734, + "learning_rate": 4.3686205495282786e-05, + "loss": 0.4971, + "step": 1607 + }, + { + "epoch": 0.2315668202764977, + "grad_norm": 6.536379814147949, + "learning_rate": 4.367868981927673e-05, + "loss": 0.6786, + "step": 1608 + }, + { + "epoch": 0.23171082949308755, + "grad_norm": 1.8638758659362793, + "learning_rate": 4.367117032007326e-05, + "loss": 0.7952, + "step": 1609 + }, + { + "epoch": 0.2318548387096774, + "grad_norm": 4.959320068359375, + "learning_rate": 4.3663646999211495e-05, + "loss": 1.0529, + "step": 1610 + }, + { + "epoch": 0.2319988479262673, + "grad_norm": 2.925222635269165, + "learning_rate": 4.36561198582313e-05, + "loss": 0.4173, + "step": 1611 + }, + { + "epoch": 0.23214285714285715, + "grad_norm": 4.588637351989746, + "learning_rate": 4.364858889867336e-05, + "loss": 1.1227, + "step": 1612 + }, + { + "epoch": 0.232286866359447, + "grad_norm": 3.15014910697937, + "learning_rate": 4.364105412207914e-05, + "loss": 0.8187, + "step": 1613 + }, + { + "epoch": 0.23243087557603687, + "grad_norm": 0.9260169267654419, + "learning_rate": 4.363351552999086e-05, + "loss": 0.099, + "step": 1614 + }, + { + "epoch": 0.23257488479262672, + "grad_norm": 2.864626169204712, + "learning_rate": 4.362597312395156e-05, + "loss": 0.1819, + "step": 1615 + }, + { + "epoch": 0.23271889400921658, + "grad_norm": 3.081322193145752, + "learning_rate": 4.361842690550501e-05, + "loss": 0.5923, + "step": 1616 + }, + { + "epoch": 0.23286290322580644, + "grad_norm": 1.2546461820602417, + "learning_rate": 4.361087687619579e-05, + "loss": 0.1588, + "step": 1617 + }, + { + "epoch": 0.23300691244239632, + "grad_norm": 4.562083721160889, + "learning_rate": 4.3603323037569265e-05, + "loss": 0.3748, + "step": 1618 + }, + { + "epoch": 0.23315092165898618, + "grad_norm": 3.451495409011841, + "learning_rate": 4.3595765391171576e-05, + "loss": 0.3007, + "step": 1619 + }, + { + "epoch": 0.23329493087557604, + "grad_norm": 2.4026293754577637, + "learning_rate": 4.3588203938549645e-05, + "loss": 0.4767, + "step": 1620 + }, + { + "epoch": 0.2334389400921659, + "grad_norm": 5.052242755889893, + "learning_rate": 4.358063868125115e-05, + "loss": 0.4193, + "step": 1621 + }, + { + "epoch": 0.23358294930875576, + "grad_norm": 6.112339496612549, + "learning_rate": 4.357306962082457e-05, + "loss": 1.8987, + "step": 1622 + }, + { + "epoch": 0.2337269585253456, + "grad_norm": 7.636193752288818, + "learning_rate": 4.3565496758819166e-05, + "loss": 0.8093, + "step": 1623 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 0.6553455591201782, + "learning_rate": 4.3557920096784966e-05, + "loss": 0.0765, + "step": 1624 + }, + { + "epoch": 0.23401497695852536, + "grad_norm": 4.777616024017334, + "learning_rate": 4.3550339636272775e-05, + "loss": 0.5652, + "step": 1625 + }, + { + "epoch": 0.2341589861751152, + "grad_norm": 7.743527412414551, + "learning_rate": 4.3542755378834174e-05, + "loss": 1.9319, + "step": 1626 + }, + { + "epoch": 0.23430299539170507, + "grad_norm": 5.095020771026611, + "learning_rate": 4.353516732602155e-05, + "loss": 0.7128, + "step": 1627 + }, + { + "epoch": 0.23444700460829493, + "grad_norm": 1.5581172704696655, + "learning_rate": 4.352757547938802e-05, + "loss": 0.1549, + "step": 1628 + }, + { + "epoch": 0.2345910138248848, + "grad_norm": 4.988015174865723, + "learning_rate": 4.35199798404875e-05, + "loss": 0.432, + "step": 1629 + }, + { + "epoch": 0.23473502304147464, + "grad_norm": 5.985219955444336, + "learning_rate": 4.3512380410874696e-05, + "loss": 1.824, + "step": 1630 + }, + { + "epoch": 0.23487903225806453, + "grad_norm": 7.848267078399658, + "learning_rate": 4.3504777192105074e-05, + "loss": 1.3155, + "step": 1631 + }, + { + "epoch": 0.2350230414746544, + "grad_norm": 1.2344437837600708, + "learning_rate": 4.349717018573487e-05, + "loss": 0.121, + "step": 1632 + }, + { + "epoch": 0.23516705069124424, + "grad_norm": 2.5877041816711426, + "learning_rate": 4.348955939332111e-05, + "loss": 0.3009, + "step": 1633 + }, + { + "epoch": 0.2353110599078341, + "grad_norm": 2.9782180786132812, + "learning_rate": 4.348194481642159e-05, + "loss": 0.2867, + "step": 1634 + }, + { + "epoch": 0.23545506912442396, + "grad_norm": 3.124105453491211, + "learning_rate": 4.347432645659488e-05, + "loss": 0.6584, + "step": 1635 + }, + { + "epoch": 0.23559907834101382, + "grad_norm": 2.00016450881958, + "learning_rate": 4.346670431540032e-05, + "loss": 0.2958, + "step": 1636 + }, + { + "epoch": 0.23574308755760368, + "grad_norm": 1.9136375188827515, + "learning_rate": 4.345907839439802e-05, + "loss": 0.2133, + "step": 1637 + }, + { + "epoch": 0.23588709677419356, + "grad_norm": 3.729694366455078, + "learning_rate": 4.3451448695148895e-05, + "loss": 1.6846, + "step": 1638 + }, + { + "epoch": 0.23603110599078342, + "grad_norm": 2.4781768321990967, + "learning_rate": 4.344381521921458e-05, + "loss": 0.8492, + "step": 1639 + }, + { + "epoch": 0.23617511520737328, + "grad_norm": 3.971006155014038, + "learning_rate": 4.3436177968157534e-05, + "loss": 0.3663, + "step": 1640 + }, + { + "epoch": 0.23631912442396313, + "grad_norm": 0.6137856841087341, + "learning_rate": 4.342853694354095e-05, + "loss": 0.048, + "step": 1641 + }, + { + "epoch": 0.236463133640553, + "grad_norm": 1.5536867380142212, + "learning_rate": 4.342089214692883e-05, + "loss": 0.1704, + "step": 1642 + }, + { + "epoch": 0.23660714285714285, + "grad_norm": 4.79181432723999, + "learning_rate": 4.341324357988592e-05, + "loss": 0.3482, + "step": 1643 + }, + { + "epoch": 0.2367511520737327, + "grad_norm": 10.326250076293945, + "learning_rate": 4.3405591243977736e-05, + "loss": 1.7751, + "step": 1644 + }, + { + "epoch": 0.2368951612903226, + "grad_norm": 1.386892557144165, + "learning_rate": 4.339793514077059e-05, + "loss": 0.1438, + "step": 1645 + }, + { + "epoch": 0.23703917050691245, + "grad_norm": 4.585718154907227, + "learning_rate": 4.339027527183154e-05, + "loss": 0.4974, + "step": 1646 + }, + { + "epoch": 0.2371831797235023, + "grad_norm": 4.6489577293396, + "learning_rate": 4.338261163872844e-05, + "loss": 0.5738, + "step": 1647 + }, + { + "epoch": 0.23732718894009217, + "grad_norm": 2.2828660011291504, + "learning_rate": 4.337494424302989e-05, + "loss": 0.1609, + "step": 1648 + }, + { + "epoch": 0.23747119815668202, + "grad_norm": 0.8000209331512451, + "learning_rate": 4.336727308630527e-05, + "loss": 0.1416, + "step": 1649 + }, + { + "epoch": 0.23761520737327188, + "grad_norm": 1.1230424642562866, + "learning_rate": 4.335959817012473e-05, + "loss": 0.0693, + "step": 1650 + }, + { + "epoch": 0.23775921658986174, + "grad_norm": 4.985502243041992, + "learning_rate": 4.3351919496059194e-05, + "loss": 0.5485, + "step": 1651 + }, + { + "epoch": 0.23790322580645162, + "grad_norm": 1.8275142908096313, + "learning_rate": 4.334423706568035e-05, + "loss": 0.2324, + "step": 1652 + }, + { + "epoch": 0.23804723502304148, + "grad_norm": 3.1726090908050537, + "learning_rate": 4.333655088056065e-05, + "loss": 0.2203, + "step": 1653 + }, + { + "epoch": 0.23819124423963134, + "grad_norm": 2.679636240005493, + "learning_rate": 4.332886094227333e-05, + "loss": 0.3675, + "step": 1654 + }, + { + "epoch": 0.2383352534562212, + "grad_norm": 6.7782487869262695, + "learning_rate": 4.332116725239237e-05, + "loss": 1.0576, + "step": 1655 + }, + { + "epoch": 0.23847926267281105, + "grad_norm": 1.19471275806427, + "learning_rate": 4.331346981249255e-05, + "loss": 0.151, + "step": 1656 + }, + { + "epoch": 0.2386232718894009, + "grad_norm": 3.211522102355957, + "learning_rate": 4.330576862414938e-05, + "loss": 2.4977, + "step": 1657 + }, + { + "epoch": 0.23876728110599077, + "grad_norm": 4.60874080657959, + "learning_rate": 4.329806368893917e-05, + "loss": 1.0654, + "step": 1658 + }, + { + "epoch": 0.23891129032258066, + "grad_norm": 1.9762176275253296, + "learning_rate": 4.329035500843899e-05, + "loss": 0.2692, + "step": 1659 + }, + { + "epoch": 0.2390552995391705, + "grad_norm": 3.284135341644287, + "learning_rate": 4.328264258422665e-05, + "loss": 0.2853, + "step": 1660 + }, + { + "epoch": 0.23919930875576037, + "grad_norm": 4.886958599090576, + "learning_rate": 4.327492641788077e-05, + "loss": 0.3676, + "step": 1661 + }, + { + "epoch": 0.23934331797235023, + "grad_norm": 2.941633462905884, + "learning_rate": 4.32672065109807e-05, + "loss": 0.4823, + "step": 1662 + }, + { + "epoch": 0.23948732718894009, + "grad_norm": 2.2450294494628906, + "learning_rate": 4.325948286510656e-05, + "loss": 0.2413, + "step": 1663 + }, + { + "epoch": 0.23963133640552994, + "grad_norm": 6.936364650726318, + "learning_rate": 4.325175548183926e-05, + "loss": 1.1443, + "step": 1664 + }, + { + "epoch": 0.23977534562211983, + "grad_norm": 4.563480377197266, + "learning_rate": 4.324402436276046e-05, + "loss": 1.8734, + "step": 1665 + }, + { + "epoch": 0.2399193548387097, + "grad_norm": 6.506825923919678, + "learning_rate": 4.323628950945257e-05, + "loss": 0.573, + "step": 1666 + }, + { + "epoch": 0.24006336405529954, + "grad_norm": 3.7553000450134277, + "learning_rate": 4.322855092349878e-05, + "loss": 1.3896, + "step": 1667 + }, + { + "epoch": 0.2402073732718894, + "grad_norm": 3.5245859622955322, + "learning_rate": 4.3220808606483044e-05, + "loss": 0.9139, + "step": 1668 + }, + { + "epoch": 0.24035138248847926, + "grad_norm": 7.4112420082092285, + "learning_rate": 4.321306255999008e-05, + "loss": 1.5385, + "step": 1669 + }, + { + "epoch": 0.24049539170506912, + "grad_norm": 1.3005380630493164, + "learning_rate": 4.320531278560537e-05, + "loss": 0.1869, + "step": 1670 + }, + { + "epoch": 0.24063940092165897, + "grad_norm": 0.7764245867729187, + "learning_rate": 4.319755928491515e-05, + "loss": 0.0583, + "step": 1671 + }, + { + "epoch": 0.24078341013824886, + "grad_norm": 2.8615782260894775, + "learning_rate": 4.318980205950641e-05, + "loss": 0.8308, + "step": 1672 + }, + { + "epoch": 0.24092741935483872, + "grad_norm": 6.053670406341553, + "learning_rate": 4.318204111096695e-05, + "loss": 1.7689, + "step": 1673 + }, + { + "epoch": 0.24107142857142858, + "grad_norm": 2.399231433868408, + "learning_rate": 4.3174276440885276e-05, + "loss": 0.1902, + "step": 1674 + }, + { + "epoch": 0.24121543778801843, + "grad_norm": 3.390796661376953, + "learning_rate": 4.316650805085068e-05, + "loss": 1.4952, + "step": 1675 + }, + { + "epoch": 0.2413594470046083, + "grad_norm": 9.760526657104492, + "learning_rate": 4.315873594245322e-05, + "loss": 1.9167, + "step": 1676 + }, + { + "epoch": 0.24150345622119815, + "grad_norm": 3.0889554023742676, + "learning_rate": 4.3150960117283703e-05, + "loss": 0.42, + "step": 1677 + }, + { + "epoch": 0.241647465437788, + "grad_norm": 1.5060477256774902, + "learning_rate": 4.314318057693372e-05, + "loss": 0.4765, + "step": 1678 + }, + { + "epoch": 0.2417914746543779, + "grad_norm": 1.8230472803115845, + "learning_rate": 4.3135397322995576e-05, + "loss": 0.216, + "step": 1679 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 2.491593599319458, + "learning_rate": 4.3127610357062386e-05, + "loss": 0.1977, + "step": 1680 + }, + { + "epoch": 0.2420794930875576, + "grad_norm": 7.767695426940918, + "learning_rate": 4.3119819680728e-05, + "loss": 1.0477, + "step": 1681 + }, + { + "epoch": 0.24222350230414746, + "grad_norm": 2.483482837677002, + "learning_rate": 4.311202529558703e-05, + "loss": 0.289, + "step": 1682 + }, + { + "epoch": 0.24236751152073732, + "grad_norm": 3.0540754795074463, + "learning_rate": 4.3104227203234856e-05, + "loss": 1.8806, + "step": 1683 + }, + { + "epoch": 0.24251152073732718, + "grad_norm": 3.0346460342407227, + "learning_rate": 4.30964254052676e-05, + "loss": 0.3497, + "step": 1684 + }, + { + "epoch": 0.24265552995391704, + "grad_norm": 4.389759063720703, + "learning_rate": 4.3088619903282154e-05, + "loss": 0.5568, + "step": 1685 + }, + { + "epoch": 0.24279953917050692, + "grad_norm": 5.831572532653809, + "learning_rate": 4.3080810698876175e-05, + "loss": 0.9935, + "step": 1686 + }, + { + "epoch": 0.24294354838709678, + "grad_norm": 5.865657806396484, + "learning_rate": 4.307299779364805e-05, + "loss": 0.8885, + "step": 1687 + }, + { + "epoch": 0.24308755760368664, + "grad_norm": 5.443741798400879, + "learning_rate": 4.3065181189196956e-05, + "loss": 1.1406, + "step": 1688 + }, + { + "epoch": 0.2432315668202765, + "grad_norm": 1.9749964475631714, + "learning_rate": 4.305736088712282e-05, + "loss": 0.1844, + "step": 1689 + }, + { + "epoch": 0.24337557603686635, + "grad_norm": 4.703548431396484, + "learning_rate": 4.304953688902631e-05, + "loss": 2.6429, + "step": 1690 + }, + { + "epoch": 0.2435195852534562, + "grad_norm": 5.623354434967041, + "learning_rate": 4.304170919650885e-05, + "loss": 1.396, + "step": 1691 + }, + { + "epoch": 0.2436635944700461, + "grad_norm": 7.722415924072266, + "learning_rate": 4.3033877811172654e-05, + "loss": 1.3147, + "step": 1692 + }, + { + "epoch": 0.24380760368663595, + "grad_norm": 4.217384338378906, + "learning_rate": 4.3026042734620656e-05, + "loss": 0.8428, + "step": 1693 + }, + { + "epoch": 0.2439516129032258, + "grad_norm": 2.3445522785186768, + "learning_rate": 4.301820396845655e-05, + "loss": 0.2399, + "step": 1694 + }, + { + "epoch": 0.24409562211981567, + "grad_norm": 4.446591377258301, + "learning_rate": 4.30103615142848e-05, + "loss": 2.3763, + "step": 1695 + }, + { + "epoch": 0.24423963133640553, + "grad_norm": 3.1515023708343506, + "learning_rate": 4.300251537371062e-05, + "loss": 0.3204, + "step": 1696 + }, + { + "epoch": 0.24438364055299538, + "grad_norm": 2.6645121574401855, + "learning_rate": 4.299466554833997e-05, + "loss": 0.3961, + "step": 1697 + }, + { + "epoch": 0.24452764976958524, + "grad_norm": 5.519476413726807, + "learning_rate": 4.298681203977959e-05, + "loss": 0.5363, + "step": 1698 + }, + { + "epoch": 0.24467165898617513, + "grad_norm": 2.0937066078186035, + "learning_rate": 4.297895484963692e-05, + "loss": 0.1629, + "step": 1699 + }, + { + "epoch": 0.24481566820276499, + "grad_norm": 3.6989657878875732, + "learning_rate": 4.297109397952022e-05, + "loss": 0.6407, + "step": 1700 + }, + { + "epoch": 0.24495967741935484, + "grad_norm": 10.077070236206055, + "learning_rate": 4.2963229431038446e-05, + "loss": 2.3704, + "step": 1701 + }, + { + "epoch": 0.2451036866359447, + "grad_norm": 2.8898515701293945, + "learning_rate": 4.295536120580135e-05, + "loss": 0.2275, + "step": 1702 + }, + { + "epoch": 0.24524769585253456, + "grad_norm": 6.446637153625488, + "learning_rate": 4.294748930541941e-05, + "loss": 1.3712, + "step": 1703 + }, + { + "epoch": 0.24539170506912442, + "grad_norm": 0.8312081694602966, + "learning_rate": 4.293961373150387e-05, + "loss": 4.6856, + "step": 1704 + }, + { + "epoch": 0.24553571428571427, + "grad_norm": 3.879485607147217, + "learning_rate": 4.293173448566671e-05, + "loss": 0.7079, + "step": 1705 + }, + { + "epoch": 0.24567972350230416, + "grad_norm": 6.7522993087768555, + "learning_rate": 4.2923851569520685e-05, + "loss": 0.7752, + "step": 1706 + }, + { + "epoch": 0.24582373271889402, + "grad_norm": 4.632371425628662, + "learning_rate": 4.291596498467928e-05, + "loss": 0.6941, + "step": 1707 + }, + { + "epoch": 0.24596774193548387, + "grad_norm": 2.9715583324432373, + "learning_rate": 4.290807473275675e-05, + "loss": 0.6269, + "step": 1708 + }, + { + "epoch": 0.24611175115207373, + "grad_norm": 2.279111623764038, + "learning_rate": 4.2900180815368076e-05, + "loss": 0.2515, + "step": 1709 + }, + { + "epoch": 0.2462557603686636, + "grad_norm": 1.7524594068527222, + "learning_rate": 4.289228323412901e-05, + "loss": 0.166, + "step": 1710 + }, + { + "epoch": 0.24639976958525345, + "grad_norm": 5.009512424468994, + "learning_rate": 4.288438199065605e-05, + "loss": 0.747, + "step": 1711 + }, + { + "epoch": 0.2465437788018433, + "grad_norm": 3.365826368331909, + "learning_rate": 4.2876477086566434e-05, + "loss": 0.2984, + "step": 1712 + }, + { + "epoch": 0.2466877880184332, + "grad_norm": 3.4564478397369385, + "learning_rate": 4.286856852347816e-05, + "loss": 2.0772, + "step": 1713 + }, + { + "epoch": 0.24683179723502305, + "grad_norm": 3.6714789867401123, + "learning_rate": 4.286065630300998e-05, + "loss": 0.6038, + "step": 1714 + }, + { + "epoch": 0.2469758064516129, + "grad_norm": 6.021542549133301, + "learning_rate": 4.2852740426781365e-05, + "loss": 1.6648, + "step": 1715 + }, + { + "epoch": 0.24711981566820276, + "grad_norm": 4.211161136627197, + "learning_rate": 4.284482089641257e-05, + "loss": 0.5153, + "step": 1716 + }, + { + "epoch": 0.24726382488479262, + "grad_norm": 1.5346018075942993, + "learning_rate": 4.2836897713524585e-05, + "loss": 0.1426, + "step": 1717 + }, + { + "epoch": 0.24740783410138248, + "grad_norm": 3.361175775527954, + "learning_rate": 4.2828970879739136e-05, + "loss": 0.4983, + "step": 1718 + }, + { + "epoch": 0.24755184331797234, + "grad_norm": 5.983315467834473, + "learning_rate": 4.28210403966787e-05, + "loss": 0.7323, + "step": 1719 + }, + { + "epoch": 0.24769585253456222, + "grad_norm": 5.462739944458008, + "learning_rate": 4.281310626596653e-05, + "loss": 1.6749, + "step": 1720 + }, + { + "epoch": 0.24783986175115208, + "grad_norm": 2.5435595512390137, + "learning_rate": 4.280516848922658e-05, + "loss": 0.2702, + "step": 1721 + }, + { + "epoch": 0.24798387096774194, + "grad_norm": 1.6644847393035889, + "learning_rate": 4.279722706808358e-05, + "loss": 0.1331, + "step": 1722 + }, + { + "epoch": 0.2481278801843318, + "grad_norm": 4.597958087921143, + "learning_rate": 4.2789282004163e-05, + "loss": 0.6459, + "step": 1723 + }, + { + "epoch": 0.24827188940092165, + "grad_norm": 5.642190456390381, + "learning_rate": 4.2781333299091054e-05, + "loss": 1.3547, + "step": 1724 + }, + { + "epoch": 0.2484158986175115, + "grad_norm": 2.108733654022217, + "learning_rate": 4.27733809544947e-05, + "loss": 0.1247, + "step": 1725 + }, + { + "epoch": 0.2485599078341014, + "grad_norm": 7.0447258949279785, + "learning_rate": 4.276542497200164e-05, + "loss": 0.6815, + "step": 1726 + }, + { + "epoch": 0.24870391705069125, + "grad_norm": 1.0886191129684448, + "learning_rate": 4.275746535324033e-05, + "loss": 0.1549, + "step": 1727 + }, + { + "epoch": 0.2488479262672811, + "grad_norm": 2.5591118335723877, + "learning_rate": 4.2749502099839956e-05, + "loss": 0.3466, + "step": 1728 + }, + { + "epoch": 0.24899193548387097, + "grad_norm": 7.096731662750244, + "learning_rate": 4.274153521343046e-05, + "loss": 2.4738, + "step": 1729 + }, + { + "epoch": 0.24913594470046083, + "grad_norm": 4.997349262237549, + "learning_rate": 4.273356469564251e-05, + "loss": 0.4727, + "step": 1730 + }, + { + "epoch": 0.24927995391705068, + "grad_norm": 4.450806617736816, + "learning_rate": 4.2725590548107555e-05, + "loss": 1.94, + "step": 1731 + }, + { + "epoch": 0.24942396313364054, + "grad_norm": 2.77359938621521, + "learning_rate": 4.271761277245774e-05, + "loss": 0.2491, + "step": 1732 + }, + { + "epoch": 0.24956797235023043, + "grad_norm": 2.877174139022827, + "learning_rate": 4.270963137032599e-05, + "loss": 0.2722, + "step": 1733 + }, + { + "epoch": 0.24971198156682028, + "grad_norm": 4.875844478607178, + "learning_rate": 4.2701646343345934e-05, + "loss": 0.9175, + "step": 1734 + }, + { + "epoch": 0.24985599078341014, + "grad_norm": 1.2326208353042603, + "learning_rate": 4.269365769315199e-05, + "loss": 0.165, + "step": 1735 + }, + { + "epoch": 0.25, + "grad_norm": 2.636434555053711, + "learning_rate": 4.268566542137928e-05, + "loss": 0.26, + "step": 1736 + }, + { + "epoch": 0.25014400921658986, + "grad_norm": 6.082031726837158, + "learning_rate": 4.267766952966369e-05, + "loss": 2.0589, + "step": 1737 + }, + { + "epoch": 0.2502880184331797, + "grad_norm": 3.3067023754119873, + "learning_rate": 4.266967001964183e-05, + "loss": 2.0986, + "step": 1738 + }, + { + "epoch": 0.2504320276497696, + "grad_norm": 1.3861817121505737, + "learning_rate": 4.2661666892951056e-05, + "loss": 0.1277, + "step": 1739 + }, + { + "epoch": 0.25057603686635943, + "grad_norm": 7.206782817840576, + "learning_rate": 4.265366015122948e-05, + "loss": 1.4771, + "step": 1740 + }, + { + "epoch": 0.2507200460829493, + "grad_norm": 4.064801216125488, + "learning_rate": 4.2645649796115924e-05, + "loss": 1.9083, + "step": 1741 + }, + { + "epoch": 0.25086405529953915, + "grad_norm": 2.173316717147827, + "learning_rate": 4.263763582924998e-05, + "loss": 0.5254, + "step": 1742 + }, + { + "epoch": 0.25100806451612906, + "grad_norm": 1.447555422782898, + "learning_rate": 4.262961825227195e-05, + "loss": 0.1226, + "step": 1743 + }, + { + "epoch": 0.2511520737327189, + "grad_norm": 5.316477298736572, + "learning_rate": 4.262159706682291e-05, + "loss": 2.6041, + "step": 1744 + }, + { + "epoch": 0.2512960829493088, + "grad_norm": 3.8550682067871094, + "learning_rate": 4.261357227454463e-05, + "loss": 0.3149, + "step": 1745 + }, + { + "epoch": 0.25144009216589863, + "grad_norm": 3.3757991790771484, + "learning_rate": 4.2605543877079654e-05, + "loss": 0.7383, + "step": 1746 + }, + { + "epoch": 0.2515841013824885, + "grad_norm": 2.72967529296875, + "learning_rate": 4.259751187607127e-05, + "loss": 0.6502, + "step": 1747 + }, + { + "epoch": 0.25172811059907835, + "grad_norm": 3.0264291763305664, + "learning_rate": 4.258947627316347e-05, + "loss": 0.3887, + "step": 1748 + }, + { + "epoch": 0.2518721198156682, + "grad_norm": 1.906705617904663, + "learning_rate": 4.2581437070001e-05, + "loss": 0.1755, + "step": 1749 + }, + { + "epoch": 0.25201612903225806, + "grad_norm": 3.415621519088745, + "learning_rate": 4.257339426822934e-05, + "loss": 0.4236, + "step": 1750 + }, + { + "epoch": 0.2521601382488479, + "grad_norm": 1.318006157875061, + "learning_rate": 4.256534786949472e-05, + "loss": 0.1073, + "step": 1751 + }, + { + "epoch": 0.2523041474654378, + "grad_norm": 2.4789226055145264, + "learning_rate": 4.255729787544408e-05, + "loss": 0.1556, + "step": 1752 + }, + { + "epoch": 0.25244815668202764, + "grad_norm": 2.3323006629943848, + "learning_rate": 4.2549244287725135e-05, + "loss": 0.4383, + "step": 1753 + }, + { + "epoch": 0.2525921658986175, + "grad_norm": 5.0875563621521, + "learning_rate": 4.254118710798629e-05, + "loss": 1.1122, + "step": 1754 + }, + { + "epoch": 0.25273617511520735, + "grad_norm": 3.8800928592681885, + "learning_rate": 4.253312633787671e-05, + "loss": 1.2086, + "step": 1755 + }, + { + "epoch": 0.2528801843317972, + "grad_norm": 0.6841304898262024, + "learning_rate": 4.25250619790463e-05, + "loss": 0.0665, + "step": 1756 + }, + { + "epoch": 0.2530241935483871, + "grad_norm": 1.4760183095932007, + "learning_rate": 4.251699403314569e-05, + "loss": 0.1428, + "step": 1757 + }, + { + "epoch": 0.253168202764977, + "grad_norm": 6.564279079437256, + "learning_rate": 4.2508922501826244e-05, + "loss": 0.8638, + "step": 1758 + }, + { + "epoch": 0.25331221198156684, + "grad_norm": 5.8889594078063965, + "learning_rate": 4.250084738674006e-05, + "loss": 1.4248, + "step": 1759 + }, + { + "epoch": 0.2534562211981567, + "grad_norm": 3.845834255218506, + "learning_rate": 4.249276868953998e-05, + "loss": 2.5883, + "step": 1760 + }, + { + "epoch": 0.25360023041474655, + "grad_norm": 3.347489356994629, + "learning_rate": 4.2484686411879554e-05, + "loss": 0.1949, + "step": 1761 + }, + { + "epoch": 0.2537442396313364, + "grad_norm": 3.328758478164673, + "learning_rate": 4.2476600555413096e-05, + "loss": 0.6995, + "step": 1762 + }, + { + "epoch": 0.25388824884792627, + "grad_norm": 2.1843883991241455, + "learning_rate": 4.246851112179563e-05, + "loss": 0.3842, + "step": 1763 + }, + { + "epoch": 0.2540322580645161, + "grad_norm": 5.954964637756348, + "learning_rate": 4.2460418112682934e-05, + "loss": 0.5113, + "step": 1764 + }, + { + "epoch": 0.254176267281106, + "grad_norm": 2.7671701908111572, + "learning_rate": 4.2452321529731475e-05, + "loss": 0.1895, + "step": 1765 + }, + { + "epoch": 0.25432027649769584, + "grad_norm": 6.885192394256592, + "learning_rate": 4.244422137459851e-05, + "loss": 1.1376, + "step": 1766 + }, + { + "epoch": 0.2544642857142857, + "grad_norm": 3.8382301330566406, + "learning_rate": 4.243611764894198e-05, + "loss": 0.817, + "step": 1767 + }, + { + "epoch": 0.25460829493087556, + "grad_norm": 2.300360918045044, + "learning_rate": 4.242801035442058e-05, + "loss": 0.4711, + "step": 1768 + }, + { + "epoch": 0.2547523041474654, + "grad_norm": 8.265625, + "learning_rate": 4.2419899492693737e-05, + "loss": 1.0088, + "step": 1769 + }, + { + "epoch": 0.2548963133640553, + "grad_norm": 4.542981147766113, + "learning_rate": 4.2411785065421584e-05, + "loss": 0.5255, + "step": 1770 + }, + { + "epoch": 0.2550403225806452, + "grad_norm": 6.505258083343506, + "learning_rate": 4.2403667074265015e-05, + "loss": 1.255, + "step": 1771 + }, + { + "epoch": 0.25518433179723504, + "grad_norm": 4.375340461730957, + "learning_rate": 4.239554552088563e-05, + "loss": 0.4134, + "step": 1772 + }, + { + "epoch": 0.2553283410138249, + "grad_norm": 1.556134819984436, + "learning_rate": 4.238742040694578e-05, + "loss": 0.2126, + "step": 1773 + }, + { + "epoch": 0.25547235023041476, + "grad_norm": 7.359821319580078, + "learning_rate": 4.237929173410851e-05, + "loss": 0.8662, + "step": 1774 + }, + { + "epoch": 0.2556163594470046, + "grad_norm": 5.584040641784668, + "learning_rate": 4.237115950403764e-05, + "loss": 0.345, + "step": 1775 + }, + { + "epoch": 0.2557603686635945, + "grad_norm": 2.163572311401367, + "learning_rate": 4.2363023718397676e-05, + "loss": 0.1367, + "step": 1776 + }, + { + "epoch": 0.25590437788018433, + "grad_norm": 4.6089301109313965, + "learning_rate": 4.235488437885388e-05, + "loss": 2.8236, + "step": 1777 + }, + { + "epoch": 0.2560483870967742, + "grad_norm": 3.6452081203460693, + "learning_rate": 4.2346741487072227e-05, + "loss": 1.2125, + "step": 1778 + }, + { + "epoch": 0.25619239631336405, + "grad_norm": 3.3031299114227295, + "learning_rate": 4.233859504471943e-05, + "loss": 0.4367, + "step": 1779 + }, + { + "epoch": 0.2563364055299539, + "grad_norm": 3.1914401054382324, + "learning_rate": 4.233044505346291e-05, + "loss": 0.3445, + "step": 1780 + }, + { + "epoch": 0.25648041474654376, + "grad_norm": 2.5479886531829834, + "learning_rate": 4.2322291514970826e-05, + "loss": 0.3397, + "step": 1781 + }, + { + "epoch": 0.2566244239631336, + "grad_norm": 1.9198329448699951, + "learning_rate": 4.231413443091207e-05, + "loss": 0.3449, + "step": 1782 + }, + { + "epoch": 0.2567684331797235, + "grad_norm": 0.9707812070846558, + "learning_rate": 4.230597380295626e-05, + "loss": 0.1075, + "step": 1783 + }, + { + "epoch": 0.2569124423963134, + "grad_norm": 1.564378261566162, + "learning_rate": 4.229780963277371e-05, + "loss": 0.25, + "step": 1784 + }, + { + "epoch": 0.25705645161290325, + "grad_norm": 6.11513090133667, + "learning_rate": 4.2289641922035493e-05, + "loss": 0.5205, + "step": 1785 + }, + { + "epoch": 0.2572004608294931, + "grad_norm": 2.903700113296509, + "learning_rate": 4.22814706724134e-05, + "loss": 0.3265, + "step": 1786 + }, + { + "epoch": 0.25734447004608296, + "grad_norm": 1.2192085981369019, + "learning_rate": 4.227329588557994e-05, + "loss": 0.1965, + "step": 1787 + }, + { + "epoch": 0.2574884792626728, + "grad_norm": 1.690506935119629, + "learning_rate": 4.2265117563208344e-05, + "loss": 4.103, + "step": 1788 + }, + { + "epoch": 0.2576324884792627, + "grad_norm": 11.871247291564941, + "learning_rate": 4.225693570697257e-05, + "loss": 1.7252, + "step": 1789 + }, + { + "epoch": 0.25777649769585254, + "grad_norm": 9.27865219116211, + "learning_rate": 4.2248750318547303e-05, + "loss": 0.8647, + "step": 1790 + }, + { + "epoch": 0.2579205069124424, + "grad_norm": 5.170888900756836, + "learning_rate": 4.2240561399607935e-05, + "loss": 0.7311, + "step": 1791 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 3.116222381591797, + "learning_rate": 4.223236895183061e-05, + "loss": 0.2112, + "step": 1792 + }, + { + "epoch": 0.2582085253456221, + "grad_norm": 1.8321833610534668, + "learning_rate": 4.222417297689217e-05, + "loss": 0.1201, + "step": 1793 + }, + { + "epoch": 0.25835253456221197, + "grad_norm": 3.9959166049957275, + "learning_rate": 4.221597347647018e-05, + "loss": 0.5169, + "step": 1794 + }, + { + "epoch": 0.2584965437788018, + "grad_norm": 7.569512367248535, + "learning_rate": 4.220777045224294e-05, + "loss": 1.4929, + "step": 1795 + }, + { + "epoch": 0.2586405529953917, + "grad_norm": 2.7059288024902344, + "learning_rate": 4.219956390588946e-05, + "loss": 0.352, + "step": 1796 + }, + { + "epoch": 0.25878456221198154, + "grad_norm": 1.4586970806121826, + "learning_rate": 4.2191353839089474e-05, + "loss": 0.2024, + "step": 1797 + }, + { + "epoch": 0.25892857142857145, + "grad_norm": 7.026905059814453, + "learning_rate": 4.218314025352345e-05, + "loss": 1.13, + "step": 1798 + }, + { + "epoch": 0.2590725806451613, + "grad_norm": 3.8711137771606445, + "learning_rate": 4.2174923150872544e-05, + "loss": 2.3742, + "step": 1799 + }, + { + "epoch": 0.25921658986175117, + "grad_norm": 3.425266981124878, + "learning_rate": 4.2166702532818665e-05, + "loss": 0.3324, + "step": 1800 + }, + { + "epoch": 0.259360599078341, + "grad_norm": 3.681381940841675, + "learning_rate": 4.215847840104442e-05, + "loss": 0.3738, + "step": 1801 + }, + { + "epoch": 0.2595046082949309, + "grad_norm": 1.8047692775726318, + "learning_rate": 4.2150250757233155e-05, + "loss": 0.1472, + "step": 1802 + }, + { + "epoch": 0.25964861751152074, + "grad_norm": 1.4236788749694824, + "learning_rate": 4.2142019603068915e-05, + "loss": 0.1347, + "step": 1803 + }, + { + "epoch": 0.2597926267281106, + "grad_norm": 2.784181594848633, + "learning_rate": 4.2133784940236464e-05, + "loss": 0.4102, + "step": 1804 + }, + { + "epoch": 0.25993663594470046, + "grad_norm": 2.7353250980377197, + "learning_rate": 4.212554677042131e-05, + "loss": 0.5123, + "step": 1805 + }, + { + "epoch": 0.2600806451612903, + "grad_norm": 1.8754914999008179, + "learning_rate": 4.211730509530965e-05, + "loss": 0.1289, + "step": 1806 + }, + { + "epoch": 0.26022465437788017, + "grad_norm": 3.9132699966430664, + "learning_rate": 4.2109059916588414e-05, + "loss": 0.4809, + "step": 1807 + }, + { + "epoch": 0.26036866359447003, + "grad_norm": 2.3678412437438965, + "learning_rate": 4.210081123594523e-05, + "loss": 0.1673, + "step": 1808 + }, + { + "epoch": 0.2605126728110599, + "grad_norm": 1.2913620471954346, + "learning_rate": 4.209255905506847e-05, + "loss": 0.1791, + "step": 1809 + }, + { + "epoch": 0.26065668202764974, + "grad_norm": 1.834547996520996, + "learning_rate": 4.208430337564721e-05, + "loss": 0.194, + "step": 1810 + }, + { + "epoch": 0.26080069124423966, + "grad_norm": 11.58283519744873, + "learning_rate": 4.2076044199371236e-05, + "loss": 2.8354, + "step": 1811 + }, + { + "epoch": 0.2609447004608295, + "grad_norm": 6.729573726654053, + "learning_rate": 4.206778152793106e-05, + "loss": 1.2384, + "step": 1812 + }, + { + "epoch": 0.2610887096774194, + "grad_norm": 6.0404372215271, + "learning_rate": 4.20595153630179e-05, + "loss": 2.1315, + "step": 1813 + }, + { + "epoch": 0.26123271889400923, + "grad_norm": 2.221287250518799, + "learning_rate": 4.2051245706323696e-05, + "loss": 0.23, + "step": 1814 + }, + { + "epoch": 0.2613767281105991, + "grad_norm": 2.2088356018066406, + "learning_rate": 4.20429725595411e-05, + "loss": 0.2162, + "step": 1815 + }, + { + "epoch": 0.26152073732718895, + "grad_norm": 2.0051872730255127, + "learning_rate": 4.203469592436349e-05, + "loss": 0.2302, + "step": 1816 + }, + { + "epoch": 0.2616647465437788, + "grad_norm": 4.15115213394165, + "learning_rate": 4.202641580248492e-05, + "loss": 0.623, + "step": 1817 + }, + { + "epoch": 0.26180875576036866, + "grad_norm": 4.005507469177246, + "learning_rate": 4.2018132195600214e-05, + "loss": 1.8342, + "step": 1818 + }, + { + "epoch": 0.2619527649769585, + "grad_norm": 4.730959415435791, + "learning_rate": 4.2009845105404856e-05, + "loss": 0.5523, + "step": 1819 + }, + { + "epoch": 0.2620967741935484, + "grad_norm": 4.167786598205566, + "learning_rate": 4.200155453359508e-05, + "loss": 0.6435, + "step": 1820 + }, + { + "epoch": 0.26224078341013823, + "grad_norm": 4.406939506530762, + "learning_rate": 4.199326048186782e-05, + "loss": 0.6364, + "step": 1821 + }, + { + "epoch": 0.2623847926267281, + "grad_norm": 0.6667211055755615, + "learning_rate": 4.198496295192073e-05, + "loss": 0.0669, + "step": 1822 + }, + { + "epoch": 0.26252880184331795, + "grad_norm": 3.9150824546813965, + "learning_rate": 4.197666194545213e-05, + "loss": 0.9158, + "step": 1823 + }, + { + "epoch": 0.2626728110599078, + "grad_norm": 3.7909445762634277, + "learning_rate": 4.196835746416113e-05, + "loss": 2.3201, + "step": 1824 + }, + { + "epoch": 0.2628168202764977, + "grad_norm": 1.693711280822754, + "learning_rate": 4.19600495097475e-05, + "loss": 0.137, + "step": 1825 + }, + { + "epoch": 0.2629608294930876, + "grad_norm": 1.3409957885742188, + "learning_rate": 4.1951738083911716e-05, + "loss": 0.2709, + "step": 1826 + }, + { + "epoch": 0.26310483870967744, + "grad_norm": 4.296470642089844, + "learning_rate": 4.1943423188355e-05, + "loss": 1.1307, + "step": 1827 + }, + { + "epoch": 0.2632488479262673, + "grad_norm": 4.6562275886535645, + "learning_rate": 4.1935104824779246e-05, + "loss": 1.1994, + "step": 1828 + }, + { + "epoch": 0.26339285714285715, + "grad_norm": 2.328695297241211, + "learning_rate": 4.192678299488709e-05, + "loss": 0.1886, + "step": 1829 + }, + { + "epoch": 0.263536866359447, + "grad_norm": 1.6344799995422363, + "learning_rate": 4.1918457700381855e-05, + "loss": 0.2771, + "step": 1830 + }, + { + "epoch": 0.26368087557603687, + "grad_norm": 1.9185982942581177, + "learning_rate": 4.1910128942967594e-05, + "loss": 0.2407, + "step": 1831 + }, + { + "epoch": 0.2638248847926267, + "grad_norm": 5.572973251342773, + "learning_rate": 4.190179672434904e-05, + "loss": 0.5667, + "step": 1832 + }, + { + "epoch": 0.2639688940092166, + "grad_norm": 2.757094144821167, + "learning_rate": 4.1893461046231656e-05, + "loss": 0.2256, + "step": 1833 + }, + { + "epoch": 0.26411290322580644, + "grad_norm": 4.6310014724731445, + "learning_rate": 4.188512191032161e-05, + "loss": 2.0226, + "step": 1834 + }, + { + "epoch": 0.2642569124423963, + "grad_norm": 2.3964736461639404, + "learning_rate": 4.187677931832578e-05, + "loss": 0.197, + "step": 1835 + }, + { + "epoch": 0.26440092165898615, + "grad_norm": 6.69654655456543, + "learning_rate": 4.186843327195174e-05, + "loss": 0.7773, + "step": 1836 + }, + { + "epoch": 0.264544930875576, + "grad_norm": 1.4410678148269653, + "learning_rate": 4.1860083772907775e-05, + "loss": 0.1839, + "step": 1837 + }, + { + "epoch": 0.2646889400921659, + "grad_norm": 2.4200599193573, + "learning_rate": 4.185173082290289e-05, + "loss": 0.3404, + "step": 1838 + }, + { + "epoch": 0.2648329493087558, + "grad_norm": 5.566043376922607, + "learning_rate": 4.184337442364678e-05, + "loss": 0.7939, + "step": 1839 + }, + { + "epoch": 0.26497695852534564, + "grad_norm": 7.6449151039123535, + "learning_rate": 4.1835014576849854e-05, + "loss": 1.7608, + "step": 1840 + }, + { + "epoch": 0.2651209677419355, + "grad_norm": 4.3222270011901855, + "learning_rate": 4.182665128422323e-05, + "loss": 1.5206, + "step": 1841 + }, + { + "epoch": 0.26526497695852536, + "grad_norm": 4.178720951080322, + "learning_rate": 4.181828454747872e-05, + "loss": 0.7546, + "step": 1842 + }, + { + "epoch": 0.2654089861751152, + "grad_norm": 1.0186909437179565, + "learning_rate": 4.180991436832883e-05, + "loss": 0.1671, + "step": 1843 + }, + { + "epoch": 0.26555299539170507, + "grad_norm": 7.914861679077148, + "learning_rate": 4.180154074848682e-05, + "loss": 1.4745, + "step": 1844 + }, + { + "epoch": 0.26569700460829493, + "grad_norm": 3.7962234020233154, + "learning_rate": 4.17931636896666e-05, + "loss": 1.6313, + "step": 1845 + }, + { + "epoch": 0.2658410138248848, + "grad_norm": 3.1487245559692383, + "learning_rate": 4.1784783193582814e-05, + "loss": 0.3221, + "step": 1846 + }, + { + "epoch": 0.26598502304147464, + "grad_norm": 8.38532829284668, + "learning_rate": 4.1776399261950806e-05, + "loss": 0.8963, + "step": 1847 + }, + { + "epoch": 0.2661290322580645, + "grad_norm": 5.019575595855713, + "learning_rate": 4.17680118964866e-05, + "loss": 1.8602, + "step": 1848 + }, + { + "epoch": 0.26627304147465436, + "grad_norm": 0.8892652988433838, + "learning_rate": 4.175962109890696e-05, + "loss": 0.094, + "step": 1849 + }, + { + "epoch": 0.2664170506912442, + "grad_norm": 0.9484119415283203, + "learning_rate": 4.175122687092934e-05, + "loss": 0.1072, + "step": 1850 + }, + { + "epoch": 0.2665610599078341, + "grad_norm": 1.7350047826766968, + "learning_rate": 4.174282921427186e-05, + "loss": 0.1783, + "step": 1851 + }, + { + "epoch": 0.266705069124424, + "grad_norm": 6.216547012329102, + "learning_rate": 4.17344281306534e-05, + "loss": 1.7231, + "step": 1852 + }, + { + "epoch": 0.26684907834101385, + "grad_norm": 3.3825066089630127, + "learning_rate": 4.172602362179349e-05, + "loss": 0.6774, + "step": 1853 + }, + { + "epoch": 0.2669930875576037, + "grad_norm": 4.923544406890869, + "learning_rate": 4.1717615689412404e-05, + "loss": 2.4563, + "step": 1854 + }, + { + "epoch": 0.26713709677419356, + "grad_norm": 0.7998258471488953, + "learning_rate": 4.170920433523109e-05, + "loss": 0.1148, + "step": 1855 + }, + { + "epoch": 0.2672811059907834, + "grad_norm": 1.80278480052948, + "learning_rate": 4.170078956097121e-05, + "loss": 0.0988, + "step": 1856 + }, + { + "epoch": 0.2674251152073733, + "grad_norm": 0.8689360618591309, + "learning_rate": 4.16923713683551e-05, + "loss": 0.0919, + "step": 1857 + }, + { + "epoch": 0.26756912442396313, + "grad_norm": 4.3748064041137695, + "learning_rate": 4.1683949759105835e-05, + "loss": 0.7607, + "step": 1858 + }, + { + "epoch": 0.267713133640553, + "grad_norm": 1.3120216131210327, + "learning_rate": 4.167552473494716e-05, + "loss": 0.1445, + "step": 1859 + }, + { + "epoch": 0.26785714285714285, + "grad_norm": 2.3478403091430664, + "learning_rate": 4.166709629760353e-05, + "loss": 0.2052, + "step": 1860 + }, + { + "epoch": 0.2680011520737327, + "grad_norm": 0.9885870814323425, + "learning_rate": 4.16586644488001e-05, + "loss": 0.1148, + "step": 1861 + }, + { + "epoch": 0.26814516129032256, + "grad_norm": 2.984651803970337, + "learning_rate": 4.165022919026272e-05, + "loss": 1.5429, + "step": 1862 + }, + { + "epoch": 0.2682891705069124, + "grad_norm": 0.8723156452178955, + "learning_rate": 4.1641790523717935e-05, + "loss": 0.0872, + "step": 1863 + }, + { + "epoch": 0.2684331797235023, + "grad_norm": 1.8822827339172363, + "learning_rate": 4.163334845089298e-05, + "loss": 0.2521, + "step": 1864 + }, + { + "epoch": 0.2685771889400922, + "grad_norm": 10.952579498291016, + "learning_rate": 4.162490297351583e-05, + "loss": 2.6423, + "step": 1865 + }, + { + "epoch": 0.26872119815668205, + "grad_norm": 3.971419095993042, + "learning_rate": 4.16164540933151e-05, + "loss": 0.319, + "step": 1866 + }, + { + "epoch": 0.2688652073732719, + "grad_norm": 2.3392364978790283, + "learning_rate": 4.160800181202012e-05, + "loss": 0.2007, + "step": 1867 + }, + { + "epoch": 0.26900921658986177, + "grad_norm": 7.440077304840088, + "learning_rate": 4.159954613136093e-05, + "loss": 0.5886, + "step": 1868 + }, + { + "epoch": 0.2691532258064516, + "grad_norm": 6.329635143280029, + "learning_rate": 4.159108705306828e-05, + "loss": 0.672, + "step": 1869 + }, + { + "epoch": 0.2692972350230415, + "grad_norm": 5.702576637268066, + "learning_rate": 4.158262457887356e-05, + "loss": 0.7576, + "step": 1870 + }, + { + "epoch": 0.26944124423963134, + "grad_norm": 2.7081470489501953, + "learning_rate": 4.157415871050891e-05, + "loss": 0.392, + "step": 1871 + }, + { + "epoch": 0.2695852534562212, + "grad_norm": 2.688750982284546, + "learning_rate": 4.156568944970714e-05, + "loss": 0.209, + "step": 1872 + }, + { + "epoch": 0.26972926267281105, + "grad_norm": 1.5038955211639404, + "learning_rate": 4.155721679820176e-05, + "loss": 0.11, + "step": 1873 + }, + { + "epoch": 0.2698732718894009, + "grad_norm": 0.856002926826477, + "learning_rate": 4.1548740757726964e-05, + "loss": 0.1212, + "step": 1874 + }, + { + "epoch": 0.27001728110599077, + "grad_norm": 3.066659450531006, + "learning_rate": 4.154026133001765e-05, + "loss": 0.4645, + "step": 1875 + }, + { + "epoch": 0.2701612903225806, + "grad_norm": 4.6553473472595215, + "learning_rate": 4.153177851680941e-05, + "loss": 0.2959, + "step": 1876 + }, + { + "epoch": 0.2703052995391705, + "grad_norm": 1.8654708862304688, + "learning_rate": 4.1523292319838524e-05, + "loss": 0.1736, + "step": 1877 + }, + { + "epoch": 0.27044930875576034, + "grad_norm": 2.2919540405273438, + "learning_rate": 4.151480274084196e-05, + "loss": 0.2605, + "step": 1878 + }, + { + "epoch": 0.27059331797235026, + "grad_norm": 1.7809946537017822, + "learning_rate": 4.15063097815574e-05, + "loss": 0.1494, + "step": 1879 + }, + { + "epoch": 0.2707373271889401, + "grad_norm": 2.286181688308716, + "learning_rate": 4.1497813443723186e-05, + "loss": 0.3772, + "step": 1880 + }, + { + "epoch": 0.27088133640552997, + "grad_norm": 4.1037116050720215, + "learning_rate": 4.1489313729078376e-05, + "loss": 0.2737, + "step": 1881 + }, + { + "epoch": 0.27102534562211983, + "grad_norm": 4.246210098266602, + "learning_rate": 4.1480810639362713e-05, + "loss": 0.5038, + "step": 1882 + }, + { + "epoch": 0.2711693548387097, + "grad_norm": 2.5108678340911865, + "learning_rate": 4.1472304176316634e-05, + "loss": 0.4555, + "step": 1883 + }, + { + "epoch": 0.27131336405529954, + "grad_norm": 3.4118196964263916, + "learning_rate": 4.1463794341681244e-05, + "loss": 0.3666, + "step": 1884 + }, + { + "epoch": 0.2714573732718894, + "grad_norm": 1.3287795782089233, + "learning_rate": 4.145528113719837e-05, + "loss": 0.1433, + "step": 1885 + }, + { + "epoch": 0.27160138248847926, + "grad_norm": 5.041428089141846, + "learning_rate": 4.1446764564610505e-05, + "loss": 0.5829, + "step": 1886 + }, + { + "epoch": 0.2717453917050691, + "grad_norm": 3.2877755165100098, + "learning_rate": 4.143824462566086e-05, + "loss": 0.3674, + "step": 1887 + }, + { + "epoch": 0.271889400921659, + "grad_norm": 4.631596088409424, + "learning_rate": 4.142972132209329e-05, + "loss": 0.3656, + "step": 1888 + }, + { + "epoch": 0.27203341013824883, + "grad_norm": 4.0649333000183105, + "learning_rate": 4.142119465565238e-05, + "loss": 0.3359, + "step": 1889 + }, + { + "epoch": 0.2721774193548387, + "grad_norm": 4.43428897857666, + "learning_rate": 4.1412664628083386e-05, + "loss": 0.5811, + "step": 1890 + }, + { + "epoch": 0.27232142857142855, + "grad_norm": 5.553092002868652, + "learning_rate": 4.140413124113225e-05, + "loss": 0.5941, + "step": 1891 + }, + { + "epoch": 0.27246543778801846, + "grad_norm": 1.183394432067871, + "learning_rate": 4.139559449654561e-05, + "loss": 0.1028, + "step": 1892 + }, + { + "epoch": 0.2726094470046083, + "grad_norm": 3.6401050090789795, + "learning_rate": 4.138705439607077e-05, + "loss": 2.3992, + "step": 1893 + }, + { + "epoch": 0.2727534562211982, + "grad_norm": 4.274582862854004, + "learning_rate": 4.1378510941455767e-05, + "loss": 0.3377, + "step": 1894 + }, + { + "epoch": 0.27289746543778803, + "grad_norm": 4.659739017486572, + "learning_rate": 4.1369964134449276e-05, + "loss": 0.5231, + "step": 1895 + }, + { + "epoch": 0.2730414746543779, + "grad_norm": 1.0716779232025146, + "learning_rate": 4.136141397680068e-05, + "loss": 0.1147, + "step": 1896 + }, + { + "epoch": 0.27318548387096775, + "grad_norm": 4.758059978485107, + "learning_rate": 4.135286047026005e-05, + "loss": 0.6372, + "step": 1897 + }, + { + "epoch": 0.2733294930875576, + "grad_norm": 7.586665630340576, + "learning_rate": 4.134430361657813e-05, + "loss": 0.9863, + "step": 1898 + }, + { + "epoch": 0.27347350230414746, + "grad_norm": 3.7666985988616943, + "learning_rate": 4.133574341750636e-05, + "loss": 0.4273, + "step": 1899 + }, + { + "epoch": 0.2736175115207373, + "grad_norm": 0.8394392132759094, + "learning_rate": 4.132717987479685e-05, + "loss": 0.0786, + "step": 1900 + }, + { + "epoch": 0.2737615207373272, + "grad_norm": 3.1121013164520264, + "learning_rate": 4.1318612990202434e-05, + "loss": 0.2269, + "step": 1901 + }, + { + "epoch": 0.27390552995391704, + "grad_norm": 2.6151793003082275, + "learning_rate": 4.1310042765476574e-05, + "loss": 0.4501, + "step": 1902 + }, + { + "epoch": 0.2740495391705069, + "grad_norm": 3.9376344680786133, + "learning_rate": 4.1301469202373464e-05, + "loss": 0.7933, + "step": 1903 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 6.150395393371582, + "learning_rate": 4.1292892302647946e-05, + "loss": 2.5617, + "step": 1904 + }, + { + "epoch": 0.2743375576036866, + "grad_norm": 2.904545545578003, + "learning_rate": 4.128431206805557e-05, + "loss": 0.3372, + "step": 1905 + }, + { + "epoch": 0.2744815668202765, + "grad_norm": 1.9146217107772827, + "learning_rate": 4.127572850035253e-05, + "loss": 0.1581, + "step": 1906 + }, + { + "epoch": 0.2746255760368664, + "grad_norm": 1.6870476007461548, + "learning_rate": 4.126714160129577e-05, + "loss": 0.0995, + "step": 1907 + }, + { + "epoch": 0.27476958525345624, + "grad_norm": 5.618216037750244, + "learning_rate": 4.125855137264286e-05, + "loss": 1.6303, + "step": 1908 + }, + { + "epoch": 0.2749135944700461, + "grad_norm": 6.0849928855896, + "learning_rate": 4.1249957816152066e-05, + "loss": 3.0748, + "step": 1909 + }, + { + "epoch": 0.27505760368663595, + "grad_norm": 1.2262433767318726, + "learning_rate": 4.124136093358234e-05, + "loss": 0.1658, + "step": 1910 + }, + { + "epoch": 0.2752016129032258, + "grad_norm": 5.91195821762085, + "learning_rate": 4.123276072669331e-05, + "loss": 0.6933, + "step": 1911 + }, + { + "epoch": 0.27534562211981567, + "grad_norm": 1.8630363941192627, + "learning_rate": 4.122415719724528e-05, + "loss": 0.2912, + "step": 1912 + }, + { + "epoch": 0.2754896313364055, + "grad_norm": 3.0336225032806396, + "learning_rate": 4.121555034699925e-05, + "loss": 0.5182, + "step": 1913 + }, + { + "epoch": 0.2756336405529954, + "grad_norm": 3.5020666122436523, + "learning_rate": 4.1206940177716894e-05, + "loss": 0.3711, + "step": 1914 + }, + { + "epoch": 0.27577764976958524, + "grad_norm": 2.8599259853363037, + "learning_rate": 4.119832669116055e-05, + "loss": 0.3404, + "step": 1915 + }, + { + "epoch": 0.2759216589861751, + "grad_norm": 6.982171058654785, + "learning_rate": 4.118970988909325e-05, + "loss": 0.6769, + "step": 1916 + }, + { + "epoch": 0.27606566820276496, + "grad_norm": 1.9426212310791016, + "learning_rate": 4.11810897732787e-05, + "loss": 0.1634, + "step": 1917 + }, + { + "epoch": 0.2762096774193548, + "grad_norm": 5.009031772613525, + "learning_rate": 4.1172466345481286e-05, + "loss": 0.7864, + "step": 1918 + }, + { + "epoch": 0.2763536866359447, + "grad_norm": 3.6844394207000732, + "learning_rate": 4.1163839607466084e-05, + "loss": 0.2049, + "step": 1919 + }, + { + "epoch": 0.2764976958525346, + "grad_norm": 3.197066307067871, + "learning_rate": 4.115520956099881e-05, + "loss": 0.2155, + "step": 1920 + }, + { + "epoch": 0.27664170506912444, + "grad_norm": 3.983478307723999, + "learning_rate": 4.114657620784589e-05, + "loss": 0.6906, + "step": 1921 + }, + { + "epoch": 0.2767857142857143, + "grad_norm": 2.332625389099121, + "learning_rate": 4.113793954977443e-05, + "loss": 0.2038, + "step": 1922 + }, + { + "epoch": 0.27692972350230416, + "grad_norm": 4.586831092834473, + "learning_rate": 4.1129299588552193e-05, + "loss": 0.3648, + "step": 1923 + }, + { + "epoch": 0.277073732718894, + "grad_norm": 3.2778899669647217, + "learning_rate": 4.112065632594762e-05, + "loss": 0.4021, + "step": 1924 + }, + { + "epoch": 0.2772177419354839, + "grad_norm": 2.043217182159424, + "learning_rate": 4.111200976372985e-05, + "loss": 0.1869, + "step": 1925 + }, + { + "epoch": 0.27736175115207373, + "grad_norm": 4.428118705749512, + "learning_rate": 4.110335990366868e-05, + "loss": 0.4439, + "step": 1926 + }, + { + "epoch": 0.2775057603686636, + "grad_norm": 3.666869878768921, + "learning_rate": 4.109470674753457e-05, + "loss": 2.6114, + "step": 1927 + }, + { + "epoch": 0.27764976958525345, + "grad_norm": 6.04480504989624, + "learning_rate": 4.1086050297098666e-05, + "loss": 0.724, + "step": 1928 + }, + { + "epoch": 0.2777937788018433, + "grad_norm": 1.8455337285995483, + "learning_rate": 4.107739055413281e-05, + "loss": 0.1602, + "step": 1929 + }, + { + "epoch": 0.27793778801843316, + "grad_norm": 1.5022509098052979, + "learning_rate": 4.1068727520409476e-05, + "loss": 0.1627, + "step": 1930 + }, + { + "epoch": 0.278081797235023, + "grad_norm": 1.2657842636108398, + "learning_rate": 4.106006119770185e-05, + "loss": 0.1175, + "step": 1931 + }, + { + "epoch": 0.2782258064516129, + "grad_norm": 3.205331563949585, + "learning_rate": 4.105139158778377e-05, + "loss": 0.3168, + "step": 1932 + }, + { + "epoch": 0.2783698156682028, + "grad_norm": 4.636321544647217, + "learning_rate": 4.104271869242975e-05, + "loss": 0.7245, + "step": 1933 + }, + { + "epoch": 0.27851382488479265, + "grad_norm": 1.0321346521377563, + "learning_rate": 4.1034042513414976e-05, + "loss": 0.16, + "step": 1934 + }, + { + "epoch": 0.2786578341013825, + "grad_norm": 3.8067312240600586, + "learning_rate": 4.102536305251532e-05, + "loss": 0.4447, + "step": 1935 + }, + { + "epoch": 0.27880184331797236, + "grad_norm": 1.6017229557037354, + "learning_rate": 4.10166803115073e-05, + "loss": 0.14, + "step": 1936 + }, + { + "epoch": 0.2789458525345622, + "grad_norm": 5.704813480377197, + "learning_rate": 4.1007994292168126e-05, + "loss": 0.4893, + "step": 1937 + }, + { + "epoch": 0.2790898617511521, + "grad_norm": 6.100917339324951, + "learning_rate": 4.099930499627567e-05, + "loss": 0.4315, + "step": 1938 + }, + { + "epoch": 0.27923387096774194, + "grad_norm": 2.613396644592285, + "learning_rate": 4.099061242560848e-05, + "loss": 0.2202, + "step": 1939 + }, + { + "epoch": 0.2793778801843318, + "grad_norm": 3.5631139278411865, + "learning_rate": 4.098191658194578e-05, + "loss": 0.2532, + "step": 1940 + }, + { + "epoch": 0.27952188940092165, + "grad_norm": 1.2212433815002441, + "learning_rate": 4.0973217467067434e-05, + "loss": 0.158, + "step": 1941 + }, + { + "epoch": 0.2796658986175115, + "grad_norm": 4.221761226654053, + "learning_rate": 4.096451508275401e-05, + "loss": 0.3763, + "step": 1942 + }, + { + "epoch": 0.27980990783410137, + "grad_norm": 2.0426082611083984, + "learning_rate": 4.0955809430786743e-05, + "loss": 0.1593, + "step": 1943 + }, + { + "epoch": 0.2799539170506912, + "grad_norm": 5.858448505401611, + "learning_rate": 4.09471005129475e-05, + "loss": 0.7413, + "step": 1944 + }, + { + "epoch": 0.2800979262672811, + "grad_norm": 6.2442216873168945, + "learning_rate": 4.0938388331018864e-05, + "loss": 1.1461, + "step": 1945 + }, + { + "epoch": 0.28024193548387094, + "grad_norm": 4.77587890625, + "learning_rate": 4.092967288678405e-05, + "loss": 0.362, + "step": 1946 + }, + { + "epoch": 0.28038594470046085, + "grad_norm": 0.8365405201911926, + "learning_rate": 4.0920954182026965e-05, + "loss": 0.095, + "step": 1947 + }, + { + "epoch": 0.2805299539170507, + "grad_norm": 2.456843614578247, + "learning_rate": 4.091223221853217e-05, + "loss": 0.1967, + "step": 1948 + }, + { + "epoch": 0.28067396313364057, + "grad_norm": 0.8882625102996826, + "learning_rate": 4.09035069980849e-05, + "loss": 0.103, + "step": 1949 + }, + { + "epoch": 0.2808179723502304, + "grad_norm": 2.4644734859466553, + "learning_rate": 4.089477852247105e-05, + "loss": 0.2427, + "step": 1950 + }, + { + "epoch": 0.2809619815668203, + "grad_norm": 1.2387189865112305, + "learning_rate": 4.088604679347718e-05, + "loss": 4.3841, + "step": 1951 + }, + { + "epoch": 0.28110599078341014, + "grad_norm": 2.108671188354492, + "learning_rate": 4.087731181289054e-05, + "loss": 0.2076, + "step": 1952 + }, + { + "epoch": 0.28125, + "grad_norm": 3.079277276992798, + "learning_rate": 4.0868573582499004e-05, + "loss": 0.3948, + "step": 1953 + }, + { + "epoch": 0.28139400921658986, + "grad_norm": 3.9141855239868164, + "learning_rate": 4.085983210409114e-05, + "loss": 0.3464, + "step": 1954 + }, + { + "epoch": 0.2815380184331797, + "grad_norm": 7.538147449493408, + "learning_rate": 4.0851087379456175e-05, + "loss": 0.8881, + "step": 1955 + }, + { + "epoch": 0.2816820276497696, + "grad_norm": 1.960270881652832, + "learning_rate": 4.0842339410384e-05, + "loss": 0.217, + "step": 1956 + }, + { + "epoch": 0.28182603686635943, + "grad_norm": 5.284159183502197, + "learning_rate": 4.0833588198665176e-05, + "loss": 0.5567, + "step": 1957 + }, + { + "epoch": 0.2819700460829493, + "grad_norm": 1.8096565008163452, + "learning_rate": 4.0824833746090906e-05, + "loss": 0.143, + "step": 1958 + }, + { + "epoch": 0.28211405529953915, + "grad_norm": 6.636285781860352, + "learning_rate": 4.0816076054453076e-05, + "loss": 1.9299, + "step": 1959 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 1.3157185316085815, + "learning_rate": 4.080731512554424e-05, + "loss": 0.2107, + "step": 1960 + }, + { + "epoch": 0.2824020737327189, + "grad_norm": 5.0150346755981445, + "learning_rate": 4.07985509611576e-05, + "loss": 0.5841, + "step": 1961 + }, + { + "epoch": 0.2825460829493088, + "grad_norm": 2.0354669094085693, + "learning_rate": 4.0789783563087026e-05, + "loss": 0.1439, + "step": 1962 + }, + { + "epoch": 0.28269009216589863, + "grad_norm": 1.9379587173461914, + "learning_rate": 4.078101293312705e-05, + "loss": 0.1458, + "step": 1963 + }, + { + "epoch": 0.2828341013824885, + "grad_norm": 6.625720500946045, + "learning_rate": 4.077223907307286e-05, + "loss": 0.8265, + "step": 1964 + }, + { + "epoch": 0.28297811059907835, + "grad_norm": 2.859713077545166, + "learning_rate": 4.076346198472031e-05, + "loss": 0.7049, + "step": 1965 + }, + { + "epoch": 0.2831221198156682, + "grad_norm": 2.8508360385894775, + "learning_rate": 4.075468166986592e-05, + "loss": 0.2848, + "step": 1966 + }, + { + "epoch": 0.28326612903225806, + "grad_norm": 4.889984607696533, + "learning_rate": 4.074589813030687e-05, + "loss": 0.4908, + "step": 1967 + }, + { + "epoch": 0.2834101382488479, + "grad_norm": 5.133331298828125, + "learning_rate": 4.073711136784099e-05, + "loss": 2.1452, + "step": 1968 + }, + { + "epoch": 0.2835541474654378, + "grad_norm": 4.76059103012085, + "learning_rate": 4.072832138426676e-05, + "loss": 1.7722, + "step": 1969 + }, + { + "epoch": 0.28369815668202764, + "grad_norm": 3.9528214931488037, + "learning_rate": 4.0719528181383356e-05, + "loss": 0.2895, + "step": 1970 + }, + { + "epoch": 0.2838421658986175, + "grad_norm": 2.562316656112671, + "learning_rate": 4.0710731760990576e-05, + "loss": 0.3274, + "step": 1971 + }, + { + "epoch": 0.28398617511520735, + "grad_norm": 8.489856719970703, + "learning_rate": 4.070193212488891e-05, + "loss": 2.3231, + "step": 1972 + }, + { + "epoch": 0.2841301843317972, + "grad_norm": 1.452985405921936, + "learning_rate": 4.069312927487946e-05, + "loss": 0.2249, + "step": 1973 + }, + { + "epoch": 0.2842741935483871, + "grad_norm": 1.861785888671875, + "learning_rate": 4.068432321276404e-05, + "loss": 0.1945, + "step": 1974 + }, + { + "epoch": 0.284418202764977, + "grad_norm": 3.5662271976470947, + "learning_rate": 4.067551394034508e-05, + "loss": 0.5848, + "step": 1975 + }, + { + "epoch": 0.28456221198156684, + "grad_norm": 3.130138397216797, + "learning_rate": 4.066670145942569e-05, + "loss": 2.3405, + "step": 1976 + }, + { + "epoch": 0.2847062211981567, + "grad_norm": 6.059991359710693, + "learning_rate": 4.065788577180962e-05, + "loss": 2.1377, + "step": 1977 + }, + { + "epoch": 0.28485023041474655, + "grad_norm": 1.0617119073867798, + "learning_rate": 4.06490668793013e-05, + "loss": 0.1502, + "step": 1978 + }, + { + "epoch": 0.2849942396313364, + "grad_norm": 3.249626398086548, + "learning_rate": 4.064024478370579e-05, + "loss": 0.2577, + "step": 1979 + }, + { + "epoch": 0.28513824884792627, + "grad_norm": 2.7177553176879883, + "learning_rate": 4.0631419486828816e-05, + "loss": 1.4263, + "step": 1980 + }, + { + "epoch": 0.2852822580645161, + "grad_norm": 1.0415911674499512, + "learning_rate": 4.062259099047677e-05, + "loss": 0.1454, + "step": 1981 + }, + { + "epoch": 0.285426267281106, + "grad_norm": 1.7099741697311401, + "learning_rate": 4.0613759296456675e-05, + "loss": 0.2816, + "step": 1982 + }, + { + "epoch": 0.28557027649769584, + "grad_norm": 1.0137842893600464, + "learning_rate": 4.060492440657624e-05, + "loss": 0.1242, + "step": 1983 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 4.059482574462891, + "learning_rate": 4.059608632264379e-05, + "loss": 0.8922, + "step": 1984 + }, + { + "epoch": 0.28585829493087556, + "grad_norm": 6.086939811706543, + "learning_rate": 4.058724504646834e-05, + "loss": 1.7378, + "step": 1985 + }, + { + "epoch": 0.2860023041474654, + "grad_norm": 1.0864372253417969, + "learning_rate": 4.057840057985954e-05, + "loss": 0.1308, + "step": 1986 + }, + { + "epoch": 0.2861463133640553, + "grad_norm": 2.4234046936035156, + "learning_rate": 4.05695529246277e-05, + "loss": 0.1902, + "step": 1987 + }, + { + "epoch": 0.2862903225806452, + "grad_norm": 2.8020894527435303, + "learning_rate": 4.056070208258376e-05, + "loss": 0.2969, + "step": 1988 + }, + { + "epoch": 0.28643433179723504, + "grad_norm": 2.2715985774993896, + "learning_rate": 4.0551848055539345e-05, + "loss": 0.2551, + "step": 1989 + }, + { + "epoch": 0.2865783410138249, + "grad_norm": 5.2068328857421875, + "learning_rate": 4.054299084530672e-05, + "loss": 0.422, + "step": 1990 + }, + { + "epoch": 0.28672235023041476, + "grad_norm": 4.2811970710754395, + "learning_rate": 4.0534130453698796e-05, + "loss": 0.3644, + "step": 1991 + }, + { + "epoch": 0.2868663594470046, + "grad_norm": 2.2135090827941895, + "learning_rate": 4.052526688252914e-05, + "loss": 0.2956, + "step": 1992 + }, + { + "epoch": 0.2870103686635945, + "grad_norm": 3.9497737884521484, + "learning_rate": 4.0516400133611964e-05, + "loss": 0.5544, + "step": 1993 + }, + { + "epoch": 0.28715437788018433, + "grad_norm": 0.6752537488937378, + "learning_rate": 4.050753020876213e-05, + "loss": 0.0604, + "step": 1994 + }, + { + "epoch": 0.2872983870967742, + "grad_norm": 1.5059171915054321, + "learning_rate": 4.049865710979517e-05, + "loss": 0.1378, + "step": 1995 + }, + { + "epoch": 0.28744239631336405, + "grad_norm": 2.3345770835876465, + "learning_rate": 4.048978083852724e-05, + "loss": 0.29, + "step": 1996 + }, + { + "epoch": 0.2875864055299539, + "grad_norm": 2.149207353591919, + "learning_rate": 4.048090139677516e-05, + "loss": 0.3748, + "step": 1997 + }, + { + "epoch": 0.28773041474654376, + "grad_norm": 3.2103323936462402, + "learning_rate": 4.047201878635639e-05, + "loss": 0.1934, + "step": 1998 + }, + { + "epoch": 0.2878744239631336, + "grad_norm": 6.433139324188232, + "learning_rate": 4.046313300908904e-05, + "loss": 2.0772, + "step": 1999 + }, + { + "epoch": 0.2880184331797235, + "grad_norm": 3.970954418182373, + "learning_rate": 4.0454244066791885e-05, + "loss": 1.8781, + "step": 2000 + }, + { + "epoch": 0.2881624423963134, + "grad_norm": 3.5272796154022217, + "learning_rate": 4.0445351961284326e-05, + "loss": 0.6948, + "step": 2001 + }, + { + "epoch": 0.28830645161290325, + "grad_norm": 3.6040914058685303, + "learning_rate": 4.0436456694386414e-05, + "loss": 0.486, + "step": 2002 + }, + { + "epoch": 0.2884504608294931, + "grad_norm": 2.249372720718384, + "learning_rate": 4.042755826791886e-05, + "loss": 0.1621, + "step": 2003 + }, + { + "epoch": 0.28859447004608296, + "grad_norm": 2.7197794914245605, + "learning_rate": 4.041865668370301e-05, + "loss": 0.314, + "step": 2004 + }, + { + "epoch": 0.2887384792626728, + "grad_norm": 6.245969772338867, + "learning_rate": 4.0409751943560876e-05, + "loss": 0.7833, + "step": 2005 + }, + { + "epoch": 0.2888824884792627, + "grad_norm": 1.3757646083831787, + "learning_rate": 4.040084404931508e-05, + "loss": 0.1677, + "step": 2006 + }, + { + "epoch": 0.28902649769585254, + "grad_norm": 2.140007734298706, + "learning_rate": 4.0391933002788926e-05, + "loss": 0.1776, + "step": 2007 + }, + { + "epoch": 0.2891705069124424, + "grad_norm": 12.367351531982422, + "learning_rate": 4.0383018805806334e-05, + "loss": 1.7978, + "step": 2008 + }, + { + "epoch": 0.28931451612903225, + "grad_norm": 4.407439231872559, + "learning_rate": 4.0374101460191895e-05, + "loss": 0.3686, + "step": 2009 + }, + { + "epoch": 0.2894585253456221, + "grad_norm": 1.6088802814483643, + "learning_rate": 4.036518096777082e-05, + "loss": 0.1229, + "step": 2010 + }, + { + "epoch": 0.28960253456221197, + "grad_norm": 1.9081406593322754, + "learning_rate": 4.0356257330368986e-05, + "loss": 0.1928, + "step": 2011 + }, + { + "epoch": 0.2897465437788018, + "grad_norm": 1.4468098878860474, + "learning_rate": 4.03473305498129e-05, + "loss": 0.1632, + "step": 2012 + }, + { + "epoch": 0.2898905529953917, + "grad_norm": 6.235609531402588, + "learning_rate": 4.0338400627929715e-05, + "loss": 0.6776, + "step": 2013 + }, + { + "epoch": 0.29003456221198154, + "grad_norm": 3.702686071395874, + "learning_rate": 4.032946756654723e-05, + "loss": 3.0652, + "step": 2014 + }, + { + "epoch": 0.29017857142857145, + "grad_norm": 6.394821643829346, + "learning_rate": 4.032053136749388e-05, + "loss": 0.4628, + "step": 2015 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 0.7020028829574585, + "learning_rate": 4.0311592032598754e-05, + "loss": 0.0965, + "step": 2016 + }, + { + "epoch": 0.29046658986175117, + "grad_norm": 1.5867488384246826, + "learning_rate": 4.030264956369157e-05, + "loss": 0.1426, + "step": 2017 + }, + { + "epoch": 0.290610599078341, + "grad_norm": 0.6601081490516663, + "learning_rate": 4.0293703962602704e-05, + "loss": 0.0678, + "step": 2018 + }, + { + "epoch": 0.2907546082949309, + "grad_norm": 2.6614418029785156, + "learning_rate": 4.028475523116314e-05, + "loss": 0.202, + "step": 2019 + }, + { + "epoch": 0.29089861751152074, + "grad_norm": 3.7385926246643066, + "learning_rate": 4.027580337120455e-05, + "loss": 0.2247, + "step": 2020 + }, + { + "epoch": 0.2910426267281106, + "grad_norm": 6.584449768066406, + "learning_rate": 4.026684838455921e-05, + "loss": 1.8659, + "step": 2021 + }, + { + "epoch": 0.29118663594470046, + "grad_norm": 3.501401662826538, + "learning_rate": 4.025789027306004e-05, + "loss": 2.5032, + "step": 2022 + }, + { + "epoch": 0.2913306451612903, + "grad_norm": 5.993541240692139, + "learning_rate": 4.024892903854062e-05, + "loss": 2.8456, + "step": 2023 + }, + { + "epoch": 0.29147465437788017, + "grad_norm": 5.890476226806641, + "learning_rate": 4.023996468283515e-05, + "loss": 0.2983, + "step": 2024 + }, + { + "epoch": 0.29161866359447003, + "grad_norm": 7.715660095214844, + "learning_rate": 4.023099720777848e-05, + "loss": 1.6717, + "step": 2025 + }, + { + "epoch": 0.2917626728110599, + "grad_norm": 1.7790091037750244, + "learning_rate": 4.022202661520609e-05, + "loss": 4.0451, + "step": 2026 + }, + { + "epoch": 0.29190668202764974, + "grad_norm": 1.8816241025924683, + "learning_rate": 4.0213052906954096e-05, + "loss": 0.2431, + "step": 2027 + }, + { + "epoch": 0.29205069124423966, + "grad_norm": 1.3280261754989624, + "learning_rate": 4.020407608485926e-05, + "loss": 0.0892, + "step": 2028 + }, + { + "epoch": 0.2921947004608295, + "grad_norm": 4.094665050506592, + "learning_rate": 4.019509615075898e-05, + "loss": 0.2716, + "step": 2029 + }, + { + "epoch": 0.2923387096774194, + "grad_norm": 4.752663612365723, + "learning_rate": 4.01861131064913e-05, + "loss": 1.6947, + "step": 2030 + }, + { + "epoch": 0.29248271889400923, + "grad_norm": 0.653541088104248, + "learning_rate": 4.017712695389487e-05, + "loss": 4.3142, + "step": 2031 + }, + { + "epoch": 0.2926267281105991, + "grad_norm": 2.7103681564331055, + "learning_rate": 4.016813769480902e-05, + "loss": 0.2862, + "step": 2032 + }, + { + "epoch": 0.29277073732718895, + "grad_norm": 2.6564440727233887, + "learning_rate": 4.015914533107367e-05, + "loss": 0.1952, + "step": 2033 + }, + { + "epoch": 0.2929147465437788, + "grad_norm": 1.2803257703781128, + "learning_rate": 4.015014986452941e-05, + "loss": 0.1163, + "step": 2034 + }, + { + "epoch": 0.29305875576036866, + "grad_norm": 4.380311012268066, + "learning_rate": 4.014115129701746e-05, + "loss": 0.2917, + "step": 2035 + }, + { + "epoch": 0.2932027649769585, + "grad_norm": 2.5272858142852783, + "learning_rate": 4.013214963037965e-05, + "loss": 0.2926, + "step": 2036 + }, + { + "epoch": 0.2933467741935484, + "grad_norm": 2.1242337226867676, + "learning_rate": 4.0123144866458465e-05, + "loss": 0.3857, + "step": 2037 + }, + { + "epoch": 0.29349078341013823, + "grad_norm": 2.531869649887085, + "learning_rate": 4.011413700709703e-05, + "loss": 0.1883, + "step": 2038 + }, + { + "epoch": 0.2936347926267281, + "grad_norm": 0.9375856518745422, + "learning_rate": 4.0105126054139094e-05, + "loss": 0.1172, + "step": 2039 + }, + { + "epoch": 0.29377880184331795, + "grad_norm": 1.1880216598510742, + "learning_rate": 4.009611200942904e-05, + "loss": 0.128, + "step": 2040 + }, + { + "epoch": 0.2939228110599078, + "grad_norm": 2.066082239151001, + "learning_rate": 4.008709487481187e-05, + "loss": 0.2637, + "step": 2041 + }, + { + "epoch": 0.2940668202764977, + "grad_norm": 1.3181419372558594, + "learning_rate": 4.007807465213325e-05, + "loss": 0.1417, + "step": 2042 + }, + { + "epoch": 0.2942108294930876, + "grad_norm": 1.239729642868042, + "learning_rate": 4.006905134323944e-05, + "loss": 0.1156, + "step": 2043 + }, + { + "epoch": 0.29435483870967744, + "grad_norm": 5.8050737380981445, + "learning_rate": 4.006002494997737e-05, + "loss": 1.1582, + "step": 2044 + }, + { + "epoch": 0.2944988479262673, + "grad_norm": 4.023856163024902, + "learning_rate": 4.0050995474194576e-05, + "loss": 2.8927, + "step": 2045 + }, + { + "epoch": 0.29464285714285715, + "grad_norm": 3.1647682189941406, + "learning_rate": 4.0041962917739236e-05, + "loss": 1.8864, + "step": 2046 + }, + { + "epoch": 0.294786866359447, + "grad_norm": 2.9548861980438232, + "learning_rate": 4.0032927282460146e-05, + "loss": 2.8732, + "step": 2047 + }, + { + "epoch": 0.29493087557603687, + "grad_norm": 2.227893114089966, + "learning_rate": 4.0023888570206746e-05, + "loss": 0.2781, + "step": 2048 + }, + { + "epoch": 0.2950748847926267, + "grad_norm": 1.4947367906570435, + "learning_rate": 4.0014846782829104e-05, + "loss": 0.1666, + "step": 2049 + }, + { + "epoch": 0.2952188940092166, + "grad_norm": 6.4715657234191895, + "learning_rate": 4.000580192217791e-05, + "loss": 0.5602, + "step": 2050 + }, + { + "epoch": 0.29536290322580644, + "grad_norm": 4.639443397521973, + "learning_rate": 3.9996753990104484e-05, + "loss": 2.8212, + "step": 2051 + }, + { + "epoch": 0.2955069124423963, + "grad_norm": 4.49601411819458, + "learning_rate": 3.998770298846079e-05, + "loss": 1.0588, + "step": 2052 + }, + { + "epoch": 0.29565092165898615, + "grad_norm": 1.1419059038162231, + "learning_rate": 3.9978648919099386e-05, + "loss": 0.099, + "step": 2053 + }, + { + "epoch": 0.295794930875576, + "grad_norm": 12.567187309265137, + "learning_rate": 3.9969591783873495e-05, + "loss": 2.7997, + "step": 2054 + }, + { + "epoch": 0.2959389400921659, + "grad_norm": 4.682185649871826, + "learning_rate": 3.996053158463695e-05, + "loss": 2.0544, + "step": 2055 + }, + { + "epoch": 0.2960829493087558, + "grad_norm": 1.8835757970809937, + "learning_rate": 3.995146832324422e-05, + "loss": 0.1752, + "step": 2056 + }, + { + "epoch": 0.29622695852534564, + "grad_norm": 1.061055064201355, + "learning_rate": 3.994240200155038e-05, + "loss": 0.0953, + "step": 2057 + }, + { + "epoch": 0.2963709677419355, + "grad_norm": 3.656014919281006, + "learning_rate": 3.993333262141116e-05, + "loss": 0.4948, + "step": 2058 + }, + { + "epoch": 0.29651497695852536, + "grad_norm": 1.4476035833358765, + "learning_rate": 3.9924260184682894e-05, + "loss": 0.217, + "step": 2059 + }, + { + "epoch": 0.2966589861751152, + "grad_norm": 2.674288272857666, + "learning_rate": 3.991518469322255e-05, + "loss": 2.2219, + "step": 2060 + }, + { + "epoch": 0.29680299539170507, + "grad_norm": 1.4405978918075562, + "learning_rate": 3.990610614888772e-05, + "loss": 0.1609, + "step": 2061 + }, + { + "epoch": 0.29694700460829493, + "grad_norm": 2.9820072650909424, + "learning_rate": 3.989702455353662e-05, + "loss": 0.5765, + "step": 2062 + }, + { + "epoch": 0.2970910138248848, + "grad_norm": 2.242094039916992, + "learning_rate": 3.9887939909028096e-05, + "loss": 0.1658, + "step": 2063 + }, + { + "epoch": 0.29723502304147464, + "grad_norm": 1.3575830459594727, + "learning_rate": 3.987885221722162e-05, + "loss": 0.148, + "step": 2064 + }, + { + "epoch": 0.2973790322580645, + "grad_norm": 1.6266669034957886, + "learning_rate": 3.9869761479977266e-05, + "loss": 0.2275, + "step": 2065 + }, + { + "epoch": 0.29752304147465436, + "grad_norm": 5.657979488372803, + "learning_rate": 3.986066769915575e-05, + "loss": 2.5077, + "step": 2066 + }, + { + "epoch": 0.2976670506912442, + "grad_norm": 2.573279619216919, + "learning_rate": 3.985157087661843e-05, + "loss": 0.2677, + "step": 2067 + }, + { + "epoch": 0.2978110599078341, + "grad_norm": 5.302773475646973, + "learning_rate": 3.984247101422724e-05, + "loss": 0.5879, + "step": 2068 + }, + { + "epoch": 0.297955069124424, + "grad_norm": 6.743826866149902, + "learning_rate": 3.983336811384476e-05, + "loss": 1.6989, + "step": 2069 + }, + { + "epoch": 0.29809907834101385, + "grad_norm": 2.174928903579712, + "learning_rate": 3.982426217733421e-05, + "loss": 0.1495, + "step": 2070 + }, + { + "epoch": 0.2982430875576037, + "grad_norm": 4.983500003814697, + "learning_rate": 3.981515320655941e-05, + "loss": 0.6823, + "step": 2071 + }, + { + "epoch": 0.29838709677419356, + "grad_norm": 1.7479591369628906, + "learning_rate": 3.980604120338479e-05, + "loss": 0.1516, + "step": 2072 + }, + { + "epoch": 0.2985311059907834, + "grad_norm": 5.852272987365723, + "learning_rate": 3.979692616967543e-05, + "loss": 3.3149, + "step": 2073 + }, + { + "epoch": 0.2986751152073733, + "grad_norm": 7.268279552459717, + "learning_rate": 3.978780810729702e-05, + "loss": 0.8786, + "step": 2074 + }, + { + "epoch": 0.29881912442396313, + "grad_norm": 0.8006262183189392, + "learning_rate": 3.9778687018115856e-05, + "loss": 0.1204, + "step": 2075 + }, + { + "epoch": 0.298963133640553, + "grad_norm": 1.7946444749832153, + "learning_rate": 3.976956290399886e-05, + "loss": 0.197, + "step": 2076 + }, + { + "epoch": 0.29910714285714285, + "grad_norm": 3.8825526237487793, + "learning_rate": 3.9760435766813596e-05, + "loss": 0.3823, + "step": 2077 + }, + { + "epoch": 0.2992511520737327, + "grad_norm": 3.657999277114868, + "learning_rate": 3.9751305608428205e-05, + "loss": 0.3439, + "step": 2078 + }, + { + "epoch": 0.29939516129032256, + "grad_norm": 6.453387260437012, + "learning_rate": 3.974217243071149e-05, + "loss": 0.6203, + "step": 2079 + }, + { + "epoch": 0.2995391705069124, + "grad_norm": 0.7543643712997437, + "learning_rate": 3.973303623553283e-05, + "loss": 0.0865, + "step": 2080 + }, + { + "epoch": 0.2996831797235023, + "grad_norm": 2.008950710296631, + "learning_rate": 3.9723897024762255e-05, + "loss": 0.2022, + "step": 2081 + }, + { + "epoch": 0.2998271889400922, + "grad_norm": 1.3772034645080566, + "learning_rate": 3.9714754800270395e-05, + "loss": 0.1687, + "step": 2082 + }, + { + "epoch": 0.29997119815668205, + "grad_norm": 2.80077862739563, + "learning_rate": 3.97056095639285e-05, + "loss": 0.3971, + "step": 2083 + }, + { + "epoch": 0.3001152073732719, + "grad_norm": 5.1128363609313965, + "learning_rate": 3.969646131760845e-05, + "loss": 0.3762, + "step": 2084 + }, + { + "epoch": 0.30025921658986177, + "grad_norm": 2.8570680618286133, + "learning_rate": 3.968731006318272e-05, + "loss": 0.367, + "step": 2085 + }, + { + "epoch": 0.3004032258064516, + "grad_norm": 3.4348673820495605, + "learning_rate": 3.967815580252441e-05, + "loss": 2.0204, + "step": 2086 + }, + { + "epoch": 0.3005472350230415, + "grad_norm": 3.695319652557373, + "learning_rate": 3.966899853750724e-05, + "loss": 0.6022, + "step": 2087 + }, + { + "epoch": 0.30069124423963134, + "grad_norm": 6.134025573730469, + "learning_rate": 3.9659838270005535e-05, + "loss": 0.5184, + "step": 2088 + }, + { + "epoch": 0.3008352534562212, + "grad_norm": 2.233690023422241, + "learning_rate": 3.965067500189424e-05, + "loss": 0.2417, + "step": 2089 + }, + { + "epoch": 0.30097926267281105, + "grad_norm": 1.1850144863128662, + "learning_rate": 3.9641508735048915e-05, + "loss": 0.1406, + "step": 2090 + }, + { + "epoch": 0.3011232718894009, + "grad_norm": 2.5641422271728516, + "learning_rate": 3.963233947134573e-05, + "loss": 0.2471, + "step": 2091 + }, + { + "epoch": 0.30126728110599077, + "grad_norm": 2.870542526245117, + "learning_rate": 3.962316721266148e-05, + "loss": 0.2752, + "step": 2092 + }, + { + "epoch": 0.3014112903225806, + "grad_norm": 0.6397125720977783, + "learning_rate": 3.961399196087355e-05, + "loss": 0.0599, + "step": 2093 + }, + { + "epoch": 0.3015552995391705, + "grad_norm": 8.188117980957031, + "learning_rate": 3.960481371785997e-05, + "loss": 2.5369, + "step": 2094 + }, + { + "epoch": 0.30169930875576034, + "grad_norm": 1.2071136236190796, + "learning_rate": 3.959563248549935e-05, + "loss": 0.1319, + "step": 2095 + }, + { + "epoch": 0.30184331797235026, + "grad_norm": 2.0631582736968994, + "learning_rate": 3.958644826567093e-05, + "loss": 0.2875, + "step": 2096 + }, + { + "epoch": 0.3019873271889401, + "grad_norm": 6.129966735839844, + "learning_rate": 3.957726106025455e-05, + "loss": 0.9344, + "step": 2097 + }, + { + "epoch": 0.30213133640552997, + "grad_norm": 4.816929340362549, + "learning_rate": 3.956807087113068e-05, + "loss": 0.341, + "step": 2098 + }, + { + "epoch": 0.30227534562211983, + "grad_norm": 3.5142290592193604, + "learning_rate": 3.955887770018039e-05, + "loss": 3.0031, + "step": 2099 + }, + { + "epoch": 0.3024193548387097, + "grad_norm": 2.0251941680908203, + "learning_rate": 3.954968154928534e-05, + "loss": 0.2824, + "step": 2100 + }, + { + "epoch": 0.30256336405529954, + "grad_norm": 3.1265854835510254, + "learning_rate": 3.9540482420327845e-05, + "loss": 1.0645, + "step": 2101 + }, + { + "epoch": 0.3027073732718894, + "grad_norm": 2.448774576187134, + "learning_rate": 3.953128031519079e-05, + "loss": 0.2839, + "step": 2102 + }, + { + "epoch": 0.30285138248847926, + "grad_norm": 0.6740416884422302, + "learning_rate": 3.9522075235757686e-05, + "loss": 0.0851, + "step": 2103 + }, + { + "epoch": 0.3029953917050691, + "grad_norm": 11.441324234008789, + "learning_rate": 3.951286718391265e-05, + "loss": 1.109, + "step": 2104 + }, + { + "epoch": 0.303139400921659, + "grad_norm": 1.8419533967971802, + "learning_rate": 3.950365616154042e-05, + "loss": 0.2905, + "step": 2105 + }, + { + "epoch": 0.30328341013824883, + "grad_norm": 4.1370368003845215, + "learning_rate": 3.949444217052629e-05, + "loss": 2.6524, + "step": 2106 + }, + { + "epoch": 0.3034274193548387, + "grad_norm": 0.5555270910263062, + "learning_rate": 3.9485225212756246e-05, + "loss": 4.5559, + "step": 2107 + }, + { + "epoch": 0.30357142857142855, + "grad_norm": 2.566009044647217, + "learning_rate": 3.9476005290116814e-05, + "loss": 0.2747, + "step": 2108 + }, + { + "epoch": 0.30371543778801846, + "grad_norm": 4.517889499664307, + "learning_rate": 3.946678240449515e-05, + "loss": 0.4303, + "step": 2109 + }, + { + "epoch": 0.3038594470046083, + "grad_norm": 3.4803295135498047, + "learning_rate": 3.9457556557779015e-05, + "loss": 0.3578, + "step": 2110 + }, + { + "epoch": 0.3040034562211982, + "grad_norm": 3.625251531600952, + "learning_rate": 3.944832775185678e-05, + "loss": 1.0418, + "step": 2111 + }, + { + "epoch": 0.30414746543778803, + "grad_norm": 4.674493312835693, + "learning_rate": 3.9439095988617424e-05, + "loss": 0.6701, + "step": 2112 + }, + { + "epoch": 0.3042914746543779, + "grad_norm": 1.3278777599334717, + "learning_rate": 3.942986126995052e-05, + "loss": 0.1264, + "step": 2113 + }, + { + "epoch": 0.30443548387096775, + "grad_norm": 1.2654496431350708, + "learning_rate": 3.942062359774625e-05, + "loss": 0.1198, + "step": 2114 + }, + { + "epoch": 0.3045794930875576, + "grad_norm": 4.950197696685791, + "learning_rate": 3.94113829738954e-05, + "loss": 2.4079, + "step": 2115 + }, + { + "epoch": 0.30472350230414746, + "grad_norm": 3.7724227905273438, + "learning_rate": 3.940213940028937e-05, + "loss": 2.1213, + "step": 2116 + }, + { + "epoch": 0.3048675115207373, + "grad_norm": 4.282036304473877, + "learning_rate": 3.939289287882015e-05, + "loss": 0.3587, + "step": 2117 + }, + { + "epoch": 0.3050115207373272, + "grad_norm": 0.9235988259315491, + "learning_rate": 3.938364341138034e-05, + "loss": 0.0878, + "step": 2118 + }, + { + "epoch": 0.30515552995391704, + "grad_norm": 2.0376830101013184, + "learning_rate": 3.937439099986314e-05, + "loss": 0.1939, + "step": 2119 + }, + { + "epoch": 0.3052995391705069, + "grad_norm": 2.137742042541504, + "learning_rate": 3.9365135646162366e-05, + "loss": 0.1994, + "step": 2120 + }, + { + "epoch": 0.30544354838709675, + "grad_norm": 3.088731527328491, + "learning_rate": 3.935587735217242e-05, + "loss": 0.7616, + "step": 2121 + }, + { + "epoch": 0.3055875576036866, + "grad_norm": 3.1897430419921875, + "learning_rate": 3.93466161197883e-05, + "loss": 0.3874, + "step": 2122 + }, + { + "epoch": 0.3057315668202765, + "grad_norm": 3.9973809719085693, + "learning_rate": 3.933735195090562e-05, + "loss": 0.3863, + "step": 2123 + }, + { + "epoch": 0.3058755760368664, + "grad_norm": 5.075820446014404, + "learning_rate": 3.932808484742061e-05, + "loss": 2.9451, + "step": 2124 + }, + { + "epoch": 0.30601958525345624, + "grad_norm": 4.184928894042969, + "learning_rate": 3.931881481123006e-05, + "loss": 0.2678, + "step": 2125 + }, + { + "epoch": 0.3061635944700461, + "grad_norm": 5.812377452850342, + "learning_rate": 3.9309541844231395e-05, + "loss": 0.8624, + "step": 2126 + }, + { + "epoch": 0.30630760368663595, + "grad_norm": 4.427703380584717, + "learning_rate": 3.930026594832262e-05, + "loss": 0.6083, + "step": 2127 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 1.1754651069641113, + "learning_rate": 3.929098712540236e-05, + "loss": 0.1177, + "step": 2128 + }, + { + "epoch": 0.30659562211981567, + "grad_norm": 4.063302516937256, + "learning_rate": 3.928170537736981e-05, + "loss": 0.2437, + "step": 2129 + }, + { + "epoch": 0.3067396313364055, + "grad_norm": 4.390351295471191, + "learning_rate": 3.927242070612478e-05, + "loss": 1.9056, + "step": 2130 + }, + { + "epoch": 0.3068836405529954, + "grad_norm": 6.4713640213012695, + "learning_rate": 3.9263133113567695e-05, + "loss": 0.7685, + "step": 2131 + }, + { + "epoch": 0.30702764976958524, + "grad_norm": 2.4808502197265625, + "learning_rate": 3.925384260159954e-05, + "loss": 0.2473, + "step": 2132 + }, + { + "epoch": 0.3071716589861751, + "grad_norm": 2.2772724628448486, + "learning_rate": 3.9244549172121934e-05, + "loss": 0.2338, + "step": 2133 + }, + { + "epoch": 0.30731566820276496, + "grad_norm": 0.5927713513374329, + "learning_rate": 3.923525282703707e-05, + "loss": 0.0407, + "step": 2134 + }, + { + "epoch": 0.3074596774193548, + "grad_norm": 4.241677761077881, + "learning_rate": 3.922595356824775e-05, + "loss": 0.5445, + "step": 2135 + }, + { + "epoch": 0.3076036866359447, + "grad_norm": 1.0521068572998047, + "learning_rate": 3.9216651397657364e-05, + "loss": 0.1277, + "step": 2136 + }, + { + "epoch": 0.3077476958525346, + "grad_norm": 6.1771769523620605, + "learning_rate": 3.920734631716991e-05, + "loss": 2.1345, + "step": 2137 + }, + { + "epoch": 0.30789170506912444, + "grad_norm": 1.0851842164993286, + "learning_rate": 3.919803832868996e-05, + "loss": 0.1085, + "step": 2138 + }, + { + "epoch": 0.3080357142857143, + "grad_norm": 4.714212417602539, + "learning_rate": 3.9188727434122695e-05, + "loss": 0.538, + "step": 2139 + }, + { + "epoch": 0.30817972350230416, + "grad_norm": 4.032148361206055, + "learning_rate": 3.9179413635373897e-05, + "loss": 0.4547, + "step": 2140 + }, + { + "epoch": 0.308323732718894, + "grad_norm": 2.39846134185791, + "learning_rate": 3.9170096934349944e-05, + "loss": 0.1784, + "step": 2141 + }, + { + "epoch": 0.3084677419354839, + "grad_norm": 5.754051208496094, + "learning_rate": 3.916077733295778e-05, + "loss": 2.5418, + "step": 2142 + }, + { + "epoch": 0.30861175115207373, + "grad_norm": 2.8066203594207764, + "learning_rate": 3.915145483310498e-05, + "loss": 3.0208, + "step": 2143 + }, + { + "epoch": 0.3087557603686636, + "grad_norm": 3.480288505554199, + "learning_rate": 3.914212943669969e-05, + "loss": 0.501, + "step": 2144 + }, + { + "epoch": 0.30889976958525345, + "grad_norm": 2.1268150806427, + "learning_rate": 3.913280114565066e-05, + "loss": 0.1266, + "step": 2145 + }, + { + "epoch": 0.3090437788018433, + "grad_norm": 1.6570589542388916, + "learning_rate": 3.91234699618672e-05, + "loss": 0.1664, + "step": 2146 + }, + { + "epoch": 0.30918778801843316, + "grad_norm": 0.9880962371826172, + "learning_rate": 3.911413588725926e-05, + "loss": 0.1073, + "step": 2147 + }, + { + "epoch": 0.309331797235023, + "grad_norm": 4.7364935874938965, + "learning_rate": 3.910479892373737e-05, + "loss": 2.4711, + "step": 2148 + }, + { + "epoch": 0.3094758064516129, + "grad_norm": 4.790311336517334, + "learning_rate": 3.9095459073212615e-05, + "loss": 1.0822, + "step": 2149 + }, + { + "epoch": 0.3096198156682028, + "grad_norm": 3.4176459312438965, + "learning_rate": 3.908611633759672e-05, + "loss": 0.1559, + "step": 2150 + }, + { + "epoch": 0.30976382488479265, + "grad_norm": 2.829763412475586, + "learning_rate": 3.907677071880196e-05, + "loss": 0.1727, + "step": 2151 + }, + { + "epoch": 0.3099078341013825, + "grad_norm": 2.1061532497406006, + "learning_rate": 3.906742221874122e-05, + "loss": 0.2156, + "step": 2152 + }, + { + "epoch": 0.31005184331797236, + "grad_norm": 4.212060451507568, + "learning_rate": 3.905807083932799e-05, + "loss": 0.4488, + "step": 2153 + }, + { + "epoch": 0.3101958525345622, + "grad_norm": 3.397585391998291, + "learning_rate": 3.9048716582476316e-05, + "loss": 1.1264, + "step": 2154 + }, + { + "epoch": 0.3103398617511521, + "grad_norm": 3.926377058029175, + "learning_rate": 3.903935945010085e-05, + "loss": 0.454, + "step": 2155 + }, + { + "epoch": 0.31048387096774194, + "grad_norm": 3.0604660511016846, + "learning_rate": 3.902999944411685e-05, + "loss": 0.2307, + "step": 2156 + }, + { + "epoch": 0.3106278801843318, + "grad_norm": 5.656445026397705, + "learning_rate": 3.902063656644012e-05, + "loss": 1.2403, + "step": 2157 + }, + { + "epoch": 0.31077188940092165, + "grad_norm": 1.9675745964050293, + "learning_rate": 3.901127081898708e-05, + "loss": 0.1684, + "step": 2158 + }, + { + "epoch": 0.3109158986175115, + "grad_norm": 6.309079170227051, + "learning_rate": 3.900190220367473e-05, + "loss": 2.163, + "step": 2159 + }, + { + "epoch": 0.31105990783410137, + "grad_norm": 2.098220109939575, + "learning_rate": 3.899253072242067e-05, + "loss": 0.2373, + "step": 2160 + }, + { + "epoch": 0.3112039170506912, + "grad_norm": 1.7800476551055908, + "learning_rate": 3.898315637714308e-05, + "loss": 0.175, + "step": 2161 + }, + { + "epoch": 0.3113479262672811, + "grad_norm": 4.149674415588379, + "learning_rate": 3.8973779169760716e-05, + "loss": 1.6145, + "step": 2162 + }, + { + "epoch": 0.31149193548387094, + "grad_norm": 0.6238038539886475, + "learning_rate": 3.896439910219292e-05, + "loss": 0.0498, + "step": 2163 + }, + { + "epoch": 0.31163594470046085, + "grad_norm": 1.3702797889709473, + "learning_rate": 3.895501617635964e-05, + "loss": 0.1493, + "step": 2164 + }, + { + "epoch": 0.3117799539170507, + "grad_norm": 4.819758892059326, + "learning_rate": 3.894563039418137e-05, + "loss": 0.3828, + "step": 2165 + }, + { + "epoch": 0.31192396313364057, + "grad_norm": 3.4709882736206055, + "learning_rate": 3.893624175757924e-05, + "loss": 0.4176, + "step": 2166 + }, + { + "epoch": 0.3120679723502304, + "grad_norm": 1.7616279125213623, + "learning_rate": 3.892685026847494e-05, + "loss": 0.1693, + "step": 2167 + }, + { + "epoch": 0.3122119815668203, + "grad_norm": 1.0598535537719727, + "learning_rate": 3.8917455928790714e-05, + "loss": 0.1097, + "step": 2168 + }, + { + "epoch": 0.31235599078341014, + "grad_norm": 1.1785104274749756, + "learning_rate": 3.8908058740449436e-05, + "loss": 0.1352, + "step": 2169 + }, + { + "epoch": 0.3125, + "grad_norm": 1.5080389976501465, + "learning_rate": 3.8898658705374546e-05, + "loss": 0.2492, + "step": 2170 + }, + { + "epoch": 0.31264400921658986, + "grad_norm": 5.522306442260742, + "learning_rate": 3.888925582549006e-05, + "loss": 0.5624, + "step": 2171 + }, + { + "epoch": 0.3127880184331797, + "grad_norm": 3.244730234146118, + "learning_rate": 3.887985010272058e-05, + "loss": 0.3877, + "step": 2172 + }, + { + "epoch": 0.3129320276497696, + "grad_norm": 2.357093572616577, + "learning_rate": 3.8870441538991295e-05, + "loss": 0.1958, + "step": 2173 + }, + { + "epoch": 0.31307603686635943, + "grad_norm": 4.711888790130615, + "learning_rate": 3.886103013622796e-05, + "loss": 0.5424, + "step": 2174 + }, + { + "epoch": 0.3132200460829493, + "grad_norm": 5.715659141540527, + "learning_rate": 3.885161589635694e-05, + "loss": 0.9745, + "step": 2175 + }, + { + "epoch": 0.31336405529953915, + "grad_norm": 2.6397621631622314, + "learning_rate": 3.8842198821305155e-05, + "loss": 0.3161, + "step": 2176 + }, + { + "epoch": 0.31350806451612906, + "grad_norm": 6.365112781524658, + "learning_rate": 3.883277891300011e-05, + "loss": 1.4754, + "step": 2177 + }, + { + "epoch": 0.3136520737327189, + "grad_norm": 2.1221506595611572, + "learning_rate": 3.8823356173369895e-05, + "loss": 0.1706, + "step": 2178 + }, + { + "epoch": 0.3137960829493088, + "grad_norm": 0.867976188659668, + "learning_rate": 3.881393060434319e-05, + "loss": 4.3771, + "step": 2179 + }, + { + "epoch": 0.31394009216589863, + "grad_norm": 10.96361255645752, + "learning_rate": 3.880450220784923e-05, + "loss": 0.9611, + "step": 2180 + }, + { + "epoch": 0.3140841013824885, + "grad_norm": 1.5654730796813965, + "learning_rate": 3.879507098581784e-05, + "loss": 0.1746, + "step": 2181 + }, + { + "epoch": 0.31422811059907835, + "grad_norm": 2.5433669090270996, + "learning_rate": 3.8785636940179434e-05, + "loss": 0.293, + "step": 2182 + }, + { + "epoch": 0.3143721198156682, + "grad_norm": 10.525322914123535, + "learning_rate": 3.877620007286499e-05, + "loss": 1.596, + "step": 2183 + }, + { + "epoch": 0.31451612903225806, + "grad_norm": 2.2990055084228516, + "learning_rate": 3.876676038580606e-05, + "loss": 0.2131, + "step": 2184 + }, + { + "epoch": 0.3146601382488479, + "grad_norm": 4.453908443450928, + "learning_rate": 3.8757317880934786e-05, + "loss": 1.3574, + "step": 2185 + }, + { + "epoch": 0.3148041474654378, + "grad_norm": 1.9981704950332642, + "learning_rate": 3.874787256018388e-05, + "loss": 0.2417, + "step": 2186 + }, + { + "epoch": 0.31494815668202764, + "grad_norm": 5.691637992858887, + "learning_rate": 3.873842442548665e-05, + "loss": 1.9197, + "step": 2187 + }, + { + "epoch": 0.3150921658986175, + "grad_norm": 1.0464750528335571, + "learning_rate": 3.8728973478776945e-05, + "loss": 0.1103, + "step": 2188 + }, + { + "epoch": 0.31523617511520735, + "grad_norm": 4.331569671630859, + "learning_rate": 3.871951972198919e-05, + "loss": 0.2031, + "step": 2189 + }, + { + "epoch": 0.3153801843317972, + "grad_norm": 6.300526142120361, + "learning_rate": 3.871006315705844e-05, + "loss": 2.5363, + "step": 2190 + }, + { + "epoch": 0.3155241935483871, + "grad_norm": 3.158890724182129, + "learning_rate": 3.870060378592026e-05, + "loss": 1.9891, + "step": 2191 + }, + { + "epoch": 0.315668202764977, + "grad_norm": 1.3070265054702759, + "learning_rate": 3.869114161051082e-05, + "loss": 0.1439, + "step": 2192 + }, + { + "epoch": 0.31581221198156684, + "grad_norm": 1.1560536623001099, + "learning_rate": 3.868167663276686e-05, + "loss": 0.1609, + "step": 2193 + }, + { + "epoch": 0.3159562211981567, + "grad_norm": 5.2165045738220215, + "learning_rate": 3.867220885462571e-05, + "loss": 0.8207, + "step": 2194 + }, + { + "epoch": 0.31610023041474655, + "grad_norm": 3.913224935531616, + "learning_rate": 3.866273827802523e-05, + "loss": 0.5724, + "step": 2195 + }, + { + "epoch": 0.3162442396313364, + "grad_norm": 3.726107597351074, + "learning_rate": 3.8653264904903905e-05, + "loss": 0.7006, + "step": 2196 + }, + { + "epoch": 0.31638824884792627, + "grad_norm": 4.4167561531066895, + "learning_rate": 3.864378873720075e-05, + "loss": 0.7759, + "step": 2197 + }, + { + "epoch": 0.3165322580645161, + "grad_norm": 4.856625080108643, + "learning_rate": 3.863430977685537e-05, + "loss": 0.3039, + "step": 2198 + }, + { + "epoch": 0.316676267281106, + "grad_norm": 9.203021049499512, + "learning_rate": 3.862482802580795e-05, + "loss": 0.2971, + "step": 2199 + }, + { + "epoch": 0.31682027649769584, + "grad_norm": 6.557830810546875, + "learning_rate": 3.861534348599922e-05, + "loss": 1.2623, + "step": 2200 + }, + { + "epoch": 0.3169642857142857, + "grad_norm": 1.045134425163269, + "learning_rate": 3.860585615937051e-05, + "loss": 0.1177, + "step": 2201 + }, + { + "epoch": 0.31710829493087556, + "grad_norm": 5.852851390838623, + "learning_rate": 3.859636604786372e-05, + "loss": 2.917, + "step": 2202 + }, + { + "epoch": 0.3172523041474654, + "grad_norm": 3.4671144485473633, + "learning_rate": 3.858687315342129e-05, + "loss": 0.2254, + "step": 2203 + }, + { + "epoch": 0.3173963133640553, + "grad_norm": 2.3404111862182617, + "learning_rate": 3.857737747798624e-05, + "loss": 0.2304, + "step": 2204 + }, + { + "epoch": 0.3175403225806452, + "grad_norm": 1.4969562292099, + "learning_rate": 3.8567879023502186e-05, + "loss": 0.2034, + "step": 2205 + }, + { + "epoch": 0.31768433179723504, + "grad_norm": 2.415008783340454, + "learning_rate": 3.855837779191329e-05, + "loss": 0.4007, + "step": 2206 + }, + { + "epoch": 0.3178283410138249, + "grad_norm": 2.168964147567749, + "learning_rate": 3.854887378516428e-05, + "loss": 0.2457, + "step": 2207 + }, + { + "epoch": 0.31797235023041476, + "grad_norm": 0.9402557015419006, + "learning_rate": 3.853936700520046e-05, + "loss": 0.1321, + "step": 2208 + }, + { + "epoch": 0.3181163594470046, + "grad_norm": 5.1730055809021, + "learning_rate": 3.85298574539677e-05, + "loss": 0.4953, + "step": 2209 + }, + { + "epoch": 0.3182603686635945, + "grad_norm": 4.398864269256592, + "learning_rate": 3.852034513341244e-05, + "loss": 2.7293, + "step": 2210 + }, + { + "epoch": 0.31840437788018433, + "grad_norm": 2.238879919052124, + "learning_rate": 3.851083004548167e-05, + "loss": 0.1687, + "step": 2211 + }, + { + "epoch": 0.3185483870967742, + "grad_norm": 9.396533012390137, + "learning_rate": 3.8501312192122986e-05, + "loss": 1.6418, + "step": 2212 + }, + { + "epoch": 0.31869239631336405, + "grad_norm": 3.5060536861419678, + "learning_rate": 3.84917915752845e-05, + "loss": 1.5003, + "step": 2213 + }, + { + "epoch": 0.3188364055299539, + "grad_norm": 2.996799945831299, + "learning_rate": 3.848226819691493e-05, + "loss": 0.3472, + "step": 2214 + }, + { + "epoch": 0.31898041474654376, + "grad_norm": 7.476945877075195, + "learning_rate": 3.847274205896353e-05, + "loss": 0.5305, + "step": 2215 + }, + { + "epoch": 0.3191244239631336, + "grad_norm": 2.6289596557617188, + "learning_rate": 3.846321316338014e-05, + "loss": 0.2439, + "step": 2216 + }, + { + "epoch": 0.3192684331797235, + "grad_norm": 6.974658012390137, + "learning_rate": 3.845368151211516e-05, + "loss": 0.548, + "step": 2217 + }, + { + "epoch": 0.3194124423963134, + "grad_norm": 1.8961349725723267, + "learning_rate": 3.8444147107119536e-05, + "loss": 0.166, + "step": 2218 + }, + { + "epoch": 0.31955645161290325, + "grad_norm": 1.1015805006027222, + "learning_rate": 3.84346099503448e-05, + "loss": 0.094, + "step": 2219 + }, + { + "epoch": 0.3197004608294931, + "grad_norm": 0.9916687607765198, + "learning_rate": 3.842507004374304e-05, + "loss": 0.1122, + "step": 2220 + }, + { + "epoch": 0.31984447004608296, + "grad_norm": 0.887603759765625, + "learning_rate": 3.841552738926691e-05, + "loss": 0.0714, + "step": 2221 + }, + { + "epoch": 0.3199884792626728, + "grad_norm": 1.029150366783142, + "learning_rate": 3.840598198886963e-05, + "loss": 0.0974, + "step": 2222 + }, + { + "epoch": 0.3201324884792627, + "grad_norm": 1.4034690856933594, + "learning_rate": 3.8396433844504955e-05, + "loss": 0.1885, + "step": 2223 + }, + { + "epoch": 0.32027649769585254, + "grad_norm": 1.2902467250823975, + "learning_rate": 3.838688295812722e-05, + "loss": 0.1853, + "step": 2224 + }, + { + "epoch": 0.3204205069124424, + "grad_norm": 3.063833713531494, + "learning_rate": 3.837732933169135e-05, + "loss": 0.2637, + "step": 2225 + }, + { + "epoch": 0.32056451612903225, + "grad_norm": 0.5953347682952881, + "learning_rate": 3.8367772967152775e-05, + "loss": 0.0592, + "step": 2226 + }, + { + "epoch": 0.3207085253456221, + "grad_norm": 5.732187747955322, + "learning_rate": 3.835821386646753e-05, + "loss": 0.435, + "step": 2227 + }, + { + "epoch": 0.32085253456221197, + "grad_norm": 2.844172239303589, + "learning_rate": 3.834865203159218e-05, + "loss": 0.3647, + "step": 2228 + }, + { + "epoch": 0.3209965437788018, + "grad_norm": 1.5379304885864258, + "learning_rate": 3.833908746448388e-05, + "loss": 0.1566, + "step": 2229 + }, + { + "epoch": 0.3211405529953917, + "grad_norm": 4.683826446533203, + "learning_rate": 3.8329520167100316e-05, + "loss": 0.3732, + "step": 2230 + }, + { + "epoch": 0.32128456221198154, + "grad_norm": 1.8450937271118164, + "learning_rate": 3.831995014139974e-05, + "loss": 0.2152, + "step": 2231 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 5.557022571563721, + "learning_rate": 3.831037738934099e-05, + "loss": 1.2689, + "step": 2232 + }, + { + "epoch": 0.3215725806451613, + "grad_norm": 0.8340703248977661, + "learning_rate": 3.830080191288342e-05, + "loss": 0.0785, + "step": 2233 + }, + { + "epoch": 0.32171658986175117, + "grad_norm": 2.2428171634674072, + "learning_rate": 3.8291223713986955e-05, + "loss": 0.1729, + "step": 2234 + }, + { + "epoch": 0.321860599078341, + "grad_norm": 3.2691686153411865, + "learning_rate": 3.82816427946121e-05, + "loss": 0.4327, + "step": 2235 + }, + { + "epoch": 0.3220046082949309, + "grad_norm": 1.3902626037597656, + "learning_rate": 3.8272059156719896e-05, + "loss": 0.1585, + "step": 2236 + }, + { + "epoch": 0.32214861751152074, + "grad_norm": 2.9699084758758545, + "learning_rate": 3.8262472802271944e-05, + "loss": 0.2306, + "step": 2237 + }, + { + "epoch": 0.3222926267281106, + "grad_norm": 8.011017799377441, + "learning_rate": 3.8252883733230386e-05, + "loss": 1.7537, + "step": 2238 + }, + { + "epoch": 0.32243663594470046, + "grad_norm": 2.654078245162964, + "learning_rate": 3.8243291951557954e-05, + "loss": 0.3188, + "step": 2239 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 1.3809185028076172, + "learning_rate": 3.823369745921791e-05, + "loss": 0.1425, + "step": 2240 + }, + { + "epoch": 0.32272465437788017, + "grad_norm": 0.890509843826294, + "learning_rate": 3.822410025817406e-05, + "loss": 0.1019, + "step": 2241 + }, + { + "epoch": 0.32286866359447003, + "grad_norm": 4.119012832641602, + "learning_rate": 3.8214500350390816e-05, + "loss": 0.3732, + "step": 2242 + }, + { + "epoch": 0.3230126728110599, + "grad_norm": 2.206300973892212, + "learning_rate": 3.8204897737833076e-05, + "loss": 0.4144, + "step": 2243 + }, + { + "epoch": 0.32315668202764974, + "grad_norm": 1.3085981607437134, + "learning_rate": 3.8195292422466344e-05, + "loss": 0.1173, + "step": 2244 + }, + { + "epoch": 0.32330069124423966, + "grad_norm": 0.8388067483901978, + "learning_rate": 3.818568440625666e-05, + "loss": 0.0966, + "step": 2245 + }, + { + "epoch": 0.3234447004608295, + "grad_norm": 7.869523525238037, + "learning_rate": 3.81760736911706e-05, + "loss": 0.6599, + "step": 2246 + }, + { + "epoch": 0.3235887096774194, + "grad_norm": 1.975831151008606, + "learning_rate": 3.816646027917532e-05, + "loss": 0.2368, + "step": 2247 + }, + { + "epoch": 0.32373271889400923, + "grad_norm": 5.3562846183776855, + "learning_rate": 3.815684417223851e-05, + "loss": 0.602, + "step": 2248 + }, + { + "epoch": 0.3238767281105991, + "grad_norm": 1.0286316871643066, + "learning_rate": 3.8147225372328424e-05, + "loss": 0.1134, + "step": 2249 + }, + { + "epoch": 0.32402073732718895, + "grad_norm": 0.8859091401100159, + "learning_rate": 3.813760388141384e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.3241647465437788, + "grad_norm": 2.995488166809082, + "learning_rate": 3.812797970146412e-05, + "loss": 0.3932, + "step": 2251 + }, + { + "epoch": 0.32430875576036866, + "grad_norm": 6.776566505432129, + "learning_rate": 3.811835283444918e-05, + "loss": 1.9584, + "step": 2252 + }, + { + "epoch": 0.3244527649769585, + "grad_norm": 2.5909433364868164, + "learning_rate": 3.8108723282339445e-05, + "loss": 0.207, + "step": 2253 + }, + { + "epoch": 0.3245967741935484, + "grad_norm": 1.6353797912597656, + "learning_rate": 3.8099091047105926e-05, + "loss": 0.2834, + "step": 2254 + }, + { + "epoch": 0.32474078341013823, + "grad_norm": 1.6143486499786377, + "learning_rate": 3.808945613072017e-05, + "loss": 0.1603, + "step": 2255 + }, + { + "epoch": 0.3248847926267281, + "grad_norm": 3.5729053020477295, + "learning_rate": 3.807981853515427e-05, + "loss": 0.3408, + "step": 2256 + }, + { + "epoch": 0.32502880184331795, + "grad_norm": 3.7150933742523193, + "learning_rate": 3.8070178262380876e-05, + "loss": 0.3288, + "step": 2257 + }, + { + "epoch": 0.3251728110599078, + "grad_norm": 5.964656829833984, + "learning_rate": 3.806053531437317e-05, + "loss": 2.6321, + "step": 2258 + }, + { + "epoch": 0.3253168202764977, + "grad_norm": 1.6687166690826416, + "learning_rate": 3.8050889693104904e-05, + "loss": 0.1968, + "step": 2259 + }, + { + "epoch": 0.3254608294930876, + "grad_norm": 4.142490386962891, + "learning_rate": 3.8041241400550364e-05, + "loss": 0.403, + "step": 2260 + }, + { + "epoch": 0.32560483870967744, + "grad_norm": 2.743635892868042, + "learning_rate": 3.80315904386844e-05, + "loss": 0.3869, + "step": 2261 + }, + { + "epoch": 0.3257488479262673, + "grad_norm": 4.500650882720947, + "learning_rate": 3.802193680948236e-05, + "loss": 2.1243, + "step": 2262 + }, + { + "epoch": 0.32589285714285715, + "grad_norm": 1.0147463083267212, + "learning_rate": 3.801228051492019e-05, + "loss": 0.1114, + "step": 2263 + }, + { + "epoch": 0.326036866359447, + "grad_norm": 0.8168612122535706, + "learning_rate": 3.8002621556974367e-05, + "loss": 0.1047, + "step": 2264 + }, + { + "epoch": 0.32618087557603687, + "grad_norm": 3.082470178604126, + "learning_rate": 3.7992959937621896e-05, + "loss": 0.2571, + "step": 2265 + }, + { + "epoch": 0.3263248847926267, + "grad_norm": 1.1566603183746338, + "learning_rate": 3.798329565884036e-05, + "loss": 0.1862, + "step": 2266 + }, + { + "epoch": 0.3264688940092166, + "grad_norm": 1.4830631017684937, + "learning_rate": 3.797362872260785e-05, + "loss": 0.1841, + "step": 2267 + }, + { + "epoch": 0.32661290322580644, + "grad_norm": 2.100132942199707, + "learning_rate": 3.796395913090301e-05, + "loss": 0.238, + "step": 2268 + }, + { + "epoch": 0.3267569124423963, + "grad_norm": 1.8231691122055054, + "learning_rate": 3.795428688570505e-05, + "loss": 0.2509, + "step": 2269 + }, + { + "epoch": 0.32690092165898615, + "grad_norm": 1.9856480360031128, + "learning_rate": 3.7944611988993703e-05, + "loss": 0.2117, + "step": 2270 + }, + { + "epoch": 0.327044930875576, + "grad_norm": 7.361438274383545, + "learning_rate": 3.7934934442749246e-05, + "loss": 0.4983, + "step": 2271 + }, + { + "epoch": 0.3271889400921659, + "grad_norm": 0.8297907114028931, + "learning_rate": 3.79252542489525e-05, + "loss": 0.0436, + "step": 2272 + }, + { + "epoch": 0.3273329493087558, + "grad_norm": 0.59689861536026, + "learning_rate": 3.7915571409584836e-05, + "loss": 0.0542, + "step": 2273 + }, + { + "epoch": 0.32747695852534564, + "grad_norm": 3.582059383392334, + "learning_rate": 3.790588592662816e-05, + "loss": 0.3149, + "step": 2274 + }, + { + "epoch": 0.3276209677419355, + "grad_norm": 1.7889058589935303, + "learning_rate": 3.7896197802064907e-05, + "loss": 0.1616, + "step": 2275 + }, + { + "epoch": 0.32776497695852536, + "grad_norm": 2.726233720779419, + "learning_rate": 3.788650703787808e-05, + "loss": 0.2538, + "step": 2276 + }, + { + "epoch": 0.3279089861751152, + "grad_norm": 8.035685539245605, + "learning_rate": 3.78768136360512e-05, + "loss": 4.0465, + "step": 2277 + }, + { + "epoch": 0.32805299539170507, + "grad_norm": 3.6852827072143555, + "learning_rate": 3.7867117598568336e-05, + "loss": 0.2881, + "step": 2278 + }, + { + "epoch": 0.32819700460829493, + "grad_norm": 2.003971815109253, + "learning_rate": 3.7857418927414094e-05, + "loss": 0.2197, + "step": 2279 + }, + { + "epoch": 0.3283410138248848, + "grad_norm": 4.956406593322754, + "learning_rate": 3.784771762457362e-05, + "loss": 2.7178, + "step": 2280 + }, + { + "epoch": 0.32848502304147464, + "grad_norm": 0.8965150713920593, + "learning_rate": 3.78380136920326e-05, + "loss": 0.114, + "step": 2281 + }, + { + "epoch": 0.3286290322580645, + "grad_norm": 0.7590845227241516, + "learning_rate": 3.7828307131777263e-05, + "loss": 0.0602, + "step": 2282 + }, + { + "epoch": 0.32877304147465436, + "grad_norm": 0.886633574962616, + "learning_rate": 3.781859794579436e-05, + "loss": 0.0772, + "step": 2283 + }, + { + "epoch": 0.3289170506912442, + "grad_norm": 1.9793353080749512, + "learning_rate": 3.78088861360712e-05, + "loss": 0.1858, + "step": 2284 + }, + { + "epoch": 0.3290610599078341, + "grad_norm": 1.034543752670288, + "learning_rate": 3.779917170459561e-05, + "loss": 0.1447, + "step": 2285 + }, + { + "epoch": 0.329205069124424, + "grad_norm": 5.047701835632324, + "learning_rate": 3.7789454653355966e-05, + "loss": 2.7798, + "step": 2286 + }, + { + "epoch": 0.32934907834101385, + "grad_norm": 3.3393890857696533, + "learning_rate": 3.777973498434117e-05, + "loss": 2.7745, + "step": 2287 + }, + { + "epoch": 0.3294930875576037, + "grad_norm": 4.604044437408447, + "learning_rate": 3.777001269954068e-05, + "loss": 3.3798, + "step": 2288 + }, + { + "epoch": 0.32963709677419356, + "grad_norm": 1.4672040939331055, + "learning_rate": 3.776028780094446e-05, + "loss": 0.1285, + "step": 2289 + }, + { + "epoch": 0.3297811059907834, + "grad_norm": 3.89298677444458, + "learning_rate": 3.775056029054304e-05, + "loss": 2.0993, + "step": 2290 + }, + { + "epoch": 0.3299251152073733, + "grad_norm": 3.7345991134643555, + "learning_rate": 3.774083017032746e-05, + "loss": 0.507, + "step": 2291 + }, + { + "epoch": 0.33006912442396313, + "grad_norm": 1.7782729864120483, + "learning_rate": 3.7731097442289306e-05, + "loss": 0.1397, + "step": 2292 + }, + { + "epoch": 0.330213133640553, + "grad_norm": 0.3840826749801636, + "learning_rate": 3.77213621084207e-05, + "loss": 0.0484, + "step": 2293 + }, + { + "epoch": 0.33035714285714285, + "grad_norm": 3.645589590072632, + "learning_rate": 3.771162417071428e-05, + "loss": 0.4362, + "step": 2294 + }, + { + "epoch": 0.3305011520737327, + "grad_norm": 4.654716491699219, + "learning_rate": 3.770188363116324e-05, + "loss": 0.4039, + "step": 2295 + }, + { + "epoch": 0.33064516129032256, + "grad_norm": 0.9561938047409058, + "learning_rate": 3.7692140491761295e-05, + "loss": 0.127, + "step": 2296 + }, + { + "epoch": 0.3307891705069124, + "grad_norm": 4.494529724121094, + "learning_rate": 3.768239475450269e-05, + "loss": 0.3505, + "step": 2297 + }, + { + "epoch": 0.3309331797235023, + "grad_norm": 3.034480571746826, + "learning_rate": 3.767264642138221e-05, + "loss": 0.2105, + "step": 2298 + }, + { + "epoch": 0.3310771889400922, + "grad_norm": 1.1781079769134521, + "learning_rate": 3.7662895494395155e-05, + "loss": 0.1217, + "step": 2299 + }, + { + "epoch": 0.33122119815668205, + "grad_norm": 1.9939104318618774, + "learning_rate": 3.765314197553738e-05, + "loss": 0.1596, + "step": 2300 + }, + { + "epoch": 0.3313652073732719, + "grad_norm": 2.0853118896484375, + "learning_rate": 3.764338586680525e-05, + "loss": 0.3109, + "step": 2301 + }, + { + "epoch": 0.33150921658986177, + "grad_norm": 4.1169657707214355, + "learning_rate": 3.763362717019567e-05, + "loss": 1.2425, + "step": 2302 + }, + { + "epoch": 0.3316532258064516, + "grad_norm": 0.878976583480835, + "learning_rate": 3.7623865887706075e-05, + "loss": 0.1108, + "step": 2303 + }, + { + "epoch": 0.3317972350230415, + "grad_norm": 3.4611918926239014, + "learning_rate": 3.761410202133443e-05, + "loss": 1.0627, + "step": 2304 + }, + { + "epoch": 0.33194124423963134, + "grad_norm": 3.953080415725708, + "learning_rate": 3.760433557307922e-05, + "loss": 1.4199, + "step": 2305 + }, + { + "epoch": 0.3320852534562212, + "grad_norm": 3.160783290863037, + "learning_rate": 3.759456654493946e-05, + "loss": 0.285, + "step": 2306 + }, + { + "epoch": 0.33222926267281105, + "grad_norm": 1.3344035148620605, + "learning_rate": 3.758479493891471e-05, + "loss": 0.1427, + "step": 2307 + }, + { + "epoch": 0.3323732718894009, + "grad_norm": 6.766593933105469, + "learning_rate": 3.757502075700503e-05, + "loss": 0.9743, + "step": 2308 + }, + { + "epoch": 0.33251728110599077, + "grad_norm": 5.661294937133789, + "learning_rate": 3.756524400121104e-05, + "loss": 2.104, + "step": 2309 + }, + { + "epoch": 0.3326612903225806, + "grad_norm": 1.482837438583374, + "learning_rate": 3.7555464673533845e-05, + "loss": 0.1889, + "step": 2310 + }, + { + "epoch": 0.3328052995391705, + "grad_norm": 5.066265106201172, + "learning_rate": 3.754568277597512e-05, + "loss": 0.4399, + "step": 2311 + }, + { + "epoch": 0.33294930875576034, + "grad_norm": 3.5761520862579346, + "learning_rate": 3.7535898310537046e-05, + "loss": 0.4882, + "step": 2312 + }, + { + "epoch": 0.33309331797235026, + "grad_norm": 4.774685382843018, + "learning_rate": 3.752611127922232e-05, + "loss": 0.9379, + "step": 2313 + }, + { + "epoch": 0.3332373271889401, + "grad_norm": 2.11466646194458, + "learning_rate": 3.751632168403417e-05, + "loss": 0.1467, + "step": 2314 + }, + { + "epoch": 0.33338133640552997, + "grad_norm": 0.9029709696769714, + "learning_rate": 3.7506529526976375e-05, + "loss": 0.1299, + "step": 2315 + }, + { + "epoch": 0.33352534562211983, + "grad_norm": 2.119434356689453, + "learning_rate": 3.74967348100532e-05, + "loss": 0.1888, + "step": 2316 + }, + { + "epoch": 0.3336693548387097, + "grad_norm": 5.721355438232422, + "learning_rate": 3.748693753526945e-05, + "loss": 0.4073, + "step": 2317 + }, + { + "epoch": 0.33381336405529954, + "grad_norm": 5.3682990074157715, + "learning_rate": 3.747713770463046e-05, + "loss": 0.5665, + "step": 2318 + }, + { + "epoch": 0.3339573732718894, + "grad_norm": 1.8555505275726318, + "learning_rate": 3.7467335320142074e-05, + "loss": 0.1303, + "step": 2319 + }, + { + "epoch": 0.33410138248847926, + "grad_norm": 5.087815761566162, + "learning_rate": 3.745753038381068e-05, + "loss": 2.3611, + "step": 2320 + }, + { + "epoch": 0.3342453917050691, + "grad_norm": 0.6496595740318298, + "learning_rate": 3.744772289764316e-05, + "loss": 0.0844, + "step": 2321 + }, + { + "epoch": 0.334389400921659, + "grad_norm": 5.606258869171143, + "learning_rate": 3.7437912863646945e-05, + "loss": 2.3895, + "step": 2322 + }, + { + "epoch": 0.33453341013824883, + "grad_norm": 0.6281618475914001, + "learning_rate": 3.742810028382997e-05, + "loss": 0.0708, + "step": 2323 + }, + { + "epoch": 0.3346774193548387, + "grad_norm": 0.8221520781517029, + "learning_rate": 3.7418285160200696e-05, + "loss": 0.085, + "step": 2324 + }, + { + "epoch": 0.33482142857142855, + "grad_norm": 7.590441703796387, + "learning_rate": 3.74084674947681e-05, + "loss": 0.9363, + "step": 2325 + }, + { + "epoch": 0.33496543778801846, + "grad_norm": 1.389952540397644, + "learning_rate": 3.73986472895417e-05, + "loss": 0.132, + "step": 2326 + }, + { + "epoch": 0.3351094470046083, + "grad_norm": 6.717215061187744, + "learning_rate": 3.73888245465315e-05, + "loss": 0.2339, + "step": 2327 + }, + { + "epoch": 0.3352534562211982, + "grad_norm": 4.869823932647705, + "learning_rate": 3.737899926774805e-05, + "loss": 0.2658, + "step": 2328 + }, + { + "epoch": 0.33539746543778803, + "grad_norm": 7.679150104522705, + "learning_rate": 3.736917145520242e-05, + "loss": 1.1496, + "step": 2329 + }, + { + "epoch": 0.3355414746543779, + "grad_norm": 4.080450534820557, + "learning_rate": 3.735934111090617e-05, + "loss": 0.4206, + "step": 2330 + }, + { + "epoch": 0.33568548387096775, + "grad_norm": 4.199003219604492, + "learning_rate": 3.7349508236871416e-05, + "loss": 0.3063, + "step": 2331 + }, + { + "epoch": 0.3358294930875576, + "grad_norm": 6.438695907592773, + "learning_rate": 3.733967283511077e-05, + "loss": 0.7246, + "step": 2332 + }, + { + "epoch": 0.33597350230414746, + "grad_norm": 0.6839125156402588, + "learning_rate": 3.732983490763735e-05, + "loss": 0.0978, + "step": 2333 + }, + { + "epoch": 0.3361175115207373, + "grad_norm": 0.8067770004272461, + "learning_rate": 3.731999445646482e-05, + "loss": 0.0877, + "step": 2334 + }, + { + "epoch": 0.3362615207373272, + "grad_norm": 2.3769943714141846, + "learning_rate": 3.731015148360735e-05, + "loss": 0.1826, + "step": 2335 + }, + { + "epoch": 0.33640552995391704, + "grad_norm": 1.267694115638733, + "learning_rate": 3.730030599107961e-05, + "loss": 0.1533, + "step": 2336 + }, + { + "epoch": 0.3365495391705069, + "grad_norm": 1.3081212043762207, + "learning_rate": 3.7290457980896795e-05, + "loss": 0.1552, + "step": 2337 + }, + { + "epoch": 0.33669354838709675, + "grad_norm": 5.489588737487793, + "learning_rate": 3.7280607455074634e-05, + "loss": 1.3573, + "step": 2338 + }, + { + "epoch": 0.3368375576036866, + "grad_norm": 4.043893814086914, + "learning_rate": 3.7270754415629346e-05, + "loss": 2.188, + "step": 2339 + }, + { + "epoch": 0.3369815668202765, + "grad_norm": 0.6886488199234009, + "learning_rate": 3.726089886457768e-05, + "loss": 0.0796, + "step": 2340 + }, + { + "epoch": 0.3371255760368664, + "grad_norm": 1.7163454294204712, + "learning_rate": 3.7251040803936876e-05, + "loss": 0.208, + "step": 2341 + }, + { + "epoch": 0.33726958525345624, + "grad_norm": 2.0918617248535156, + "learning_rate": 3.7241180235724726e-05, + "loss": 0.1582, + "step": 2342 + }, + { + "epoch": 0.3374135944700461, + "grad_norm": 0.4933851361274719, + "learning_rate": 3.7231317161959507e-05, + "loss": 0.0496, + "step": 2343 + }, + { + "epoch": 0.33755760368663595, + "grad_norm": 1.3471031188964844, + "learning_rate": 3.722145158466001e-05, + "loss": 0.1539, + "step": 2344 + }, + { + "epoch": 0.3377016129032258, + "grad_norm": 2.5899548530578613, + "learning_rate": 3.721158350584556e-05, + "loss": 0.273, + "step": 2345 + }, + { + "epoch": 0.33784562211981567, + "grad_norm": 0.5333763360977173, + "learning_rate": 3.7201712927535954e-05, + "loss": 0.0737, + "step": 2346 + }, + { + "epoch": 0.3379896313364055, + "grad_norm": 0.7828883528709412, + "learning_rate": 3.719183985175154e-05, + "loss": 0.084, + "step": 2347 + }, + { + "epoch": 0.3381336405529954, + "grad_norm": 1.7180287837982178, + "learning_rate": 3.718196428051316e-05, + "loss": 0.3335, + "step": 2348 + }, + { + "epoch": 0.33827764976958524, + "grad_norm": 1.697249174118042, + "learning_rate": 3.717208621584217e-05, + "loss": 0.2756, + "step": 2349 + }, + { + "epoch": 0.3384216589861751, + "grad_norm": 5.968106269836426, + "learning_rate": 3.716220565976043e-05, + "loss": 0.4745, + "step": 2350 + }, + { + "epoch": 0.33856566820276496, + "grad_norm": 6.538610935211182, + "learning_rate": 3.7152322614290316e-05, + "loss": 2.2496, + "step": 2351 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 0.6698985695838928, + "learning_rate": 3.714243708145472e-05, + "loss": 0.0877, + "step": 2352 + }, + { + "epoch": 0.3388536866359447, + "grad_norm": 1.0705537796020508, + "learning_rate": 3.713254906327703e-05, + "loss": 0.1193, + "step": 2353 + }, + { + "epoch": 0.3389976958525346, + "grad_norm": 1.003795862197876, + "learning_rate": 3.7122658561781146e-05, + "loss": 0.1199, + "step": 2354 + }, + { + "epoch": 0.33914170506912444, + "grad_norm": 10.042739868164062, + "learning_rate": 3.7112765578991484e-05, + "loss": 2.1356, + "step": 2355 + }, + { + "epoch": 0.3392857142857143, + "grad_norm": 2.9672162532806396, + "learning_rate": 3.710287011693296e-05, + "loss": 2.0509, + "step": 2356 + }, + { + "epoch": 0.33942972350230416, + "grad_norm": 2.354273557662964, + "learning_rate": 3.7092972177631e-05, + "loss": 0.2311, + "step": 2357 + }, + { + "epoch": 0.339573732718894, + "grad_norm": 1.6122685670852661, + "learning_rate": 3.708307176311153e-05, + "loss": 0.2072, + "step": 2358 + }, + { + "epoch": 0.3397177419354839, + "grad_norm": 1.3859217166900635, + "learning_rate": 3.707316887540101e-05, + "loss": 0.1137, + "step": 2359 + }, + { + "epoch": 0.33986175115207373, + "grad_norm": 5.635464191436768, + "learning_rate": 3.706326351652636e-05, + "loss": 1.7512, + "step": 2360 + }, + { + "epoch": 0.3400057603686636, + "grad_norm": 3.7860653400421143, + "learning_rate": 3.705335568851506e-05, + "loss": 0.341, + "step": 2361 + }, + { + "epoch": 0.34014976958525345, + "grad_norm": 1.546879768371582, + "learning_rate": 3.704344539339504e-05, + "loss": 4.4874, + "step": 2362 + }, + { + "epoch": 0.3402937788018433, + "grad_norm": 4.218993186950684, + "learning_rate": 3.703353263319478e-05, + "loss": 1.5712, + "step": 2363 + }, + { + "epoch": 0.34043778801843316, + "grad_norm": 4.601889610290527, + "learning_rate": 3.702361740994324e-05, + "loss": 1.7149, + "step": 2364 + }, + { + "epoch": 0.340581797235023, + "grad_norm": 0.6380130052566528, + "learning_rate": 3.701369972566989e-05, + "loss": 0.1138, + "step": 2365 + }, + { + "epoch": 0.3407258064516129, + "grad_norm": 0.81846684217453, + "learning_rate": 3.7003779582404705e-05, + "loss": 0.093, + "step": 2366 + }, + { + "epoch": 0.3408698156682028, + "grad_norm": 0.9563940167427063, + "learning_rate": 3.699385698217816e-05, + "loss": 0.0776, + "step": 2367 + }, + { + "epoch": 0.34101382488479265, + "grad_norm": 12.046903610229492, + "learning_rate": 3.6983931927021245e-05, + "loss": 1.9469, + "step": 2368 + }, + { + "epoch": 0.3411578341013825, + "grad_norm": 1.3418408632278442, + "learning_rate": 3.697400441896543e-05, + "loss": 0.1576, + "step": 2369 + }, + { + "epoch": 0.34130184331797236, + "grad_norm": 1.6905614137649536, + "learning_rate": 3.6964074460042726e-05, + "loss": 0.1843, + "step": 2370 + }, + { + "epoch": 0.3414458525345622, + "grad_norm": 2.067955493927002, + "learning_rate": 3.695414205228559e-05, + "loss": 0.1525, + "step": 2371 + }, + { + "epoch": 0.3415898617511521, + "grad_norm": 1.4583888053894043, + "learning_rate": 3.6944207197727024e-05, + "loss": 0.1612, + "step": 2372 + }, + { + "epoch": 0.34173387096774194, + "grad_norm": 0.590279221534729, + "learning_rate": 3.693426989840052e-05, + "loss": 0.0602, + "step": 2373 + }, + { + "epoch": 0.3418778801843318, + "grad_norm": 3.349550485610962, + "learning_rate": 3.692433015634005e-05, + "loss": 0.5664, + "step": 2374 + }, + { + "epoch": 0.34202188940092165, + "grad_norm": 7.818876266479492, + "learning_rate": 3.691438797358013e-05, + "loss": 1.2052, + "step": 2375 + }, + { + "epoch": 0.3421658986175115, + "grad_norm": 2.9690499305725098, + "learning_rate": 3.6904443352155726e-05, + "loss": 0.2345, + "step": 2376 + }, + { + "epoch": 0.34230990783410137, + "grad_norm": 0.8302456736564636, + "learning_rate": 3.689449629410234e-05, + "loss": 0.1293, + "step": 2377 + }, + { + "epoch": 0.3424539170506912, + "grad_norm": 2.3978703022003174, + "learning_rate": 3.6884546801455956e-05, + "loss": 0.2022, + "step": 2378 + }, + { + "epoch": 0.3425979262672811, + "grad_norm": 10.290448188781738, + "learning_rate": 3.687459487625305e-05, + "loss": 2.63, + "step": 2379 + }, + { + "epoch": 0.34274193548387094, + "grad_norm": 4.832192420959473, + "learning_rate": 3.6864640520530615e-05, + "loss": 1.1986, + "step": 2380 + }, + { + "epoch": 0.34288594470046085, + "grad_norm": 1.8463056087493896, + "learning_rate": 3.6854683736326125e-05, + "loss": 0.1721, + "step": 2381 + }, + { + "epoch": 0.3430299539170507, + "grad_norm": 2.670353651046753, + "learning_rate": 3.6844724525677574e-05, + "loss": 0.1187, + "step": 2382 + }, + { + "epoch": 0.34317396313364057, + "grad_norm": 1.7534866333007812, + "learning_rate": 3.6834762890623415e-05, + "loss": 0.1765, + "step": 2383 + }, + { + "epoch": 0.3433179723502304, + "grad_norm": 1.0669370889663696, + "learning_rate": 3.682479883320263e-05, + "loss": 0.1421, + "step": 2384 + }, + { + "epoch": 0.3434619815668203, + "grad_norm": 1.3682060241699219, + "learning_rate": 3.681483235545468e-05, + "loss": 0.1429, + "step": 2385 + }, + { + "epoch": 0.34360599078341014, + "grad_norm": 3.0086333751678467, + "learning_rate": 3.6804863459419526e-05, + "loss": 1.8655, + "step": 2386 + }, + { + "epoch": 0.34375, + "grad_norm": 4.808706283569336, + "learning_rate": 3.679489214713763e-05, + "loss": 0.6436, + "step": 2387 + }, + { + "epoch": 0.34389400921658986, + "grad_norm": 1.0558781623840332, + "learning_rate": 3.678491842064995e-05, + "loss": 0.1245, + "step": 2388 + }, + { + "epoch": 0.3440380184331797, + "grad_norm": 2.713318109512329, + "learning_rate": 3.67749422819979e-05, + "loss": 0.2263, + "step": 2389 + }, + { + "epoch": 0.3441820276497696, + "grad_norm": 3.3478786945343018, + "learning_rate": 3.676496373322346e-05, + "loss": 0.6895, + "step": 2390 + }, + { + "epoch": 0.34432603686635943, + "grad_norm": 0.8439862728118896, + "learning_rate": 3.675498277636905e-05, + "loss": 0.1134, + "step": 2391 + }, + { + "epoch": 0.3444700460829493, + "grad_norm": 1.3433738946914673, + "learning_rate": 3.674499941347757e-05, + "loss": 0.1377, + "step": 2392 + }, + { + "epoch": 0.34461405529953915, + "grad_norm": 0.6892962455749512, + "learning_rate": 3.6735013646592475e-05, + "loss": 0.085, + "step": 2393 + }, + { + "epoch": 0.34475806451612906, + "grad_norm": 0.8488281965255737, + "learning_rate": 3.6725025477757645e-05, + "loss": 0.0958, + "step": 2394 + }, + { + "epoch": 0.3449020737327189, + "grad_norm": 4.22620964050293, + "learning_rate": 3.67150349090175e-05, + "loss": 0.5781, + "step": 2395 + }, + { + "epoch": 0.3450460829493088, + "grad_norm": 2.6592812538146973, + "learning_rate": 3.670504194241692e-05, + "loss": 0.5638, + "step": 2396 + }, + { + "epoch": 0.34519009216589863, + "grad_norm": 1.2554744482040405, + "learning_rate": 3.6695046580001304e-05, + "loss": 4.1849, + "step": 2397 + }, + { + "epoch": 0.3453341013824885, + "grad_norm": 4.193381309509277, + "learning_rate": 3.66850488238165e-05, + "loss": 2.2313, + "step": 2398 + }, + { + "epoch": 0.34547811059907835, + "grad_norm": 0.7223864793777466, + "learning_rate": 3.667504867590891e-05, + "loss": 0.085, + "step": 2399 + }, + { + "epoch": 0.3456221198156682, + "grad_norm": 3.5216948986053467, + "learning_rate": 3.6665046138325354e-05, + "loss": 0.2595, + "step": 2400 + }, + { + "epoch": 0.34576612903225806, + "grad_norm": 2.8520140647888184, + "learning_rate": 3.6655041213113184e-05, + "loss": 0.7308, + "step": 2401 + }, + { + "epoch": 0.3459101382488479, + "grad_norm": 1.8850812911987305, + "learning_rate": 3.664503390232024e-05, + "loss": 0.1426, + "step": 2402 + }, + { + "epoch": 0.3460541474654378, + "grad_norm": 1.384497880935669, + "learning_rate": 3.663502420799483e-05, + "loss": 0.139, + "step": 2403 + }, + { + "epoch": 0.34619815668202764, + "grad_norm": 1.4156855344772339, + "learning_rate": 3.662501213218577e-05, + "loss": 0.1513, + "step": 2404 + }, + { + "epoch": 0.3463421658986175, + "grad_norm": 4.210934162139893, + "learning_rate": 3.6614997676942354e-05, + "loss": 0.2408, + "step": 2405 + }, + { + "epoch": 0.34648617511520735, + "grad_norm": 5.331251621246338, + "learning_rate": 3.6604980844314356e-05, + "loss": 2.8958, + "step": 2406 + }, + { + "epoch": 0.3466301843317972, + "grad_norm": 1.834007740020752, + "learning_rate": 3.659496163635205e-05, + "loss": 0.1361, + "step": 2407 + }, + { + "epoch": 0.3467741935483871, + "grad_norm": 2.0349910259246826, + "learning_rate": 3.6584940055106194e-05, + "loss": 0.1177, + "step": 2408 + }, + { + "epoch": 0.346918202764977, + "grad_norm": 4.808788299560547, + "learning_rate": 3.657491610262802e-05, + "loss": 2.0496, + "step": 2409 + }, + { + "epoch": 0.34706221198156684, + "grad_norm": 0.9064728617668152, + "learning_rate": 3.656488978096926e-05, + "loss": 0.1156, + "step": 2410 + }, + { + "epoch": 0.3472062211981567, + "grad_norm": 1.4305496215820312, + "learning_rate": 3.6554861092182126e-05, + "loss": 0.1152, + "step": 2411 + }, + { + "epoch": 0.34735023041474655, + "grad_norm": 8.921164512634277, + "learning_rate": 3.654483003831931e-05, + "loss": 1.0191, + "step": 2412 + }, + { + "epoch": 0.3474942396313364, + "grad_norm": 1.7453818321228027, + "learning_rate": 3.6534796621433986e-05, + "loss": 0.2485, + "step": 2413 + }, + { + "epoch": 0.34763824884792627, + "grad_norm": 3.1367626190185547, + "learning_rate": 3.652476084357983e-05, + "loss": 0.4361, + "step": 2414 + }, + { + "epoch": 0.3477822580645161, + "grad_norm": 2.361532211303711, + "learning_rate": 3.651472270681097e-05, + "loss": 0.2645, + "step": 2415 + }, + { + "epoch": 0.347926267281106, + "grad_norm": 0.9018645882606506, + "learning_rate": 3.650468221318206e-05, + "loss": 0.1212, + "step": 2416 + }, + { + "epoch": 0.34807027649769584, + "grad_norm": 5.873631000518799, + "learning_rate": 3.6494639364748184e-05, + "loss": 1.4764, + "step": 2417 + }, + { + "epoch": 0.3482142857142857, + "grad_norm": 0.982476532459259, + "learning_rate": 3.648459416356496e-05, + "loss": 0.1468, + "step": 2418 + }, + { + "epoch": 0.34835829493087556, + "grad_norm": 1.7306019067764282, + "learning_rate": 3.6474546611688445e-05, + "loss": 0.1824, + "step": 2419 + }, + { + "epoch": 0.3485023041474654, + "grad_norm": 2.2156147956848145, + "learning_rate": 3.6464496711175204e-05, + "loss": 0.1912, + "step": 2420 + }, + { + "epoch": 0.3486463133640553, + "grad_norm": 2.098072052001953, + "learning_rate": 3.645444446408227e-05, + "loss": 0.1849, + "step": 2421 + }, + { + "epoch": 0.3487903225806452, + "grad_norm": 4.623746871948242, + "learning_rate": 3.644438987246716e-05, + "loss": 0.6154, + "step": 2422 + }, + { + "epoch": 0.34893433179723504, + "grad_norm": 1.3534555435180664, + "learning_rate": 3.6434332938387875e-05, + "loss": 0.1136, + "step": 2423 + }, + { + "epoch": 0.3490783410138249, + "grad_norm": 5.962512969970703, + "learning_rate": 3.642427366390289e-05, + "loss": 0.4365, + "step": 2424 + }, + { + "epoch": 0.34922235023041476, + "grad_norm": 3.6710808277130127, + "learning_rate": 3.641421205107116e-05, + "loss": 0.1908, + "step": 2425 + }, + { + "epoch": 0.3493663594470046, + "grad_norm": 2.5832154750823975, + "learning_rate": 3.640414810195212e-05, + "loss": 0.1929, + "step": 2426 + }, + { + "epoch": 0.3495103686635945, + "grad_norm": 2.3103344440460205, + "learning_rate": 3.639408181860569e-05, + "loss": 0.3715, + "step": 2427 + }, + { + "epoch": 0.34965437788018433, + "grad_norm": 5.471312046051025, + "learning_rate": 3.638401320309224e-05, + "loss": 0.447, + "step": 2428 + }, + { + "epoch": 0.3497983870967742, + "grad_norm": 3.8375375270843506, + "learning_rate": 3.6373942257472655e-05, + "loss": 2.0244, + "step": 2429 + }, + { + "epoch": 0.34994239631336405, + "grad_norm": 2.2739052772521973, + "learning_rate": 3.636386898380827e-05, + "loss": 0.1465, + "step": 2430 + }, + { + "epoch": 0.3500864055299539, + "grad_norm": 6.2880096435546875, + "learning_rate": 3.635379338416091e-05, + "loss": 0.6468, + "step": 2431 + }, + { + "epoch": 0.35023041474654376, + "grad_norm": 1.4346929788589478, + "learning_rate": 3.634371546059288e-05, + "loss": 0.1828, + "step": 2432 + }, + { + "epoch": 0.3503744239631336, + "grad_norm": 2.2654194831848145, + "learning_rate": 3.633363521516693e-05, + "loss": 0.2181, + "step": 2433 + }, + { + "epoch": 0.3505184331797235, + "grad_norm": 4.251484394073486, + "learning_rate": 3.632355264994633e-05, + "loss": 0.3996, + "step": 2434 + }, + { + "epoch": 0.3506624423963134, + "grad_norm": 0.7930424213409424, + "learning_rate": 3.63134677669948e-05, + "loss": 0.1385, + "step": 2435 + }, + { + "epoch": 0.35080645161290325, + "grad_norm": 2.1255946159362793, + "learning_rate": 3.6303380568376517e-05, + "loss": 0.3405, + "step": 2436 + }, + { + "epoch": 0.3509504608294931, + "grad_norm": 8.548701286315918, + "learning_rate": 3.629329105615617e-05, + "loss": 2.133, + "step": 2437 + }, + { + "epoch": 0.35109447004608296, + "grad_norm": 2.174970865249634, + "learning_rate": 3.6283199232398914e-05, + "loss": 0.1861, + "step": 2438 + }, + { + "epoch": 0.3512384792626728, + "grad_norm": 0.9826527237892151, + "learning_rate": 3.627310509917035e-05, + "loss": 0.0931, + "step": 2439 + }, + { + "epoch": 0.3513824884792627, + "grad_norm": 0.7331881523132324, + "learning_rate": 3.626300865853657e-05, + "loss": 0.0764, + "step": 2440 + }, + { + "epoch": 0.35152649769585254, + "grad_norm": 4.85658597946167, + "learning_rate": 3.625290991256414e-05, + "loss": 1.4174, + "step": 2441 + }, + { + "epoch": 0.3516705069124424, + "grad_norm": 1.1030769348144531, + "learning_rate": 3.6242808863320096e-05, + "loss": 0.1028, + "step": 2442 + }, + { + "epoch": 0.35181451612903225, + "grad_norm": 3.144944667816162, + "learning_rate": 3.6232705512871934e-05, + "loss": 2.1987, + "step": 2443 + }, + { + "epoch": 0.3519585253456221, + "grad_norm": 4.191503524780273, + "learning_rate": 3.622259986328765e-05, + "loss": 1.6406, + "step": 2444 + }, + { + "epoch": 0.35210253456221197, + "grad_norm": 1.4062926769256592, + "learning_rate": 3.621249191663567e-05, + "loss": 0.0924, + "step": 2445 + }, + { + "epoch": 0.3522465437788018, + "grad_norm": 2.5268476009368896, + "learning_rate": 3.620238167498493e-05, + "loss": 3.7791, + "step": 2446 + }, + { + "epoch": 0.3523905529953917, + "grad_norm": 1.400687336921692, + "learning_rate": 3.619226914040481e-05, + "loss": 0.1896, + "step": 2447 + }, + { + "epoch": 0.35253456221198154, + "grad_norm": 1.8959547281265259, + "learning_rate": 3.6182154314965164e-05, + "loss": 0.196, + "step": 2448 + }, + { + "epoch": 0.35267857142857145, + "grad_norm": 5.3043389320373535, + "learning_rate": 3.6172037200736325e-05, + "loss": 0.3697, + "step": 2449 + }, + { + "epoch": 0.3528225806451613, + "grad_norm": 2.5624473094940186, + "learning_rate": 3.616191779978907e-05, + "loss": 0.2363, + "step": 2450 + }, + { + "epoch": 0.35296658986175117, + "grad_norm": 3.579014301300049, + "learning_rate": 3.615179611419469e-05, + "loss": 0.6798, + "step": 2451 + }, + { + "epoch": 0.353110599078341, + "grad_norm": 6.138802528381348, + "learning_rate": 3.61416721460249e-05, + "loss": 0.4656, + "step": 2452 + }, + { + "epoch": 0.3532546082949309, + "grad_norm": 1.1916704177856445, + "learning_rate": 3.6131545897351896e-05, + "loss": 0.1138, + "step": 2453 + }, + { + "epoch": 0.35339861751152074, + "grad_norm": 2.2099239826202393, + "learning_rate": 3.6121417370248336e-05, + "loss": 0.2486, + "step": 2454 + }, + { + "epoch": 0.3535426267281106, + "grad_norm": 1.2396140098571777, + "learning_rate": 3.611128656678736e-05, + "loss": 0.1844, + "step": 2455 + }, + { + "epoch": 0.35368663594470046, + "grad_norm": 7.088640213012695, + "learning_rate": 3.610115348904256e-05, + "loss": 0.6054, + "step": 2456 + }, + { + "epoch": 0.3538306451612903, + "grad_norm": 6.01964807510376, + "learning_rate": 3.609101813908801e-05, + "loss": 2.5073, + "step": 2457 + }, + { + "epoch": 0.35397465437788017, + "grad_norm": 7.923009872436523, + "learning_rate": 3.6080880518998216e-05, + "loss": 1.0401, + "step": 2458 + }, + { + "epoch": 0.35411866359447003, + "grad_norm": 3.8559396266937256, + "learning_rate": 3.607074063084818e-05, + "loss": 1.8499, + "step": 2459 + }, + { + "epoch": 0.3542626728110599, + "grad_norm": 1.1142041683197021, + "learning_rate": 3.606059847671336e-05, + "loss": 0.1117, + "step": 2460 + }, + { + "epoch": 0.35440668202764974, + "grad_norm": 1.1718419790267944, + "learning_rate": 3.605045405866968e-05, + "loss": 0.1356, + "step": 2461 + }, + { + "epoch": 0.35455069124423966, + "grad_norm": 3.23195743560791, + "learning_rate": 3.604030737879351e-05, + "loss": 0.1669, + "step": 2462 + }, + { + "epoch": 0.3546947004608295, + "grad_norm": 0.9547437429428101, + "learning_rate": 3.603015843916169e-05, + "loss": 0.0891, + "step": 2463 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 3.8854339122772217, + "learning_rate": 3.602000724185156e-05, + "loss": 2.0314, + "step": 2464 + }, + { + "epoch": 0.35498271889400923, + "grad_norm": 4.2390828132629395, + "learning_rate": 3.600985378894086e-05, + "loss": 0.3556, + "step": 2465 + }, + { + "epoch": 0.3551267281105991, + "grad_norm": 4.622185707092285, + "learning_rate": 3.599969808250784e-05, + "loss": 3.2837, + "step": 2466 + }, + { + "epoch": 0.35527073732718895, + "grad_norm": 2.0559370517730713, + "learning_rate": 3.5989540124631175e-05, + "loss": 0.3023, + "step": 2467 + }, + { + "epoch": 0.3554147465437788, + "grad_norm": 3.3533694744110107, + "learning_rate": 3.5979379917390044e-05, + "loss": 1.9992, + "step": 2468 + }, + { + "epoch": 0.35555875576036866, + "grad_norm": 7.575666904449463, + "learning_rate": 3.596921746286404e-05, + "loss": 1.3557, + "step": 2469 + }, + { + "epoch": 0.3557027649769585, + "grad_norm": 4.988179683685303, + "learning_rate": 3.595905276313325e-05, + "loss": 0.5137, + "step": 2470 + }, + { + "epoch": 0.3558467741935484, + "grad_norm": 1.3269814252853394, + "learning_rate": 3.594888582027821e-05, + "loss": 0.0959, + "step": 2471 + }, + { + "epoch": 0.35599078341013823, + "grad_norm": 2.510369062423706, + "learning_rate": 3.59387166363799e-05, + "loss": 0.2772, + "step": 2472 + }, + { + "epoch": 0.3561347926267281, + "grad_norm": 2.8208227157592773, + "learning_rate": 3.5928545213519784e-05, + "loss": 0.3878, + "step": 2473 + }, + { + "epoch": 0.35627880184331795, + "grad_norm": 4.364611625671387, + "learning_rate": 3.591837155377976e-05, + "loss": 0.555, + "step": 2474 + }, + { + "epoch": 0.3564228110599078, + "grad_norm": 0.764491081237793, + "learning_rate": 3.5908195659242215e-05, + "loss": 0.0951, + "step": 2475 + }, + { + "epoch": 0.3565668202764977, + "grad_norm": 2.6955113410949707, + "learning_rate": 3.589801753198996e-05, + "loss": 1.7007, + "step": 2476 + }, + { + "epoch": 0.3567108294930876, + "grad_norm": 3.011079788208008, + "learning_rate": 3.5887837174106274e-05, + "loss": 0.3388, + "step": 2477 + }, + { + "epoch": 0.35685483870967744, + "grad_norm": 4.906853199005127, + "learning_rate": 3.587765458767491e-05, + "loss": 1.4708, + "step": 2478 + }, + { + "epoch": 0.3569988479262673, + "grad_norm": 2.0341882705688477, + "learning_rate": 3.586746977478006e-05, + "loss": 0.2528, + "step": 2479 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.9994325637817383, + "learning_rate": 3.585728273750637e-05, + "loss": 0.1584, + "step": 2480 + }, + { + "epoch": 0.357286866359447, + "grad_norm": 0.9314089417457581, + "learning_rate": 3.5847093477938956e-05, + "loss": 0.1331, + "step": 2481 + }, + { + "epoch": 0.35743087557603687, + "grad_norm": 0.9331819415092468, + "learning_rate": 3.583690199816336e-05, + "loss": 0.1276, + "step": 2482 + }, + { + "epoch": 0.3575748847926267, + "grad_norm": 5.600116729736328, + "learning_rate": 3.582670830026562e-05, + "loss": 2.0184, + "step": 2483 + }, + { + "epoch": 0.3577188940092166, + "grad_norm": 6.868280410766602, + "learning_rate": 3.581651238633219e-05, + "loss": 0.5232, + "step": 2484 + }, + { + "epoch": 0.35786290322580644, + "grad_norm": 1.132889747619629, + "learning_rate": 3.580631425845e-05, + "loss": 0.1151, + "step": 2485 + }, + { + "epoch": 0.3580069124423963, + "grad_norm": 4.722726345062256, + "learning_rate": 3.5796113918706426e-05, + "loss": 1.5408, + "step": 2486 + }, + { + "epoch": 0.35815092165898615, + "grad_norm": 0.6776606440544128, + "learning_rate": 3.5785911369189294e-05, + "loss": 0.0528, + "step": 2487 + }, + { + "epoch": 0.358294930875576, + "grad_norm": 3.0548951625823975, + "learning_rate": 3.577570661198689e-05, + "loss": 0.3728, + "step": 2488 + }, + { + "epoch": 0.3584389400921659, + "grad_norm": 3.8512449264526367, + "learning_rate": 3.576549964918794e-05, + "loss": 0.3067, + "step": 2489 + }, + { + "epoch": 0.3585829493087558, + "grad_norm": 4.802267551422119, + "learning_rate": 3.575529048288163e-05, + "loss": 0.2617, + "step": 2490 + }, + { + "epoch": 0.35872695852534564, + "grad_norm": 5.658627510070801, + "learning_rate": 3.5745079115157606e-05, + "loss": 0.3197, + "step": 2491 + }, + { + "epoch": 0.3588709677419355, + "grad_norm": 1.5323221683502197, + "learning_rate": 3.573486554810595e-05, + "loss": 0.1537, + "step": 2492 + }, + { + "epoch": 0.35901497695852536, + "grad_norm": 3.649600028991699, + "learning_rate": 3.5724649783817185e-05, + "loss": 0.628, + "step": 2493 + }, + { + "epoch": 0.3591589861751152, + "grad_norm": 0.8649821281433105, + "learning_rate": 3.571443182438232e-05, + "loss": 0.1167, + "step": 2494 + }, + { + "epoch": 0.35930299539170507, + "grad_norm": 0.7073783874511719, + "learning_rate": 3.570421167189277e-05, + "loss": 0.074, + "step": 2495 + }, + { + "epoch": 0.35944700460829493, + "grad_norm": 1.698610544204712, + "learning_rate": 3.569398932844044e-05, + "loss": 0.1784, + "step": 2496 + }, + { + "epoch": 0.3595910138248848, + "grad_norm": 3.6020352840423584, + "learning_rate": 3.5683764796117634e-05, + "loss": 0.3777, + "step": 2497 + }, + { + "epoch": 0.35973502304147464, + "grad_norm": 0.9210937023162842, + "learning_rate": 3.567353807701716e-05, + "loss": 0.1072, + "step": 2498 + }, + { + "epoch": 0.3598790322580645, + "grad_norm": 3.423515558242798, + "learning_rate": 3.566330917323224e-05, + "loss": 1.6176, + "step": 2499 + }, + { + "epoch": 0.36002304147465436, + "grad_norm": 2.281140089035034, + "learning_rate": 3.5653078086856546e-05, + "loss": 0.3516, + "step": 2500 + }, + { + "epoch": 0.3601670506912442, + "grad_norm": 0.9873731732368469, + "learning_rate": 3.5642844819984194e-05, + "loss": 0.1295, + "step": 2501 + }, + { + "epoch": 0.3603110599078341, + "grad_norm": 2.1231088638305664, + "learning_rate": 3.5632609374709764e-05, + "loss": 0.1658, + "step": 2502 + }, + { + "epoch": 0.360455069124424, + "grad_norm": 0.6813572645187378, + "learning_rate": 3.5622371753128266e-05, + "loss": 0.0781, + "step": 2503 + }, + { + "epoch": 0.36059907834101385, + "grad_norm": 5.275638580322266, + "learning_rate": 3.561213195733515e-05, + "loss": 0.3702, + "step": 2504 + }, + { + "epoch": 0.3607430875576037, + "grad_norm": 3.2970142364501953, + "learning_rate": 3.560188998942634e-05, + "loss": 2.502, + "step": 2505 + }, + { + "epoch": 0.36088709677419356, + "grad_norm": 2.7465381622314453, + "learning_rate": 3.5591645851498176e-05, + "loss": 0.2107, + "step": 2506 + }, + { + "epoch": 0.3610311059907834, + "grad_norm": 1.945616602897644, + "learning_rate": 3.558139954564746e-05, + "loss": 0.1313, + "step": 2507 + }, + { + "epoch": 0.3611751152073733, + "grad_norm": 2.6909306049346924, + "learning_rate": 3.557115107397141e-05, + "loss": 2.3602, + "step": 2508 + }, + { + "epoch": 0.36131912442396313, + "grad_norm": 2.090989589691162, + "learning_rate": 3.556090043856773e-05, + "loss": 0.2252, + "step": 2509 + }, + { + "epoch": 0.361463133640553, + "grad_norm": 4.770748615264893, + "learning_rate": 3.555064764153452e-05, + "loss": 1.597, + "step": 2510 + }, + { + "epoch": 0.36160714285714285, + "grad_norm": 2.433443546295166, + "learning_rate": 3.554039268497037e-05, + "loss": 0.258, + "step": 2511 + }, + { + "epoch": 0.3617511520737327, + "grad_norm": 2.4422571659088135, + "learning_rate": 3.5530135570974274e-05, + "loss": 0.2909, + "step": 2512 + }, + { + "epoch": 0.36189516129032256, + "grad_norm": 2.326350212097168, + "learning_rate": 3.5519876301645684e-05, + "loss": 0.1651, + "step": 2513 + }, + { + "epoch": 0.3620391705069124, + "grad_norm": 1.1377015113830566, + "learning_rate": 3.55096148790845e-05, + "loss": 0.136, + "step": 2514 + }, + { + "epoch": 0.3621831797235023, + "grad_norm": 0.7518608570098877, + "learning_rate": 3.5499351305391034e-05, + "loss": 0.0713, + "step": 2515 + }, + { + "epoch": 0.3623271889400922, + "grad_norm": 6.725396156311035, + "learning_rate": 3.548908558266607e-05, + "loss": 2.4623, + "step": 2516 + }, + { + "epoch": 0.36247119815668205, + "grad_norm": 8.0882568359375, + "learning_rate": 3.5478817713010823e-05, + "loss": 0.8615, + "step": 2517 + }, + { + "epoch": 0.3626152073732719, + "grad_norm": 0.421604186296463, + "learning_rate": 3.5468547698526946e-05, + "loss": 0.0463, + "step": 2518 + }, + { + "epoch": 0.36275921658986177, + "grad_norm": 3.6874637603759766, + "learning_rate": 3.5458275541316514e-05, + "loss": 0.1847, + "step": 2519 + }, + { + "epoch": 0.3629032258064516, + "grad_norm": 2.289047956466675, + "learning_rate": 3.544800124348207e-05, + "loss": 0.2126, + "step": 2520 + }, + { + "epoch": 0.3630472350230415, + "grad_norm": 0.6674261093139648, + "learning_rate": 3.543772480712658e-05, + "loss": 0.1167, + "step": 2521 + }, + { + "epoch": 0.36319124423963134, + "grad_norm": 3.027615547180176, + "learning_rate": 3.542744623435344e-05, + "loss": 1.5605, + "step": 2522 + }, + { + "epoch": 0.3633352534562212, + "grad_norm": 1.3190739154815674, + "learning_rate": 3.541716552726651e-05, + "loss": 0.1647, + "step": 2523 + }, + { + "epoch": 0.36347926267281105, + "grad_norm": 3.0657401084899902, + "learning_rate": 3.540688268797005e-05, + "loss": 0.2371, + "step": 2524 + }, + { + "epoch": 0.3636232718894009, + "grad_norm": 4.945165157318115, + "learning_rate": 3.539659771856878e-05, + "loss": 1.8016, + "step": 2525 + }, + { + "epoch": 0.36376728110599077, + "grad_norm": 1.10462486743927, + "learning_rate": 3.5386310621167855e-05, + "loss": 0.1581, + "step": 2526 + }, + { + "epoch": 0.3639112903225806, + "grad_norm": 0.7261354327201843, + "learning_rate": 3.5376021397872855e-05, + "loss": 0.0764, + "step": 2527 + }, + { + "epoch": 0.3640552995391705, + "grad_norm": 0.9490910172462463, + "learning_rate": 3.536573005078981e-05, + "loss": 4.496, + "step": 2528 + }, + { + "epoch": 0.36419930875576034, + "grad_norm": 3.2249865531921387, + "learning_rate": 3.535543658202518e-05, + "loss": 0.2747, + "step": 2529 + }, + { + "epoch": 0.36434331797235026, + "grad_norm": 1.056183099746704, + "learning_rate": 3.5345140993685844e-05, + "loss": 0.1278, + "step": 2530 + }, + { + "epoch": 0.3644873271889401, + "grad_norm": 3.291205406188965, + "learning_rate": 3.533484328787914e-05, + "loss": 2.0416, + "step": 2531 + }, + { + "epoch": 0.36463133640552997, + "grad_norm": 1.2364760637283325, + "learning_rate": 3.532454346671281e-05, + "loss": 0.0907, + "step": 2532 + }, + { + "epoch": 0.36477534562211983, + "grad_norm": 4.744956970214844, + "learning_rate": 3.531424153229506e-05, + "loss": 0.3946, + "step": 2533 + }, + { + "epoch": 0.3649193548387097, + "grad_norm": 1.6715648174285889, + "learning_rate": 3.530393748673451e-05, + "loss": 0.1494, + "step": 2534 + }, + { + "epoch": 0.36506336405529954, + "grad_norm": 1.299487590789795, + "learning_rate": 3.529363133214021e-05, + "loss": 0.1531, + "step": 2535 + }, + { + "epoch": 0.3652073732718894, + "grad_norm": 2.3723137378692627, + "learning_rate": 3.528332307062164e-05, + "loss": 2.1113, + "step": 2536 + }, + { + "epoch": 0.36535138248847926, + "grad_norm": 2.0583531856536865, + "learning_rate": 3.5273012704288745e-05, + "loss": 0.2901, + "step": 2537 + }, + { + "epoch": 0.3654953917050691, + "grad_norm": 1.5458030700683594, + "learning_rate": 3.526270023525186e-05, + "loss": 0.1665, + "step": 2538 + }, + { + "epoch": 0.365639400921659, + "grad_norm": 0.5780789256095886, + "learning_rate": 3.525238566562176e-05, + "loss": 0.0561, + "step": 2539 + }, + { + "epoch": 0.36578341013824883, + "grad_norm": 1.3165934085845947, + "learning_rate": 3.524206899750966e-05, + "loss": 0.1458, + "step": 2540 + }, + { + "epoch": 0.3659274193548387, + "grad_norm": 1.860344648361206, + "learning_rate": 3.523175023302721e-05, + "loss": 0.1747, + "step": 2541 + }, + { + "epoch": 0.36607142857142855, + "grad_norm": 1.0586819648742676, + "learning_rate": 3.522142937428645e-05, + "loss": 0.1486, + "step": 2542 + }, + { + "epoch": 0.36621543778801846, + "grad_norm": 0.7119726538658142, + "learning_rate": 3.521110642339991e-05, + "loss": 0.0876, + "step": 2543 + }, + { + "epoch": 0.3663594470046083, + "grad_norm": 0.5330336689949036, + "learning_rate": 3.520078138248049e-05, + "loss": 0.0476, + "step": 2544 + }, + { + "epoch": 0.3665034562211982, + "grad_norm": 2.240368604660034, + "learning_rate": 3.519045425364156e-05, + "loss": 0.1953, + "step": 2545 + }, + { + "epoch": 0.36664746543778803, + "grad_norm": 1.8106698989868164, + "learning_rate": 3.518012503899689e-05, + "loss": 0.077, + "step": 2546 + }, + { + "epoch": 0.3667914746543779, + "grad_norm": 0.7845739722251892, + "learning_rate": 3.516979374066069e-05, + "loss": 0.0807, + "step": 2547 + }, + { + "epoch": 0.36693548387096775, + "grad_norm": 0.8456618189811707, + "learning_rate": 3.51594603607476e-05, + "loss": 0.1048, + "step": 2548 + }, + { + "epoch": 0.3670794930875576, + "grad_norm": 6.740107536315918, + "learning_rate": 3.5149124901372677e-05, + "loss": 1.7553, + "step": 2549 + }, + { + "epoch": 0.36722350230414746, + "grad_norm": 3.1051125526428223, + "learning_rate": 3.5138787364651405e-05, + "loss": 0.5501, + "step": 2550 + }, + { + "epoch": 0.3673675115207373, + "grad_norm": 3.521313428878784, + "learning_rate": 3.51284477526997e-05, + "loss": 0.165, + "step": 2551 + }, + { + "epoch": 0.3675115207373272, + "grad_norm": 4.0015363693237305, + "learning_rate": 3.511810606763388e-05, + "loss": 1.8526, + "step": 2552 + }, + { + "epoch": 0.36765552995391704, + "grad_norm": 9.012273788452148, + "learning_rate": 3.5107762311570735e-05, + "loss": 2.9905, + "step": 2553 + }, + { + "epoch": 0.3677995391705069, + "grad_norm": 4.9741058349609375, + "learning_rate": 3.509741648662742e-05, + "loss": 1.2698, + "step": 2554 + }, + { + "epoch": 0.36794354838709675, + "grad_norm": 1.2000263929367065, + "learning_rate": 3.5087068594921563e-05, + "loss": 0.1475, + "step": 2555 + }, + { + "epoch": 0.3680875576036866, + "grad_norm": 0.48016607761383057, + "learning_rate": 3.5076718638571185e-05, + "loss": 0.0455, + "step": 2556 + }, + { + "epoch": 0.3682315668202765, + "grad_norm": 5.460827350616455, + "learning_rate": 3.506636661969473e-05, + "loss": 1.3132, + "step": 2557 + }, + { + "epoch": 0.3683755760368664, + "grad_norm": 1.756156325340271, + "learning_rate": 3.505601254041109e-05, + "loss": 0.1314, + "step": 2558 + }, + { + "epoch": 0.36851958525345624, + "grad_norm": 0.7634763717651367, + "learning_rate": 3.5045656402839554e-05, + "loss": 0.0616, + "step": 2559 + }, + { + "epoch": 0.3686635944700461, + "grad_norm": 2.91273832321167, + "learning_rate": 3.503529820909984e-05, + "loss": 0.2944, + "step": 2560 + }, + { + "epoch": 0.36880760368663595, + "grad_norm": 1.6275509595870972, + "learning_rate": 3.50249379613121e-05, + "loss": 0.1743, + "step": 2561 + }, + { + "epoch": 0.3689516129032258, + "grad_norm": 6.887571811676025, + "learning_rate": 3.501457566159687e-05, + "loss": 2.2907, + "step": 2562 + }, + { + "epoch": 0.36909562211981567, + "grad_norm": 12.147695541381836, + "learning_rate": 3.5004211312075143e-05, + "loss": 3.4087, + "step": 2563 + }, + { + "epoch": 0.3692396313364055, + "grad_norm": 0.7412810325622559, + "learning_rate": 3.499384491486832e-05, + "loss": 0.0779, + "step": 2564 + }, + { + "epoch": 0.3693836405529954, + "grad_norm": 3.498094320297241, + "learning_rate": 3.498347647209821e-05, + "loss": 2.7536, + "step": 2565 + }, + { + "epoch": 0.36952764976958524, + "grad_norm": 8.190592765808105, + "learning_rate": 3.497310598588706e-05, + "loss": 1.7067, + "step": 2566 + }, + { + "epoch": 0.3696716589861751, + "grad_norm": 4.088171005249023, + "learning_rate": 3.4962733458357514e-05, + "loss": 0.4808, + "step": 2567 + }, + { + "epoch": 0.36981566820276496, + "grad_norm": 3.4285905361175537, + "learning_rate": 3.495235889163266e-05, + "loss": 0.6393, + "step": 2568 + }, + { + "epoch": 0.3699596774193548, + "grad_norm": 3.3635799884796143, + "learning_rate": 3.4941982287835974e-05, + "loss": 1.4156, + "step": 2569 + }, + { + "epoch": 0.3701036866359447, + "grad_norm": 1.388311505317688, + "learning_rate": 3.4931603649091374e-05, + "loss": 0.128, + "step": 2570 + }, + { + "epoch": 0.3702476958525346, + "grad_norm": 2.827951669692993, + "learning_rate": 3.492122297752317e-05, + "loss": 2.3842, + "step": 2571 + }, + { + "epoch": 0.37039170506912444, + "grad_norm": 2.330766201019287, + "learning_rate": 3.491084027525611e-05, + "loss": 0.2104, + "step": 2572 + }, + { + "epoch": 0.3705357142857143, + "grad_norm": 0.8475162982940674, + "learning_rate": 3.4900455544415356e-05, + "loss": 0.0858, + "step": 2573 + }, + { + "epoch": 0.37067972350230416, + "grad_norm": 2.0854849815368652, + "learning_rate": 3.489006878712647e-05, + "loss": 0.177, + "step": 2574 + }, + { + "epoch": 0.370823732718894, + "grad_norm": 4.7767109870910645, + "learning_rate": 3.487968000551544e-05, + "loss": 2.0264, + "step": 2575 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 1.3137985467910767, + "learning_rate": 3.4869289201708663e-05, + "loss": 0.1088, + "step": 2576 + }, + { + "epoch": 0.37111175115207373, + "grad_norm": 4.545588493347168, + "learning_rate": 3.4858896377832966e-05, + "loss": 2.087, + "step": 2577 + }, + { + "epoch": 0.3712557603686636, + "grad_norm": 1.8702753782272339, + "learning_rate": 3.484850153601556e-05, + "loss": 0.2475, + "step": 2578 + }, + { + "epoch": 0.37139976958525345, + "grad_norm": 1.1830402612686157, + "learning_rate": 3.483810467838409e-05, + "loss": 0.1155, + "step": 2579 + }, + { + "epoch": 0.3715437788018433, + "grad_norm": 5.112494945526123, + "learning_rate": 3.482770580706661e-05, + "loss": 2.1005, + "step": 2580 + }, + { + "epoch": 0.37168778801843316, + "grad_norm": 0.7308192849159241, + "learning_rate": 3.481730492419159e-05, + "loss": 0.0826, + "step": 2581 + }, + { + "epoch": 0.371831797235023, + "grad_norm": 0.967135488986969, + "learning_rate": 3.48069020318879e-05, + "loss": 0.1014, + "step": 2582 + }, + { + "epoch": 0.3719758064516129, + "grad_norm": 3.9077906608581543, + "learning_rate": 3.4796497132284825e-05, + "loss": 1.576, + "step": 2583 + }, + { + "epoch": 0.3721198156682028, + "grad_norm": 1.1266465187072754, + "learning_rate": 3.478609022751207e-05, + "loss": 0.1208, + "step": 2584 + }, + { + "epoch": 0.37226382488479265, + "grad_norm": 2.0777065753936768, + "learning_rate": 3.4775681319699746e-05, + "loss": 0.2761, + "step": 2585 + }, + { + "epoch": 0.3724078341013825, + "grad_norm": 6.533022880554199, + "learning_rate": 3.476527041097836e-05, + "loss": 0.8019, + "step": 2586 + }, + { + "epoch": 0.37255184331797236, + "grad_norm": 0.9148367643356323, + "learning_rate": 3.475485750347886e-05, + "loss": 0.1188, + "step": 2587 + }, + { + "epoch": 0.3726958525345622, + "grad_norm": 1.8766061067581177, + "learning_rate": 3.474444259933257e-05, + "loss": 0.2021, + "step": 2588 + }, + { + "epoch": 0.3728398617511521, + "grad_norm": 1.88285493850708, + "learning_rate": 3.473402570067125e-05, + "loss": 0.2209, + "step": 2589 + }, + { + "epoch": 0.37298387096774194, + "grad_norm": 7.548783779144287, + "learning_rate": 3.472360680962704e-05, + "loss": 2.0173, + "step": 2590 + }, + { + "epoch": 0.3731278801843318, + "grad_norm": 2.75174617767334, + "learning_rate": 3.47131859283325e-05, + "loss": 0.47, + "step": 2591 + }, + { + "epoch": 0.37327188940092165, + "grad_norm": 3.0554516315460205, + "learning_rate": 3.470276305892062e-05, + "loss": 0.7095, + "step": 2592 + }, + { + "epoch": 0.3734158986175115, + "grad_norm": 2.8807766437530518, + "learning_rate": 3.469233820352477e-05, + "loss": 0.289, + "step": 2593 + }, + { + "epoch": 0.37355990783410137, + "grad_norm": 3.526811361312866, + "learning_rate": 3.468191136427872e-05, + "loss": 1.4895, + "step": 2594 + }, + { + "epoch": 0.3737039170506912, + "grad_norm": 0.6929857730865479, + "learning_rate": 3.4671482543316666e-05, + "loss": 0.0829, + "step": 2595 + }, + { + "epoch": 0.3738479262672811, + "grad_norm": 6.531679630279541, + "learning_rate": 3.466105174277321e-05, + "loss": 1.499, + "step": 2596 + }, + { + "epoch": 0.37399193548387094, + "grad_norm": 3.7059526443481445, + "learning_rate": 3.465061896478335e-05, + "loss": 1.2836, + "step": 2597 + }, + { + "epoch": 0.37413594470046085, + "grad_norm": 1.0551663637161255, + "learning_rate": 3.464018421148249e-05, + "loss": 0.136, + "step": 2598 + }, + { + "epoch": 0.3742799539170507, + "grad_norm": 3.9718592166900635, + "learning_rate": 3.4629747485006424e-05, + "loss": 0.2289, + "step": 2599 + }, + { + "epoch": 0.37442396313364057, + "grad_norm": 3.449723482131958, + "learning_rate": 3.4619308787491394e-05, + "loss": 1.3348, + "step": 2600 + }, + { + "epoch": 0.3745679723502304, + "grad_norm": 0.6961051821708679, + "learning_rate": 3.4608868121074e-05, + "loss": 0.0812, + "step": 2601 + }, + { + "epoch": 0.3747119815668203, + "grad_norm": 0.7959226965904236, + "learning_rate": 3.459842548789127e-05, + "loss": 0.0998, + "step": 2602 + }, + { + "epoch": 0.37485599078341014, + "grad_norm": 4.691957473754883, + "learning_rate": 3.458798089008061e-05, + "loss": 0.6014, + "step": 2603 + }, + { + "epoch": 0.375, + "grad_norm": 7.617314338684082, + "learning_rate": 3.457753432977986e-05, + "loss": 1.0371, + "step": 2604 + }, + { + "epoch": 0.37514400921658986, + "grad_norm": 0.4397721290588379, + "learning_rate": 3.456708580912725e-05, + "loss": 0.0692, + "step": 2605 + }, + { + "epoch": 0.3752880184331797, + "grad_norm": 4.82490348815918, + "learning_rate": 3.455663533026139e-05, + "loss": 0.9369, + "step": 2606 + }, + { + "epoch": 0.3754320276497696, + "grad_norm": 0.9806955456733704, + "learning_rate": 3.4546182895321315e-05, + "loss": 0.1292, + "step": 2607 + }, + { + "epoch": 0.37557603686635943, + "grad_norm": 4.280431270599365, + "learning_rate": 3.4535728506446466e-05, + "loss": 1.3099, + "step": 2608 + }, + { + "epoch": 0.3757200460829493, + "grad_norm": 0.8615792989730835, + "learning_rate": 3.452527216577665e-05, + "loss": 0.1139, + "step": 2609 + }, + { + "epoch": 0.37586405529953915, + "grad_norm": 1.0327812433242798, + "learning_rate": 3.4514813875452115e-05, + "loss": 0.1014, + "step": 2610 + }, + { + "epoch": 0.37600806451612906, + "grad_norm": 1.618804693222046, + "learning_rate": 3.450435363761347e-05, + "loss": 0.168, + "step": 2611 + }, + { + "epoch": 0.3761520737327189, + "grad_norm": 2.0365889072418213, + "learning_rate": 3.449389145440175e-05, + "loss": 0.1434, + "step": 2612 + }, + { + "epoch": 0.3762960829493088, + "grad_norm": 4.393850803375244, + "learning_rate": 3.448342732795838e-05, + "loss": 0.2865, + "step": 2613 + }, + { + "epoch": 0.37644009216589863, + "grad_norm": 4.217681407928467, + "learning_rate": 3.4472961260425186e-05, + "loss": 0.4797, + "step": 2614 + }, + { + "epoch": 0.3765841013824885, + "grad_norm": 1.0851399898529053, + "learning_rate": 3.446249325394437e-05, + "loss": 0.1079, + "step": 2615 + }, + { + "epoch": 0.37672811059907835, + "grad_norm": 1.0779836177825928, + "learning_rate": 3.445202331065857e-05, + "loss": 0.1021, + "step": 2616 + }, + { + "epoch": 0.3768721198156682, + "grad_norm": 3.0106918811798096, + "learning_rate": 3.4441551432710784e-05, + "loss": 0.3301, + "step": 2617 + }, + { + "epoch": 0.37701612903225806, + "grad_norm": 0.9944549798965454, + "learning_rate": 3.443107762224442e-05, + "loss": 0.1179, + "step": 2618 + }, + { + "epoch": 0.3771601382488479, + "grad_norm": 5.69107723236084, + "learning_rate": 3.4420601881403284e-05, + "loss": 0.531, + "step": 2619 + }, + { + "epoch": 0.3773041474654378, + "grad_norm": 3.182805061340332, + "learning_rate": 3.441012421233159e-05, + "loss": 0.2115, + "step": 2620 + }, + { + "epoch": 0.37744815668202764, + "grad_norm": 0.9898135662078857, + "learning_rate": 3.4399644617173896e-05, + "loss": 4.363, + "step": 2621 + }, + { + "epoch": 0.3775921658986175, + "grad_norm": 1.4007617235183716, + "learning_rate": 3.438916309807522e-05, + "loss": 0.1219, + "step": 2622 + }, + { + "epoch": 0.37773617511520735, + "grad_norm": 3.2011373043060303, + "learning_rate": 3.437867965718093e-05, + "loss": 0.2208, + "step": 2623 + }, + { + "epoch": 0.3778801843317972, + "grad_norm": 2.0934274196624756, + "learning_rate": 3.436819429663682e-05, + "loss": 0.1796, + "step": 2624 + }, + { + "epoch": 0.3780241935483871, + "grad_norm": 0.43932196497917175, + "learning_rate": 3.4357707018589036e-05, + "loss": 0.038, + "step": 2625 + }, + { + "epoch": 0.378168202764977, + "grad_norm": 2.0114619731903076, + "learning_rate": 3.4347217825184134e-05, + "loss": 0.2847, + "step": 2626 + }, + { + "epoch": 0.37831221198156684, + "grad_norm": 3.233339548110962, + "learning_rate": 3.433672671856909e-05, + "loss": 0.4277, + "step": 2627 + }, + { + "epoch": 0.3784562211981567, + "grad_norm": 0.47524890303611755, + "learning_rate": 3.4326233700891236e-05, + "loss": 4.5046, + "step": 2628 + }, + { + "epoch": 0.37860023041474655, + "grad_norm": 1.5623345375061035, + "learning_rate": 3.43157387742983e-05, + "loss": 0.1739, + "step": 2629 + }, + { + "epoch": 0.3787442396313364, + "grad_norm": 1.000221848487854, + "learning_rate": 3.4305241940938425e-05, + "loss": 0.0894, + "step": 2630 + }, + { + "epoch": 0.37888824884792627, + "grad_norm": 5.744750499725342, + "learning_rate": 3.429474320296011e-05, + "loss": 0.6056, + "step": 2631 + }, + { + "epoch": 0.3790322580645161, + "grad_norm": 7.583759307861328, + "learning_rate": 3.428424256251227e-05, + "loss": 0.5123, + "step": 2632 + }, + { + "epoch": 0.379176267281106, + "grad_norm": 1.2095781564712524, + "learning_rate": 3.42737400217442e-05, + "loss": 0.1005, + "step": 2633 + }, + { + "epoch": 0.37932027649769584, + "grad_norm": 4.44573450088501, + "learning_rate": 3.426323558280558e-05, + "loss": 1.1304, + "step": 2634 + }, + { + "epoch": 0.3794642857142857, + "grad_norm": 1.3708473443984985, + "learning_rate": 3.4252729247846486e-05, + "loss": 4.3781, + "step": 2635 + }, + { + "epoch": 0.37960829493087556, + "grad_norm": 8.761052131652832, + "learning_rate": 3.424222101901738e-05, + "loss": 1.9044, + "step": 2636 + }, + { + "epoch": 0.3797523041474654, + "grad_norm": 0.49964088201522827, + "learning_rate": 3.4231710898469105e-05, + "loss": 0.0832, + "step": 2637 + }, + { + "epoch": 0.3798963133640553, + "grad_norm": 5.16946268081665, + "learning_rate": 3.4221198888352907e-05, + "loss": 0.8849, + "step": 2638 + }, + { + "epoch": 0.3800403225806452, + "grad_norm": 1.8370614051818848, + "learning_rate": 3.42106849908204e-05, + "loss": 0.2678, + "step": 2639 + }, + { + "epoch": 0.38018433179723504, + "grad_norm": 6.683591365814209, + "learning_rate": 3.4200169208023594e-05, + "loss": 1.6463, + "step": 2640 + }, + { + "epoch": 0.3803283410138249, + "grad_norm": 1.1464613676071167, + "learning_rate": 3.4189651542114884e-05, + "loss": 0.1278, + "step": 2641 + }, + { + "epoch": 0.38047235023041476, + "grad_norm": 0.7598159909248352, + "learning_rate": 3.417913199524705e-05, + "loss": 0.114, + "step": 2642 + }, + { + "epoch": 0.3806163594470046, + "grad_norm": 4.627331733703613, + "learning_rate": 3.4168610569573256e-05, + "loss": 0.4429, + "step": 2643 + }, + { + "epoch": 0.3807603686635945, + "grad_norm": 1.442900538444519, + "learning_rate": 3.4158087267247066e-05, + "loss": 0.1659, + "step": 2644 + }, + { + "epoch": 0.38090437788018433, + "grad_norm": 0.576085090637207, + "learning_rate": 3.4147562090422394e-05, + "loss": 0.0658, + "step": 2645 + }, + { + "epoch": 0.3810483870967742, + "grad_norm": 0.714860737323761, + "learning_rate": 3.4137035041253565e-05, + "loss": 4.3624, + "step": 2646 + }, + { + "epoch": 0.38119239631336405, + "grad_norm": 1.7053929567337036, + "learning_rate": 3.412650612189528e-05, + "loss": 0.1653, + "step": 2647 + }, + { + "epoch": 0.3813364055299539, + "grad_norm": 3.170530319213867, + "learning_rate": 3.411597533450262e-05, + "loss": 1.1674, + "step": 2648 + }, + { + "epoch": 0.38148041474654376, + "grad_norm": 3.350327730178833, + "learning_rate": 3.410544268123106e-05, + "loss": 0.2872, + "step": 2649 + }, + { + "epoch": 0.3816244239631336, + "grad_norm": 6.216424942016602, + "learning_rate": 3.4094908164236436e-05, + "loss": 1.3838, + "step": 2650 + }, + { + "epoch": 0.3817684331797235, + "grad_norm": 1.8628824949264526, + "learning_rate": 3.408437178567499e-05, + "loss": 0.2269, + "step": 2651 + }, + { + "epoch": 0.3819124423963134, + "grad_norm": 2.910759687423706, + "learning_rate": 3.407383354770332e-05, + "loss": 0.2618, + "step": 2652 + }, + { + "epoch": 0.38205645161290325, + "grad_norm": 0.921834409236908, + "learning_rate": 3.406329345247842e-05, + "loss": 0.1202, + "step": 2653 + }, + { + "epoch": 0.3822004608294931, + "grad_norm": 2.555323600769043, + "learning_rate": 3.405275150215766e-05, + "loss": 0.2637, + "step": 2654 + }, + { + "epoch": 0.38234447004608296, + "grad_norm": 2.9241535663604736, + "learning_rate": 3.40422076988988e-05, + "loss": 0.2332, + "step": 2655 + }, + { + "epoch": 0.3824884792626728, + "grad_norm": 4.030672073364258, + "learning_rate": 3.403166204485996e-05, + "loss": 0.154, + "step": 2656 + }, + { + "epoch": 0.3826324884792627, + "grad_norm": 2.0740842819213867, + "learning_rate": 3.4021114542199664e-05, + "loss": 0.1468, + "step": 2657 + }, + { + "epoch": 0.38277649769585254, + "grad_norm": 0.7287181615829468, + "learning_rate": 3.4010565193076776e-05, + "loss": 0.0838, + "step": 2658 + }, + { + "epoch": 0.3829205069124424, + "grad_norm": 2.1190266609191895, + "learning_rate": 3.400001399965057e-05, + "loss": 0.2555, + "step": 2659 + }, + { + "epoch": 0.38306451612903225, + "grad_norm": 6.438807964324951, + "learning_rate": 3.3989460964080704e-05, + "loss": 2.8465, + "step": 2660 + }, + { + "epoch": 0.3832085253456221, + "grad_norm": 4.167162895202637, + "learning_rate": 3.397890608852718e-05, + "loss": 0.4986, + "step": 2661 + }, + { + "epoch": 0.38335253456221197, + "grad_norm": 6.952796936035156, + "learning_rate": 3.3968349375150396e-05, + "loss": 1.1014, + "step": 2662 + }, + { + "epoch": 0.3834965437788018, + "grad_norm": 4.8697428703308105, + "learning_rate": 3.395779082611113e-05, + "loss": 1.1559, + "step": 2663 + }, + { + "epoch": 0.3836405529953917, + "grad_norm": 1.4654505252838135, + "learning_rate": 3.3947230443570536e-05, + "loss": 0.1729, + "step": 2664 + }, + { + "epoch": 0.38378456221198154, + "grad_norm": 5.4022746086120605, + "learning_rate": 3.393666822969012e-05, + "loss": 0.807, + "step": 2665 + }, + { + "epoch": 0.38392857142857145, + "grad_norm": 1.7951315641403198, + "learning_rate": 3.3926104186631795e-05, + "loss": 0.2476, + "step": 2666 + }, + { + "epoch": 0.3840725806451613, + "grad_norm": 7.154374599456787, + "learning_rate": 3.391553831655782e-05, + "loss": 1.7027, + "step": 2667 + }, + { + "epoch": 0.38421658986175117, + "grad_norm": 1.2448991537094116, + "learning_rate": 3.3904970621630866e-05, + "loss": 0.1367, + "step": 2668 + }, + { + "epoch": 0.384360599078341, + "grad_norm": 2.361400604248047, + "learning_rate": 3.389440110401393e-05, + "loss": 0.2623, + "step": 2669 + }, + { + "epoch": 0.3845046082949309, + "grad_norm": 1.7407917976379395, + "learning_rate": 3.3883829765870415e-05, + "loss": 0.1779, + "step": 2670 + }, + { + "epoch": 0.38464861751152074, + "grad_norm": 4.457988262176514, + "learning_rate": 3.387325660936409e-05, + "loss": 0.2754, + "step": 2671 + }, + { + "epoch": 0.3847926267281106, + "grad_norm": 0.8829044103622437, + "learning_rate": 3.3862681636659086e-05, + "loss": 0.1138, + "step": 2672 + }, + { + "epoch": 0.38493663594470046, + "grad_norm": 1.744327187538147, + "learning_rate": 3.3852104849919905e-05, + "loss": 0.2387, + "step": 2673 + }, + { + "epoch": 0.3850806451612903, + "grad_norm": 2.4028396606445312, + "learning_rate": 3.384152625131145e-05, + "loss": 0.2984, + "step": 2674 + }, + { + "epoch": 0.38522465437788017, + "grad_norm": 2.011183023452759, + "learning_rate": 3.3830945842998954e-05, + "loss": 0.3251, + "step": 2675 + }, + { + "epoch": 0.38536866359447003, + "grad_norm": 2.014815330505371, + "learning_rate": 3.382036362714805e-05, + "loss": 0.3597, + "step": 2676 + }, + { + "epoch": 0.3855126728110599, + "grad_norm": 1.9204961061477661, + "learning_rate": 3.380977960592473e-05, + "loss": 0.2004, + "step": 2677 + }, + { + "epoch": 0.38565668202764974, + "grad_norm": 1.1803172826766968, + "learning_rate": 3.379919378149535e-05, + "loss": 0.1044, + "step": 2678 + }, + { + "epoch": 0.38580069124423966, + "grad_norm": 3.495753765106201, + "learning_rate": 3.378860615602665e-05, + "loss": 0.7918, + "step": 2679 + }, + { + "epoch": 0.3859447004608295, + "grad_norm": 2.929100275039673, + "learning_rate": 3.377801673168571e-05, + "loss": 0.3603, + "step": 2680 + }, + { + "epoch": 0.3860887096774194, + "grad_norm": 1.639853835105896, + "learning_rate": 3.3767425510640026e-05, + "loss": 0.1115, + "step": 2681 + }, + { + "epoch": 0.38623271889400923, + "grad_norm": 1.6573162078857422, + "learning_rate": 3.3756832495057414e-05, + "loss": 0.1531, + "step": 2682 + }, + { + "epoch": 0.3863767281105991, + "grad_norm": 0.6934017539024353, + "learning_rate": 3.3746237687106086e-05, + "loss": 0.1066, + "step": 2683 + }, + { + "epoch": 0.38652073732718895, + "grad_norm": 3.8039562702178955, + "learning_rate": 3.3735641088954595e-05, + "loss": 0.4806, + "step": 2684 + }, + { + "epoch": 0.3866647465437788, + "grad_norm": 4.230919361114502, + "learning_rate": 3.37250427027719e-05, + "loss": 0.3359, + "step": 2685 + }, + { + "epoch": 0.38680875576036866, + "grad_norm": 3.106762647628784, + "learning_rate": 3.3714442530727296e-05, + "loss": 1.2406, + "step": 2686 + }, + { + "epoch": 0.3869527649769585, + "grad_norm": 1.224800944328308, + "learning_rate": 3.3703840574990444e-05, + "loss": 0.2216, + "step": 2687 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 1.4044371843338013, + "learning_rate": 3.3693236837731383e-05, + "loss": 0.1816, + "step": 2688 + }, + { + "epoch": 0.38724078341013823, + "grad_norm": 1.2684845924377441, + "learning_rate": 3.3682631321120504e-05, + "loss": 0.139, + "step": 2689 + }, + { + "epoch": 0.3873847926267281, + "grad_norm": 9.829960823059082, + "learning_rate": 3.367202402732858e-05, + "loss": 0.8398, + "step": 2690 + }, + { + "epoch": 0.38752880184331795, + "grad_norm": 0.6626307368278503, + "learning_rate": 3.366141495852673e-05, + "loss": 0.0942, + "step": 2691 + }, + { + "epoch": 0.3876728110599078, + "grad_norm": 2.674124240875244, + "learning_rate": 3.365080411688644e-05, + "loss": 0.4344, + "step": 2692 + }, + { + "epoch": 0.3878168202764977, + "grad_norm": 1.7841304540634155, + "learning_rate": 3.364019150457956e-05, + "loss": 0.1716, + "step": 2693 + }, + { + "epoch": 0.3879608294930876, + "grad_norm": 0.6191197633743286, + "learning_rate": 3.3629577123778305e-05, + "loss": 0.0736, + "step": 2694 + }, + { + "epoch": 0.38810483870967744, + "grad_norm": 2.0989935398101807, + "learning_rate": 3.361896097665526e-05, + "loss": 0.3285, + "step": 2695 + }, + { + "epoch": 0.3882488479262673, + "grad_norm": 2.0296709537506104, + "learning_rate": 3.360834306538336e-05, + "loss": 0.2543, + "step": 2696 + }, + { + "epoch": 0.38839285714285715, + "grad_norm": 3.3236746788024902, + "learning_rate": 3.35977233921359e-05, + "loss": 0.3778, + "step": 2697 + }, + { + "epoch": 0.388536866359447, + "grad_norm": 5.08437442779541, + "learning_rate": 3.358710195908653e-05, + "loss": 1.2187, + "step": 2698 + }, + { + "epoch": 0.38868087557603687, + "grad_norm": 0.6429014205932617, + "learning_rate": 3.357647876840928e-05, + "loss": 0.1108, + "step": 2699 + }, + { + "epoch": 0.3888248847926267, + "grad_norm": 2.168095350265503, + "learning_rate": 3.356585382227854e-05, + "loss": 0.1731, + "step": 2700 + }, + { + "epoch": 0.3889688940092166, + "grad_norm": 4.8050312995910645, + "learning_rate": 3.355522712286902e-05, + "loss": 1.0112, + "step": 2701 + }, + { + "epoch": 0.38911290322580644, + "grad_norm": 4.110383987426758, + "learning_rate": 3.354459867235584e-05, + "loss": 0.1131, + "step": 2702 + }, + { + "epoch": 0.3892569124423963, + "grad_norm": 2.935335636138916, + "learning_rate": 3.353396847291446e-05, + "loss": 0.5506, + "step": 2703 + }, + { + "epoch": 0.38940092165898615, + "grad_norm": 6.200320243835449, + "learning_rate": 3.352333652672067e-05, + "loss": 0.7259, + "step": 2704 + }, + { + "epoch": 0.389544930875576, + "grad_norm": 2.701198101043701, + "learning_rate": 3.351270283595066e-05, + "loss": 0.296, + "step": 2705 + }, + { + "epoch": 0.3896889400921659, + "grad_norm": 1.151998519897461, + "learning_rate": 3.350206740278095e-05, + "loss": 0.1558, + "step": 2706 + }, + { + "epoch": 0.3898329493087558, + "grad_norm": 2.1983530521392822, + "learning_rate": 3.349143022938843e-05, + "loss": 0.1406, + "step": 2707 + }, + { + "epoch": 0.38997695852534564, + "grad_norm": 4.375380516052246, + "learning_rate": 3.3480791317950346e-05, + "loss": 0.9946, + "step": 2708 + }, + { + "epoch": 0.3901209677419355, + "grad_norm": 4.256187438964844, + "learning_rate": 3.3470150670644286e-05, + "loss": 0.5298, + "step": 2709 + }, + { + "epoch": 0.39026497695852536, + "grad_norm": 2.7562620639801025, + "learning_rate": 3.34595082896482e-05, + "loss": 0.3598, + "step": 2710 + }, + { + "epoch": 0.3904089861751152, + "grad_norm": 4.732277870178223, + "learning_rate": 3.3448864177140406e-05, + "loss": 0.9926, + "step": 2711 + }, + { + "epoch": 0.39055299539170507, + "grad_norm": 5.534977912902832, + "learning_rate": 3.3438218335299554e-05, + "loss": 1.4106, + "step": 2712 + }, + { + "epoch": 0.39069700460829493, + "grad_norm": 0.787486732006073, + "learning_rate": 3.342757076630467e-05, + "loss": 0.081, + "step": 2713 + }, + { + "epoch": 0.3908410138248848, + "grad_norm": 3.480271577835083, + "learning_rate": 3.3416921472335115e-05, + "loss": 0.3587, + "step": 2714 + }, + { + "epoch": 0.39098502304147464, + "grad_norm": 0.7766627073287964, + "learning_rate": 3.3406270455570616e-05, + "loss": 4.1436, + "step": 2715 + }, + { + "epoch": 0.3911290322580645, + "grad_norm": 3.6750547885894775, + "learning_rate": 3.339561771819125e-05, + "loss": 0.4383, + "step": 2716 + }, + { + "epoch": 0.39127304147465436, + "grad_norm": 7.591723442077637, + "learning_rate": 3.338496326237743e-05, + "loss": 1.869, + "step": 2717 + }, + { + "epoch": 0.3914170506912442, + "grad_norm": 2.5503902435302734, + "learning_rate": 3.337430709030995e-05, + "loss": 0.2168, + "step": 2718 + }, + { + "epoch": 0.3915610599078341, + "grad_norm": 1.6242977380752563, + "learning_rate": 3.3363649204169934e-05, + "loss": 0.31, + "step": 2719 + }, + { + "epoch": 0.391705069124424, + "grad_norm": 1.5715885162353516, + "learning_rate": 3.3352989606138865e-05, + "loss": 0.1418, + "step": 2720 + }, + { + "epoch": 0.39184907834101385, + "grad_norm": 2.222717761993408, + "learning_rate": 3.3342328298398565e-05, + "loss": 0.2022, + "step": 2721 + }, + { + "epoch": 0.3919930875576037, + "grad_norm": 6.834585666656494, + "learning_rate": 3.333166528313123e-05, + "loss": 1.6609, + "step": 2722 + }, + { + "epoch": 0.39213709677419356, + "grad_norm": 2.055112838745117, + "learning_rate": 3.332100056251938e-05, + "loss": 0.3187, + "step": 2723 + }, + { + "epoch": 0.3922811059907834, + "grad_norm": 1.011413335800171, + "learning_rate": 3.33103341387459e-05, + "loss": 0.1343, + "step": 2724 + }, + { + "epoch": 0.3924251152073733, + "grad_norm": 6.101302623748779, + "learning_rate": 3.329966601399401e-05, + "loss": 0.6709, + "step": 2725 + }, + { + "epoch": 0.39256912442396313, + "grad_norm": 1.7338722944259644, + "learning_rate": 3.32889961904473e-05, + "loss": 0.164, + "step": 2726 + }, + { + "epoch": 0.392713133640553, + "grad_norm": 0.9291833639144897, + "learning_rate": 3.327832467028969e-05, + "loss": 0.1042, + "step": 2727 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 0.9566053152084351, + "learning_rate": 3.326765145570544e-05, + "loss": 0.1271, + "step": 2728 + }, + { + "epoch": 0.3930011520737327, + "grad_norm": 2.1914970874786377, + "learning_rate": 3.3256976548879184e-05, + "loss": 0.1993, + "step": 2729 + }, + { + "epoch": 0.39314516129032256, + "grad_norm": 4.801070690155029, + "learning_rate": 3.3246299951995865e-05, + "loss": 0.2967, + "step": 2730 + }, + { + "epoch": 0.3932891705069124, + "grad_norm": 0.8292073011398315, + "learning_rate": 3.323562166724082e-05, + "loss": 0.075, + "step": 2731 + }, + { + "epoch": 0.3934331797235023, + "grad_norm": 1.0867822170257568, + "learning_rate": 3.322494169679969e-05, + "loss": 0.1213, + "step": 2732 + }, + { + "epoch": 0.3935771889400922, + "grad_norm": 2.7340714931488037, + "learning_rate": 3.321426004285848e-05, + "loss": 0.2944, + "step": 2733 + }, + { + "epoch": 0.39372119815668205, + "grad_norm": 0.7783468961715698, + "learning_rate": 3.320357670760352e-05, + "loss": 0.0886, + "step": 2734 + }, + { + "epoch": 0.3938652073732719, + "grad_norm": 4.320103645324707, + "learning_rate": 3.319289169322153e-05, + "loss": 0.4087, + "step": 2735 + }, + { + "epoch": 0.39400921658986177, + "grad_norm": 1.4083789587020874, + "learning_rate": 3.3182205001899525e-05, + "loss": 0.1831, + "step": 2736 + }, + { + "epoch": 0.3941532258064516, + "grad_norm": 2.9892005920410156, + "learning_rate": 3.317151663582488e-05, + "loss": 0.3132, + "step": 2737 + }, + { + "epoch": 0.3942972350230415, + "grad_norm": 2.7503836154937744, + "learning_rate": 3.316082659718532e-05, + "loss": 0.3128, + "step": 2738 + }, + { + "epoch": 0.39444124423963134, + "grad_norm": 4.68699836730957, + "learning_rate": 3.3150134888168905e-05, + "loss": 2.7875, + "step": 2739 + }, + { + "epoch": 0.3945852534562212, + "grad_norm": 0.9361661076545715, + "learning_rate": 3.313944151096404e-05, + "loss": 0.1039, + "step": 2740 + }, + { + "epoch": 0.39472926267281105, + "grad_norm": 0.743615984916687, + "learning_rate": 3.312874646775947e-05, + "loss": 0.0971, + "step": 2741 + }, + { + "epoch": 0.3948732718894009, + "grad_norm": 5.496874809265137, + "learning_rate": 3.311804976074428e-05, + "loss": 1.3336, + "step": 2742 + }, + { + "epoch": 0.39501728110599077, + "grad_norm": 4.148477554321289, + "learning_rate": 3.3107351392107896e-05, + "loss": 2.2771, + "step": 2743 + }, + { + "epoch": 0.3951612903225806, + "grad_norm": 4.437798500061035, + "learning_rate": 3.309665136404009e-05, + "loss": 0.7189, + "step": 2744 + }, + { + "epoch": 0.3953052995391705, + "grad_norm": 3.1266372203826904, + "learning_rate": 3.308594967873095e-05, + "loss": 0.2499, + "step": 2745 + }, + { + "epoch": 0.39544930875576034, + "grad_norm": 2.1577720642089844, + "learning_rate": 3.307524633837095e-05, + "loss": 0.2004, + "step": 2746 + }, + { + "epoch": 0.39559331797235026, + "grad_norm": 9.305359840393066, + "learning_rate": 3.306454134515086e-05, + "loss": 0.5386, + "step": 2747 + }, + { + "epoch": 0.3957373271889401, + "grad_norm": 3.5590052604675293, + "learning_rate": 3.30538347012618e-05, + "loss": 0.3932, + "step": 2748 + }, + { + "epoch": 0.39588133640552997, + "grad_norm": 3.3273673057556152, + "learning_rate": 3.304312640889523e-05, + "loss": 0.3441, + "step": 2749 + }, + { + "epoch": 0.39602534562211983, + "grad_norm": 17.759159088134766, + "learning_rate": 3.303241647024296e-05, + "loss": 3.6117, + "step": 2750 + }, + { + "epoch": 0.3961693548387097, + "grad_norm": 4.61497688293457, + "learning_rate": 3.3021704887497114e-05, + "loss": 1.4846, + "step": 2751 + }, + { + "epoch": 0.39631336405529954, + "grad_norm": 6.041271209716797, + "learning_rate": 3.301099166285017e-05, + "loss": 2.4381, + "step": 2752 + }, + { + "epoch": 0.3964573732718894, + "grad_norm": 1.3148545026779175, + "learning_rate": 3.300027679849492e-05, + "loss": 0.1625, + "step": 2753 + }, + { + "epoch": 0.39660138248847926, + "grad_norm": 2.2704923152923584, + "learning_rate": 3.298956029662453e-05, + "loss": 0.3394, + "step": 2754 + }, + { + "epoch": 0.3967453917050691, + "grad_norm": 1.2594618797302246, + "learning_rate": 3.297884215943246e-05, + "loss": 0.1495, + "step": 2755 + }, + { + "epoch": 0.396889400921659, + "grad_norm": 1.7887327671051025, + "learning_rate": 3.2968122389112544e-05, + "loss": 0.1311, + "step": 2756 + }, + { + "epoch": 0.39703341013824883, + "grad_norm": 1.7456347942352295, + "learning_rate": 3.295740098785891e-05, + "loss": 0.1799, + "step": 2757 + }, + { + "epoch": 0.3971774193548387, + "grad_norm": 0.9053346514701843, + "learning_rate": 3.294667795786604e-05, + "loss": 0.0798, + "step": 2758 + }, + { + "epoch": 0.39732142857142855, + "grad_norm": 4.256875991821289, + "learning_rate": 3.293595330132876e-05, + "loss": 0.5322, + "step": 2759 + }, + { + "epoch": 0.39746543778801846, + "grad_norm": 2.7942299842834473, + "learning_rate": 3.292522702044221e-05, + "loss": 0.3525, + "step": 2760 + }, + { + "epoch": 0.3976094470046083, + "grad_norm": 2.435896635055542, + "learning_rate": 3.2914499117401865e-05, + "loss": 0.3475, + "step": 2761 + }, + { + "epoch": 0.3977534562211982, + "grad_norm": 0.8265592455863953, + "learning_rate": 3.2903769594403545e-05, + "loss": 0.1031, + "step": 2762 + }, + { + "epoch": 0.39789746543778803, + "grad_norm": 3.0940544605255127, + "learning_rate": 3.28930384536434e-05, + "loss": 0.5214, + "step": 2763 + }, + { + "epoch": 0.3980414746543779, + "grad_norm": 0.6167076826095581, + "learning_rate": 3.288230569731789e-05, + "loss": 0.0471, + "step": 2764 + }, + { + "epoch": 0.39818548387096775, + "grad_norm": 0.9050828218460083, + "learning_rate": 3.2871571327623826e-05, + "loss": 0.1133, + "step": 2765 + }, + { + "epoch": 0.3983294930875576, + "grad_norm": 1.5604090690612793, + "learning_rate": 3.286083534675835e-05, + "loss": 0.1685, + "step": 2766 + }, + { + "epoch": 0.39847350230414746, + "grad_norm": 2.2503037452697754, + "learning_rate": 3.285009775691892e-05, + "loss": 0.1951, + "step": 2767 + }, + { + "epoch": 0.3986175115207373, + "grad_norm": 4.580416679382324, + "learning_rate": 3.283935856030334e-05, + "loss": 0.9052, + "step": 2768 + }, + { + "epoch": 0.3987615207373272, + "grad_norm": 0.7201671600341797, + "learning_rate": 3.2828617759109714e-05, + "loss": 0.0428, + "step": 2769 + }, + { + "epoch": 0.39890552995391704, + "grad_norm": 3.4367780685424805, + "learning_rate": 3.281787535553651e-05, + "loss": 0.3394, + "step": 2770 + }, + { + "epoch": 0.3990495391705069, + "grad_norm": 1.2273643016815186, + "learning_rate": 3.2807131351782505e-05, + "loss": 0.1975, + "step": 2771 + }, + { + "epoch": 0.39919354838709675, + "grad_norm": 11.702470779418945, + "learning_rate": 3.279638575004681e-05, + "loss": 2.1901, + "step": 2772 + }, + { + "epoch": 0.3993375576036866, + "grad_norm": 1.7530397176742554, + "learning_rate": 3.278563855252885e-05, + "loss": 0.1871, + "step": 2773 + }, + { + "epoch": 0.3994815668202765, + "grad_norm": 0.7021932005882263, + "learning_rate": 3.2774889761428396e-05, + "loss": 0.0669, + "step": 2774 + }, + { + "epoch": 0.3996255760368664, + "grad_norm": 1.2017234563827515, + "learning_rate": 3.276413937894552e-05, + "loss": 0.1166, + "step": 2775 + }, + { + "epoch": 0.39976958525345624, + "grad_norm": 0.6240705251693726, + "learning_rate": 3.2753387407280656e-05, + "loss": 0.0971, + "step": 2776 + }, + { + "epoch": 0.3999135944700461, + "grad_norm": 5.324913024902344, + "learning_rate": 3.274263384863453e-05, + "loss": 0.2731, + "step": 2777 + }, + { + "epoch": 0.40005760368663595, + "grad_norm": 4.35202169418335, + "learning_rate": 3.273187870520821e-05, + "loss": 0.6194, + "step": 2778 + }, + { + "epoch": 0.4002016129032258, + "grad_norm": 5.535919189453125, + "learning_rate": 3.2721121979203086e-05, + "loss": 1.6973, + "step": 2779 + }, + { + "epoch": 0.40034562211981567, + "grad_norm": 3.4298694133758545, + "learning_rate": 3.271036367282085e-05, + "loss": 0.1941, + "step": 2780 + }, + { + "epoch": 0.4004896313364055, + "grad_norm": 5.272246360778809, + "learning_rate": 3.269960378826357e-05, + "loss": 1.7849, + "step": 2781 + }, + { + "epoch": 0.4006336405529954, + "grad_norm": 4.242159843444824, + "learning_rate": 3.2688842327733574e-05, + "loss": 2.1519, + "step": 2782 + }, + { + "epoch": 0.40077764976958524, + "grad_norm": 3.156977891921997, + "learning_rate": 3.267807929343356e-05, + "loss": 0.257, + "step": 2783 + }, + { + "epoch": 0.4009216589861751, + "grad_norm": 1.4907149076461792, + "learning_rate": 3.266731468756653e-05, + "loss": 0.1528, + "step": 2784 + }, + { + "epoch": 0.40106566820276496, + "grad_norm": 1.3671919107437134, + "learning_rate": 3.265654851233579e-05, + "loss": 0.1851, + "step": 2785 + }, + { + "epoch": 0.4012096774193548, + "grad_norm": 0.8478295803070068, + "learning_rate": 3.264578076994502e-05, + "loss": 0.0815, + "step": 2786 + }, + { + "epoch": 0.4013536866359447, + "grad_norm": 1.2977957725524902, + "learning_rate": 3.2635011462598145e-05, + "loss": 0.1212, + "step": 2787 + }, + { + "epoch": 0.4014976958525346, + "grad_norm": 1.4919143915176392, + "learning_rate": 3.262424059249949e-05, + "loss": 0.1459, + "step": 2788 + }, + { + "epoch": 0.40164170506912444, + "grad_norm": 5.393086910247803, + "learning_rate": 3.2613468161853625e-05, + "loss": 1.5592, + "step": 2789 + }, + { + "epoch": 0.4017857142857143, + "grad_norm": 4.374269485473633, + "learning_rate": 3.260269417286551e-05, + "loss": 0.3057, + "step": 2790 + }, + { + "epoch": 0.40192972350230416, + "grad_norm": 1.0540432929992676, + "learning_rate": 3.259191862774037e-05, + "loss": 0.1549, + "step": 2791 + }, + { + "epoch": 0.402073732718894, + "grad_norm": 1.757839322090149, + "learning_rate": 3.258114152868378e-05, + "loss": 0.1909, + "step": 2792 + }, + { + "epoch": 0.4022177419354839, + "grad_norm": 2.952930212020874, + "learning_rate": 3.2570362877901605e-05, + "loss": 0.2224, + "step": 2793 + }, + { + "epoch": 0.40236175115207373, + "grad_norm": 4.139651775360107, + "learning_rate": 3.255958267760006e-05, + "loss": 2.8855, + "step": 2794 + }, + { + "epoch": 0.4025057603686636, + "grad_norm": 4.971797466278076, + "learning_rate": 3.254880092998566e-05, + "loss": 1.6216, + "step": 2795 + }, + { + "epoch": 0.40264976958525345, + "grad_norm": 3.9819772243499756, + "learning_rate": 3.253801763726523e-05, + "loss": 0.295, + "step": 2796 + }, + { + "epoch": 0.4027937788018433, + "grad_norm": 6.160065174102783, + "learning_rate": 3.2527232801645924e-05, + "loss": 2.1618, + "step": 2797 + }, + { + "epoch": 0.40293778801843316, + "grad_norm": 1.894554853439331, + "learning_rate": 3.25164464253352e-05, + "loss": 0.2219, + "step": 2798 + }, + { + "epoch": 0.403081797235023, + "grad_norm": 2.2907233238220215, + "learning_rate": 3.250565851054086e-05, + "loss": 0.142, + "step": 2799 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 2.4268405437469482, + "learning_rate": 3.2494869059470964e-05, + "loss": 0.2828, + "step": 2800 + }, + { + "epoch": 0.4033698156682028, + "grad_norm": 1.1849952936172485, + "learning_rate": 3.2484078074333954e-05, + "loss": 0.1805, + "step": 2801 + }, + { + "epoch": 0.40351382488479265, + "grad_norm": 0.6039983034133911, + "learning_rate": 3.247328555733854e-05, + "loss": 0.0672, + "step": 2802 + }, + { + "epoch": 0.4036578341013825, + "grad_norm": 0.8026435375213623, + "learning_rate": 3.2462491510693753e-05, + "loss": 0.0884, + "step": 2803 + }, + { + "epoch": 0.40380184331797236, + "grad_norm": 1.4532712697982788, + "learning_rate": 3.2451695936608964e-05, + "loss": 0.16, + "step": 2804 + }, + { + "epoch": 0.4039458525345622, + "grad_norm": 0.9591280817985535, + "learning_rate": 3.2440898837293814e-05, + "loss": 0.1063, + "step": 2805 + }, + { + "epoch": 0.4040898617511521, + "grad_norm": 7.7761125564575195, + "learning_rate": 3.243010021495829e-05, + "loss": 1.0228, + "step": 2806 + }, + { + "epoch": 0.40423387096774194, + "grad_norm": 1.000921607017517, + "learning_rate": 3.241930007181268e-05, + "loss": 0.1385, + "step": 2807 + }, + { + "epoch": 0.4043778801843318, + "grad_norm": 4.151905059814453, + "learning_rate": 3.240849841006758e-05, + "loss": 0.2772, + "step": 2808 + }, + { + "epoch": 0.40452188940092165, + "grad_norm": 3.996457815170288, + "learning_rate": 3.2397695231933894e-05, + "loss": 0.2608, + "step": 2809 + }, + { + "epoch": 0.4046658986175115, + "grad_norm": 1.6677794456481934, + "learning_rate": 3.238689053962284e-05, + "loss": 0.2294, + "step": 2810 + }, + { + "epoch": 0.40480990783410137, + "grad_norm": 3.80912709236145, + "learning_rate": 3.237608433534596e-05, + "loss": 0.2985, + "step": 2811 + }, + { + "epoch": 0.4049539170506912, + "grad_norm": 1.1391092538833618, + "learning_rate": 3.236527662131509e-05, + "loss": 0.1291, + "step": 2812 + }, + { + "epoch": 0.4050979262672811, + "grad_norm": 6.123546123504639, + "learning_rate": 3.235446739974236e-05, + "loss": 0.5562, + "step": 2813 + }, + { + "epoch": 0.40524193548387094, + "grad_norm": 2.0591063499450684, + "learning_rate": 3.234365667284025e-05, + "loss": 0.3357, + "step": 2814 + }, + { + "epoch": 0.40538594470046085, + "grad_norm": 0.9560560584068298, + "learning_rate": 3.233284444282152e-05, + "loss": 0.1107, + "step": 2815 + }, + { + "epoch": 0.4055299539170507, + "grad_norm": 1.118831992149353, + "learning_rate": 3.2322030711899224e-05, + "loss": 0.1408, + "step": 2816 + }, + { + "epoch": 0.40567396313364057, + "grad_norm": 3.4727208614349365, + "learning_rate": 3.231121548228676e-05, + "loss": 1.955, + "step": 2817 + }, + { + "epoch": 0.4058179723502304, + "grad_norm": 0.7244469523429871, + "learning_rate": 3.2300398756197806e-05, + "loss": 0.0797, + "step": 2818 + }, + { + "epoch": 0.4059619815668203, + "grad_norm": 0.8919389843940735, + "learning_rate": 3.2289580535846367e-05, + "loss": 0.0927, + "step": 2819 + }, + { + "epoch": 0.40610599078341014, + "grad_norm": 4.183733940124512, + "learning_rate": 3.2278760823446716e-05, + "loss": 0.4259, + "step": 2820 + }, + { + "epoch": 0.40625, + "grad_norm": 0.6634525656700134, + "learning_rate": 3.2267939621213486e-05, + "loss": 0.0909, + "step": 2821 + }, + { + "epoch": 0.40639400921658986, + "grad_norm": 0.9294559359550476, + "learning_rate": 3.225711693136156e-05, + "loss": 0.1321, + "step": 2822 + }, + { + "epoch": 0.4065380184331797, + "grad_norm": 2.6557958126068115, + "learning_rate": 3.2246292756106164e-05, + "loss": 0.237, + "step": 2823 + }, + { + "epoch": 0.4066820276497696, + "grad_norm": 0.8593855500221252, + "learning_rate": 3.223546709766283e-05, + "loss": 0.1099, + "step": 2824 + }, + { + "epoch": 0.40682603686635943, + "grad_norm": 1.8880170583724976, + "learning_rate": 3.2224639958247346e-05, + "loss": 0.1733, + "step": 2825 + }, + { + "epoch": 0.4069700460829493, + "grad_norm": 3.9962058067321777, + "learning_rate": 3.2213811340075864e-05, + "loss": 0.2718, + "step": 2826 + }, + { + "epoch": 0.40711405529953915, + "grad_norm": 1.9419705867767334, + "learning_rate": 3.2202981245364795e-05, + "loss": 0.1601, + "step": 2827 + }, + { + "epoch": 0.40725806451612906, + "grad_norm": 1.4936254024505615, + "learning_rate": 3.2192149676330865e-05, + "loss": 0.1323, + "step": 2828 + }, + { + "epoch": 0.4074020737327189, + "grad_norm": 5.541792392730713, + "learning_rate": 3.2181316635191125e-05, + "loss": 0.5787, + "step": 2829 + }, + { + "epoch": 0.4075460829493088, + "grad_norm": 0.9027908444404602, + "learning_rate": 3.2170482124162884e-05, + "loss": 0.1183, + "step": 2830 + }, + { + "epoch": 0.40769009216589863, + "grad_norm": 5.06166934967041, + "learning_rate": 3.215964614546379e-05, + "loss": 1.5079, + "step": 2831 + }, + { + "epoch": 0.4078341013824885, + "grad_norm": 4.685650825500488, + "learning_rate": 3.214880870131176e-05, + "loss": 2.4859, + "step": 2832 + }, + { + "epoch": 0.40797811059907835, + "grad_norm": 1.6529935598373413, + "learning_rate": 3.213796979392505e-05, + "loss": 0.1135, + "step": 2833 + }, + { + "epoch": 0.4081221198156682, + "grad_norm": 3.479227304458618, + "learning_rate": 3.212712942552218e-05, + "loss": 2.7105, + "step": 2834 + }, + { + "epoch": 0.40826612903225806, + "grad_norm": 5.73405647277832, + "learning_rate": 3.2116287598321984e-05, + "loss": 1.9706, + "step": 2835 + }, + { + "epoch": 0.4084101382488479, + "grad_norm": 1.3776664733886719, + "learning_rate": 3.2105444314543584e-05, + "loss": 0.1169, + "step": 2836 + }, + { + "epoch": 0.4085541474654378, + "grad_norm": 3.6893832683563232, + "learning_rate": 3.2094599576406415e-05, + "loss": 0.3236, + "step": 2837 + }, + { + "epoch": 0.40869815668202764, + "grad_norm": 0.7944961190223694, + "learning_rate": 3.2083753386130205e-05, + "loss": 0.0927, + "step": 2838 + }, + { + "epoch": 0.4088421658986175, + "grad_norm": 1.0108013153076172, + "learning_rate": 3.207290574593498e-05, + "loss": 4.0907, + "step": 2839 + }, + { + "epoch": 0.40898617511520735, + "grad_norm": 0.3898824453353882, + "learning_rate": 3.2062056658041044e-05, + "loss": 0.0711, + "step": 2840 + }, + { + "epoch": 0.4091301843317972, + "grad_norm": 1.1806929111480713, + "learning_rate": 3.205120612466904e-05, + "loss": 0.1583, + "step": 2841 + }, + { + "epoch": 0.4092741935483871, + "grad_norm": 4.289811611175537, + "learning_rate": 3.204035414803985e-05, + "loss": 2.2215, + "step": 2842 + }, + { + "epoch": 0.409418202764977, + "grad_norm": 0.8954632878303528, + "learning_rate": 3.20295007303747e-05, + "loss": 0.1391, + "step": 2843 + }, + { + "epoch": 0.40956221198156684, + "grad_norm": 0.7557753324508667, + "learning_rate": 3.2018645873895095e-05, + "loss": 0.0792, + "step": 2844 + }, + { + "epoch": 0.4097062211981567, + "grad_norm": 3.1133177280426025, + "learning_rate": 3.200778958082282e-05, + "loss": 2.161, + "step": 2845 + }, + { + "epoch": 0.40985023041474655, + "grad_norm": 1.0371315479278564, + "learning_rate": 3.199693185337997e-05, + "loss": 0.1242, + "step": 2846 + }, + { + "epoch": 0.4099942396313364, + "grad_norm": 6.0896501541137695, + "learning_rate": 3.1986072693788944e-05, + "loss": 2.8159, + "step": 2847 + }, + { + "epoch": 0.41013824884792627, + "grad_norm": 6.946549415588379, + "learning_rate": 3.19752121042724e-05, + "loss": 0.4405, + "step": 2848 + }, + { + "epoch": 0.4102822580645161, + "grad_norm": 3.2633166313171387, + "learning_rate": 3.196435008705332e-05, + "loss": 2.3418, + "step": 2849 + }, + { + "epoch": 0.410426267281106, + "grad_norm": 1.9555137157440186, + "learning_rate": 3.195348664435497e-05, + "loss": 0.195, + "step": 2850 + }, + { + "epoch": 0.41057027649769584, + "grad_norm": 1.0844162702560425, + "learning_rate": 3.194262177840089e-05, + "loss": 4.0132, + "step": 2851 + }, + { + "epoch": 0.4107142857142857, + "grad_norm": 2.0378377437591553, + "learning_rate": 3.1931755491414935e-05, + "loss": 0.3086, + "step": 2852 + }, + { + "epoch": 0.41085829493087556, + "grad_norm": 1.3269392251968384, + "learning_rate": 3.1920887785621235e-05, + "loss": 0.1566, + "step": 2853 + }, + { + "epoch": 0.4110023041474654, + "grad_norm": 2.753197431564331, + "learning_rate": 3.191001866324423e-05, + "loss": 0.3918, + "step": 2854 + }, + { + "epoch": 0.4111463133640553, + "grad_norm": 1.655472993850708, + "learning_rate": 3.1899148126508625e-05, + "loss": 0.1708, + "step": 2855 + }, + { + "epoch": 0.4112903225806452, + "grad_norm": 3.108440399169922, + "learning_rate": 3.188827617763943e-05, + "loss": 0.268, + "step": 2856 + }, + { + "epoch": 0.41143433179723504, + "grad_norm": 4.271183967590332, + "learning_rate": 3.187740281886195e-05, + "loss": 2.0466, + "step": 2857 + }, + { + "epoch": 0.4115783410138249, + "grad_norm": 5.0650105476379395, + "learning_rate": 3.186652805240176e-05, + "loss": 2.8388, + "step": 2858 + }, + { + "epoch": 0.41172235023041476, + "grad_norm": 0.9365291595458984, + "learning_rate": 3.185565188048473e-05, + "loss": 0.108, + "step": 2859 + }, + { + "epoch": 0.4118663594470046, + "grad_norm": 4.181826591491699, + "learning_rate": 3.184477430533703e-05, + "loss": 1.0322, + "step": 2860 + }, + { + "epoch": 0.4120103686635945, + "grad_norm": 1.7864140272140503, + "learning_rate": 3.183389532918509e-05, + "loss": 3.783, + "step": 2861 + }, + { + "epoch": 0.41215437788018433, + "grad_norm": 1.0675859451293945, + "learning_rate": 3.182301495425567e-05, + "loss": 0.1137, + "step": 2862 + }, + { + "epoch": 0.4122983870967742, + "grad_norm": 1.7495768070220947, + "learning_rate": 3.181213318277577e-05, + "loss": 0.1512, + "step": 2863 + }, + { + "epoch": 0.41244239631336405, + "grad_norm": 1.1572892665863037, + "learning_rate": 3.18012500169727e-05, + "loss": 0.1313, + "step": 2864 + }, + { + "epoch": 0.4125864055299539, + "grad_norm": 3.4339799880981445, + "learning_rate": 3.179036545907405e-05, + "loss": 0.7094, + "step": 2865 + }, + { + "epoch": 0.41273041474654376, + "grad_norm": 6.827710151672363, + "learning_rate": 3.17794795113077e-05, + "loss": 0.9159, + "step": 2866 + }, + { + "epoch": 0.4128744239631336, + "grad_norm": 0.8496534824371338, + "learning_rate": 3.1768592175901805e-05, + "loss": 0.0776, + "step": 2867 + }, + { + "epoch": 0.4130184331797235, + "grad_norm": 2.356947422027588, + "learning_rate": 3.1757703455084827e-05, + "loss": 0.249, + "step": 2868 + }, + { + "epoch": 0.4131624423963134, + "grad_norm": 2.5738258361816406, + "learning_rate": 3.1746813351085475e-05, + "loss": 0.3373, + "step": 2869 + }, + { + "epoch": 0.41330645161290325, + "grad_norm": 4.828393459320068, + "learning_rate": 3.173592186613277e-05, + "loss": 0.1967, + "step": 2870 + }, + { + "epoch": 0.4134504608294931, + "grad_norm": 3.109696388244629, + "learning_rate": 3.1725029002456e-05, + "loss": 0.3331, + "step": 2871 + }, + { + "epoch": 0.41359447004608296, + "grad_norm": 7.547399997711182, + "learning_rate": 3.1714134762284755e-05, + "loss": 2.0691, + "step": 2872 + }, + { + "epoch": 0.4137384792626728, + "grad_norm": 1.3058749437332153, + "learning_rate": 3.170323914784889e-05, + "loss": 0.184, + "step": 2873 + }, + { + "epoch": 0.4138824884792627, + "grad_norm": 4.7903523445129395, + "learning_rate": 3.169234216137852e-05, + "loss": 1.1752, + "step": 2874 + }, + { + "epoch": 0.41402649769585254, + "grad_norm": 2.190863847732544, + "learning_rate": 3.16814438051041e-05, + "loss": 0.1334, + "step": 2875 + }, + { + "epoch": 0.4141705069124424, + "grad_norm": 3.6869869232177734, + "learning_rate": 3.167054408125631e-05, + "loss": 2.3859, + "step": 2876 + }, + { + "epoch": 0.41431451612903225, + "grad_norm": 1.055316686630249, + "learning_rate": 3.165964299206614e-05, + "loss": 4.1521, + "step": 2877 + }, + { + "epoch": 0.4144585253456221, + "grad_norm": 1.3395549058914185, + "learning_rate": 3.1648740539764844e-05, + "loss": 0.1579, + "step": 2878 + }, + { + "epoch": 0.41460253456221197, + "grad_norm": 4.9261932373046875, + "learning_rate": 3.1637836726583957e-05, + "loss": 0.3277, + "step": 2879 + }, + { + "epoch": 0.4147465437788018, + "grad_norm": 4.361835956573486, + "learning_rate": 3.162693155475531e-05, + "loss": 0.1831, + "step": 2880 + }, + { + "epoch": 0.4148905529953917, + "grad_norm": 2.0474460124969482, + "learning_rate": 3.161602502651099e-05, + "loss": 0.3366, + "step": 2881 + }, + { + "epoch": 0.41503456221198154, + "grad_norm": 2.09287691116333, + "learning_rate": 3.1605117144083374e-05, + "loss": 0.1848, + "step": 2882 + }, + { + "epoch": 0.41517857142857145, + "grad_norm": 8.717791557312012, + "learning_rate": 3.159420790970511e-05, + "loss": 0.6239, + "step": 2883 + }, + { + "epoch": 0.4153225806451613, + "grad_norm": 5.357721328735352, + "learning_rate": 3.158329732560912e-05, + "loss": 1.7827, + "step": 2884 + }, + { + "epoch": 0.41546658986175117, + "grad_norm": 0.5243618488311768, + "learning_rate": 3.157238539402862e-05, + "loss": 0.0533, + "step": 2885 + }, + { + "epoch": 0.415610599078341, + "grad_norm": 0.61652672290802, + "learning_rate": 3.156147211719708e-05, + "loss": 0.061, + "step": 2886 + }, + { + "epoch": 0.4157546082949309, + "grad_norm": 4.563547134399414, + "learning_rate": 3.155055749734827e-05, + "loss": 0.3075, + "step": 2887 + }, + { + "epoch": 0.41589861751152074, + "grad_norm": 1.6407771110534668, + "learning_rate": 3.153964153671619e-05, + "loss": 0.145, + "step": 2888 + }, + { + "epoch": 0.4160426267281106, + "grad_norm": 1.1397759914398193, + "learning_rate": 3.1528724237535165e-05, + "loss": 0.1602, + "step": 2889 + }, + { + "epoch": 0.41618663594470046, + "grad_norm": 1.9358712434768677, + "learning_rate": 3.151780560203978e-05, + "loss": 0.233, + "step": 2890 + }, + { + "epoch": 0.4163306451612903, + "grad_norm": 1.5185120105743408, + "learning_rate": 3.1506885632464865e-05, + "loss": 0.1323, + "step": 2891 + }, + { + "epoch": 0.41647465437788017, + "grad_norm": 2.201007604598999, + "learning_rate": 3.149596433104556e-05, + "loss": 0.1952, + "step": 2892 + }, + { + "epoch": 0.41661866359447003, + "grad_norm": 1.9377212524414062, + "learning_rate": 3.148504170001726e-05, + "loss": 0.3711, + "step": 2893 + }, + { + "epoch": 0.4167626728110599, + "grad_norm": 1.0782966613769531, + "learning_rate": 3.1474117741615635e-05, + "loss": 0.1255, + "step": 2894 + }, + { + "epoch": 0.41690668202764974, + "grad_norm": 1.4944103956222534, + "learning_rate": 3.1463192458076616e-05, + "loss": 0.1273, + "step": 2895 + }, + { + "epoch": 0.41705069124423966, + "grad_norm": 8.702287673950195, + "learning_rate": 3.1452265851636424e-05, + "loss": 1.9613, + "step": 2896 + }, + { + "epoch": 0.4171947004608295, + "grad_norm": 2.4743034839630127, + "learning_rate": 3.144133792453154e-05, + "loss": 0.3408, + "step": 2897 + }, + { + "epoch": 0.4173387096774194, + "grad_norm": 1.1190133094787598, + "learning_rate": 3.143040867899872e-05, + "loss": 4.4252, + "step": 2898 + }, + { + "epoch": 0.41748271889400923, + "grad_norm": 1.7261478900909424, + "learning_rate": 3.1419478117274984e-05, + "loss": 0.2293, + "step": 2899 + }, + { + "epoch": 0.4176267281105991, + "grad_norm": 3.5880625247955322, + "learning_rate": 3.140854624159763e-05, + "loss": 0.5295, + "step": 2900 + }, + { + "epoch": 0.41777073732718895, + "grad_norm": 0.5406910181045532, + "learning_rate": 3.1397613054204215e-05, + "loss": 0.0772, + "step": 2901 + }, + { + "epoch": 0.4179147465437788, + "grad_norm": 2.220010757446289, + "learning_rate": 3.1386678557332564e-05, + "loss": 0.4049, + "step": 2902 + }, + { + "epoch": 0.41805875576036866, + "grad_norm": 4.765170097351074, + "learning_rate": 3.137574275322078e-05, + "loss": 0.3758, + "step": 2903 + }, + { + "epoch": 0.4182027649769585, + "grad_norm": 0.7564293146133423, + "learning_rate": 3.136480564410724e-05, + "loss": 0.1145, + "step": 2904 + }, + { + "epoch": 0.4183467741935484, + "grad_norm": 0.8879989385604858, + "learning_rate": 3.1353867232230564e-05, + "loss": 0.1509, + "step": 2905 + }, + { + "epoch": 0.41849078341013823, + "grad_norm": 5.189684867858887, + "learning_rate": 3.1342927519829644e-05, + "loss": 1.1785, + "step": 2906 + }, + { + "epoch": 0.4186347926267281, + "grad_norm": 2.2437007427215576, + "learning_rate": 3.1331986509143664e-05, + "loss": 0.2282, + "step": 2907 + }, + { + "epoch": 0.41877880184331795, + "grad_norm": 1.933872103691101, + "learning_rate": 3.132104420241204e-05, + "loss": 0.2779, + "step": 2908 + }, + { + "epoch": 0.4189228110599078, + "grad_norm": 1.4598643779754639, + "learning_rate": 3.1310100601874484e-05, + "loss": 0.1378, + "step": 2909 + }, + { + "epoch": 0.4190668202764977, + "grad_norm": 2.452526330947876, + "learning_rate": 3.129915570977094e-05, + "loss": 0.3295, + "step": 2910 + }, + { + "epoch": 0.4192108294930876, + "grad_norm": 2.9782228469848633, + "learning_rate": 3.128820952834164e-05, + "loss": 0.1548, + "step": 2911 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 2.8175745010375977, + "learning_rate": 3.1277262059827085e-05, + "loss": 0.2495, + "step": 2912 + }, + { + "epoch": 0.4194988479262673, + "grad_norm": 8.374321937561035, + "learning_rate": 3.126631330646802e-05, + "loss": 1.1885, + "step": 2913 + }, + { + "epoch": 0.41964285714285715, + "grad_norm": 1.3171706199645996, + "learning_rate": 3.125536327050546e-05, + "loss": 0.1852, + "step": 2914 + }, + { + "epoch": 0.419786866359447, + "grad_norm": 2.3221852779388428, + "learning_rate": 3.1244411954180676e-05, + "loss": 0.3088, + "step": 2915 + }, + { + "epoch": 0.41993087557603687, + "grad_norm": 5.569929122924805, + "learning_rate": 3.123345935973522e-05, + "loss": 1.1183, + "step": 2916 + }, + { + "epoch": 0.4200748847926267, + "grad_norm": 0.6611135601997375, + "learning_rate": 3.122250548941089e-05, + "loss": 0.0807, + "step": 2917 + }, + { + "epoch": 0.4202188940092166, + "grad_norm": 2.324122905731201, + "learning_rate": 3.121155034544976e-05, + "loss": 0.0937, + "step": 2918 + }, + { + "epoch": 0.42036290322580644, + "grad_norm": 4.076549053192139, + "learning_rate": 3.120059393009414e-05, + "loss": 0.2714, + "step": 2919 + }, + { + "epoch": 0.4205069124423963, + "grad_norm": 0.5927730798721313, + "learning_rate": 3.118963624558662e-05, + "loss": 0.0595, + "step": 2920 + }, + { + "epoch": 0.42065092165898615, + "grad_norm": 3.6254587173461914, + "learning_rate": 3.117867729417004e-05, + "loss": 2.0771, + "step": 2921 + }, + { + "epoch": 0.420794930875576, + "grad_norm": 3.0664498805999756, + "learning_rate": 3.116771707808751e-05, + "loss": 0.171, + "step": 2922 + }, + { + "epoch": 0.4209389400921659, + "grad_norm": 5.205289363861084, + "learning_rate": 3.1156755599582385e-05, + "loss": 0.309, + "step": 2923 + }, + { + "epoch": 0.4210829493087558, + "grad_norm": 3.9386403560638428, + "learning_rate": 3.1145792860898294e-05, + "loss": 0.2964, + "step": 2924 + }, + { + "epoch": 0.42122695852534564, + "grad_norm": 7.335031032562256, + "learning_rate": 3.113482886427911e-05, + "loss": 1.0028, + "step": 2925 + }, + { + "epoch": 0.4213709677419355, + "grad_norm": 5.655906677246094, + "learning_rate": 3.112386361196897e-05, + "loss": 0.631, + "step": 2926 + }, + { + "epoch": 0.42151497695852536, + "grad_norm": 3.4257991313934326, + "learning_rate": 3.111289710621228e-05, + "loss": 0.2519, + "step": 2927 + }, + { + "epoch": 0.4216589861751152, + "grad_norm": 2.3380355834960938, + "learning_rate": 3.110192934925367e-05, + "loss": 0.2074, + "step": 2928 + }, + { + "epoch": 0.42180299539170507, + "grad_norm": 1.9876552820205688, + "learning_rate": 3.109096034333805e-05, + "loss": 0.0762, + "step": 2929 + }, + { + "epoch": 0.42194700460829493, + "grad_norm": 1.090009093284607, + "learning_rate": 3.1079990090710595e-05, + "loss": 0.1768, + "step": 2930 + }, + { + "epoch": 0.4220910138248848, + "grad_norm": 5.296807289123535, + "learning_rate": 3.10690185936167e-05, + "loss": 0.483, + "step": 2931 + }, + { + "epoch": 0.42223502304147464, + "grad_norm": 1.0243682861328125, + "learning_rate": 3.105804585430206e-05, + "loss": 0.1182, + "step": 2932 + }, + { + "epoch": 0.4223790322580645, + "grad_norm": 3.620081663131714, + "learning_rate": 3.104707187501258e-05, + "loss": 1.3138, + "step": 2933 + }, + { + "epoch": 0.42252304147465436, + "grad_norm": 8.372934341430664, + "learning_rate": 3.103609665799445e-05, + "loss": 2.3369, + "step": 2934 + }, + { + "epoch": 0.4226670506912442, + "grad_norm": 0.9716679453849792, + "learning_rate": 3.1025120205494106e-05, + "loss": 0.1098, + "step": 2935 + }, + { + "epoch": 0.4228110599078341, + "grad_norm": 0.8261524438858032, + "learning_rate": 3.101414251975823e-05, + "loss": 0.0992, + "step": 2936 + }, + { + "epoch": 0.422955069124424, + "grad_norm": 1.365081548690796, + "learning_rate": 3.100316360303376e-05, + "loss": 0.1676, + "step": 2937 + }, + { + "epoch": 0.42309907834101385, + "grad_norm": 8.81645679473877, + "learning_rate": 3.099218345756787e-05, + "loss": 1.922, + "step": 2938 + }, + { + "epoch": 0.4232430875576037, + "grad_norm": 0.9748818278312683, + "learning_rate": 3.098120208560803e-05, + "loss": 0.1301, + "step": 2939 + }, + { + "epoch": 0.42338709677419356, + "grad_norm": 1.6284888982772827, + "learning_rate": 3.097021948940192e-05, + "loss": 0.2221, + "step": 2940 + }, + { + "epoch": 0.4235311059907834, + "grad_norm": 7.5001349449157715, + "learning_rate": 3.095923567119748e-05, + "loss": 1.5059, + "step": 2941 + }, + { + "epoch": 0.4236751152073733, + "grad_norm": 1.5309727191925049, + "learning_rate": 3.09482506332429e-05, + "loss": 0.1441, + "step": 2942 + }, + { + "epoch": 0.42381912442396313, + "grad_norm": 1.215211033821106, + "learning_rate": 3.093726437778664e-05, + "loss": 0.0998, + "step": 2943 + }, + { + "epoch": 0.423963133640553, + "grad_norm": 0.9463428258895874, + "learning_rate": 3.092627690707738e-05, + "loss": 0.1047, + "step": 2944 + }, + { + "epoch": 0.42410714285714285, + "grad_norm": 5.1168622970581055, + "learning_rate": 3.091528822336405e-05, + "loss": 1.6425, + "step": 2945 + }, + { + "epoch": 0.4242511520737327, + "grad_norm": 1.1166261434555054, + "learning_rate": 3.090429832889586e-05, + "loss": 0.1134, + "step": 2946 + }, + { + "epoch": 0.42439516129032256, + "grad_norm": 2.4721622467041016, + "learning_rate": 3.0893307225922244e-05, + "loss": 0.2337, + "step": 2947 + }, + { + "epoch": 0.4245391705069124, + "grad_norm": 1.5235751867294312, + "learning_rate": 3.088231491669287e-05, + "loss": 0.2775, + "step": 2948 + }, + { + "epoch": 0.4246831797235023, + "grad_norm": 0.7256106734275818, + "learning_rate": 3.0871321403457684e-05, + "loss": 0.0975, + "step": 2949 + }, + { + "epoch": 0.4248271889400922, + "grad_norm": 5.5581560134887695, + "learning_rate": 3.086032668846686e-05, + "loss": 1.5893, + "step": 2950 + }, + { + "epoch": 0.42497119815668205, + "grad_norm": 1.1378854513168335, + "learning_rate": 3.084933077397081e-05, + "loss": 0.1604, + "step": 2951 + }, + { + "epoch": 0.4251152073732719, + "grad_norm": 1.3895729780197144, + "learning_rate": 3.083833366222023e-05, + "loss": 0.2187, + "step": 2952 + }, + { + "epoch": 0.42525921658986177, + "grad_norm": 3.26011323928833, + "learning_rate": 3.082733535546601e-05, + "loss": 0.2307, + "step": 2953 + }, + { + "epoch": 0.4254032258064516, + "grad_norm": 0.8309813737869263, + "learning_rate": 3.081633585595931e-05, + "loss": 0.1012, + "step": 2954 + }, + { + "epoch": 0.4255472350230415, + "grad_norm": 0.7914798259735107, + "learning_rate": 3.080533516595155e-05, + "loss": 0.0898, + "step": 2955 + }, + { + "epoch": 0.42569124423963134, + "grad_norm": 4.736087799072266, + "learning_rate": 3.0794333287694376e-05, + "loss": 0.4017, + "step": 2956 + }, + { + "epoch": 0.4258352534562212, + "grad_norm": 3.6145694255828857, + "learning_rate": 3.078333022343966e-05, + "loss": 0.3202, + "step": 2957 + }, + { + "epoch": 0.42597926267281105, + "grad_norm": 8.353148460388184, + "learning_rate": 3.077232597543954e-05, + "loss": 2.7896, + "step": 2958 + }, + { + "epoch": 0.4261232718894009, + "grad_norm": 0.7520623207092285, + "learning_rate": 3.076132054594641e-05, + "loss": 0.1074, + "step": 2959 + }, + { + "epoch": 0.42626728110599077, + "grad_norm": 2.4107422828674316, + "learning_rate": 3.075031393721285e-05, + "loss": 0.1767, + "step": 2960 + }, + { + "epoch": 0.4264112903225806, + "grad_norm": 2.845876932144165, + "learning_rate": 3.073930615149174e-05, + "loss": 0.7437, + "step": 2961 + }, + { + "epoch": 0.4265552995391705, + "grad_norm": 1.618788480758667, + "learning_rate": 3.072829719103619e-05, + "loss": 0.1604, + "step": 2962 + }, + { + "epoch": 0.42669930875576034, + "grad_norm": 4.20764684677124, + "learning_rate": 3.0717287058099524e-05, + "loss": 2.3902, + "step": 2963 + }, + { + "epoch": 0.42684331797235026, + "grad_norm": 2.0781853199005127, + "learning_rate": 3.070627575493533e-05, + "loss": 0.2798, + "step": 2964 + }, + { + "epoch": 0.4269873271889401, + "grad_norm": 1.9363151788711548, + "learning_rate": 3.069526328379742e-05, + "loss": 0.2075, + "step": 2965 + }, + { + "epoch": 0.42713133640552997, + "grad_norm": 4.110360622406006, + "learning_rate": 3.068424964693985e-05, + "loss": 0.3428, + "step": 2966 + }, + { + "epoch": 0.42727534562211983, + "grad_norm": 3.2413556575775146, + "learning_rate": 3.067323484661693e-05, + "loss": 2.8219, + "step": 2967 + }, + { + "epoch": 0.4274193548387097, + "grad_norm": 1.011242151260376, + "learning_rate": 3.066221888508318e-05, + "loss": 0.1372, + "step": 2968 + }, + { + "epoch": 0.42756336405529954, + "grad_norm": 1.466403841972351, + "learning_rate": 3.065120176459338e-05, + "loss": 4.297, + "step": 2969 + }, + { + "epoch": 0.4277073732718894, + "grad_norm": 4.7306013107299805, + "learning_rate": 3.064018348740253e-05, + "loss": 2.4122, + "step": 2970 + }, + { + "epoch": 0.42785138248847926, + "grad_norm": 3.1197595596313477, + "learning_rate": 3.0629164055765894e-05, + "loss": 1.7768, + "step": 2971 + }, + { + "epoch": 0.4279953917050691, + "grad_norm": 1.942121982574463, + "learning_rate": 3.061814347193894e-05, + "loss": 0.1683, + "step": 2972 + }, + { + "epoch": 0.428139400921659, + "grad_norm": 3.200483560562134, + "learning_rate": 3.0607121738177394e-05, + "loss": 0.2754, + "step": 2973 + }, + { + "epoch": 0.42828341013824883, + "grad_norm": 11.342204093933105, + "learning_rate": 3.0596098856737205e-05, + "loss": 2.9068, + "step": 2974 + }, + { + "epoch": 0.4284274193548387, + "grad_norm": 2.140300750732422, + "learning_rate": 3.058507482987457e-05, + "loss": 0.1621, + "step": 2975 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.8843597769737244, + "learning_rate": 3.05740496598459e-05, + "loss": 0.1036, + "step": 2976 + }, + { + "epoch": 0.42871543778801846, + "grad_norm": 0.9488966464996338, + "learning_rate": 3.056302334890786e-05, + "loss": 0.0702, + "step": 2977 + }, + { + "epoch": 0.4288594470046083, + "grad_norm": 3.3038735389709473, + "learning_rate": 3.055199589931735e-05, + "loss": 0.3259, + "step": 2978 + }, + { + "epoch": 0.4290034562211982, + "grad_norm": 2.882412910461426, + "learning_rate": 3.054096731333147e-05, + "loss": 2.0886, + "step": 2979 + }, + { + "epoch": 0.42914746543778803, + "grad_norm": 11.271259307861328, + "learning_rate": 3.05299375932076e-05, + "loss": 2.2838, + "step": 2980 + }, + { + "epoch": 0.4292914746543779, + "grad_norm": 0.5817174911499023, + "learning_rate": 3.0518906741203316e-05, + "loss": 0.0515, + "step": 2981 + }, + { + "epoch": 0.42943548387096775, + "grad_norm": 3.40474009513855, + "learning_rate": 3.0507874759576438e-05, + "loss": 2.1706, + "step": 2982 + }, + { + "epoch": 0.4295794930875576, + "grad_norm": 3.497828245162964, + "learning_rate": 3.0496841650585022e-05, + "loss": 0.3261, + "step": 2983 + }, + { + "epoch": 0.42972350230414746, + "grad_norm": 5.227865695953369, + "learning_rate": 3.0485807416487348e-05, + "loss": 0.5213, + "step": 2984 + }, + { + "epoch": 0.4298675115207373, + "grad_norm": 2.224905014038086, + "learning_rate": 3.0474772059541935e-05, + "loss": 0.3196, + "step": 2985 + }, + { + "epoch": 0.4300115207373272, + "grad_norm": 3.1307973861694336, + "learning_rate": 3.046373558200752e-05, + "loss": 0.1653, + "step": 2986 + }, + { + "epoch": 0.43015552995391704, + "grad_norm": 9.25662612915039, + "learning_rate": 3.0452697986143068e-05, + "loss": 1.2039, + "step": 2987 + }, + { + "epoch": 0.4302995391705069, + "grad_norm": 3.0332729816436768, + "learning_rate": 3.0441659274207796e-05, + "loss": 2.6859, + "step": 2988 + }, + { + "epoch": 0.43044354838709675, + "grad_norm": 1.3235626220703125, + "learning_rate": 3.0430619448461118e-05, + "loss": 0.1443, + "step": 2989 + }, + { + "epoch": 0.4305875576036866, + "grad_norm": 1.5147907733917236, + "learning_rate": 3.0419578511162695e-05, + "loss": 0.17, + "step": 2990 + }, + { + "epoch": 0.4307315668202765, + "grad_norm": 1.0381648540496826, + "learning_rate": 3.0408536464572412e-05, + "loss": 0.1015, + "step": 2991 + }, + { + "epoch": 0.4308755760368664, + "grad_norm": 3.104469060897827, + "learning_rate": 3.039749331095038e-05, + "loss": 0.4007, + "step": 2992 + }, + { + "epoch": 0.43101958525345624, + "grad_norm": 1.54116690158844, + "learning_rate": 3.0386449052556943e-05, + "loss": 0.1613, + "step": 2993 + }, + { + "epoch": 0.4311635944700461, + "grad_norm": 0.6364186406135559, + "learning_rate": 3.037540369165266e-05, + "loss": 0.0797, + "step": 2994 + }, + { + "epoch": 0.43130760368663595, + "grad_norm": 3.876330852508545, + "learning_rate": 3.0364357230498325e-05, + "loss": 1.471, + "step": 2995 + }, + { + "epoch": 0.4314516129032258, + "grad_norm": 2.1037542819976807, + "learning_rate": 3.0353309671354947e-05, + "loss": 0.1662, + "step": 2996 + }, + { + "epoch": 0.43159562211981567, + "grad_norm": 3.6105337142944336, + "learning_rate": 3.034226101648377e-05, + "loss": 0.3381, + "step": 2997 + }, + { + "epoch": 0.4317396313364055, + "grad_norm": 0.9902567267417908, + "learning_rate": 3.033121126814626e-05, + "loss": 0.0918, + "step": 2998 + }, + { + "epoch": 0.4318836405529954, + "grad_norm": 0.668506383895874, + "learning_rate": 3.03201604286041e-05, + "loss": 0.0896, + "step": 2999 + }, + { + "epoch": 0.43202764976958524, + "grad_norm": 0.4114518463611603, + "learning_rate": 3.0309108500119205e-05, + "loss": 0.0676, + "step": 3000 + }, + { + "epoch": 0.4321716589861751, + "grad_norm": 1.2540347576141357, + "learning_rate": 3.029805548495371e-05, + "loss": 0.126, + "step": 3001 + }, + { + "epoch": 0.43231566820276496, + "grad_norm": 1.585508942604065, + "learning_rate": 3.0287001385369968e-05, + "loss": 0.1407, + "step": 3002 + }, + { + "epoch": 0.4324596774193548, + "grad_norm": 0.736254870891571, + "learning_rate": 3.0275946203630558e-05, + "loss": 0.0708, + "step": 3003 + }, + { + "epoch": 0.4326036866359447, + "grad_norm": 1.451836347579956, + "learning_rate": 3.0264889941998285e-05, + "loss": 0.1188, + "step": 3004 + }, + { + "epoch": 0.4327476958525346, + "grad_norm": 1.6151028871536255, + "learning_rate": 3.0253832602736166e-05, + "loss": 0.1284, + "step": 3005 + }, + { + "epoch": 0.43289170506912444, + "grad_norm": 4.330977916717529, + "learning_rate": 3.0242774188107437e-05, + "loss": 0.2047, + "step": 3006 + }, + { + "epoch": 0.4330357142857143, + "grad_norm": 0.6102902293205261, + "learning_rate": 3.0231714700375568e-05, + "loss": 0.0798, + "step": 3007 + }, + { + "epoch": 0.43317972350230416, + "grad_norm": 3.6513490676879883, + "learning_rate": 3.022065414180425e-05, + "loss": 0.9681, + "step": 3008 + }, + { + "epoch": 0.433323732718894, + "grad_norm": 4.9689836502075195, + "learning_rate": 3.0209592514657365e-05, + "loss": 0.2744, + "step": 3009 + }, + { + "epoch": 0.4334677419354839, + "grad_norm": 8.258262634277344, + "learning_rate": 3.019852982119904e-05, + "loss": 1.6771, + "step": 3010 + }, + { + "epoch": 0.43361175115207373, + "grad_norm": 3.9401330947875977, + "learning_rate": 3.0187466063693614e-05, + "loss": 1.7286, + "step": 3011 + }, + { + "epoch": 0.4337557603686636, + "grad_norm": 1.0709456205368042, + "learning_rate": 3.0176401244405645e-05, + "loss": 0.1457, + "step": 3012 + }, + { + "epoch": 0.43389976958525345, + "grad_norm": 2.2759933471679688, + "learning_rate": 3.0165335365599894e-05, + "loss": 0.198, + "step": 3013 + }, + { + "epoch": 0.4340437788018433, + "grad_norm": 1.6982011795043945, + "learning_rate": 3.0154268429541364e-05, + "loss": 0.1833, + "step": 3014 + }, + { + "epoch": 0.43418778801843316, + "grad_norm": 1.2538584470748901, + "learning_rate": 3.0143200438495255e-05, + "loss": 0.143, + "step": 3015 + }, + { + "epoch": 0.434331797235023, + "grad_norm": 1.6701797246932983, + "learning_rate": 3.0132131394726993e-05, + "loss": 0.1689, + "step": 3016 + }, + { + "epoch": 0.4344758064516129, + "grad_norm": 5.097164630889893, + "learning_rate": 3.0121061300502213e-05, + "loss": 1.7025, + "step": 3017 + }, + { + "epoch": 0.4346198156682028, + "grad_norm": 1.0293242931365967, + "learning_rate": 3.0109990158086764e-05, + "loss": 0.1269, + "step": 3018 + }, + { + "epoch": 0.43476382488479265, + "grad_norm": 1.3947205543518066, + "learning_rate": 3.009891796974671e-05, + "loss": 0.1752, + "step": 3019 + }, + { + "epoch": 0.4349078341013825, + "grad_norm": 1.7699730396270752, + "learning_rate": 3.0087844737748344e-05, + "loss": 0.1565, + "step": 3020 + }, + { + "epoch": 0.43505184331797236, + "grad_norm": 0.9494741559028625, + "learning_rate": 3.007677046435815e-05, + "loss": 0.0933, + "step": 3021 + }, + { + "epoch": 0.4351958525345622, + "grad_norm": 1.8424553871154785, + "learning_rate": 3.006569515184285e-05, + "loss": 0.1418, + "step": 3022 + }, + { + "epoch": 0.4353398617511521, + "grad_norm": 3.3565762042999268, + "learning_rate": 3.005461880246935e-05, + "loss": 0.3309, + "step": 3023 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 2.0615224838256836, + "learning_rate": 3.0043541418504783e-05, + "loss": 0.1935, + "step": 3024 + }, + { + "epoch": 0.4356278801843318, + "grad_norm": 2.8094818592071533, + "learning_rate": 3.0032463002216505e-05, + "loss": 0.2215, + "step": 3025 + }, + { + "epoch": 0.43577188940092165, + "grad_norm": 7.9687395095825195, + "learning_rate": 3.0021383555872064e-05, + "loss": 2.0196, + "step": 3026 + }, + { + "epoch": 0.4359158986175115, + "grad_norm": 4.063091278076172, + "learning_rate": 3.0010303081739226e-05, + "loss": 2.4492, + "step": 3027 + }, + { + "epoch": 0.43605990783410137, + "grad_norm": 1.437224268913269, + "learning_rate": 2.9999221582085974e-05, + "loss": 0.1598, + "step": 3028 + }, + { + "epoch": 0.4362039170506912, + "grad_norm": 1.637597918510437, + "learning_rate": 2.9988139059180486e-05, + "loss": 0.1635, + "step": 3029 + }, + { + "epoch": 0.4363479262672811, + "grad_norm": 0.7799956202507019, + "learning_rate": 2.9977055515291164e-05, + "loss": 0.1007, + "step": 3030 + }, + { + "epoch": 0.43649193548387094, + "grad_norm": 1.250326156616211, + "learning_rate": 2.9965970952686618e-05, + "loss": 0.1637, + "step": 3031 + }, + { + "epoch": 0.43663594470046085, + "grad_norm": 0.7015475630760193, + "learning_rate": 2.9954885373635655e-05, + "loss": 0.1073, + "step": 3032 + }, + { + "epoch": 0.4367799539170507, + "grad_norm": 0.5551050901412964, + "learning_rate": 2.9943798780407288e-05, + "loss": 0.0565, + "step": 3033 + }, + { + "epoch": 0.43692396313364057, + "grad_norm": 5.249487400054932, + "learning_rate": 2.9932711175270767e-05, + "loss": 1.7439, + "step": 3034 + }, + { + "epoch": 0.4370679723502304, + "grad_norm": 1.6456241607666016, + "learning_rate": 2.992162256049552e-05, + "loss": 0.1622, + "step": 3035 + }, + { + "epoch": 0.4372119815668203, + "grad_norm": 3.3386127948760986, + "learning_rate": 2.991053293835119e-05, + "loss": 1.292, + "step": 3036 + }, + { + "epoch": 0.43735599078341014, + "grad_norm": 1.3208701610565186, + "learning_rate": 2.9899442311107617e-05, + "loss": 0.1558, + "step": 3037 + }, + { + "epoch": 0.4375, + "grad_norm": 0.7576304078102112, + "learning_rate": 2.9888350681034872e-05, + "loss": 0.0781, + "step": 3038 + }, + { + "epoch": 0.43764400921658986, + "grad_norm": 1.063991904258728, + "learning_rate": 2.9877258050403212e-05, + "loss": 0.1364, + "step": 3039 + }, + { + "epoch": 0.4377880184331797, + "grad_norm": 4.900160312652588, + "learning_rate": 2.986616442148309e-05, + "loss": 2.1095, + "step": 3040 + }, + { + "epoch": 0.4379320276497696, + "grad_norm": 3.5518083572387695, + "learning_rate": 2.9855069796545186e-05, + "loss": 3.4047, + "step": 3041 + }, + { + "epoch": 0.43807603686635943, + "grad_norm": 1.2436045408248901, + "learning_rate": 2.9843974177860378e-05, + "loss": 0.1671, + "step": 3042 + }, + { + "epoch": 0.4382200460829493, + "grad_norm": 1.0098066329956055, + "learning_rate": 2.9832877567699734e-05, + "loss": 0.1233, + "step": 3043 + }, + { + "epoch": 0.43836405529953915, + "grad_norm": 3.3906381130218506, + "learning_rate": 2.9821779968334535e-05, + "loss": 0.3309, + "step": 3044 + }, + { + "epoch": 0.43850806451612906, + "grad_norm": 3.7985544204711914, + "learning_rate": 2.9810681382036264e-05, + "loss": 2.0146, + "step": 3045 + }, + { + "epoch": 0.4386520737327189, + "grad_norm": 1.3706923723220825, + "learning_rate": 2.9799581811076605e-05, + "loss": 0.1065, + "step": 3046 + }, + { + "epoch": 0.4387960829493088, + "grad_norm": 5.184993267059326, + "learning_rate": 2.9788481257727446e-05, + "loss": 1.7778, + "step": 3047 + }, + { + "epoch": 0.43894009216589863, + "grad_norm": 0.7748806476593018, + "learning_rate": 2.9777379724260875e-05, + "loss": 0.0929, + "step": 3048 + }, + { + "epoch": 0.4390841013824885, + "grad_norm": 3.4992268085479736, + "learning_rate": 2.9766277212949172e-05, + "loss": 0.2703, + "step": 3049 + }, + { + "epoch": 0.43922811059907835, + "grad_norm": 2.31775164604187, + "learning_rate": 2.9755173726064834e-05, + "loss": 0.2899, + "step": 3050 + }, + { + "epoch": 0.4393721198156682, + "grad_norm": 0.4908515214920044, + "learning_rate": 2.9744069265880546e-05, + "loss": 0.0593, + "step": 3051 + }, + { + "epoch": 0.43951612903225806, + "grad_norm": 0.7165913581848145, + "learning_rate": 2.973296383466919e-05, + "loss": 0.0758, + "step": 3052 + }, + { + "epoch": 0.4396601382488479, + "grad_norm": 17.16750717163086, + "learning_rate": 2.9721857434703858e-05, + "loss": 3.0691, + "step": 3053 + }, + { + "epoch": 0.4398041474654378, + "grad_norm": 2.5723650455474854, + "learning_rate": 2.971075006825783e-05, + "loss": 0.1925, + "step": 3054 + }, + { + "epoch": 0.43994815668202764, + "grad_norm": 3.5252020359039307, + "learning_rate": 2.9699641737604583e-05, + "loss": 2.642, + "step": 3055 + }, + { + "epoch": 0.4400921658986175, + "grad_norm": 1.5501271486282349, + "learning_rate": 2.96885324450178e-05, + "loss": 0.216, + "step": 3056 + }, + { + "epoch": 0.44023617511520735, + "grad_norm": 0.6642268896102905, + "learning_rate": 2.9677422192771365e-05, + "loss": 0.0603, + "step": 3057 + }, + { + "epoch": 0.4403801843317972, + "grad_norm": 0.7640119194984436, + "learning_rate": 2.9666310983139332e-05, + "loss": 0.0901, + "step": 3058 + }, + { + "epoch": 0.4405241935483871, + "grad_norm": 2.245894193649292, + "learning_rate": 2.9655198818395985e-05, + "loss": 0.198, + "step": 3059 + }, + { + "epoch": 0.440668202764977, + "grad_norm": 4.993640422821045, + "learning_rate": 2.9644085700815777e-05, + "loss": 0.2773, + "step": 3060 + }, + { + "epoch": 0.44081221198156684, + "grad_norm": 4.285893440246582, + "learning_rate": 2.9632971632673374e-05, + "loss": 1.2581, + "step": 3061 + }, + { + "epoch": 0.4409562211981567, + "grad_norm": 1.9293829202651978, + "learning_rate": 2.9621856616243626e-05, + "loss": 0.148, + "step": 3062 + }, + { + "epoch": 0.44110023041474655, + "grad_norm": 6.8861165046691895, + "learning_rate": 2.9610740653801585e-05, + "loss": 1.79, + "step": 3063 + }, + { + "epoch": 0.4412442396313364, + "grad_norm": 2.371258020401001, + "learning_rate": 2.959962374762248e-05, + "loss": 2.0086, + "step": 3064 + }, + { + "epoch": 0.44138824884792627, + "grad_norm": 8.62067699432373, + "learning_rate": 2.9588505899981756e-05, + "loss": 1.7604, + "step": 3065 + }, + { + "epoch": 0.4415322580645161, + "grad_norm": 1.2272520065307617, + "learning_rate": 2.9577387113155037e-05, + "loss": 0.1477, + "step": 3066 + }, + { + "epoch": 0.441676267281106, + "grad_norm": 3.1977226734161377, + "learning_rate": 2.9566267389418144e-05, + "loss": 0.2335, + "step": 3067 + }, + { + "epoch": 0.44182027649769584, + "grad_norm": 1.0967620611190796, + "learning_rate": 2.955514673104708e-05, + "loss": 0.134, + "step": 3068 + }, + { + "epoch": 0.4419642857142857, + "grad_norm": 1.8520376682281494, + "learning_rate": 2.9544025140318054e-05, + "loss": 0.2636, + "step": 3069 + }, + { + "epoch": 0.44210829493087556, + "grad_norm": 1.1609982252120972, + "learning_rate": 2.9532902619507462e-05, + "loss": 0.1797, + "step": 3070 + }, + { + "epoch": 0.4422523041474654, + "grad_norm": 0.858529269695282, + "learning_rate": 2.9521779170891877e-05, + "loss": 0.1209, + "step": 3071 + }, + { + "epoch": 0.4423963133640553, + "grad_norm": 1.14321768283844, + "learning_rate": 2.9510654796748077e-05, + "loss": 4.1814, + "step": 3072 + }, + { + "epoch": 0.4425403225806452, + "grad_norm": 1.0592501163482666, + "learning_rate": 2.9499529499353024e-05, + "loss": 0.166, + "step": 3073 + }, + { + "epoch": 0.44268433179723504, + "grad_norm": 0.647680401802063, + "learning_rate": 2.9488403280983873e-05, + "loss": 0.0877, + "step": 3074 + }, + { + "epoch": 0.4428283410138249, + "grad_norm": 5.094755172729492, + "learning_rate": 2.9477276143917966e-05, + "loss": 0.2428, + "step": 3075 + }, + { + "epoch": 0.44297235023041476, + "grad_norm": 1.9048644304275513, + "learning_rate": 2.9466148090432822e-05, + "loss": 0.1743, + "step": 3076 + }, + { + "epoch": 0.4431163594470046, + "grad_norm": 3.712114095687866, + "learning_rate": 2.945501912280616e-05, + "loss": 0.3988, + "step": 3077 + }, + { + "epoch": 0.4432603686635945, + "grad_norm": 1.0624492168426514, + "learning_rate": 2.9443889243315887e-05, + "loss": 4.3362, + "step": 3078 + }, + { + "epoch": 0.44340437788018433, + "grad_norm": 3.2534990310668945, + "learning_rate": 2.9432758454240096e-05, + "loss": 0.1933, + "step": 3079 + }, + { + "epoch": 0.4435483870967742, + "grad_norm": 4.648531436920166, + "learning_rate": 2.9421626757857045e-05, + "loss": 2.13, + "step": 3080 + }, + { + "epoch": 0.44369239631336405, + "grad_norm": 3.524319648742676, + "learning_rate": 2.9410494156445216e-05, + "loss": 0.2267, + "step": 3081 + }, + { + "epoch": 0.4438364055299539, + "grad_norm": 1.0372263193130493, + "learning_rate": 2.9399360652283243e-05, + "loss": 0.1397, + "step": 3082 + }, + { + "epoch": 0.44398041474654376, + "grad_norm": 0.870536208152771, + "learning_rate": 2.9388226247649962e-05, + "loss": 0.1323, + "step": 3083 + }, + { + "epoch": 0.4441244239631336, + "grad_norm": 4.115156650543213, + "learning_rate": 2.9377090944824388e-05, + "loss": 1.3667, + "step": 3084 + }, + { + "epoch": 0.4442684331797235, + "grad_norm": 0.7922481298446655, + "learning_rate": 2.9365954746085723e-05, + "loss": 0.1231, + "step": 3085 + }, + { + "epoch": 0.4444124423963134, + "grad_norm": 2.5806045532226562, + "learning_rate": 2.935481765371334e-05, + "loss": 0.1424, + "step": 3086 + }, + { + "epoch": 0.44455645161290325, + "grad_norm": 0.7562891244888306, + "learning_rate": 2.9343679669986813e-05, + "loss": 0.0694, + "step": 3087 + }, + { + "epoch": 0.4447004608294931, + "grad_norm": 6.6359333992004395, + "learning_rate": 2.9332540797185892e-05, + "loss": 1.5455, + "step": 3088 + }, + { + "epoch": 0.44484447004608296, + "grad_norm": 0.6410256028175354, + "learning_rate": 2.9321401037590502e-05, + "loss": 0.0617, + "step": 3089 + }, + { + "epoch": 0.4449884792626728, + "grad_norm": 4.400673866271973, + "learning_rate": 2.931026039348076e-05, + "loss": 0.9431, + "step": 3090 + }, + { + "epoch": 0.4451324884792627, + "grad_norm": 0.7613173723220825, + "learning_rate": 2.9299118867136954e-05, + "loss": 0.0891, + "step": 3091 + }, + { + "epoch": 0.44527649769585254, + "grad_norm": 3.1846792697906494, + "learning_rate": 2.928797646083956e-05, + "loss": 0.3448, + "step": 3092 + }, + { + "epoch": 0.4454205069124424, + "grad_norm": 3.529188632965088, + "learning_rate": 2.9276833176869235e-05, + "loss": 2.2484, + "step": 3093 + }, + { + "epoch": 0.44556451612903225, + "grad_norm": 0.8708030581474304, + "learning_rate": 2.9265689017506802e-05, + "loss": 0.1101, + "step": 3094 + }, + { + "epoch": 0.4457085253456221, + "grad_norm": 1.2380704879760742, + "learning_rate": 2.925454398503328e-05, + "loss": 0.1261, + "step": 3095 + }, + { + "epoch": 0.44585253456221197, + "grad_norm": 2.8646717071533203, + "learning_rate": 2.924339808172986e-05, + "loss": 0.1808, + "step": 3096 + }, + { + "epoch": 0.4459965437788018, + "grad_norm": 7.989163398742676, + "learning_rate": 2.923225130987791e-05, + "loss": 2.1111, + "step": 3097 + }, + { + "epoch": 0.4461405529953917, + "grad_norm": 0.6103109121322632, + "learning_rate": 2.9221103671758983e-05, + "loss": 0.0633, + "step": 3098 + }, + { + "epoch": 0.44628456221198154, + "grad_norm": 0.9490913152694702, + "learning_rate": 2.9209955169654784e-05, + "loss": 0.1287, + "step": 3099 + }, + { + "epoch": 0.44642857142857145, + "grad_norm": 4.0956926345825195, + "learning_rate": 2.919880580584724e-05, + "loss": 1.7006, + "step": 3100 + }, + { + "epoch": 0.4465725806451613, + "grad_norm": 2.816807270050049, + "learning_rate": 2.918765558261841e-05, + "loss": 0.1711, + "step": 3101 + }, + { + "epoch": 0.44671658986175117, + "grad_norm": 1.2043403387069702, + "learning_rate": 2.9176504502250563e-05, + "loss": 0.133, + "step": 3102 + }, + { + "epoch": 0.446860599078341, + "grad_norm": 1.625562310218811, + "learning_rate": 2.916535256702611e-05, + "loss": 0.1865, + "step": 3103 + }, + { + "epoch": 0.4470046082949309, + "grad_norm": 2.1241776943206787, + "learning_rate": 2.915419977922767e-05, + "loss": 0.206, + "step": 3104 + }, + { + "epoch": 0.44714861751152074, + "grad_norm": 4.503981113433838, + "learning_rate": 2.9143046141138015e-05, + "loss": 2.6745, + "step": 3105 + }, + { + "epoch": 0.4472926267281106, + "grad_norm": 1.520530104637146, + "learning_rate": 2.9131891655040096e-05, + "loss": 0.1127, + "step": 3106 + }, + { + "epoch": 0.44743663594470046, + "grad_norm": 0.7353809475898743, + "learning_rate": 2.9120736323217035e-05, + "loss": 0.086, + "step": 3107 + }, + { + "epoch": 0.4475806451612903, + "grad_norm": 4.498667240142822, + "learning_rate": 2.910958014795214e-05, + "loss": 2.0339, + "step": 3108 + }, + { + "epoch": 0.44772465437788017, + "grad_norm": 4.710301399230957, + "learning_rate": 2.909842313152888e-05, + "loss": 1.2213, + "step": 3109 + }, + { + "epoch": 0.44786866359447003, + "grad_norm": 5.039255619049072, + "learning_rate": 2.90872652762309e-05, + "loss": 0.4264, + "step": 3110 + }, + { + "epoch": 0.4480126728110599, + "grad_norm": 2.4138617515563965, + "learning_rate": 2.9076106584342017e-05, + "loss": 0.2237, + "step": 3111 + }, + { + "epoch": 0.44815668202764974, + "grad_norm": 1.0182509422302246, + "learning_rate": 2.906494705814621e-05, + "loss": 0.1106, + "step": 3112 + }, + { + "epoch": 0.44830069124423966, + "grad_norm": 1.3969157934188843, + "learning_rate": 2.9053786699927642e-05, + "loss": 0.1235, + "step": 3113 + }, + { + "epoch": 0.4484447004608295, + "grad_norm": 0.7732139825820923, + "learning_rate": 2.9042625511970644e-05, + "loss": 0.0821, + "step": 3114 + }, + { + "epoch": 0.4485887096774194, + "grad_norm": 6.1920485496521, + "learning_rate": 2.9031463496559706e-05, + "loss": 1.2678, + "step": 3115 + }, + { + "epoch": 0.44873271889400923, + "grad_norm": 1.648795247077942, + "learning_rate": 2.9020300655979503e-05, + "loss": 0.263, + "step": 3116 + }, + { + "epoch": 0.4488767281105991, + "grad_norm": 5.5696001052856445, + "learning_rate": 2.9009136992514862e-05, + "loss": 2.0336, + "step": 3117 + }, + { + "epoch": 0.44902073732718895, + "grad_norm": 5.6018805503845215, + "learning_rate": 2.8997972508450794e-05, + "loss": 0.3601, + "step": 3118 + }, + { + "epoch": 0.4491647465437788, + "grad_norm": 1.040076494216919, + "learning_rate": 2.8986807206072475e-05, + "loss": 0.1476, + "step": 3119 + }, + { + "epoch": 0.44930875576036866, + "grad_norm": 0.7100153565406799, + "learning_rate": 2.8975641087665233e-05, + "loss": 0.0983, + "step": 3120 + }, + { + "epoch": 0.4494527649769585, + "grad_norm": 2.1937105655670166, + "learning_rate": 2.8964474155514588e-05, + "loss": 1.6794, + "step": 3121 + }, + { + "epoch": 0.4495967741935484, + "grad_norm": 4.887200355529785, + "learning_rate": 2.8953306411906206e-05, + "loss": 1.506, + "step": 3122 + }, + { + "epoch": 0.44974078341013823, + "grad_norm": 1.5473949909210205, + "learning_rate": 2.8942137859125928e-05, + "loss": 0.1897, + "step": 3123 + }, + { + "epoch": 0.4498847926267281, + "grad_norm": 5.155910968780518, + "learning_rate": 2.893096849945976e-05, + "loss": 0.3926, + "step": 3124 + }, + { + "epoch": 0.45002880184331795, + "grad_norm": 4.065741539001465, + "learning_rate": 2.891979833519387e-05, + "loss": 1.4155, + "step": 3125 + }, + { + "epoch": 0.4501728110599078, + "grad_norm": 1.4745142459869385, + "learning_rate": 2.89086273686146e-05, + "loss": 0.1689, + "step": 3126 + }, + { + "epoch": 0.4503168202764977, + "grad_norm": 2.5287153720855713, + "learning_rate": 2.889745560200844e-05, + "loss": 0.2092, + "step": 3127 + }, + { + "epoch": 0.4504608294930876, + "grad_norm": 6.471814155578613, + "learning_rate": 2.8886283037662048e-05, + "loss": 2.6135, + "step": 3128 + }, + { + "epoch": 0.45060483870967744, + "grad_norm": 1.1659313440322876, + "learning_rate": 2.8875109677862272e-05, + "loss": 0.1181, + "step": 3129 + }, + { + "epoch": 0.4507488479262673, + "grad_norm": 1.3761802911758423, + "learning_rate": 2.886393552489608e-05, + "loss": 0.1528, + "step": 3130 + }, + { + "epoch": 0.45089285714285715, + "grad_norm": 1.442962408065796, + "learning_rate": 2.8852760581050643e-05, + "loss": 0.1286, + "step": 3131 + }, + { + "epoch": 0.451036866359447, + "grad_norm": 1.8815516233444214, + "learning_rate": 2.884158484861325e-05, + "loss": 0.2125, + "step": 3132 + }, + { + "epoch": 0.45118087557603687, + "grad_norm": 6.005934715270996, + "learning_rate": 2.88304083298714e-05, + "loss": 2.3253, + "step": 3133 + }, + { + "epoch": 0.4513248847926267, + "grad_norm": 3.3886215686798096, + "learning_rate": 2.8819231027112713e-05, + "loss": 0.2707, + "step": 3134 + }, + { + "epoch": 0.4514688940092166, + "grad_norm": 0.8527829647064209, + "learning_rate": 2.880805294262499e-05, + "loss": 0.106, + "step": 3135 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 1.5637965202331543, + "learning_rate": 2.8796874078696185e-05, + "loss": 0.1672, + "step": 3136 + }, + { + "epoch": 0.4517569124423963, + "grad_norm": 3.4235498905181885, + "learning_rate": 2.878569443761442e-05, + "loss": 2.4704, + "step": 3137 + }, + { + "epoch": 0.45190092165898615, + "grad_norm": 0.8761069178581238, + "learning_rate": 2.8774514021667965e-05, + "loss": 0.0949, + "step": 3138 + }, + { + "epoch": 0.452044930875576, + "grad_norm": 2.20910382270813, + "learning_rate": 2.876333283314525e-05, + "loss": 0.0742, + "step": 3139 + }, + { + "epoch": 0.4521889400921659, + "grad_norm": 3.2221388816833496, + "learning_rate": 2.875215087433487e-05, + "loss": 1.5187, + "step": 3140 + }, + { + "epoch": 0.4523329493087558, + "grad_norm": 3.2433929443359375, + "learning_rate": 2.874096814752557e-05, + "loss": 0.3111, + "step": 3141 + }, + { + "epoch": 0.45247695852534564, + "grad_norm": 4.874188423156738, + "learning_rate": 2.872978465500627e-05, + "loss": 1.6649, + "step": 3142 + }, + { + "epoch": 0.4526209677419355, + "grad_norm": 1.2779287099838257, + "learning_rate": 2.8718600399066027e-05, + "loss": 0.1423, + "step": 3143 + }, + { + "epoch": 0.45276497695852536, + "grad_norm": 1.0352801084518433, + "learning_rate": 2.870741538199405e-05, + "loss": 0.1098, + "step": 3144 + }, + { + "epoch": 0.4529089861751152, + "grad_norm": 0.8396167755126953, + "learning_rate": 2.8696229606079722e-05, + "loss": 0.0984, + "step": 3145 + }, + { + "epoch": 0.45305299539170507, + "grad_norm": 2.752439260482788, + "learning_rate": 2.868504307361258e-05, + "loss": 0.2595, + "step": 3146 + }, + { + "epoch": 0.45319700460829493, + "grad_norm": 5.984889507293701, + "learning_rate": 2.8673855786882292e-05, + "loss": 2.0437, + "step": 3147 + }, + { + "epoch": 0.4533410138248848, + "grad_norm": 0.8174430727958679, + "learning_rate": 2.866266774817872e-05, + "loss": 0.1127, + "step": 3148 + }, + { + "epoch": 0.45348502304147464, + "grad_norm": 0.5211092233657837, + "learning_rate": 2.8651478959791835e-05, + "loss": 0.0706, + "step": 3149 + }, + { + "epoch": 0.4536290322580645, + "grad_norm": 0.7935987710952759, + "learning_rate": 2.8640289424011796e-05, + "loss": 0.128, + "step": 3150 + }, + { + "epoch": 0.45377304147465436, + "grad_norm": 3.141660213470459, + "learning_rate": 2.8629099143128907e-05, + "loss": 2.1677, + "step": 3151 + }, + { + "epoch": 0.4539170506912442, + "grad_norm": 1.0555862188339233, + "learning_rate": 2.8617908119433612e-05, + "loss": 0.1258, + "step": 3152 + }, + { + "epoch": 0.4540610599078341, + "grad_norm": 0.7374940514564514, + "learning_rate": 2.8606716355216523e-05, + "loss": 0.1167, + "step": 3153 + }, + { + "epoch": 0.454205069124424, + "grad_norm": 1.245011806488037, + "learning_rate": 2.8595523852768384e-05, + "loss": 0.0971, + "step": 3154 + }, + { + "epoch": 0.45434907834101385, + "grad_norm": 1.1278491020202637, + "learning_rate": 2.858433061438011e-05, + "loss": 0.1411, + "step": 3155 + }, + { + "epoch": 0.4544930875576037, + "grad_norm": 0.6215878129005432, + "learning_rate": 2.8573136642342768e-05, + "loss": 0.0694, + "step": 3156 + }, + { + "epoch": 0.45463709677419356, + "grad_norm": 1.1350764036178589, + "learning_rate": 2.8561941938947556e-05, + "loss": 0.1551, + "step": 3157 + }, + { + "epoch": 0.4547811059907834, + "grad_norm": 8.58501148223877, + "learning_rate": 2.855074650648583e-05, + "loss": 2.0011, + "step": 3158 + }, + { + "epoch": 0.4549251152073733, + "grad_norm": 3.6336820125579834, + "learning_rate": 2.8539550347249105e-05, + "loss": 0.1635, + "step": 3159 + }, + { + "epoch": 0.45506912442396313, + "grad_norm": 3.9015862941741943, + "learning_rate": 2.8528353463529027e-05, + "loss": 1.3083, + "step": 3160 + }, + { + "epoch": 0.455213133640553, + "grad_norm": 1.483136773109436, + "learning_rate": 2.8517155857617405e-05, + "loss": 0.1998, + "step": 3161 + }, + { + "epoch": 0.45535714285714285, + "grad_norm": 2.1231882572174072, + "learning_rate": 2.8505957531806194e-05, + "loss": 0.2472, + "step": 3162 + }, + { + "epoch": 0.4555011520737327, + "grad_norm": 1.3022242784500122, + "learning_rate": 2.849475848838749e-05, + "loss": 0.1401, + "step": 3163 + }, + { + "epoch": 0.45564516129032256, + "grad_norm": 0.9076325297355652, + "learning_rate": 2.8483558729653535e-05, + "loss": 0.0971, + "step": 3164 + }, + { + "epoch": 0.4557891705069124, + "grad_norm": 3.1788878440856934, + "learning_rate": 2.8472358257896732e-05, + "loss": 0.588, + "step": 3165 + }, + { + "epoch": 0.4559331797235023, + "grad_norm": 4.158955097198486, + "learning_rate": 2.8461157075409612e-05, + "loss": 1.4664, + "step": 3166 + }, + { + "epoch": 0.4560771889400922, + "grad_norm": 1.1459623575210571, + "learning_rate": 2.8449955184484854e-05, + "loss": 0.1626, + "step": 3167 + }, + { + "epoch": 0.45622119815668205, + "grad_norm": 0.6358112692832947, + "learning_rate": 2.843875258741529e-05, + "loss": 0.0804, + "step": 3168 + }, + { + "epoch": 0.4563652073732719, + "grad_norm": 0.9961100816726685, + "learning_rate": 2.8427549286493904e-05, + "loss": 0.1188, + "step": 3169 + }, + { + "epoch": 0.45650921658986177, + "grad_norm": 0.3636772334575653, + "learning_rate": 2.8416345284013807e-05, + "loss": 0.0392, + "step": 3170 + }, + { + "epoch": 0.4566532258064516, + "grad_norm": 7.494146823883057, + "learning_rate": 2.840514058226826e-05, + "loss": 1.6366, + "step": 3171 + }, + { + "epoch": 0.4567972350230415, + "grad_norm": 5.8882551193237305, + "learning_rate": 2.8393935183550662e-05, + "loss": 2.4616, + "step": 3172 + }, + { + "epoch": 0.45694124423963134, + "grad_norm": 0.6772561073303223, + "learning_rate": 2.8382729090154563e-05, + "loss": 0.0958, + "step": 3173 + }, + { + "epoch": 0.4570852534562212, + "grad_norm": 0.8593619465827942, + "learning_rate": 2.837152230437366e-05, + "loss": 0.1027, + "step": 3174 + }, + { + "epoch": 0.45722926267281105, + "grad_norm": 4.559393405914307, + "learning_rate": 2.8360314828501772e-05, + "loss": 0.4525, + "step": 3175 + }, + { + "epoch": 0.4573732718894009, + "grad_norm": 0.8788166642189026, + "learning_rate": 2.834910666483288e-05, + "loss": 0.0929, + "step": 3176 + }, + { + "epoch": 0.45751728110599077, + "grad_norm": 4.324271202087402, + "learning_rate": 2.833789781566109e-05, + "loss": 1.0453, + "step": 3177 + }, + { + "epoch": 0.4576612903225806, + "grad_norm": 1.1646126508712769, + "learning_rate": 2.832668828328066e-05, + "loss": 0.1325, + "step": 3178 + }, + { + "epoch": 0.4578052995391705, + "grad_norm": 2.2175345420837402, + "learning_rate": 2.831547806998598e-05, + "loss": 0.2355, + "step": 3179 + }, + { + "epoch": 0.45794930875576034, + "grad_norm": 1.4867053031921387, + "learning_rate": 2.8304267178071587e-05, + "loss": 0.1866, + "step": 3180 + }, + { + "epoch": 0.45809331797235026, + "grad_norm": 0.7841525673866272, + "learning_rate": 2.8293055609832147e-05, + "loss": 0.111, + "step": 3181 + }, + { + "epoch": 0.4582373271889401, + "grad_norm": 0.5195282697677612, + "learning_rate": 2.8281843367562465e-05, + "loss": 0.0621, + "step": 3182 + }, + { + "epoch": 0.45838133640552997, + "grad_norm": 0.602841317653656, + "learning_rate": 2.8270630453557502e-05, + "loss": 0.0916, + "step": 3183 + }, + { + "epoch": 0.45852534562211983, + "grad_norm": 1.294477105140686, + "learning_rate": 2.825941687011233e-05, + "loss": 0.1313, + "step": 3184 + }, + { + "epoch": 0.4586693548387097, + "grad_norm": 3.9680092334747314, + "learning_rate": 2.8248202619522192e-05, + "loss": 1.1293, + "step": 3185 + }, + { + "epoch": 0.45881336405529954, + "grad_norm": 0.9355124235153198, + "learning_rate": 2.8236987704082417e-05, + "loss": 0.1085, + "step": 3186 + }, + { + "epoch": 0.4589573732718894, + "grad_norm": 3.660435676574707, + "learning_rate": 2.822577212608852e-05, + "loss": 0.2337, + "step": 3187 + }, + { + "epoch": 0.45910138248847926, + "grad_norm": 1.456449270248413, + "learning_rate": 2.8214555887836136e-05, + "loss": 0.1649, + "step": 3188 + }, + { + "epoch": 0.4592453917050691, + "grad_norm": 3.1152093410491943, + "learning_rate": 2.8203338991621016e-05, + "loss": 2.2513, + "step": 3189 + }, + { + "epoch": 0.459389400921659, + "grad_norm": 0.9674474596977234, + "learning_rate": 2.819212143973906e-05, + "loss": 0.1569, + "step": 3190 + }, + { + "epoch": 0.45953341013824883, + "grad_norm": 1.3099212646484375, + "learning_rate": 2.818090323448631e-05, + "loss": 0.1306, + "step": 3191 + }, + { + "epoch": 0.4596774193548387, + "grad_norm": 5.576354026794434, + "learning_rate": 2.816968437815894e-05, + "loss": 1.0864, + "step": 3192 + }, + { + "epoch": 0.45982142857142855, + "grad_norm": 3.6845457553863525, + "learning_rate": 2.8158464873053237e-05, + "loss": 0.4995, + "step": 3193 + }, + { + "epoch": 0.45996543778801846, + "grad_norm": 0.9292340874671936, + "learning_rate": 2.8147244721465636e-05, + "loss": 0.1626, + "step": 3194 + }, + { + "epoch": 0.4601094470046083, + "grad_norm": 0.9120882749557495, + "learning_rate": 2.8136023925692712e-05, + "loss": 0.1048, + "step": 3195 + }, + { + "epoch": 0.4602534562211982, + "grad_norm": 3.497121810913086, + "learning_rate": 2.8124802488031166e-05, + "loss": 0.342, + "step": 3196 + }, + { + "epoch": 0.46039746543778803, + "grad_norm": 0.5631290674209595, + "learning_rate": 2.8113580410777823e-05, + "loss": 0.0824, + "step": 3197 + }, + { + "epoch": 0.4605414746543779, + "grad_norm": 0.6889315843582153, + "learning_rate": 2.810235769622964e-05, + "loss": 0.0834, + "step": 3198 + }, + { + "epoch": 0.46068548387096775, + "grad_norm": 1.6514265537261963, + "learning_rate": 2.8091134346683713e-05, + "loss": 0.1818, + "step": 3199 + }, + { + "epoch": 0.4608294930875576, + "grad_norm": 0.8820537328720093, + "learning_rate": 2.8079910364437263e-05, + "loss": 0.0756, + "step": 3200 + }, + { + "epoch": 0.46097350230414746, + "grad_norm": 3.7798171043395996, + "learning_rate": 2.8068685751787636e-05, + "loss": 2.6314, + "step": 3201 + }, + { + "epoch": 0.4611175115207373, + "grad_norm": 1.8690224885940552, + "learning_rate": 2.805746051103232e-05, + "loss": 0.2593, + "step": 3202 + }, + { + "epoch": 0.4612615207373272, + "grad_norm": 0.7925488948822021, + "learning_rate": 2.804623464446891e-05, + "loss": 4.3092, + "step": 3203 + }, + { + "epoch": 0.46140552995391704, + "grad_norm": 1.0224591493606567, + "learning_rate": 2.803500815439516e-05, + "loss": 0.1189, + "step": 3204 + }, + { + "epoch": 0.4615495391705069, + "grad_norm": 3.4491326808929443, + "learning_rate": 2.802378104310892e-05, + "loss": 0.1549, + "step": 3205 + }, + { + "epoch": 0.46169354838709675, + "grad_norm": 0.7828453779220581, + "learning_rate": 2.8012553312908185e-05, + "loss": 0.1233, + "step": 3206 + }, + { + "epoch": 0.4618375576036866, + "grad_norm": 0.8659581542015076, + "learning_rate": 2.8001324966091076e-05, + "loss": 0.0932, + "step": 3207 + }, + { + "epoch": 0.4619815668202765, + "grad_norm": 3.499912738800049, + "learning_rate": 2.7990096004955828e-05, + "loss": 0.3237, + "step": 3208 + }, + { + "epoch": 0.4621255760368664, + "grad_norm": 0.6839143633842468, + "learning_rate": 2.7978866431800816e-05, + "loss": 0.0779, + "step": 3209 + }, + { + "epoch": 0.46226958525345624, + "grad_norm": 1.990738034248352, + "learning_rate": 2.796763624892454e-05, + "loss": 0.1657, + "step": 3210 + }, + { + "epoch": 0.4624135944700461, + "grad_norm": 1.363872766494751, + "learning_rate": 2.7956405458625616e-05, + "loss": 0.0987, + "step": 3211 + }, + { + "epoch": 0.46255760368663595, + "grad_norm": 1.1712000370025635, + "learning_rate": 2.794517406320279e-05, + "loss": 0.173, + "step": 3212 + }, + { + "epoch": 0.4627016129032258, + "grad_norm": 3.7722463607788086, + "learning_rate": 2.7933942064954927e-05, + "loss": 0.2442, + "step": 3213 + }, + { + "epoch": 0.46284562211981567, + "grad_norm": 2.98152232170105, + "learning_rate": 2.792270946618102e-05, + "loss": 0.5351, + "step": 3214 + }, + { + "epoch": 0.4629896313364055, + "grad_norm": 0.7359346151351929, + "learning_rate": 2.7911476269180182e-05, + "loss": 0.0513, + "step": 3215 + }, + { + "epoch": 0.4631336405529954, + "grad_norm": 1.1834733486175537, + "learning_rate": 2.7900242476251646e-05, + "loss": 0.1844, + "step": 3216 + }, + { + "epoch": 0.46327764976958524, + "grad_norm": 4.2965407371521, + "learning_rate": 2.788900808969478e-05, + "loss": 3.1031, + "step": 3217 + }, + { + "epoch": 0.4634216589861751, + "grad_norm": 3.7810912132263184, + "learning_rate": 2.787777311180906e-05, + "loss": 1.3902, + "step": 3218 + }, + { + "epoch": 0.46356566820276496, + "grad_norm": 0.9642695188522339, + "learning_rate": 2.7866537544894082e-05, + "loss": 0.1124, + "step": 3219 + }, + { + "epoch": 0.4637096774193548, + "grad_norm": 0.7531198263168335, + "learning_rate": 2.7855301391249577e-05, + "loss": 0.0853, + "step": 3220 + }, + { + "epoch": 0.4638536866359447, + "grad_norm": 0.9329513311386108, + "learning_rate": 2.7844064653175378e-05, + "loss": 0.1455, + "step": 3221 + }, + { + "epoch": 0.4639976958525346, + "grad_norm": 1.2432924509048462, + "learning_rate": 2.783282733297145e-05, + "loss": 0.143, + "step": 3222 + }, + { + "epoch": 0.46414170506912444, + "grad_norm": 0.7505854368209839, + "learning_rate": 2.7821589432937873e-05, + "loss": 0.0707, + "step": 3223 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.9225820302963257, + "learning_rate": 2.7810350955374852e-05, + "loss": 0.1872, + "step": 3224 + }, + { + "epoch": 0.46442972350230416, + "grad_norm": 3.0628743171691895, + "learning_rate": 2.7799111902582696e-05, + "loss": 0.3098, + "step": 3225 + }, + { + "epoch": 0.464573732718894, + "grad_norm": 5.118321418762207, + "learning_rate": 2.7787872276861855e-05, + "loss": 0.4143, + "step": 3226 + }, + { + "epoch": 0.4647177419354839, + "grad_norm": 4.451242923736572, + "learning_rate": 2.777663208051286e-05, + "loss": 2.0334, + "step": 3227 + }, + { + "epoch": 0.46486175115207373, + "grad_norm": 2.8838906288146973, + "learning_rate": 2.7765391315836396e-05, + "loss": 0.3971, + "step": 3228 + }, + { + "epoch": 0.4650057603686636, + "grad_norm": 0.4744962751865387, + "learning_rate": 2.7754149985133243e-05, + "loss": 0.0447, + "step": 3229 + }, + { + "epoch": 0.46514976958525345, + "grad_norm": 1.003417730331421, + "learning_rate": 2.7742908090704306e-05, + "loss": 0.0816, + "step": 3230 + }, + { + "epoch": 0.4652937788018433, + "grad_norm": 3.4246442317962646, + "learning_rate": 2.77316656348506e-05, + "loss": 0.7946, + "step": 3231 + }, + { + "epoch": 0.46543778801843316, + "grad_norm": 2.453580856323242, + "learning_rate": 2.7720422619873253e-05, + "loss": 0.2222, + "step": 3232 + }, + { + "epoch": 0.465581797235023, + "grad_norm": 0.7540020942687988, + "learning_rate": 2.770917904807352e-05, + "loss": 0.0803, + "step": 3233 + }, + { + "epoch": 0.4657258064516129, + "grad_norm": 2.341265916824341, + "learning_rate": 2.7697934921752753e-05, + "loss": 0.1973, + "step": 3234 + }, + { + "epoch": 0.4658698156682028, + "grad_norm": 1.4922566413879395, + "learning_rate": 2.7686690243212432e-05, + "loss": 0.1758, + "step": 3235 + }, + { + "epoch": 0.46601382488479265, + "grad_norm": 0.6736044883728027, + "learning_rate": 2.767544501475413e-05, + "loss": 0.0935, + "step": 3236 + }, + { + "epoch": 0.4661578341013825, + "grad_norm": 1.591460108757019, + "learning_rate": 2.7664199238679565e-05, + "loss": 0.1547, + "step": 3237 + }, + { + "epoch": 0.46630184331797236, + "grad_norm": 2.8018031120300293, + "learning_rate": 2.7652952917290542e-05, + "loss": 0.2455, + "step": 3238 + }, + { + "epoch": 0.4664458525345622, + "grad_norm": 1.5759328603744507, + "learning_rate": 2.7641706052888984e-05, + "loss": 0.1384, + "step": 3239 + }, + { + "epoch": 0.4665898617511521, + "grad_norm": 0.8221271634101868, + "learning_rate": 2.7630458647776918e-05, + "loss": 0.097, + "step": 3240 + }, + { + "epoch": 0.46673387096774194, + "grad_norm": 5.059264659881592, + "learning_rate": 2.76192107042565e-05, + "loss": 0.509, + "step": 3241 + }, + { + "epoch": 0.4668778801843318, + "grad_norm": 0.7917280793190002, + "learning_rate": 2.760796222462998e-05, + "loss": 0.085, + "step": 3242 + }, + { + "epoch": 0.46702188940092165, + "grad_norm": 0.8827596306800842, + "learning_rate": 2.7596713211199722e-05, + "loss": 0.1381, + "step": 3243 + }, + { + "epoch": 0.4671658986175115, + "grad_norm": 0.3923715054988861, + "learning_rate": 2.7585463666268196e-05, + "loss": 0.0568, + "step": 3244 + }, + { + "epoch": 0.46730990783410137, + "grad_norm": 2.8155972957611084, + "learning_rate": 2.7574213592137992e-05, + "loss": 0.3234, + "step": 3245 + }, + { + "epoch": 0.4674539170506912, + "grad_norm": 0.9515361785888672, + "learning_rate": 2.75629629911118e-05, + "loss": 0.1011, + "step": 3246 + }, + { + "epoch": 0.4675979262672811, + "grad_norm": 2.0706608295440674, + "learning_rate": 2.7551711865492413e-05, + "loss": 0.1551, + "step": 3247 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 5.35781192779541, + "learning_rate": 2.7540460217582743e-05, + "loss": 1.5498, + "step": 3248 + }, + { + "epoch": 0.46788594470046085, + "grad_norm": 0.701564610004425, + "learning_rate": 2.7529208049685807e-05, + "loss": 0.0823, + "step": 3249 + }, + { + "epoch": 0.4680299539170507, + "grad_norm": 5.432807922363281, + "learning_rate": 2.751795536410472e-05, + "loss": 0.515, + "step": 3250 + }, + { + "epoch": 0.46817396313364057, + "grad_norm": 3.864283561706543, + "learning_rate": 2.7506702163142707e-05, + "loss": 0.3719, + "step": 3251 + }, + { + "epoch": 0.4683179723502304, + "grad_norm": 0.5571586489677429, + "learning_rate": 2.7495448449103102e-05, + "loss": 0.0706, + "step": 3252 + }, + { + "epoch": 0.4684619815668203, + "grad_norm": 0.8965988159179688, + "learning_rate": 2.7484194224289334e-05, + "loss": 0.1254, + "step": 3253 + }, + { + "epoch": 0.46860599078341014, + "grad_norm": 0.967717170715332, + "learning_rate": 2.747293949100495e-05, + "loss": 0.1332, + "step": 3254 + }, + { + "epoch": 0.46875, + "grad_norm": 1.9727321863174438, + "learning_rate": 2.7461684251553598e-05, + "loss": 0.1334, + "step": 3255 + }, + { + "epoch": 0.46889400921658986, + "grad_norm": 0.9432382583618164, + "learning_rate": 2.7450428508239024e-05, + "loss": 0.1111, + "step": 3256 + }, + { + "epoch": 0.4690380184331797, + "grad_norm": 1.7070541381835938, + "learning_rate": 2.7439172263365064e-05, + "loss": 0.1581, + "step": 3257 + }, + { + "epoch": 0.4691820276497696, + "grad_norm": 0.6854557991027832, + "learning_rate": 2.7427915519235696e-05, + "loss": 0.0894, + "step": 3258 + }, + { + "epoch": 0.46932603686635943, + "grad_norm": 0.8450667858123779, + "learning_rate": 2.7416658278154967e-05, + "loss": 4.1529, + "step": 3259 + }, + { + "epoch": 0.4694700460829493, + "grad_norm": 2.047111749649048, + "learning_rate": 2.7405400542427035e-05, + "loss": 0.2855, + "step": 3260 + }, + { + "epoch": 0.46961405529953915, + "grad_norm": 4.808757781982422, + "learning_rate": 2.7394142314356157e-05, + "loss": 0.7787, + "step": 3261 + }, + { + "epoch": 0.46975806451612906, + "grad_norm": 1.340011477470398, + "learning_rate": 2.73828835962467e-05, + "loss": 0.1625, + "step": 3262 + }, + { + "epoch": 0.4699020737327189, + "grad_norm": 4.0242815017700195, + "learning_rate": 2.7371624390403116e-05, + "loss": 1.2862, + "step": 3263 + }, + { + "epoch": 0.4700460829493088, + "grad_norm": 1.783811330795288, + "learning_rate": 2.736036469912997e-05, + "loss": 0.2068, + "step": 3264 + }, + { + "epoch": 0.47019009216589863, + "grad_norm": 0.7973631024360657, + "learning_rate": 2.7349104524731916e-05, + "loss": 0.0904, + "step": 3265 + }, + { + "epoch": 0.4703341013824885, + "grad_norm": 0.5556083917617798, + "learning_rate": 2.733784386951372e-05, + "loss": 0.0813, + "step": 3266 + }, + { + "epoch": 0.47047811059907835, + "grad_norm": 0.5359264612197876, + "learning_rate": 2.7326582735780236e-05, + "loss": 0.0834, + "step": 3267 + }, + { + "epoch": 0.4706221198156682, + "grad_norm": 2.151885986328125, + "learning_rate": 2.7315321125836417e-05, + "loss": 0.2247, + "step": 3268 + }, + { + "epoch": 0.47076612903225806, + "grad_norm": 4.385164260864258, + "learning_rate": 2.7304059041987324e-05, + "loss": 2.7088, + "step": 3269 + }, + { + "epoch": 0.4709101382488479, + "grad_norm": 1.1753969192504883, + "learning_rate": 2.7292796486538093e-05, + "loss": 0.1541, + "step": 3270 + }, + { + "epoch": 0.4710541474654378, + "grad_norm": 0.7100812792778015, + "learning_rate": 2.728153346179398e-05, + "loss": 0.065, + "step": 3271 + }, + { + "epoch": 0.47119815668202764, + "grad_norm": 2.316009044647217, + "learning_rate": 2.727026997006032e-05, + "loss": 0.2667, + "step": 3272 + }, + { + "epoch": 0.4713421658986175, + "grad_norm": 0.8970161080360413, + "learning_rate": 2.7259006013642557e-05, + "loss": 0.1025, + "step": 3273 + }, + { + "epoch": 0.47148617511520735, + "grad_norm": 0.987435519695282, + "learning_rate": 2.724774159484622e-05, + "loss": 0.079, + "step": 3274 + }, + { + "epoch": 0.4716301843317972, + "grad_norm": 2.9480020999908447, + "learning_rate": 2.7236476715976937e-05, + "loss": 0.3247, + "step": 3275 + }, + { + "epoch": 0.4717741935483871, + "grad_norm": 1.1996971368789673, + "learning_rate": 2.722521137934043e-05, + "loss": 0.1951, + "step": 3276 + }, + { + "epoch": 0.471918202764977, + "grad_norm": 2.6111457347869873, + "learning_rate": 2.7213945587242508e-05, + "loss": 0.2179, + "step": 3277 + }, + { + "epoch": 0.47206221198156684, + "grad_norm": 0.7549729347229004, + "learning_rate": 2.720267934198909e-05, + "loss": 4.6258, + "step": 3278 + }, + { + "epoch": 0.4722062211981567, + "grad_norm": 0.8815680742263794, + "learning_rate": 2.719141264588617e-05, + "loss": 0.1131, + "step": 3279 + }, + { + "epoch": 0.47235023041474655, + "grad_norm": 4.079017639160156, + "learning_rate": 2.7180145501239845e-05, + "loss": 0.6234, + "step": 3280 + }, + { + "epoch": 0.4724942396313364, + "grad_norm": 1.4893211126327515, + "learning_rate": 2.71688779103563e-05, + "loss": 0.1528, + "step": 3281 + }, + { + "epoch": 0.47263824884792627, + "grad_norm": 0.9464835524559021, + "learning_rate": 2.7157609875541806e-05, + "loss": 0.0929, + "step": 3282 + }, + { + "epoch": 0.4727822580645161, + "grad_norm": 5.096100807189941, + "learning_rate": 2.7146341399102738e-05, + "loss": 2.2908, + "step": 3283 + }, + { + "epoch": 0.472926267281106, + "grad_norm": 0.5781572461128235, + "learning_rate": 2.7135072483345552e-05, + "loss": 0.0597, + "step": 3284 + }, + { + "epoch": 0.47307027649769584, + "grad_norm": 1.4638749361038208, + "learning_rate": 2.712380313057679e-05, + "loss": 0.2335, + "step": 3285 + }, + { + "epoch": 0.4732142857142857, + "grad_norm": 1.5556292533874512, + "learning_rate": 2.7112533343103098e-05, + "loss": 0.144, + "step": 3286 + }, + { + "epoch": 0.47335829493087556, + "grad_norm": 0.5611196756362915, + "learning_rate": 2.710126312323119e-05, + "loss": 0.0692, + "step": 3287 + }, + { + "epoch": 0.4735023041474654, + "grad_norm": 0.9072846174240112, + "learning_rate": 2.7089992473267894e-05, + "loss": 0.1263, + "step": 3288 + }, + { + "epoch": 0.4736463133640553, + "grad_norm": 4.33627462387085, + "learning_rate": 2.7078721395520106e-05, + "loss": 2.3942, + "step": 3289 + }, + { + "epoch": 0.4737903225806452, + "grad_norm": 1.2580726146697998, + "learning_rate": 2.7067449892294812e-05, + "loss": 3.8982, + "step": 3290 + }, + { + "epoch": 0.47393433179723504, + "grad_norm": 2.4493789672851562, + "learning_rate": 2.7056177965899097e-05, + "loss": 0.397, + "step": 3291 + }, + { + "epoch": 0.4740783410138249, + "grad_norm": 0.9040077328681946, + "learning_rate": 2.7044905618640125e-05, + "loss": 0.1192, + "step": 3292 + }, + { + "epoch": 0.47422235023041476, + "grad_norm": 0.7847512364387512, + "learning_rate": 2.703363285282514e-05, + "loss": 0.1074, + "step": 3293 + }, + { + "epoch": 0.4743663594470046, + "grad_norm": 0.587628960609436, + "learning_rate": 2.7022359670761486e-05, + "loss": 0.0929, + "step": 3294 + }, + { + "epoch": 0.4745103686635945, + "grad_norm": 3.17063570022583, + "learning_rate": 2.7011086074756575e-05, + "loss": 0.2051, + "step": 3295 + }, + { + "epoch": 0.47465437788018433, + "grad_norm": 1.7609199285507202, + "learning_rate": 2.699981206711792e-05, + "loss": 0.2218, + "step": 3296 + }, + { + "epoch": 0.4747983870967742, + "grad_norm": 6.128162384033203, + "learning_rate": 2.6988537650153107e-05, + "loss": 2.566, + "step": 3297 + }, + { + "epoch": 0.47494239631336405, + "grad_norm": 0.7898577451705933, + "learning_rate": 2.6977262826169807e-05, + "loss": 0.0858, + "step": 3298 + }, + { + "epoch": 0.4750864055299539, + "grad_norm": 3.4697794914245605, + "learning_rate": 2.6965987597475784e-05, + "loss": 0.2475, + "step": 3299 + }, + { + "epoch": 0.47523041474654376, + "grad_norm": 1.4297587871551514, + "learning_rate": 2.6954711966378874e-05, + "loss": 0.1241, + "step": 3300 + }, + { + "epoch": 0.4753744239631336, + "grad_norm": 0.9694703221321106, + "learning_rate": 2.6943435935187e-05, + "loss": 0.1451, + "step": 3301 + }, + { + "epoch": 0.4755184331797235, + "grad_norm": 3.4357872009277344, + "learning_rate": 2.6932159506208164e-05, + "loss": 0.1601, + "step": 3302 + }, + { + "epoch": 0.4756624423963134, + "grad_norm": 0.8839846849441528, + "learning_rate": 2.692088268175046e-05, + "loss": 0.0993, + "step": 3303 + }, + { + "epoch": 0.47580645161290325, + "grad_norm": 4.084830284118652, + "learning_rate": 2.6909605464122035e-05, + "loss": 0.8724, + "step": 3304 + }, + { + "epoch": 0.4759504608294931, + "grad_norm": 0.8402591943740845, + "learning_rate": 2.6898327855631155e-05, + "loss": 0.1078, + "step": 3305 + }, + { + "epoch": 0.47609447004608296, + "grad_norm": 3.6732499599456787, + "learning_rate": 2.6887049858586144e-05, + "loss": 1.667, + "step": 3306 + }, + { + "epoch": 0.4762384792626728, + "grad_norm": 1.2204047441482544, + "learning_rate": 2.6875771475295403e-05, + "loss": 4.0966, + "step": 3307 + }, + { + "epoch": 0.4763824884792627, + "grad_norm": 1.9958568811416626, + "learning_rate": 2.6864492708067422e-05, + "loss": 0.1939, + "step": 3308 + }, + { + "epoch": 0.47652649769585254, + "grad_norm": 3.5907607078552246, + "learning_rate": 2.685321355921076e-05, + "loss": 0.8956, + "step": 3309 + }, + { + "epoch": 0.4766705069124424, + "grad_norm": 1.9453588724136353, + "learning_rate": 2.6841934031034065e-05, + "loss": 0.249, + "step": 3310 + }, + { + "epoch": 0.47681451612903225, + "grad_norm": 0.7978777885437012, + "learning_rate": 2.6830654125846055e-05, + "loss": 0.1013, + "step": 3311 + }, + { + "epoch": 0.4769585253456221, + "grad_norm": 0.7201615571975708, + "learning_rate": 2.6819373845955527e-05, + "loss": 0.0851, + "step": 3312 + }, + { + "epoch": 0.47710253456221197, + "grad_norm": 0.6832061409950256, + "learning_rate": 2.6808093193671345e-05, + "loss": 0.1028, + "step": 3313 + }, + { + "epoch": 0.4772465437788018, + "grad_norm": 3.821071147918701, + "learning_rate": 2.6796812171302476e-05, + "loss": 1.4708, + "step": 3314 + }, + { + "epoch": 0.4773905529953917, + "grad_norm": 2.1050589084625244, + "learning_rate": 2.6785530781157936e-05, + "loss": 0.2442, + "step": 3315 + }, + { + "epoch": 0.47753456221198154, + "grad_norm": 0.6370155811309814, + "learning_rate": 2.677424902554683e-05, + "loss": 0.0918, + "step": 3316 + }, + { + "epoch": 0.47767857142857145, + "grad_norm": 0.6561856865882874, + "learning_rate": 2.676296690677833e-05, + "loss": 0.078, + "step": 3317 + }, + { + "epoch": 0.4778225806451613, + "grad_norm": 3.1700687408447266, + "learning_rate": 2.6751684427161683e-05, + "loss": 1.4743, + "step": 3318 + }, + { + "epoch": 0.47796658986175117, + "grad_norm": 0.8901135921478271, + "learning_rate": 2.674040158900622e-05, + "loss": 0.154, + "step": 3319 + }, + { + "epoch": 0.478110599078341, + "grad_norm": 0.8281459212303162, + "learning_rate": 2.6729118394621338e-05, + "loss": 0.0893, + "step": 3320 + }, + { + "epoch": 0.4782546082949309, + "grad_norm": 0.6490416526794434, + "learning_rate": 2.671783484631651e-05, + "loss": 0.0654, + "step": 3321 + }, + { + "epoch": 0.47839861751152074, + "grad_norm": 2.2562479972839355, + "learning_rate": 2.670655094640127e-05, + "loss": 0.1565, + "step": 3322 + }, + { + "epoch": 0.4785426267281106, + "grad_norm": 2.3444511890411377, + "learning_rate": 2.6695266697185238e-05, + "loss": 0.2116, + "step": 3323 + }, + { + "epoch": 0.47868663594470046, + "grad_norm": 3.5250558853149414, + "learning_rate": 2.66839821009781e-05, + "loss": 0.6469, + "step": 3324 + }, + { + "epoch": 0.4788306451612903, + "grad_norm": 1.109283208847046, + "learning_rate": 2.667269716008961e-05, + "loss": 0.0961, + "step": 3325 + }, + { + "epoch": 0.47897465437788017, + "grad_norm": 0.4872848689556122, + "learning_rate": 2.6661411876829596e-05, + "loss": 0.0662, + "step": 3326 + }, + { + "epoch": 0.47911866359447003, + "grad_norm": 0.5412502288818359, + "learning_rate": 2.665012625350796e-05, + "loss": 0.0548, + "step": 3327 + }, + { + "epoch": 0.4792626728110599, + "grad_norm": 2.870983362197876, + "learning_rate": 2.663884029243467e-05, + "loss": 0.1293, + "step": 3328 + }, + { + "epoch": 0.47940668202764974, + "grad_norm": 16.86970329284668, + "learning_rate": 2.6627553995919764e-05, + "loss": 3.1067, + "step": 3329 + }, + { + "epoch": 0.47955069124423966, + "grad_norm": 1.1247137784957886, + "learning_rate": 2.6616267366273334e-05, + "loss": 0.142, + "step": 3330 + }, + { + "epoch": 0.4796947004608295, + "grad_norm": 1.2720658779144287, + "learning_rate": 2.6604980405805562e-05, + "loss": 0.1117, + "step": 3331 + }, + { + "epoch": 0.4798387096774194, + "grad_norm": 1.5002855062484741, + "learning_rate": 2.6593693116826694e-05, + "loss": 0.1187, + "step": 3332 + }, + { + "epoch": 0.47998271889400923, + "grad_norm": 0.6330550909042358, + "learning_rate": 2.658240550164704e-05, + "loss": 0.0546, + "step": 3333 + }, + { + "epoch": 0.4801267281105991, + "grad_norm": 1.0957367420196533, + "learning_rate": 2.6571117562576963e-05, + "loss": 0.1396, + "step": 3334 + }, + { + "epoch": 0.48027073732718895, + "grad_norm": 0.8774569630622864, + "learning_rate": 2.6559829301926915e-05, + "loss": 0.0949, + "step": 3335 + }, + { + "epoch": 0.4804147465437788, + "grad_norm": 1.197573184967041, + "learning_rate": 2.65485407220074e-05, + "loss": 0.1161, + "step": 3336 + }, + { + "epoch": 0.48055875576036866, + "grad_norm": 1.9916743040084839, + "learning_rate": 2.6537251825128984e-05, + "loss": 0.1873, + "step": 3337 + }, + { + "epoch": 0.4807027649769585, + "grad_norm": 0.6677519679069519, + "learning_rate": 2.6525962613602318e-05, + "loss": 0.0855, + "step": 3338 + }, + { + "epoch": 0.4808467741935484, + "grad_norm": 0.7551817893981934, + "learning_rate": 2.651467308973809e-05, + "loss": 0.0977, + "step": 3339 + }, + { + "epoch": 0.48099078341013823, + "grad_norm": 2.6984848976135254, + "learning_rate": 2.6503383255847075e-05, + "loss": 0.1704, + "step": 3340 + }, + { + "epoch": 0.4811347926267281, + "grad_norm": 7.559457778930664, + "learning_rate": 2.64920931142401e-05, + "loss": 0.2825, + "step": 3341 + }, + { + "epoch": 0.48127880184331795, + "grad_norm": 0.8448072671890259, + "learning_rate": 2.6480802667228054e-05, + "loss": 0.0974, + "step": 3342 + }, + { + "epoch": 0.4814228110599078, + "grad_norm": 2.1441779136657715, + "learning_rate": 2.6469511917121896e-05, + "loss": 0.191, + "step": 3343 + }, + { + "epoch": 0.4815668202764977, + "grad_norm": 5.673854827880859, + "learning_rate": 2.6458220866232648e-05, + "loss": 2.1655, + "step": 3344 + }, + { + "epoch": 0.4817108294930876, + "grad_norm": 1.109179973602295, + "learning_rate": 2.6446929516871365e-05, + "loss": 0.1669, + "step": 3345 + }, + { + "epoch": 0.48185483870967744, + "grad_norm": 3.727886438369751, + "learning_rate": 2.6435637871349216e-05, + "loss": 1.6471, + "step": 3346 + }, + { + "epoch": 0.4819988479262673, + "grad_norm": 1.9728912115097046, + "learning_rate": 2.642434593197739e-05, + "loss": 0.1611, + "step": 3347 + }, + { + "epoch": 0.48214285714285715, + "grad_norm": 0.9186723232269287, + "learning_rate": 2.6413053701067142e-05, + "loss": 0.0972, + "step": 3348 + }, + { + "epoch": 0.482286866359447, + "grad_norm": 1.4119352102279663, + "learning_rate": 2.6401761180929797e-05, + "loss": 0.1463, + "step": 3349 + }, + { + "epoch": 0.48243087557603687, + "grad_norm": 4.640384197235107, + "learning_rate": 2.639046837387673e-05, + "loss": 2.3198, + "step": 3350 + }, + { + "epoch": 0.4825748847926267, + "grad_norm": 0.8744766116142273, + "learning_rate": 2.637917528221939e-05, + "loss": 0.0774, + "step": 3351 + }, + { + "epoch": 0.4827188940092166, + "grad_norm": 0.6010303497314453, + "learning_rate": 2.6367881908269255e-05, + "loss": 0.0603, + "step": 3352 + }, + { + "epoch": 0.48286290322580644, + "grad_norm": 1.9814013242721558, + "learning_rate": 2.6356588254337893e-05, + "loss": 0.2283, + "step": 3353 + }, + { + "epoch": 0.4830069124423963, + "grad_norm": 1.439127802848816, + "learning_rate": 2.6345294322736914e-05, + "loss": 0.2393, + "step": 3354 + }, + { + "epoch": 0.48315092165898615, + "grad_norm": 1.5646750926971436, + "learning_rate": 2.6334000115777978e-05, + "loss": 0.1561, + "step": 3355 + }, + { + "epoch": 0.483294930875576, + "grad_norm": 0.3861731290817261, + "learning_rate": 2.6322705635772815e-05, + "loss": 0.0474, + "step": 3356 + }, + { + "epoch": 0.4834389400921659, + "grad_norm": 0.9134849905967712, + "learning_rate": 2.6311410885033204e-05, + "loss": 0.1143, + "step": 3357 + }, + { + "epoch": 0.4835829493087558, + "grad_norm": 2.0950682163238525, + "learning_rate": 2.6300115865870977e-05, + "loss": 0.2298, + "step": 3358 + }, + { + "epoch": 0.48372695852534564, + "grad_norm": 2.2625598907470703, + "learning_rate": 2.6288820580598035e-05, + "loss": 0.2618, + "step": 3359 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 1.3987911939620972, + "learning_rate": 2.6277525031526318e-05, + "loss": 0.1512, + "step": 3360 + }, + { + "epoch": 0.48401497695852536, + "grad_norm": 3.2141923904418945, + "learning_rate": 2.6266229220967818e-05, + "loss": 0.2577, + "step": 3361 + }, + { + "epoch": 0.4841589861751152, + "grad_norm": 0.8169183135032654, + "learning_rate": 2.62549331512346e-05, + "loss": 0.0991, + "step": 3362 + }, + { + "epoch": 0.48430299539170507, + "grad_norm": 1.2545981407165527, + "learning_rate": 2.624363682463876e-05, + "loss": 0.1406, + "step": 3363 + }, + { + "epoch": 0.48444700460829493, + "grad_norm": 4.395308017730713, + "learning_rate": 2.6232340243492464e-05, + "loss": 0.5865, + "step": 3364 + }, + { + "epoch": 0.4845910138248848, + "grad_norm": 1.4019044637680054, + "learning_rate": 2.6221043410107914e-05, + "loss": 0.1499, + "step": 3365 + }, + { + "epoch": 0.48473502304147464, + "grad_norm": 7.883999347686768, + "learning_rate": 2.6209746326797373e-05, + "loss": 2.0995, + "step": 3366 + }, + { + "epoch": 0.4848790322580645, + "grad_norm": 2.547405242919922, + "learning_rate": 2.6198448995873164e-05, + "loss": 0.1744, + "step": 3367 + }, + { + "epoch": 0.48502304147465436, + "grad_norm": 0.49822404980659485, + "learning_rate": 2.6187151419647642e-05, + "loss": 0.0685, + "step": 3368 + }, + { + "epoch": 0.4851670506912442, + "grad_norm": 0.6719576120376587, + "learning_rate": 2.617585360043322e-05, + "loss": 0.0807, + "step": 3369 + }, + { + "epoch": 0.4853110599078341, + "grad_norm": 0.625924289226532, + "learning_rate": 2.6164555540542366e-05, + "loss": 0.0831, + "step": 3370 + }, + { + "epoch": 0.485455069124424, + "grad_norm": 5.064886569976807, + "learning_rate": 2.6153257242287593e-05, + "loss": 1.3008, + "step": 3371 + }, + { + "epoch": 0.48559907834101385, + "grad_norm": 5.887736797332764, + "learning_rate": 2.6141958707981457e-05, + "loss": 0.9684, + "step": 3372 + }, + { + "epoch": 0.4857430875576037, + "grad_norm": 0.7390227317810059, + "learning_rate": 2.6130659939936576e-05, + "loss": 0.097, + "step": 3373 + }, + { + "epoch": 0.48588709677419356, + "grad_norm": 1.1735531091690063, + "learning_rate": 2.6119360940465604e-05, + "loss": 0.12, + "step": 3374 + }, + { + "epoch": 0.4860311059907834, + "grad_norm": 0.6214638948440552, + "learning_rate": 2.610806171188125e-05, + "loss": 0.0671, + "step": 3375 + }, + { + "epoch": 0.4861751152073733, + "grad_norm": 1.785412073135376, + "learning_rate": 2.609676225649626e-05, + "loss": 0.123, + "step": 3376 + }, + { + "epoch": 0.48631912442396313, + "grad_norm": 0.9128004312515259, + "learning_rate": 2.608546257662343e-05, + "loss": 0.1013, + "step": 3377 + }, + { + "epoch": 0.486463133640553, + "grad_norm": 3.3891632556915283, + "learning_rate": 2.607416267457562e-05, + "loss": 0.6921, + "step": 3378 + }, + { + "epoch": 0.48660714285714285, + "grad_norm": 1.173492193222046, + "learning_rate": 2.6062862552665708e-05, + "loss": 0.1596, + "step": 3379 + }, + { + "epoch": 0.4867511520737327, + "grad_norm": 0.9014896750450134, + "learning_rate": 2.6051562213206632e-05, + "loss": 0.1056, + "step": 3380 + }, + { + "epoch": 0.48689516129032256, + "grad_norm": 1.1795252561569214, + "learning_rate": 2.6040261658511367e-05, + "loss": 0.1572, + "step": 3381 + }, + { + "epoch": 0.4870391705069124, + "grad_norm": 1.118417501449585, + "learning_rate": 2.6028960890892945e-05, + "loss": 0.1425, + "step": 3382 + }, + { + "epoch": 0.4871831797235023, + "grad_norm": 1.2608473300933838, + "learning_rate": 2.6017659912664426e-05, + "loss": 0.0929, + "step": 3383 + }, + { + "epoch": 0.4873271889400922, + "grad_norm": 0.9694226980209351, + "learning_rate": 2.600635872613893e-05, + "loss": 0.0907, + "step": 3384 + }, + { + "epoch": 0.48747119815668205, + "grad_norm": 4.000730514526367, + "learning_rate": 2.599505733362959e-05, + "loss": 0.394, + "step": 3385 + }, + { + "epoch": 0.4876152073732719, + "grad_norm": 1.149048089981079, + "learning_rate": 2.5983755737449622e-05, + "loss": 0.1304, + "step": 3386 + }, + { + "epoch": 0.48775921658986177, + "grad_norm": 1.483581304550171, + "learning_rate": 2.5972453939912255e-05, + "loss": 0.1508, + "step": 3387 + }, + { + "epoch": 0.4879032258064516, + "grad_norm": 1.8968478441238403, + "learning_rate": 2.596115194333077e-05, + "loss": 0.1189, + "step": 3388 + }, + { + "epoch": 0.4880472350230415, + "grad_norm": 0.911567211151123, + "learning_rate": 2.5949849750018484e-05, + "loss": 0.1254, + "step": 3389 + }, + { + "epoch": 0.48819124423963134, + "grad_norm": 0.9346846342086792, + "learning_rate": 2.5938547362288752e-05, + "loss": 0.0938, + "step": 3390 + }, + { + "epoch": 0.4883352534562212, + "grad_norm": 0.49232393503189087, + "learning_rate": 2.5927244782454978e-05, + "loss": 0.0657, + "step": 3391 + }, + { + "epoch": 0.48847926267281105, + "grad_norm": 4.615867614746094, + "learning_rate": 2.5915942012830596e-05, + "loss": 2.5444, + "step": 3392 + }, + { + "epoch": 0.4886232718894009, + "grad_norm": 0.47827011346817017, + "learning_rate": 2.5904639055729092e-05, + "loss": 0.0658, + "step": 3393 + }, + { + "epoch": 0.48876728110599077, + "grad_norm": 0.8425323963165283, + "learning_rate": 2.5893335913463967e-05, + "loss": 0.1059, + "step": 3394 + }, + { + "epoch": 0.4889112903225806, + "grad_norm": 3.047060489654541, + "learning_rate": 2.5882032588348775e-05, + "loss": 2.4029, + "step": 3395 + }, + { + "epoch": 0.4890552995391705, + "grad_norm": 0.555214524269104, + "learning_rate": 2.5870729082697126e-05, + "loss": 0.0486, + "step": 3396 + }, + { + "epoch": 0.48919930875576034, + "grad_norm": 6.94074010848999, + "learning_rate": 2.5859425398822634e-05, + "loss": 1.5739, + "step": 3397 + }, + { + "epoch": 0.48934331797235026, + "grad_norm": 1.1142168045043945, + "learning_rate": 2.5848121539038962e-05, + "loss": 0.112, + "step": 3398 + }, + { + "epoch": 0.4894873271889401, + "grad_norm": 1.1413624286651611, + "learning_rate": 2.583681750565981e-05, + "loss": 0.1263, + "step": 3399 + }, + { + "epoch": 0.48963133640552997, + "grad_norm": 1.5502761602401733, + "learning_rate": 2.5825513300998922e-05, + "loss": 0.099, + "step": 3400 + }, + { + "epoch": 0.48977534562211983, + "grad_norm": 1.8340809345245361, + "learning_rate": 2.5814208927370058e-05, + "loss": 0.1196, + "step": 3401 + }, + { + "epoch": 0.4899193548387097, + "grad_norm": 1.3990120887756348, + "learning_rate": 2.5802904387087034e-05, + "loss": 0.1504, + "step": 3402 + }, + { + "epoch": 0.49006336405529954, + "grad_norm": 0.5931093096733093, + "learning_rate": 2.579159968246368e-05, + "loss": 0.065, + "step": 3403 + }, + { + "epoch": 0.4902073732718894, + "grad_norm": 1.9483040571212769, + "learning_rate": 2.5780294815813872e-05, + "loss": 0.1745, + "step": 3404 + }, + { + "epoch": 0.49035138248847926, + "grad_norm": 1.470651388168335, + "learning_rate": 2.576898978945152e-05, + "loss": 0.1234, + "step": 3405 + }, + { + "epoch": 0.4904953917050691, + "grad_norm": 0.8947877883911133, + "learning_rate": 2.575768460569056e-05, + "loss": 0.0861, + "step": 3406 + }, + { + "epoch": 0.490639400921659, + "grad_norm": 0.5502840280532837, + "learning_rate": 2.574637926684496e-05, + "loss": 0.0854, + "step": 3407 + }, + { + "epoch": 0.49078341013824883, + "grad_norm": 0.4016472101211548, + "learning_rate": 2.573507377522873e-05, + "loss": 0.0705, + "step": 3408 + }, + { + "epoch": 0.4909274193548387, + "grad_norm": 1.1245472431182861, + "learning_rate": 2.5723768133155895e-05, + "loss": 0.1266, + "step": 3409 + }, + { + "epoch": 0.49107142857142855, + "grad_norm": 1.0722486972808838, + "learning_rate": 2.571246234294053e-05, + "loss": 0.1449, + "step": 3410 + }, + { + "epoch": 0.49121543778801846, + "grad_norm": 1.5643419027328491, + "learning_rate": 2.5701156406896725e-05, + "loss": 0.1641, + "step": 3411 + }, + { + "epoch": 0.4913594470046083, + "grad_norm": 1.4280754327774048, + "learning_rate": 2.5689850327338606e-05, + "loss": 0.1264, + "step": 3412 + }, + { + "epoch": 0.4915034562211982, + "grad_norm": 0.5215620994567871, + "learning_rate": 2.5678544106580328e-05, + "loss": 0.047, + "step": 3413 + }, + { + "epoch": 0.49164746543778803, + "grad_norm": 0.8462010622024536, + "learning_rate": 2.5667237746936067e-05, + "loss": 0.1066, + "step": 3414 + }, + { + "epoch": 0.4917914746543779, + "grad_norm": 0.8146925568580627, + "learning_rate": 2.5655931250720046e-05, + "loss": 0.0994, + "step": 3415 + }, + { + "epoch": 0.49193548387096775, + "grad_norm": 5.454751968383789, + "learning_rate": 2.56446246202465e-05, + "loss": 2.0246, + "step": 3416 + }, + { + "epoch": 0.4920794930875576, + "grad_norm": 0.5505945086479187, + "learning_rate": 2.5633317857829697e-05, + "loss": 0.1005, + "step": 3417 + }, + { + "epoch": 0.49222350230414746, + "grad_norm": 1.7952357530593872, + "learning_rate": 2.562201096578393e-05, + "loss": 0.2176, + "step": 3418 + }, + { + "epoch": 0.4923675115207373, + "grad_norm": 1.086435079574585, + "learning_rate": 2.5610703946423526e-05, + "loss": 0.1465, + "step": 3419 + }, + { + "epoch": 0.4925115207373272, + "grad_norm": 5.358489990234375, + "learning_rate": 2.559939680206282e-05, + "loss": 2.3058, + "step": 3420 + }, + { + "epoch": 0.49265552995391704, + "grad_norm": 9.24312973022461, + "learning_rate": 2.558808953501619e-05, + "loss": 1.9861, + "step": 3421 + }, + { + "epoch": 0.4927995391705069, + "grad_norm": 1.275498628616333, + "learning_rate": 2.557678214759804e-05, + "loss": 0.1321, + "step": 3422 + }, + { + "epoch": 0.49294354838709675, + "grad_norm": 2.2042396068573, + "learning_rate": 2.5565474642122782e-05, + "loss": 0.2267, + "step": 3423 + }, + { + "epoch": 0.4930875576036866, + "grad_norm": 0.9270612001419067, + "learning_rate": 2.5554167020904868e-05, + "loss": 0.1073, + "step": 3424 + }, + { + "epoch": 0.4932315668202765, + "grad_norm": 0.4819442927837372, + "learning_rate": 2.554285928625877e-05, + "loss": 0.0612, + "step": 3425 + }, + { + "epoch": 0.4933755760368664, + "grad_norm": 3.549428939819336, + "learning_rate": 2.553155144049897e-05, + "loss": 2.6969, + "step": 3426 + }, + { + "epoch": 0.49351958525345624, + "grad_norm": 6.440474987030029, + "learning_rate": 2.5520243485939997e-05, + "loss": 1.33, + "step": 3427 + }, + { + "epoch": 0.4936635944700461, + "grad_norm": 0.642499566078186, + "learning_rate": 2.5508935424896387e-05, + "loss": 0.0947, + "step": 3428 + }, + { + "epoch": 0.49380760368663595, + "grad_norm": 1.2833855152130127, + "learning_rate": 2.5497627259682695e-05, + "loss": 0.1324, + "step": 3429 + }, + { + "epoch": 0.4939516129032258, + "grad_norm": 1.111699104309082, + "learning_rate": 2.5486318992613506e-05, + "loss": 4.0369, + "step": 3430 + }, + { + "epoch": 0.49409562211981567, + "grad_norm": 2.133898973464966, + "learning_rate": 2.547501062600342e-05, + "loss": 0.355, + "step": 3431 + }, + { + "epoch": 0.4942396313364055, + "grad_norm": 0.6454287767410278, + "learning_rate": 2.5463702162167064e-05, + "loss": 0.0603, + "step": 3432 + }, + { + "epoch": 0.4943836405529954, + "grad_norm": 4.377740859985352, + "learning_rate": 2.5452393603419077e-05, + "loss": 1.798, + "step": 3433 + }, + { + "epoch": 0.49452764976958524, + "grad_norm": 0.7503531575202942, + "learning_rate": 2.544108495207412e-05, + "loss": 0.1027, + "step": 3434 + }, + { + "epoch": 0.4946716589861751, + "grad_norm": 1.2306658029556274, + "learning_rate": 2.5429776210446877e-05, + "loss": 0.1417, + "step": 3435 + }, + { + "epoch": 0.49481566820276496, + "grad_norm": 4.007768154144287, + "learning_rate": 2.541846738085204e-05, + "loss": 0.2564, + "step": 3436 + }, + { + "epoch": 0.4949596774193548, + "grad_norm": 6.009570598602295, + "learning_rate": 2.5407158465604343e-05, + "loss": 0.4112, + "step": 3437 + }, + { + "epoch": 0.4951036866359447, + "grad_norm": 2.6674435138702393, + "learning_rate": 2.5395849467018503e-05, + "loss": 0.3224, + "step": 3438 + }, + { + "epoch": 0.4952476958525346, + "grad_norm": 0.5600869059562683, + "learning_rate": 2.538454038740928e-05, + "loss": 0.0631, + "step": 3439 + }, + { + "epoch": 0.49539170506912444, + "grad_norm": 0.5590953230857849, + "learning_rate": 2.5373231229091432e-05, + "loss": 0.055, + "step": 3440 + }, + { + "epoch": 0.4955357142857143, + "grad_norm": 0.625600278377533, + "learning_rate": 2.5361921994379762e-05, + "loss": 0.075, + "step": 3441 + }, + { + "epoch": 0.49567972350230416, + "grad_norm": 2.716132402420044, + "learning_rate": 2.535061268558906e-05, + "loss": 0.1943, + "step": 3442 + }, + { + "epoch": 0.495823732718894, + "grad_norm": 0.8270092010498047, + "learning_rate": 2.5339303305034147e-05, + "loss": 0.0906, + "step": 3443 + }, + { + "epoch": 0.4959677419354839, + "grad_norm": 6.396216869354248, + "learning_rate": 2.5327993855029846e-05, + "loss": 2.3108, + "step": 3444 + }, + { + "epoch": 0.49611175115207373, + "grad_norm": 2.624830484390259, + "learning_rate": 2.5316684337891005e-05, + "loss": 0.2135, + "step": 3445 + }, + { + "epoch": 0.4962557603686636, + "grad_norm": 0.7406347990036011, + "learning_rate": 2.5305374755932482e-05, + "loss": 0.0912, + "step": 3446 + }, + { + "epoch": 0.49639976958525345, + "grad_norm": 0.9809740781784058, + "learning_rate": 2.5294065111469146e-05, + "loss": 0.1589, + "step": 3447 + }, + { + "epoch": 0.4965437788018433, + "grad_norm": 1.0424245595932007, + "learning_rate": 2.5282755406815882e-05, + "loss": 0.1349, + "step": 3448 + }, + { + "epoch": 0.49668778801843316, + "grad_norm": 0.9567108154296875, + "learning_rate": 2.5271445644287588e-05, + "loss": 0.113, + "step": 3449 + }, + { + "epoch": 0.496831797235023, + "grad_norm": 0.6222454309463501, + "learning_rate": 2.5260135826199177e-05, + "loss": 0.1082, + "step": 3450 + }, + { + "epoch": 0.4969758064516129, + "grad_norm": 3.0976462364196777, + "learning_rate": 2.5248825954865564e-05, + "loss": 0.1098, + "step": 3451 + }, + { + "epoch": 0.4971198156682028, + "grad_norm": 2.2813220024108887, + "learning_rate": 2.5237516032601675e-05, + "loss": 0.1869, + "step": 3452 + }, + { + "epoch": 0.49726382488479265, + "grad_norm": 4.739388942718506, + "learning_rate": 2.5226206061722453e-05, + "loss": 2.2985, + "step": 3453 + }, + { + "epoch": 0.4974078341013825, + "grad_norm": 0.562228262424469, + "learning_rate": 2.521489604454285e-05, + "loss": 0.0783, + "step": 3454 + }, + { + "epoch": 0.49755184331797236, + "grad_norm": 1.3386988639831543, + "learning_rate": 2.5203585983377838e-05, + "loss": 3.9115, + "step": 3455 + }, + { + "epoch": 0.4976958525345622, + "grad_norm": 2.7840592861175537, + "learning_rate": 2.5192275880542364e-05, + "loss": 2.3299, + "step": 3456 + }, + { + "epoch": 0.4978398617511521, + "grad_norm": 1.1554197072982788, + "learning_rate": 2.518096573835143e-05, + "loss": 0.1343, + "step": 3457 + }, + { + "epoch": 0.49798387096774194, + "grad_norm": 4.753363132476807, + "learning_rate": 2.5169655559120002e-05, + "loss": 0.8931, + "step": 3458 + }, + { + "epoch": 0.4981278801843318, + "grad_norm": 1.8443394899368286, + "learning_rate": 2.5158345345163086e-05, + "loss": 0.1411, + "step": 3459 + }, + { + "epoch": 0.49827188940092165, + "grad_norm": 1.967227578163147, + "learning_rate": 2.514703509879568e-05, + "loss": 0.1925, + "step": 3460 + }, + { + "epoch": 0.4984158986175115, + "grad_norm": 0.49334120750427246, + "learning_rate": 2.513572482233279e-05, + "loss": 0.0827, + "step": 3461 + }, + { + "epoch": 0.49855990783410137, + "grad_norm": 0.9086175560951233, + "learning_rate": 2.5124414518089428e-05, + "loss": 0.0915, + "step": 3462 + }, + { + "epoch": 0.4987039170506912, + "grad_norm": 1.0022859573364258, + "learning_rate": 2.5113104188380615e-05, + "loss": 0.1174, + "step": 3463 + }, + { + "epoch": 0.4988479262672811, + "grad_norm": 5.870418548583984, + "learning_rate": 2.510179383552137e-05, + "loss": 0.9392, + "step": 3464 + }, + { + "epoch": 0.49899193548387094, + "grad_norm": 0.658474326133728, + "learning_rate": 2.5090483461826737e-05, + "loss": 0.0566, + "step": 3465 + }, + { + "epoch": 0.49913594470046085, + "grad_norm": 1.284199595451355, + "learning_rate": 2.5079173069611734e-05, + "loss": 0.1363, + "step": 3466 + }, + { + "epoch": 0.4992799539170507, + "grad_norm": 2.471308708190918, + "learning_rate": 2.5067862661191398e-05, + "loss": 0.2274, + "step": 3467 + }, + { + "epoch": 0.49942396313364057, + "grad_norm": 0.9316946268081665, + "learning_rate": 2.5056552238880783e-05, + "loss": 0.1175, + "step": 3468 + }, + { + "epoch": 0.4995679723502304, + "grad_norm": 0.8511991500854492, + "learning_rate": 2.5045241804994923e-05, + "loss": 0.0827, + "step": 3469 + }, + { + "epoch": 0.4997119815668203, + "grad_norm": 0.8394210934638977, + "learning_rate": 2.5033931361848866e-05, + "loss": 4.4514, + "step": 3470 + }, + { + "epoch": 0.49985599078341014, + "grad_norm": 0.434726357460022, + "learning_rate": 2.5022620911757667e-05, + "loss": 0.0603, + "step": 3471 + }, + { + "epoch": 0.5, + "grad_norm": 1.406632900238037, + "learning_rate": 2.501131045703636e-05, + "loss": 3.7748, + "step": 3472 + }, + { + "epoch": 0.5001440092165899, + "grad_norm": 2.522080659866333, + "learning_rate": 2.5e-05, + "loss": 0.1949, + "step": 3473 + }, + { + "epoch": 0.5002880184331797, + "grad_norm": 3.587692975997925, + "learning_rate": 2.4988689542963647e-05, + "loss": 0.0983, + "step": 3474 + }, + { + "epoch": 0.5004320276497696, + "grad_norm": 1.4649560451507568, + "learning_rate": 2.4977379088242342e-05, + "loss": 0.1722, + "step": 3475 + }, + { + "epoch": 0.5005760368663594, + "grad_norm": 0.8514876961708069, + "learning_rate": 2.4966068638151137e-05, + "loss": 0.086, + "step": 3476 + }, + { + "epoch": 0.5007200460829493, + "grad_norm": 0.8922461271286011, + "learning_rate": 2.4954758195005083e-05, + "loss": 0.0848, + "step": 3477 + }, + { + "epoch": 0.5008640552995391, + "grad_norm": 2.0422070026397705, + "learning_rate": 2.4943447761119223e-05, + "loss": 0.1659, + "step": 3478 + }, + { + "epoch": 0.501008064516129, + "grad_norm": 5.08585262298584, + "learning_rate": 2.4932137338808608e-05, + "loss": 1.7043, + "step": 3479 + }, + { + "epoch": 0.5011520737327189, + "grad_norm": 6.896489143371582, + "learning_rate": 2.492082693038828e-05, + "loss": 0.4441, + "step": 3480 + }, + { + "epoch": 0.5012960829493087, + "grad_norm": 0.6664833426475525, + "learning_rate": 2.490951653817328e-05, + "loss": 0.0761, + "step": 3481 + }, + { + "epoch": 0.5014400921658986, + "grad_norm": 6.0540666580200195, + "learning_rate": 2.4898206164478638e-05, + "loss": 2.1016, + "step": 3482 + }, + { + "epoch": 0.5015841013824884, + "grad_norm": 5.596919536590576, + "learning_rate": 2.4886895811619398e-05, + "loss": 2.3997, + "step": 3483 + }, + { + "epoch": 0.5017281105990783, + "grad_norm": 0.6174638271331787, + "learning_rate": 2.4875585481910585e-05, + "loss": 0.0529, + "step": 3484 + }, + { + "epoch": 0.5018721198156681, + "grad_norm": 0.3695104718208313, + "learning_rate": 2.4864275177667224e-05, + "loss": 0.0476, + "step": 3485 + }, + { + "epoch": 0.5020161290322581, + "grad_norm": 1.1789277791976929, + "learning_rate": 2.4852964901204332e-05, + "loss": 0.1924, + "step": 3486 + }, + { + "epoch": 0.502160138248848, + "grad_norm": 4.147573947906494, + "learning_rate": 2.4841654654836913e-05, + "loss": 2.3175, + "step": 3487 + }, + { + "epoch": 0.5023041474654378, + "grad_norm": 1.2321789264678955, + "learning_rate": 2.4830344440879997e-05, + "loss": 0.1262, + "step": 3488 + }, + { + "epoch": 0.5024481566820277, + "grad_norm": 0.31048545241355896, + "learning_rate": 2.4819034261648573e-05, + "loss": 0.0564, + "step": 3489 + }, + { + "epoch": 0.5025921658986175, + "grad_norm": 1.2202868461608887, + "learning_rate": 2.480772411945763e-05, + "loss": 0.1115, + "step": 3490 + }, + { + "epoch": 0.5027361751152074, + "grad_norm": 5.298651218414307, + "learning_rate": 2.4796414016622165e-05, + "loss": 0.7949, + "step": 3491 + }, + { + "epoch": 0.5028801843317973, + "grad_norm": 4.307665824890137, + "learning_rate": 2.4785103955457148e-05, + "loss": 1.4719, + "step": 3492 + }, + { + "epoch": 0.5030241935483871, + "grad_norm": 1.7000360488891602, + "learning_rate": 2.477379393827755e-05, + "loss": 0.0756, + "step": 3493 + }, + { + "epoch": 0.503168202764977, + "grad_norm": 0.7223773002624512, + "learning_rate": 2.476248396739833e-05, + "loss": 0.1072, + "step": 3494 + }, + { + "epoch": 0.5033122119815668, + "grad_norm": 5.178288459777832, + "learning_rate": 2.4751174045134442e-05, + "loss": 1.5401, + "step": 3495 + }, + { + "epoch": 0.5034562211981567, + "grad_norm": 3.1704962253570557, + "learning_rate": 2.473986417380083e-05, + "loss": 1.6199, + "step": 3496 + }, + { + "epoch": 0.5036002304147466, + "grad_norm": 0.36327454447746277, + "learning_rate": 2.4728554355712414e-05, + "loss": 0.0333, + "step": 3497 + }, + { + "epoch": 0.5037442396313364, + "grad_norm": 0.5874791741371155, + "learning_rate": 2.471724459318412e-05, + "loss": 0.0712, + "step": 3498 + }, + { + "epoch": 0.5038882488479263, + "grad_norm": 2.3379180431365967, + "learning_rate": 2.470593488853086e-05, + "loss": 0.2296, + "step": 3499 + }, + { + "epoch": 0.5040322580645161, + "grad_norm": 2.186164140701294, + "learning_rate": 2.4694625244067527e-05, + "loss": 0.2963, + "step": 3500 + }, + { + "epoch": 0.504176267281106, + "grad_norm": 1.4451954364776611, + "learning_rate": 2.4683315662109e-05, + "loss": 0.1373, + "step": 3501 + }, + { + "epoch": 0.5043202764976958, + "grad_norm": 0.819680392742157, + "learning_rate": 2.467200614497016e-05, + "loss": 0.0989, + "step": 3502 + }, + { + "epoch": 0.5044642857142857, + "grad_norm": 1.4498411417007446, + "learning_rate": 2.466069669496586e-05, + "loss": 0.1344, + "step": 3503 + }, + { + "epoch": 0.5046082949308756, + "grad_norm": 2.792268753051758, + "learning_rate": 2.4649387314410945e-05, + "loss": 0.5553, + "step": 3504 + }, + { + "epoch": 0.5047523041474654, + "grad_norm": 4.712468147277832, + "learning_rate": 2.4638078005620243e-05, + "loss": 1.8836, + "step": 3505 + }, + { + "epoch": 0.5048963133640553, + "grad_norm": 2.5596864223480225, + "learning_rate": 2.4626768770908574e-05, + "loss": 0.4024, + "step": 3506 + }, + { + "epoch": 0.5050403225806451, + "grad_norm": 2.2221014499664307, + "learning_rate": 2.4615459612590734e-05, + "loss": 0.3636, + "step": 3507 + }, + { + "epoch": 0.505184331797235, + "grad_norm": 0.48532232642173767, + "learning_rate": 2.4604150532981513e-05, + "loss": 0.0572, + "step": 3508 + }, + { + "epoch": 0.5053283410138248, + "grad_norm": 3.8436460494995117, + "learning_rate": 2.4592841534395673e-05, + "loss": 1.2301, + "step": 3509 + }, + { + "epoch": 0.5054723502304147, + "grad_norm": 2.21529221534729, + "learning_rate": 2.4581532619147968e-05, + "loss": 0.2739, + "step": 3510 + }, + { + "epoch": 0.5056163594470046, + "grad_norm": 4.1084418296813965, + "learning_rate": 2.4570223789553136e-05, + "loss": 1.8438, + "step": 3511 + }, + { + "epoch": 0.5057603686635944, + "grad_norm": 0.7923262119293213, + "learning_rate": 2.455891504792589e-05, + "loss": 0.1122, + "step": 3512 + }, + { + "epoch": 0.5059043778801844, + "grad_norm": 1.196223258972168, + "learning_rate": 2.4547606396580936e-05, + "loss": 0.1093, + "step": 3513 + }, + { + "epoch": 0.5060483870967742, + "grad_norm": 0.7425452470779419, + "learning_rate": 2.4536297837832935e-05, + "loss": 0.0781, + "step": 3514 + }, + { + "epoch": 0.5061923963133641, + "grad_norm": 1.9631444215774536, + "learning_rate": 2.452498937399658e-05, + "loss": 0.1286, + "step": 3515 + }, + { + "epoch": 0.506336405529954, + "grad_norm": 0.9047209620475769, + "learning_rate": 2.4513681007386493e-05, + "loss": 0.1209, + "step": 3516 + }, + { + "epoch": 0.5064804147465438, + "grad_norm": 3.083616256713867, + "learning_rate": 2.4502372740317307e-05, + "loss": 0.3661, + "step": 3517 + }, + { + "epoch": 0.5066244239631337, + "grad_norm": 3.717301368713379, + "learning_rate": 2.4491064575103616e-05, + "loss": 0.7628, + "step": 3518 + }, + { + "epoch": 0.5067684331797235, + "grad_norm": 1.9321660995483398, + "learning_rate": 2.4479756514060005e-05, + "loss": 0.3204, + "step": 3519 + }, + { + "epoch": 0.5069124423963134, + "grad_norm": 0.6963633894920349, + "learning_rate": 2.4468448559501033e-05, + "loss": 0.1278, + "step": 3520 + }, + { + "epoch": 0.5070564516129032, + "grad_norm": 1.303788423538208, + "learning_rate": 2.4457140713741237e-05, + "loss": 0.1403, + "step": 3521 + }, + { + "epoch": 0.5072004608294931, + "grad_norm": 1.062169075012207, + "learning_rate": 2.4445832979095138e-05, + "loss": 0.1249, + "step": 3522 + }, + { + "epoch": 0.507344470046083, + "grad_norm": 1.9940041303634644, + "learning_rate": 2.4434525357877224e-05, + "loss": 0.1762, + "step": 3523 + }, + { + "epoch": 0.5074884792626728, + "grad_norm": 3.029087781906128, + "learning_rate": 2.4423217852401967e-05, + "loss": 0.4065, + "step": 3524 + }, + { + "epoch": 0.5076324884792627, + "grad_norm": 1.114717960357666, + "learning_rate": 2.4411910464983815e-05, + "loss": 0.1794, + "step": 3525 + }, + { + "epoch": 0.5077764976958525, + "grad_norm": 1.866121768951416, + "learning_rate": 2.4400603197937186e-05, + "loss": 0.2518, + "step": 3526 + }, + { + "epoch": 0.5079205069124424, + "grad_norm": 5.271446228027344, + "learning_rate": 2.4389296053576483e-05, + "loss": 0.2753, + "step": 3527 + }, + { + "epoch": 0.5080645161290323, + "grad_norm": 9.44665241241455, + "learning_rate": 2.437798903421607e-05, + "loss": 2.2193, + "step": 3528 + }, + { + "epoch": 0.5082085253456221, + "grad_norm": 2.5920188426971436, + "learning_rate": 2.436668214217031e-05, + "loss": 0.4358, + "step": 3529 + }, + { + "epoch": 0.508352534562212, + "grad_norm": 1.4543732404708862, + "learning_rate": 2.4355375379753502e-05, + "loss": 0.2361, + "step": 3530 + }, + { + "epoch": 0.5084965437788018, + "grad_norm": 3.8593363761901855, + "learning_rate": 2.434406874927996e-05, + "loss": 0.3046, + "step": 3531 + }, + { + "epoch": 0.5086405529953917, + "grad_norm": 2.480738401412964, + "learning_rate": 2.433276225306394e-05, + "loss": 0.2455, + "step": 3532 + }, + { + "epoch": 0.5087845622119815, + "grad_norm": 0.5220252871513367, + "learning_rate": 2.4321455893419678e-05, + "loss": 0.0693, + "step": 3533 + }, + { + "epoch": 0.5089285714285714, + "grad_norm": 0.6870394349098206, + "learning_rate": 2.4310149672661397e-05, + "loss": 0.066, + "step": 3534 + }, + { + "epoch": 0.5090725806451613, + "grad_norm": 1.744707465171814, + "learning_rate": 2.429884359310328e-05, + "loss": 0.144, + "step": 3535 + }, + { + "epoch": 0.5092165898617511, + "grad_norm": 0.8854445815086365, + "learning_rate": 2.428753765705947e-05, + "loss": 0.1181, + "step": 3536 + }, + { + "epoch": 0.509360599078341, + "grad_norm": 1.5685019493103027, + "learning_rate": 2.4276231866844107e-05, + "loss": 0.1936, + "step": 3537 + }, + { + "epoch": 0.5095046082949308, + "grad_norm": 6.526317596435547, + "learning_rate": 2.426492622477128e-05, + "loss": 1.8929, + "step": 3538 + }, + { + "epoch": 0.5096486175115207, + "grad_norm": 0.42200911045074463, + "learning_rate": 2.425362073315505e-05, + "loss": 0.0544, + "step": 3539 + }, + { + "epoch": 0.5097926267281107, + "grad_norm": 1.0528591871261597, + "learning_rate": 2.4242315394309447e-05, + "loss": 0.162, + "step": 3540 + }, + { + "epoch": 0.5099366359447005, + "grad_norm": 4.477654933929443, + "learning_rate": 2.4231010210548484e-05, + "loss": 2.3291, + "step": 3541 + }, + { + "epoch": 0.5100806451612904, + "grad_norm": 5.768682956695557, + "learning_rate": 2.4219705184186127e-05, + "loss": 1.6061, + "step": 3542 + }, + { + "epoch": 0.5102246543778802, + "grad_norm": 0.7580983638763428, + "learning_rate": 2.420840031753632e-05, + "loss": 0.084, + "step": 3543 + }, + { + "epoch": 0.5103686635944701, + "grad_norm": 2.4813432693481445, + "learning_rate": 2.419709561291297e-05, + "loss": 0.2365, + "step": 3544 + }, + { + "epoch": 0.5105126728110599, + "grad_norm": 1.4559041261672974, + "learning_rate": 2.4185791072629945e-05, + "loss": 0.11, + "step": 3545 + }, + { + "epoch": 0.5106566820276498, + "grad_norm": 0.5866481065750122, + "learning_rate": 2.4174486699001084e-05, + "loss": 0.0982, + "step": 3546 + }, + { + "epoch": 0.5108006912442397, + "grad_norm": 0.6090421676635742, + "learning_rate": 2.4163182494340192e-05, + "loss": 0.0837, + "step": 3547 + }, + { + "epoch": 0.5109447004608295, + "grad_norm": 1.2265868186950684, + "learning_rate": 2.4151878460961044e-05, + "loss": 0.1377, + "step": 3548 + }, + { + "epoch": 0.5110887096774194, + "grad_norm": 3.3332102298736572, + "learning_rate": 2.4140574601177375e-05, + "loss": 1.4463, + "step": 3549 + }, + { + "epoch": 0.5112327188940092, + "grad_norm": 0.429324746131897, + "learning_rate": 2.412927091730288e-05, + "loss": 0.0579, + "step": 3550 + }, + { + "epoch": 0.5113767281105991, + "grad_norm": 0.5421316623687744, + "learning_rate": 2.4117967411651228e-05, + "loss": 0.0522, + "step": 3551 + }, + { + "epoch": 0.511520737327189, + "grad_norm": 1.3207100629806519, + "learning_rate": 2.410666408653604e-05, + "loss": 0.1436, + "step": 3552 + }, + { + "epoch": 0.5116647465437788, + "grad_norm": 4.161920547485352, + "learning_rate": 2.4095360944270917e-05, + "loss": 2.151, + "step": 3553 + }, + { + "epoch": 0.5118087557603687, + "grad_norm": 6.076672554016113, + "learning_rate": 2.408405798716941e-05, + "loss": 0.2824, + "step": 3554 + }, + { + "epoch": 0.5119527649769585, + "grad_norm": 0.625930666923523, + "learning_rate": 2.4072755217545028e-05, + "loss": 0.0625, + "step": 3555 + }, + { + "epoch": 0.5120967741935484, + "grad_norm": 5.121299743652344, + "learning_rate": 2.406145263771125e-05, + "loss": 1.0638, + "step": 3556 + }, + { + "epoch": 0.5122407834101382, + "grad_norm": 0.7942451238632202, + "learning_rate": 2.4050150249981522e-05, + "loss": 0.0989, + "step": 3557 + }, + { + "epoch": 0.5123847926267281, + "grad_norm": 1.9830760955810547, + "learning_rate": 2.4038848056669234e-05, + "loss": 0.2112, + "step": 3558 + }, + { + "epoch": 0.512528801843318, + "grad_norm": 0.9655758142471313, + "learning_rate": 2.4027546060087747e-05, + "loss": 0.0977, + "step": 3559 + }, + { + "epoch": 0.5126728110599078, + "grad_norm": 1.276794195175171, + "learning_rate": 2.4016244262550384e-05, + "loss": 3.9272, + "step": 3560 + }, + { + "epoch": 0.5128168202764977, + "grad_norm": 0.9434391856193542, + "learning_rate": 2.4004942666370414e-05, + "loss": 0.1377, + "step": 3561 + }, + { + "epoch": 0.5129608294930875, + "grad_norm": 3.6890196800231934, + "learning_rate": 2.3993641273861085e-05, + "loss": 0.4385, + "step": 3562 + }, + { + "epoch": 0.5131048387096774, + "grad_norm": 1.3045710325241089, + "learning_rate": 2.3982340087335584e-05, + "loss": 0.1601, + "step": 3563 + }, + { + "epoch": 0.5132488479262672, + "grad_norm": 0.7824480533599854, + "learning_rate": 2.3971039109107064e-05, + "loss": 0.106, + "step": 3564 + }, + { + "epoch": 0.5133928571428571, + "grad_norm": 1.3748512268066406, + "learning_rate": 2.3959738341488642e-05, + "loss": 0.1818, + "step": 3565 + }, + { + "epoch": 0.513536866359447, + "grad_norm": 0.8065641522407532, + "learning_rate": 2.3948437786793377e-05, + "loss": 0.0621, + "step": 3566 + }, + { + "epoch": 0.5136808755760369, + "grad_norm": 0.9929618239402771, + "learning_rate": 2.39371374473343e-05, + "loss": 0.1357, + "step": 3567 + }, + { + "epoch": 0.5138248847926268, + "grad_norm": 5.177657604217529, + "learning_rate": 2.3925837325424385e-05, + "loss": 2.5258, + "step": 3568 + }, + { + "epoch": 0.5139688940092166, + "grad_norm": 0.7608471512794495, + "learning_rate": 2.391453742337657e-05, + "loss": 0.0776, + "step": 3569 + }, + { + "epoch": 0.5141129032258065, + "grad_norm": 0.5017921328544617, + "learning_rate": 2.390323774350375e-05, + "loss": 0.0775, + "step": 3570 + }, + { + "epoch": 0.5142569124423964, + "grad_norm": 4.127809524536133, + "learning_rate": 2.3891938288118753e-05, + "loss": 0.3668, + "step": 3571 + }, + { + "epoch": 0.5144009216589862, + "grad_norm": 0.8400297164916992, + "learning_rate": 2.3880639059534395e-05, + "loss": 0.1239, + "step": 3572 + }, + { + "epoch": 0.5145449308755761, + "grad_norm": 0.6088792681694031, + "learning_rate": 2.3869340060063426e-05, + "loss": 0.0825, + "step": 3573 + }, + { + "epoch": 0.5146889400921659, + "grad_norm": 3.526916265487671, + "learning_rate": 2.3858041292018542e-05, + "loss": 1.854, + "step": 3574 + }, + { + "epoch": 0.5148329493087558, + "grad_norm": 0.5611253380775452, + "learning_rate": 2.3846742757712413e-05, + "loss": 0.0767, + "step": 3575 + }, + { + "epoch": 0.5149769585253456, + "grad_norm": 4.490814208984375, + "learning_rate": 2.3835444459457636e-05, + "loss": 1.792, + "step": 3576 + }, + { + "epoch": 0.5151209677419355, + "grad_norm": 1.249788522720337, + "learning_rate": 2.3824146399566787e-05, + "loss": 0.1342, + "step": 3577 + }, + { + "epoch": 0.5152649769585254, + "grad_norm": 0.6983827948570251, + "learning_rate": 2.3812848580352364e-05, + "loss": 0.0736, + "step": 3578 + }, + { + "epoch": 0.5154089861751152, + "grad_norm": 0.9331589341163635, + "learning_rate": 2.380155100412684e-05, + "loss": 0.09, + "step": 3579 + }, + { + "epoch": 0.5155529953917051, + "grad_norm": 0.948403000831604, + "learning_rate": 2.379025367320263e-05, + "loss": 0.1313, + "step": 3580 + }, + { + "epoch": 0.5156970046082949, + "grad_norm": 0.9575824737548828, + "learning_rate": 2.377895658989209e-05, + "loss": 0.0918, + "step": 3581 + }, + { + "epoch": 0.5158410138248848, + "grad_norm": 0.6635643839836121, + "learning_rate": 2.3767659756507542e-05, + "loss": 0.0806, + "step": 3582 + }, + { + "epoch": 0.5159850230414746, + "grad_norm": 1.8366076946258545, + "learning_rate": 2.3756363175361242e-05, + "loss": 0.141, + "step": 3583 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 1.300750970840454, + "learning_rate": 2.3745066848765405e-05, + "loss": 0.1176, + "step": 3584 + }, + { + "epoch": 0.5162730414746544, + "grad_norm": 3.4044768810272217, + "learning_rate": 2.3733770779032184e-05, + "loss": 2.4409, + "step": 3585 + }, + { + "epoch": 0.5164170506912442, + "grad_norm": 6.960483551025391, + "learning_rate": 2.372247496847369e-05, + "loss": 3.1785, + "step": 3586 + }, + { + "epoch": 0.5165610599078341, + "grad_norm": 0.8980383276939392, + "learning_rate": 2.371117941940197e-05, + "loss": 0.0857, + "step": 3587 + }, + { + "epoch": 0.5167050691244239, + "grad_norm": 0.5571449398994446, + "learning_rate": 2.369988413412903e-05, + "loss": 0.0773, + "step": 3588 + }, + { + "epoch": 0.5168490783410138, + "grad_norm": 2.8089561462402344, + "learning_rate": 2.3688589114966805e-05, + "loss": 0.2083, + "step": 3589 + }, + { + "epoch": 0.5169930875576036, + "grad_norm": 1.0858371257781982, + "learning_rate": 2.3677294364227194e-05, + "loss": 0.1454, + "step": 3590 + }, + { + "epoch": 0.5171370967741935, + "grad_norm": 0.9825716018676758, + "learning_rate": 2.3665999884222035e-05, + "loss": 0.1141, + "step": 3591 + }, + { + "epoch": 0.5172811059907834, + "grad_norm": 14.31699275970459, + "learning_rate": 2.3654705677263102e-05, + "loss": 3.7241, + "step": 3592 + }, + { + "epoch": 0.5174251152073732, + "grad_norm": 4.066608428955078, + "learning_rate": 2.3643411745662116e-05, + "loss": 0.5757, + "step": 3593 + }, + { + "epoch": 0.5175691244239631, + "grad_norm": 1.8192814588546753, + "learning_rate": 2.3632118091730754e-05, + "loss": 0.1691, + "step": 3594 + }, + { + "epoch": 0.517713133640553, + "grad_norm": 1.2467838525772095, + "learning_rate": 2.3620824717780624e-05, + "loss": 0.0724, + "step": 3595 + }, + { + "epoch": 0.5178571428571429, + "grad_norm": 3.0277626514434814, + "learning_rate": 2.3609531626123264e-05, + "loss": 0.6642, + "step": 3596 + }, + { + "epoch": 0.5180011520737328, + "grad_norm": 4.1760945320129395, + "learning_rate": 2.3598238819070202e-05, + "loss": 0.2296, + "step": 3597 + }, + { + "epoch": 0.5181451612903226, + "grad_norm": 0.562084436416626, + "learning_rate": 2.3586946298932857e-05, + "loss": 0.0546, + "step": 3598 + }, + { + "epoch": 0.5182891705069125, + "grad_norm": 0.5587400794029236, + "learning_rate": 2.357565406802261e-05, + "loss": 0.081, + "step": 3599 + }, + { + "epoch": 0.5184331797235023, + "grad_norm": 2.4957504272460938, + "learning_rate": 2.3564362128650783e-05, + "loss": 0.3224, + "step": 3600 + }, + { + "epoch": 0.5185771889400922, + "grad_norm": 0.6383301019668579, + "learning_rate": 2.355307048312863e-05, + "loss": 0.0755, + "step": 3601 + }, + { + "epoch": 0.518721198156682, + "grad_norm": 1.6651309728622437, + "learning_rate": 2.354177913376736e-05, + "loss": 0.1366, + "step": 3602 + }, + { + "epoch": 0.5188652073732719, + "grad_norm": 1.6030209064483643, + "learning_rate": 2.3530488082878106e-05, + "loss": 0.2205, + "step": 3603 + }, + { + "epoch": 0.5190092165898618, + "grad_norm": 3.252220392227173, + "learning_rate": 2.351919733277195e-05, + "loss": 1.3095, + "step": 3604 + }, + { + "epoch": 0.5191532258064516, + "grad_norm": 4.441350936889648, + "learning_rate": 2.3507906885759906e-05, + "loss": 2.1822, + "step": 3605 + }, + { + "epoch": 0.5192972350230415, + "grad_norm": 0.3574712872505188, + "learning_rate": 2.349661674415293e-05, + "loss": 0.0424, + "step": 3606 + }, + { + "epoch": 0.5194412442396313, + "grad_norm": 0.35701608657836914, + "learning_rate": 2.3485326910261915e-05, + "loss": 0.0419, + "step": 3607 + }, + { + "epoch": 0.5195852534562212, + "grad_norm": 1.4232630729675293, + "learning_rate": 2.347403738639769e-05, + "loss": 0.1168, + "step": 3608 + }, + { + "epoch": 0.519729262672811, + "grad_norm": 0.8090313673019409, + "learning_rate": 2.3462748174871022e-05, + "loss": 0.1285, + "step": 3609 + }, + { + "epoch": 0.5198732718894009, + "grad_norm": 3.7857348918914795, + "learning_rate": 2.345145927799261e-05, + "loss": 1.8034, + "step": 3610 + }, + { + "epoch": 0.5200172811059908, + "grad_norm": 0.43926316499710083, + "learning_rate": 2.344017069807309e-05, + "loss": 0.0716, + "step": 3611 + }, + { + "epoch": 0.5201612903225806, + "grad_norm": 0.7892157435417175, + "learning_rate": 2.3428882437423043e-05, + "loss": 0.1188, + "step": 3612 + }, + { + "epoch": 0.5203052995391705, + "grad_norm": 0.7945453524589539, + "learning_rate": 2.3417594498352967e-05, + "loss": 0.1097, + "step": 3613 + }, + { + "epoch": 0.5204493087557603, + "grad_norm": 1.1994671821594238, + "learning_rate": 2.340630688317331e-05, + "loss": 0.1325, + "step": 3614 + }, + { + "epoch": 0.5205933179723502, + "grad_norm": 0.7012181878089905, + "learning_rate": 2.3395019594194443e-05, + "loss": 0.0813, + "step": 3615 + }, + { + "epoch": 0.5207373271889401, + "grad_norm": 0.6279650330543518, + "learning_rate": 2.3383732633726675e-05, + "loss": 0.0686, + "step": 3616 + }, + { + "epoch": 0.5208813364055299, + "grad_norm": 5.621333122253418, + "learning_rate": 2.3372446004080252e-05, + "loss": 0.2101, + "step": 3617 + }, + { + "epoch": 0.5210253456221198, + "grad_norm": 1.2292455434799194, + "learning_rate": 2.3361159707565337e-05, + "loss": 0.1165, + "step": 3618 + }, + { + "epoch": 0.5211693548387096, + "grad_norm": 3.789994716644287, + "learning_rate": 2.334987374649205e-05, + "loss": 2.9552, + "step": 3619 + }, + { + "epoch": 0.5213133640552995, + "grad_norm": 0.7808021306991577, + "learning_rate": 2.3338588123170413e-05, + "loss": 0.0753, + "step": 3620 + }, + { + "epoch": 0.5214573732718893, + "grad_norm": 0.9936549067497253, + "learning_rate": 2.3327302839910405e-05, + "loss": 0.0858, + "step": 3621 + }, + { + "epoch": 0.5216013824884793, + "grad_norm": 1.1507350206375122, + "learning_rate": 2.3316017899021913e-05, + "loss": 0.1808, + "step": 3622 + }, + { + "epoch": 0.5217453917050692, + "grad_norm": 5.581299304962158, + "learning_rate": 2.3304733302814764e-05, + "loss": 2.4232, + "step": 3623 + }, + { + "epoch": 0.521889400921659, + "grad_norm": 1.5621949434280396, + "learning_rate": 2.329344905359873e-05, + "loss": 0.1351, + "step": 3624 + }, + { + "epoch": 0.5220334101382489, + "grad_norm": 2.04944109916687, + "learning_rate": 2.3282165153683493e-05, + "loss": 0.1539, + "step": 3625 + }, + { + "epoch": 0.5221774193548387, + "grad_norm": 1.7890310287475586, + "learning_rate": 2.3270881605378658e-05, + "loss": 0.2489, + "step": 3626 + }, + { + "epoch": 0.5223214285714286, + "grad_norm": 0.932475209236145, + "learning_rate": 2.3259598410993777e-05, + "loss": 0.1336, + "step": 3627 + }, + { + "epoch": 0.5224654377880185, + "grad_norm": 0.8946047425270081, + "learning_rate": 2.3248315572838316e-05, + "loss": 0.1244, + "step": 3628 + }, + { + "epoch": 0.5226094470046083, + "grad_norm": 4.304471492767334, + "learning_rate": 2.3237033093221673e-05, + "loss": 2.1861, + "step": 3629 + }, + { + "epoch": 0.5227534562211982, + "grad_norm": 0.9590106010437012, + "learning_rate": 2.3225750974453174e-05, + "loss": 4.2602, + "step": 3630 + }, + { + "epoch": 0.522897465437788, + "grad_norm": 0.8082183003425598, + "learning_rate": 2.3214469218842066e-05, + "loss": 0.089, + "step": 3631 + }, + { + "epoch": 0.5230414746543779, + "grad_norm": 0.49469712376594543, + "learning_rate": 2.320318782869753e-05, + "loss": 0.0589, + "step": 3632 + }, + { + "epoch": 0.5231854838709677, + "grad_norm": 3.002227544784546, + "learning_rate": 2.3191906806328657e-05, + "loss": 0.1936, + "step": 3633 + }, + { + "epoch": 0.5233294930875576, + "grad_norm": 0.976767897605896, + "learning_rate": 2.3180626154044482e-05, + "loss": 0.1039, + "step": 3634 + }, + { + "epoch": 0.5234735023041475, + "grad_norm": 0.7569838762283325, + "learning_rate": 2.316934587415395e-05, + "loss": 0.1078, + "step": 3635 + }, + { + "epoch": 0.5236175115207373, + "grad_norm": 2.651902198791504, + "learning_rate": 2.315806596896594e-05, + "loss": 0.4233, + "step": 3636 + }, + { + "epoch": 0.5237615207373272, + "grad_norm": 1.5799187421798706, + "learning_rate": 2.3146786440789246e-05, + "loss": 0.1258, + "step": 3637 + }, + { + "epoch": 0.523905529953917, + "grad_norm": 0.54477459192276, + "learning_rate": 2.3135507291932583e-05, + "loss": 0.061, + "step": 3638 + }, + { + "epoch": 0.5240495391705069, + "grad_norm": 2.970926284790039, + "learning_rate": 2.31242285247046e-05, + "loss": 0.2882, + "step": 3639 + }, + { + "epoch": 0.5241935483870968, + "grad_norm": 0.9316942691802979, + "learning_rate": 2.3112950141413862e-05, + "loss": 0.1032, + "step": 3640 + }, + { + "epoch": 0.5243375576036866, + "grad_norm": 0.903565526008606, + "learning_rate": 2.310167214436885e-05, + "loss": 0.0956, + "step": 3641 + }, + { + "epoch": 0.5244815668202765, + "grad_norm": 0.7124109864234924, + "learning_rate": 2.309039453587797e-05, + "loss": 4.3383, + "step": 3642 + }, + { + "epoch": 0.5246255760368663, + "grad_norm": 1.1272518634796143, + "learning_rate": 2.307911731824955e-05, + "loss": 0.0736, + "step": 3643 + }, + { + "epoch": 0.5247695852534562, + "grad_norm": 4.09959602355957, + "learning_rate": 2.3067840493791842e-05, + "loss": 1.5255, + "step": 3644 + }, + { + "epoch": 0.524913594470046, + "grad_norm": 1.3922523260116577, + "learning_rate": 2.305656406481301e-05, + "loss": 0.1852, + "step": 3645 + }, + { + "epoch": 0.5250576036866359, + "grad_norm": 0.5770824551582336, + "learning_rate": 2.3045288033621135e-05, + "loss": 0.0785, + "step": 3646 + }, + { + "epoch": 0.5252016129032258, + "grad_norm": 0.9654541015625, + "learning_rate": 2.3034012402524225e-05, + "loss": 0.1196, + "step": 3647 + }, + { + "epoch": 0.5253456221198156, + "grad_norm": 1.2614401578903198, + "learning_rate": 2.3022737173830202e-05, + "loss": 0.148, + "step": 3648 + }, + { + "epoch": 0.5254896313364056, + "grad_norm": 1.1674377918243408, + "learning_rate": 2.3011462349846905e-05, + "loss": 0.1195, + "step": 3649 + }, + { + "epoch": 0.5256336405529954, + "grad_norm": 1.9096781015396118, + "learning_rate": 2.300018793288208e-05, + "loss": 0.2707, + "step": 3650 + }, + { + "epoch": 0.5257776497695853, + "grad_norm": 0.7377606630325317, + "learning_rate": 2.2988913925243424e-05, + "loss": 0.1066, + "step": 3651 + }, + { + "epoch": 0.5259216589861752, + "grad_norm": 0.6880693435668945, + "learning_rate": 2.2977640329238516e-05, + "loss": 0.1012, + "step": 3652 + }, + { + "epoch": 0.526065668202765, + "grad_norm": 0.7293221354484558, + "learning_rate": 2.296636714717486e-05, + "loss": 0.0976, + "step": 3653 + }, + { + "epoch": 0.5262096774193549, + "grad_norm": 1.0485872030258179, + "learning_rate": 2.2955094381359878e-05, + "loss": 0.1518, + "step": 3654 + }, + { + "epoch": 0.5263536866359447, + "grad_norm": 17.90790557861328, + "learning_rate": 2.2943822034100905e-05, + "loss": 3.1056, + "step": 3655 + }, + { + "epoch": 0.5264976958525346, + "grad_norm": 1.2211956977844238, + "learning_rate": 2.293255010770519e-05, + "loss": 0.1291, + "step": 3656 + }, + { + "epoch": 0.5266417050691244, + "grad_norm": 0.881229043006897, + "learning_rate": 2.2921278604479903e-05, + "loss": 0.0658, + "step": 3657 + }, + { + "epoch": 0.5267857142857143, + "grad_norm": 0.6690089106559753, + "learning_rate": 2.2910007526732112e-05, + "loss": 0.0637, + "step": 3658 + }, + { + "epoch": 0.5269297235023042, + "grad_norm": 0.5358933806419373, + "learning_rate": 2.2898736876768815e-05, + "loss": 0.0485, + "step": 3659 + }, + { + "epoch": 0.527073732718894, + "grad_norm": 2.171555280685425, + "learning_rate": 2.288746665689691e-05, + "loss": 0.2552, + "step": 3660 + }, + { + "epoch": 0.5272177419354839, + "grad_norm": 0.6372612118721008, + "learning_rate": 2.2876196869423215e-05, + "loss": 0.068, + "step": 3661 + }, + { + "epoch": 0.5273617511520737, + "grad_norm": 0.4056011140346527, + "learning_rate": 2.2864927516654454e-05, + "loss": 0.0644, + "step": 3662 + }, + { + "epoch": 0.5275057603686636, + "grad_norm": 0.8507561683654785, + "learning_rate": 2.2853658600897268e-05, + "loss": 0.1513, + "step": 3663 + }, + { + "epoch": 0.5276497695852534, + "grad_norm": 5.093796730041504, + "learning_rate": 2.28423901244582e-05, + "loss": 0.8064, + "step": 3664 + }, + { + "epoch": 0.5277937788018433, + "grad_norm": 2.0555548667907715, + "learning_rate": 2.283112208964371e-05, + "loss": 0.2114, + "step": 3665 + }, + { + "epoch": 0.5279377880184332, + "grad_norm": 0.7676950097084045, + "learning_rate": 2.281985449876016e-05, + "loss": 0.1004, + "step": 3666 + }, + { + "epoch": 0.528081797235023, + "grad_norm": 0.9676557779312134, + "learning_rate": 2.2808587354113835e-05, + "loss": 0.1393, + "step": 3667 + }, + { + "epoch": 0.5282258064516129, + "grad_norm": 5.641715049743652, + "learning_rate": 2.279732065801092e-05, + "loss": 2.7895, + "step": 3668 + }, + { + "epoch": 0.5283698156682027, + "grad_norm": 2.0706112384796143, + "learning_rate": 2.2786054412757498e-05, + "loss": 0.1262, + "step": 3669 + }, + { + "epoch": 0.5285138248847926, + "grad_norm": 3.6454107761383057, + "learning_rate": 2.2774788620659582e-05, + "loss": 0.3353, + "step": 3670 + }, + { + "epoch": 0.5286578341013825, + "grad_norm": 0.7557421326637268, + "learning_rate": 2.2763523284023076e-05, + "loss": 4.3457, + "step": 3671 + }, + { + "epoch": 0.5288018433179723, + "grad_norm": 3.513695478439331, + "learning_rate": 2.2752258405153783e-05, + "loss": 2.2679, + "step": 3672 + }, + { + "epoch": 0.5289458525345622, + "grad_norm": 1.0354517698287964, + "learning_rate": 2.274099398635745e-05, + "loss": 0.0985, + "step": 3673 + }, + { + "epoch": 0.529089861751152, + "grad_norm": 0.8062444925308228, + "learning_rate": 2.2729730029939683e-05, + "loss": 0.116, + "step": 3674 + }, + { + "epoch": 0.5292338709677419, + "grad_norm": 3.815720319747925, + "learning_rate": 2.2718466538206025e-05, + "loss": 2.5366, + "step": 3675 + }, + { + "epoch": 0.5293778801843319, + "grad_norm": 1.6371448040008545, + "learning_rate": 2.2707203513461913e-05, + "loss": 0.161, + "step": 3676 + }, + { + "epoch": 0.5295218894009217, + "grad_norm": 0.7306329011917114, + "learning_rate": 2.2695940958012678e-05, + "loss": 0.1162, + "step": 3677 + }, + { + "epoch": 0.5296658986175116, + "grad_norm": 0.7356336116790771, + "learning_rate": 2.268467887416358e-05, + "loss": 0.0842, + "step": 3678 + }, + { + "epoch": 0.5298099078341014, + "grad_norm": 0.7444694638252258, + "learning_rate": 2.2673417264219766e-05, + "loss": 0.0878, + "step": 3679 + }, + { + "epoch": 0.5299539170506913, + "grad_norm": 0.6385572552680969, + "learning_rate": 2.266215613048628e-05, + "loss": 0.0791, + "step": 3680 + }, + { + "epoch": 0.5300979262672811, + "grad_norm": 2.0453667640686035, + "learning_rate": 2.2650895475268086e-05, + "loss": 0.2387, + "step": 3681 + }, + { + "epoch": 0.530241935483871, + "grad_norm": 4.3076653480529785, + "learning_rate": 2.2639635300870038e-05, + "loss": 0.2404, + "step": 3682 + }, + { + "epoch": 0.5303859447004609, + "grad_norm": 1.013556957244873, + "learning_rate": 2.262837560959689e-05, + "loss": 0.1427, + "step": 3683 + }, + { + "epoch": 0.5305299539170507, + "grad_norm": 2.544694662094116, + "learning_rate": 2.2617116403753306e-05, + "loss": 0.2531, + "step": 3684 + }, + { + "epoch": 0.5306739631336406, + "grad_norm": 1.2561023235321045, + "learning_rate": 2.2605857685643845e-05, + "loss": 0.1564, + "step": 3685 + }, + { + "epoch": 0.5308179723502304, + "grad_norm": 0.881093442440033, + "learning_rate": 2.2594599457572967e-05, + "loss": 0.0902, + "step": 3686 + }, + { + "epoch": 0.5309619815668203, + "grad_norm": 0.9969978332519531, + "learning_rate": 2.2583341721845035e-05, + "loss": 0.1255, + "step": 3687 + }, + { + "epoch": 0.5311059907834101, + "grad_norm": 1.2375259399414062, + "learning_rate": 2.2572084480764307e-05, + "loss": 0.1481, + "step": 3688 + }, + { + "epoch": 0.53125, + "grad_norm": 0.7807781100273132, + "learning_rate": 2.2560827736634942e-05, + "loss": 0.0983, + "step": 3689 + }, + { + "epoch": 0.5313940092165899, + "grad_norm": 4.755069732666016, + "learning_rate": 2.2549571491760986e-05, + "loss": 0.3164, + "step": 3690 + }, + { + "epoch": 0.5315380184331797, + "grad_norm": 0.8422949314117432, + "learning_rate": 2.2538315748446405e-05, + "loss": 0.1424, + "step": 3691 + }, + { + "epoch": 0.5316820276497696, + "grad_norm": 3.409174680709839, + "learning_rate": 2.2527060508995055e-05, + "loss": 1.5954, + "step": 3692 + }, + { + "epoch": 0.5318260368663594, + "grad_norm": 1.0952035188674927, + "learning_rate": 2.251580577571067e-05, + "loss": 0.2511, + "step": 3693 + }, + { + "epoch": 0.5319700460829493, + "grad_norm": 4.415782928466797, + "learning_rate": 2.2504551550896907e-05, + "loss": 1.668, + "step": 3694 + }, + { + "epoch": 0.5321140552995391, + "grad_norm": 1.1833215951919556, + "learning_rate": 2.24932978368573e-05, + "loss": 0.1058, + "step": 3695 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 0.7132855653762817, + "learning_rate": 2.2482044635895287e-05, + "loss": 0.1019, + "step": 3696 + }, + { + "epoch": 0.5324020737327189, + "grad_norm": 0.6502566933631897, + "learning_rate": 2.24707919503142e-05, + "loss": 0.0994, + "step": 3697 + }, + { + "epoch": 0.5325460829493087, + "grad_norm": 3.82806396484375, + "learning_rate": 2.245953978241726e-05, + "loss": 1.8191, + "step": 3698 + }, + { + "epoch": 0.5326900921658986, + "grad_norm": 3.3552050590515137, + "learning_rate": 2.2448288134507596e-05, + "loss": 2.2795, + "step": 3699 + }, + { + "epoch": 0.5328341013824884, + "grad_norm": 0.39731693267822266, + "learning_rate": 2.243703700888821e-05, + "loss": 0.0802, + "step": 3700 + }, + { + "epoch": 0.5329781105990783, + "grad_norm": 2.1411097049713135, + "learning_rate": 2.242578640786202e-05, + "loss": 0.2622, + "step": 3701 + }, + { + "epoch": 0.5331221198156681, + "grad_norm": 0.5989755988121033, + "learning_rate": 2.2414536333731817e-05, + "loss": 0.0585, + "step": 3702 + }, + { + "epoch": 0.5332661290322581, + "grad_norm": 3.023967742919922, + "learning_rate": 2.2403286788800294e-05, + "loss": 2.1987, + "step": 3703 + }, + { + "epoch": 0.533410138248848, + "grad_norm": 0.5290563702583313, + "learning_rate": 2.239203777537003e-05, + "loss": 0.0739, + "step": 3704 + }, + { + "epoch": 0.5335541474654378, + "grad_norm": 0.696944534778595, + "learning_rate": 2.2380789295743506e-05, + "loss": 0.1062, + "step": 3705 + }, + { + "epoch": 0.5336981566820277, + "grad_norm": 1.4737954139709473, + "learning_rate": 2.2369541352223085e-05, + "loss": 0.1424, + "step": 3706 + }, + { + "epoch": 0.5338421658986175, + "grad_norm": 0.46030092239379883, + "learning_rate": 2.235829394711102e-05, + "loss": 0.0545, + "step": 3707 + }, + { + "epoch": 0.5339861751152074, + "grad_norm": 0.7441407442092896, + "learning_rate": 2.2347047082709464e-05, + "loss": 0.0968, + "step": 3708 + }, + { + "epoch": 0.5341301843317973, + "grad_norm": 0.5602573156356812, + "learning_rate": 2.2335800761320434e-05, + "loss": 0.0719, + "step": 3709 + }, + { + "epoch": 0.5342741935483871, + "grad_norm": 1.0664502382278442, + "learning_rate": 2.232455498524587e-05, + "loss": 0.1096, + "step": 3710 + }, + { + "epoch": 0.534418202764977, + "grad_norm": 0.48005086183547974, + "learning_rate": 2.2313309756787577e-05, + "loss": 0.058, + "step": 3711 + }, + { + "epoch": 0.5345622119815668, + "grad_norm": 1.5229668617248535, + "learning_rate": 2.2302065078247252e-05, + "loss": 0.1683, + "step": 3712 + }, + { + "epoch": 0.5347062211981567, + "grad_norm": 0.29090559482574463, + "learning_rate": 2.2290820951926487e-05, + "loss": 0.0522, + "step": 3713 + }, + { + "epoch": 0.5348502304147466, + "grad_norm": 0.7705280780792236, + "learning_rate": 2.227957738012675e-05, + "loss": 0.1223, + "step": 3714 + }, + { + "epoch": 0.5349942396313364, + "grad_norm": 0.4127512574195862, + "learning_rate": 2.2268334365149403e-05, + "loss": 0.0688, + "step": 3715 + }, + { + "epoch": 0.5351382488479263, + "grad_norm": 4.6301469802856445, + "learning_rate": 2.2257091909295696e-05, + "loss": 1.3189, + "step": 3716 + }, + { + "epoch": 0.5352822580645161, + "grad_norm": 1.9649407863616943, + "learning_rate": 2.224585001486676e-05, + "loss": 0.1704, + "step": 3717 + }, + { + "epoch": 0.535426267281106, + "grad_norm": 3.291447639465332, + "learning_rate": 2.2234608684163606e-05, + "loss": 1.794, + "step": 3718 + }, + { + "epoch": 0.5355702764976958, + "grad_norm": 4.182746887207031, + "learning_rate": 2.2223367919487144e-05, + "loss": 0.3509, + "step": 3719 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 0.8784071207046509, + "learning_rate": 2.2212127723138154e-05, + "loss": 0.0782, + "step": 3720 + }, + { + "epoch": 0.5358582949308756, + "grad_norm": 3.4279279708862305, + "learning_rate": 2.2200888097417307e-05, + "loss": 1.7936, + "step": 3721 + }, + { + "epoch": 0.5360023041474654, + "grad_norm": 3.1109516620635986, + "learning_rate": 2.2189649044625154e-05, + "loss": 0.5241, + "step": 3722 + }, + { + "epoch": 0.5361463133640553, + "grad_norm": 0.8299001455307007, + "learning_rate": 2.2178410567062132e-05, + "loss": 0.1832, + "step": 3723 + }, + { + "epoch": 0.5362903225806451, + "grad_norm": 1.022714614868164, + "learning_rate": 2.216717266702856e-05, + "loss": 0.1878, + "step": 3724 + }, + { + "epoch": 0.536434331797235, + "grad_norm": 2.0838735103607178, + "learning_rate": 2.215593534682463e-05, + "loss": 0.1174, + "step": 3725 + }, + { + "epoch": 0.5365783410138248, + "grad_norm": 0.5931451320648193, + "learning_rate": 2.2144698608750436e-05, + "loss": 0.0605, + "step": 3726 + }, + { + "epoch": 0.5367223502304147, + "grad_norm": 1.028098464012146, + "learning_rate": 2.213346245510593e-05, + "loss": 0.155, + "step": 3727 + }, + { + "epoch": 0.5368663594470046, + "grad_norm": 1.6645783185958862, + "learning_rate": 2.2122226888190953e-05, + "loss": 0.205, + "step": 3728 + }, + { + "epoch": 0.5370103686635944, + "grad_norm": 1.4723045825958252, + "learning_rate": 2.2110991910305232e-05, + "loss": 0.1639, + "step": 3729 + }, + { + "epoch": 0.5371543778801844, + "grad_norm": 0.7581170201301575, + "learning_rate": 2.2099757523748363e-05, + "loss": 0.1134, + "step": 3730 + }, + { + "epoch": 0.5372983870967742, + "grad_norm": 4.271656513214111, + "learning_rate": 2.208852373081982e-05, + "loss": 1.2021, + "step": 3731 + }, + { + "epoch": 0.5374423963133641, + "grad_norm": 0.9703299403190613, + "learning_rate": 2.207729053381898e-05, + "loss": 0.12, + "step": 3732 + }, + { + "epoch": 0.537586405529954, + "grad_norm": 1.4140650033950806, + "learning_rate": 2.2066057935045072e-05, + "loss": 0.1279, + "step": 3733 + }, + { + "epoch": 0.5377304147465438, + "grad_norm": 1.931113839149475, + "learning_rate": 2.205482593679721e-05, + "loss": 0.1395, + "step": 3734 + }, + { + "epoch": 0.5378744239631337, + "grad_norm": 3.2427451610565186, + "learning_rate": 2.2043594541374383e-05, + "loss": 0.2775, + "step": 3735 + }, + { + "epoch": 0.5380184331797235, + "grad_norm": 1.301924705505371, + "learning_rate": 2.203236375107546e-05, + "loss": 0.1217, + "step": 3736 + }, + { + "epoch": 0.5381624423963134, + "grad_norm": 5.484457015991211, + "learning_rate": 2.2021133568199183e-05, + "loss": 1.6064, + "step": 3737 + }, + { + "epoch": 0.5383064516129032, + "grad_norm": 8.759614944458008, + "learning_rate": 2.2009903995044175e-05, + "loss": 2.7848, + "step": 3738 + }, + { + "epoch": 0.5384504608294931, + "grad_norm": 1.699892520904541, + "learning_rate": 2.1998675033908933e-05, + "loss": 0.2346, + "step": 3739 + }, + { + "epoch": 0.538594470046083, + "grad_norm": 4.366125106811523, + "learning_rate": 2.1987446687091824e-05, + "loss": 2.8744, + "step": 3740 + }, + { + "epoch": 0.5387384792626728, + "grad_norm": 8.649024963378906, + "learning_rate": 2.197621895689109e-05, + "loss": 2.1179, + "step": 3741 + }, + { + "epoch": 0.5388824884792627, + "grad_norm": 0.8190634250640869, + "learning_rate": 2.1964991845604846e-05, + "loss": 0.0837, + "step": 3742 + }, + { + "epoch": 0.5390264976958525, + "grad_norm": 2.6229147911071777, + "learning_rate": 2.1953765355531093e-05, + "loss": 0.2458, + "step": 3743 + }, + { + "epoch": 0.5391705069124424, + "grad_norm": 4.463267803192139, + "learning_rate": 2.1942539488967687e-05, + "loss": 1.4851, + "step": 3744 + }, + { + "epoch": 0.5393145161290323, + "grad_norm": 10.600301742553711, + "learning_rate": 2.1931314248212366e-05, + "loss": 2.6539, + "step": 3745 + }, + { + "epoch": 0.5394585253456221, + "grad_norm": 0.7593082189559937, + "learning_rate": 2.1920089635562743e-05, + "loss": 0.0777, + "step": 3746 + }, + { + "epoch": 0.539602534562212, + "grad_norm": 4.784037113189697, + "learning_rate": 2.190886565331629e-05, + "loss": 1.873, + "step": 3747 + }, + { + "epoch": 0.5397465437788018, + "grad_norm": 0.6588318347930908, + "learning_rate": 2.1897642303770365e-05, + "loss": 0.0904, + "step": 3748 + }, + { + "epoch": 0.5398905529953917, + "grad_norm": 3.428464889526367, + "learning_rate": 2.1886419589222186e-05, + "loss": 1.7433, + "step": 3749 + }, + { + "epoch": 0.5400345622119815, + "grad_norm": 4.8177289962768555, + "learning_rate": 2.187519751196884e-05, + "loss": 2.0925, + "step": 3750 + }, + { + "epoch": 0.5401785714285714, + "grad_norm": 1.2341814041137695, + "learning_rate": 2.186397607430729e-05, + "loss": 0.1694, + "step": 3751 + }, + { + "epoch": 0.5403225806451613, + "grad_norm": 0.7716875076293945, + "learning_rate": 2.185275527853437e-05, + "loss": 0.1404, + "step": 3752 + }, + { + "epoch": 0.5404665898617511, + "grad_norm": 0.824848473072052, + "learning_rate": 2.1841535126946776e-05, + "loss": 0.1194, + "step": 3753 + }, + { + "epoch": 0.540610599078341, + "grad_norm": 0.6916657090187073, + "learning_rate": 2.1830315621841074e-05, + "loss": 0.0837, + "step": 3754 + }, + { + "epoch": 0.5407546082949308, + "grad_norm": 0.6991837620735168, + "learning_rate": 2.18190967655137e-05, + "loss": 0.0646, + "step": 3755 + }, + { + "epoch": 0.5408986175115207, + "grad_norm": 2.507221221923828, + "learning_rate": 2.180787856026095e-05, + "loss": 1.8352, + "step": 3756 + }, + { + "epoch": 0.5410426267281107, + "grad_norm": 0.9939058423042297, + "learning_rate": 2.1796661008378996e-05, + "loss": 0.103, + "step": 3757 + }, + { + "epoch": 0.5411866359447005, + "grad_norm": 2.0970706939697266, + "learning_rate": 2.1785444112163863e-05, + "loss": 0.2193, + "step": 3758 + }, + { + "epoch": 0.5413306451612904, + "grad_norm": 0.8063115477561951, + "learning_rate": 2.1774227873911474e-05, + "loss": 0.0889, + "step": 3759 + }, + { + "epoch": 0.5414746543778802, + "grad_norm": 0.7265112400054932, + "learning_rate": 2.1763012295917578e-05, + "loss": 0.0845, + "step": 3760 + }, + { + "epoch": 0.5416186635944701, + "grad_norm": 2.6958584785461426, + "learning_rate": 2.175179738047781e-05, + "loss": 0.2198, + "step": 3761 + }, + { + "epoch": 0.5417626728110599, + "grad_norm": 5.068443298339844, + "learning_rate": 2.1740583129887664e-05, + "loss": 1.8526, + "step": 3762 + }, + { + "epoch": 0.5419066820276498, + "grad_norm": 3.107957363128662, + "learning_rate": 2.17293695464425e-05, + "loss": 1.7241, + "step": 3763 + }, + { + "epoch": 0.5420506912442397, + "grad_norm": 0.5796893239021301, + "learning_rate": 2.1718156632437537e-05, + "loss": 0.0746, + "step": 3764 + }, + { + "epoch": 0.5421947004608295, + "grad_norm": 0.8749552369117737, + "learning_rate": 2.170694439016786e-05, + "loss": 0.1221, + "step": 3765 + }, + { + "epoch": 0.5423387096774194, + "grad_norm": 1.1342405080795288, + "learning_rate": 2.169573282192842e-05, + "loss": 0.1504, + "step": 3766 + }, + { + "epoch": 0.5424827188940092, + "grad_norm": 3.5341885089874268, + "learning_rate": 2.1684521930014024e-05, + "loss": 1.9887, + "step": 3767 + }, + { + "epoch": 0.5426267281105991, + "grad_norm": 1.066881537437439, + "learning_rate": 2.1673311716719346e-05, + "loss": 0.0963, + "step": 3768 + }, + { + "epoch": 0.542770737327189, + "grad_norm": 3.1160194873809814, + "learning_rate": 2.1662102184338916e-05, + "loss": 1.186, + "step": 3769 + }, + { + "epoch": 0.5429147465437788, + "grad_norm": 10.985671043395996, + "learning_rate": 2.1650893335167126e-05, + "loss": 1.1064, + "step": 3770 + }, + { + "epoch": 0.5430587557603687, + "grad_norm": 1.3439298868179321, + "learning_rate": 2.163968517149823e-05, + "loss": 0.1879, + "step": 3771 + }, + { + "epoch": 0.5432027649769585, + "grad_norm": 7.968015193939209, + "learning_rate": 2.1628477695626345e-05, + "loss": 0.8469, + "step": 3772 + }, + { + "epoch": 0.5433467741935484, + "grad_norm": 2.854787588119507, + "learning_rate": 2.161727090984544e-05, + "loss": 1.5787, + "step": 3773 + }, + { + "epoch": 0.5434907834101382, + "grad_norm": 0.6829860210418701, + "learning_rate": 2.1606064816449347e-05, + "loss": 0.0783, + "step": 3774 + }, + { + "epoch": 0.5436347926267281, + "grad_norm": 0.9219316244125366, + "learning_rate": 2.1594859417731747e-05, + "loss": 0.0612, + "step": 3775 + }, + { + "epoch": 0.543778801843318, + "grad_norm": 0.6907545328140259, + "learning_rate": 2.15836547159862e-05, + "loss": 0.0695, + "step": 3776 + }, + { + "epoch": 0.5439228110599078, + "grad_norm": 0.7059688568115234, + "learning_rate": 2.1572450713506098e-05, + "loss": 0.0658, + "step": 3777 + }, + { + "epoch": 0.5440668202764977, + "grad_norm": 2.2570388317108154, + "learning_rate": 2.1561247412584712e-05, + "loss": 0.159, + "step": 3778 + }, + { + "epoch": 0.5442108294930875, + "grad_norm": 1.0048706531524658, + "learning_rate": 2.1550044815515155e-05, + "loss": 0.14, + "step": 3779 + }, + { + "epoch": 0.5443548387096774, + "grad_norm": 0.89215087890625, + "learning_rate": 2.1538842924590404e-05, + "loss": 0.0957, + "step": 3780 + }, + { + "epoch": 0.5444988479262672, + "grad_norm": 0.5478269457817078, + "learning_rate": 2.152764174210328e-05, + "loss": 0.0721, + "step": 3781 + }, + { + "epoch": 0.5446428571428571, + "grad_norm": 0.7202975749969482, + "learning_rate": 2.1516441270346474e-05, + "loss": 0.0931, + "step": 3782 + }, + { + "epoch": 0.544786866359447, + "grad_norm": 5.372132301330566, + "learning_rate": 2.1505241511612522e-05, + "loss": 0.5006, + "step": 3783 + }, + { + "epoch": 0.5449308755760369, + "grad_norm": 0.5179041028022766, + "learning_rate": 2.1494042468193815e-05, + "loss": 0.061, + "step": 3784 + }, + { + "epoch": 0.5450748847926268, + "grad_norm": 0.7913549542427063, + "learning_rate": 2.1482844142382594e-05, + "loss": 0.103, + "step": 3785 + }, + { + "epoch": 0.5452188940092166, + "grad_norm": 2.0052590370178223, + "learning_rate": 2.1471646536470976e-05, + "loss": 0.1713, + "step": 3786 + }, + { + "epoch": 0.5453629032258065, + "grad_norm": 0.892223596572876, + "learning_rate": 2.1460449652750897e-05, + "loss": 0.1304, + "step": 3787 + }, + { + "epoch": 0.5455069124423964, + "grad_norm": 1.014614462852478, + "learning_rate": 2.1449253493514168e-05, + "loss": 0.1121, + "step": 3788 + }, + { + "epoch": 0.5456509216589862, + "grad_norm": 2.5235400199890137, + "learning_rate": 2.1438058061052443e-05, + "loss": 0.2019, + "step": 3789 + }, + { + "epoch": 0.5457949308755761, + "grad_norm": 1.4905385971069336, + "learning_rate": 2.142686335765723e-05, + "loss": 0.2168, + "step": 3790 + }, + { + "epoch": 0.5459389400921659, + "grad_norm": 0.7898442149162292, + "learning_rate": 2.1415669385619885e-05, + "loss": 0.1181, + "step": 3791 + }, + { + "epoch": 0.5460829493087558, + "grad_norm": 3.5370357036590576, + "learning_rate": 2.140447614723162e-05, + "loss": 2.196, + "step": 3792 + }, + { + "epoch": 0.5462269585253456, + "grad_norm": 0.6133771538734436, + "learning_rate": 2.1393283644783486e-05, + "loss": 0.0907, + "step": 3793 + }, + { + "epoch": 0.5463709677419355, + "grad_norm": 0.9949449300765991, + "learning_rate": 2.1382091880566394e-05, + "loss": 0.1132, + "step": 3794 + }, + { + "epoch": 0.5465149769585254, + "grad_norm": 3.113323450088501, + "learning_rate": 2.13709008568711e-05, + "loss": 0.3681, + "step": 3795 + }, + { + "epoch": 0.5466589861751152, + "grad_norm": 1.121996283531189, + "learning_rate": 2.1359710575988207e-05, + "loss": 0.1793, + "step": 3796 + }, + { + "epoch": 0.5468029953917051, + "grad_norm": 0.44304975867271423, + "learning_rate": 2.134852104020817e-05, + "loss": 0.0553, + "step": 3797 + }, + { + "epoch": 0.5469470046082949, + "grad_norm": 1.3662805557250977, + "learning_rate": 2.133733225182129e-05, + "loss": 3.8927, + "step": 3798 + }, + { + "epoch": 0.5470910138248848, + "grad_norm": 3.1780214309692383, + "learning_rate": 2.132614421311771e-05, + "loss": 0.1704, + "step": 3799 + }, + { + "epoch": 0.5472350230414746, + "grad_norm": 0.920243501663208, + "learning_rate": 2.131495692638743e-05, + "loss": 0.1065, + "step": 3800 + }, + { + "epoch": 0.5473790322580645, + "grad_norm": 3.5170207023620605, + "learning_rate": 2.1303770393920284e-05, + "loss": 0.2834, + "step": 3801 + }, + { + "epoch": 0.5475230414746544, + "grad_norm": 1.8735231161117554, + "learning_rate": 2.1292584618005955e-05, + "loss": 0.1914, + "step": 3802 + }, + { + "epoch": 0.5476670506912442, + "grad_norm": 3.968855381011963, + "learning_rate": 2.1281399600933982e-05, + "loss": 1.0546, + "step": 3803 + }, + { + "epoch": 0.5478110599078341, + "grad_norm": 1.3081183433532715, + "learning_rate": 2.1270215344993734e-05, + "loss": 0.1539, + "step": 3804 + }, + { + "epoch": 0.5479550691244239, + "grad_norm": 0.6945744752883911, + "learning_rate": 2.125903185247443e-05, + "loss": 0.0961, + "step": 3805 + }, + { + "epoch": 0.5480990783410138, + "grad_norm": 1.006156325340271, + "learning_rate": 2.1247849125665138e-05, + "loss": 0.1089, + "step": 3806 + }, + { + "epoch": 0.5482430875576036, + "grad_norm": 0.6664422750473022, + "learning_rate": 2.1236667166854763e-05, + "loss": 0.0652, + "step": 3807 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 0.8161289095878601, + "learning_rate": 2.122548597833205e-05, + "loss": 0.0901, + "step": 3808 + }, + { + "epoch": 0.5485311059907834, + "grad_norm": 1.3042902946472168, + "learning_rate": 2.1214305562385592e-05, + "loss": 0.1276, + "step": 3809 + }, + { + "epoch": 0.5486751152073732, + "grad_norm": 0.9009382128715515, + "learning_rate": 2.1203125921303817e-05, + "loss": 0.1244, + "step": 3810 + }, + { + "epoch": 0.5488191244239631, + "grad_norm": 1.245247483253479, + "learning_rate": 2.1191947057375018e-05, + "loss": 0.1507, + "step": 3811 + }, + { + "epoch": 0.548963133640553, + "grad_norm": 3.1481857299804688, + "learning_rate": 2.1180768972887293e-05, + "loss": 1.7465, + "step": 3812 + }, + { + "epoch": 0.5491071428571429, + "grad_norm": 0.8385176658630371, + "learning_rate": 2.1169591670128602e-05, + "loss": 0.1092, + "step": 3813 + }, + { + "epoch": 0.5492511520737328, + "grad_norm": 2.6261465549468994, + "learning_rate": 2.1158415151386744e-05, + "loss": 0.2469, + "step": 3814 + }, + { + "epoch": 0.5493951612903226, + "grad_norm": 13.013408660888672, + "learning_rate": 2.114723941894936e-05, + "loss": 2.2118, + "step": 3815 + }, + { + "epoch": 0.5495391705069125, + "grad_norm": 4.988786697387695, + "learning_rate": 2.1136064475103918e-05, + "loss": 0.6741, + "step": 3816 + }, + { + "epoch": 0.5496831797235023, + "grad_norm": 6.414894104003906, + "learning_rate": 2.112489032213773e-05, + "loss": 1.8962, + "step": 3817 + }, + { + "epoch": 0.5498271889400922, + "grad_norm": 0.8505078554153442, + "learning_rate": 2.111371696233795e-05, + "loss": 0.0888, + "step": 3818 + }, + { + "epoch": 0.549971198156682, + "grad_norm": 2.9798223972320557, + "learning_rate": 2.1102544397991566e-05, + "loss": 0.2715, + "step": 3819 + }, + { + "epoch": 0.5501152073732719, + "grad_norm": 7.15915584564209, + "learning_rate": 2.1091372631385406e-05, + "loss": 2.0352, + "step": 3820 + }, + { + "epoch": 0.5502592165898618, + "grad_norm": 0.587480902671814, + "learning_rate": 2.1080201664806133e-05, + "loss": 0.0771, + "step": 3821 + }, + { + "epoch": 0.5504032258064516, + "grad_norm": 0.5551830530166626, + "learning_rate": 2.106903150054024e-05, + "loss": 0.0644, + "step": 3822 + }, + { + "epoch": 0.5505472350230415, + "grad_norm": 0.6849237084388733, + "learning_rate": 2.1057862140874078e-05, + "loss": 0.0758, + "step": 3823 + }, + { + "epoch": 0.5506912442396313, + "grad_norm": 3.635728597640991, + "learning_rate": 2.10466935880938e-05, + "loss": 3.2494, + "step": 3824 + }, + { + "epoch": 0.5508352534562212, + "grad_norm": 1.3227003812789917, + "learning_rate": 2.1035525844485415e-05, + "loss": 0.1396, + "step": 3825 + }, + { + "epoch": 0.550979262672811, + "grad_norm": 0.7583706378936768, + "learning_rate": 2.1024358912334773e-05, + "loss": 0.0903, + "step": 3826 + }, + { + "epoch": 0.5511232718894009, + "grad_norm": 0.781561017036438, + "learning_rate": 2.1013192793927534e-05, + "loss": 0.0912, + "step": 3827 + }, + { + "epoch": 0.5512672811059908, + "grad_norm": 1.4161802530288696, + "learning_rate": 2.100202749154921e-05, + "loss": 0.1535, + "step": 3828 + }, + { + "epoch": 0.5514112903225806, + "grad_norm": 0.6397818326950073, + "learning_rate": 2.099086300748514e-05, + "loss": 0.0933, + "step": 3829 + }, + { + "epoch": 0.5515552995391705, + "grad_norm": 4.439366817474365, + "learning_rate": 2.0979699344020503e-05, + "loss": 1.9813, + "step": 3830 + }, + { + "epoch": 0.5516993087557603, + "grad_norm": 1.454534888267517, + "learning_rate": 2.09685365034403e-05, + "loss": 0.1643, + "step": 3831 + }, + { + "epoch": 0.5518433179723502, + "grad_norm": 1.2428836822509766, + "learning_rate": 2.095737448802936e-05, + "loss": 0.2087, + "step": 3832 + }, + { + "epoch": 0.5519873271889401, + "grad_norm": 3.1033074855804443, + "learning_rate": 2.0946213300072364e-05, + "loss": 0.1851, + "step": 3833 + }, + { + "epoch": 0.5521313364055299, + "grad_norm": 0.4344506561756134, + "learning_rate": 2.0935052941853797e-05, + "loss": 0.0472, + "step": 3834 + }, + { + "epoch": 0.5522753456221198, + "grad_norm": 0.8569513559341431, + "learning_rate": 2.0923893415657992e-05, + "loss": 0.0822, + "step": 3835 + }, + { + "epoch": 0.5524193548387096, + "grad_norm": 0.9593029022216797, + "learning_rate": 2.0912734723769105e-05, + "loss": 4.169, + "step": 3836 + }, + { + "epoch": 0.5525633640552995, + "grad_norm": 0.7301090955734253, + "learning_rate": 2.0901576868471125e-05, + "loss": 0.1245, + "step": 3837 + }, + { + "epoch": 0.5527073732718893, + "grad_norm": 0.7527263164520264, + "learning_rate": 2.0890419852047864e-05, + "loss": 0.0871, + "step": 3838 + }, + { + "epoch": 0.5528513824884793, + "grad_norm": 0.9047887325286865, + "learning_rate": 2.0879263676782974e-05, + "loss": 0.0957, + "step": 3839 + }, + { + "epoch": 0.5529953917050692, + "grad_norm": 3.22353458404541, + "learning_rate": 2.0868108344959914e-05, + "loss": 1.7923, + "step": 3840 + }, + { + "epoch": 0.553139400921659, + "grad_norm": 1.866532564163208, + "learning_rate": 2.0856953858861995e-05, + "loss": 0.217, + "step": 3841 + }, + { + "epoch": 0.5532834101382489, + "grad_norm": 5.787978649139404, + "learning_rate": 2.0845800220772334e-05, + "loss": 0.132, + "step": 3842 + }, + { + "epoch": 0.5534274193548387, + "grad_norm": 5.054624080657959, + "learning_rate": 2.0834647432973895e-05, + "loss": 1.8226, + "step": 3843 + }, + { + "epoch": 0.5535714285714286, + "grad_norm": 3.3744735717773438, + "learning_rate": 2.0823495497749446e-05, + "loss": 0.3114, + "step": 3844 + }, + { + "epoch": 0.5537154377880185, + "grad_norm": 0.5704648494720459, + "learning_rate": 2.0812344417381595e-05, + "loss": 0.0702, + "step": 3845 + }, + { + "epoch": 0.5538594470046083, + "grad_norm": 1.4833672046661377, + "learning_rate": 2.080119419415277e-05, + "loss": 0.1701, + "step": 3846 + }, + { + "epoch": 0.5540034562211982, + "grad_norm": 0.377855122089386, + "learning_rate": 2.0790044830345222e-05, + "loss": 0.0524, + "step": 3847 + }, + { + "epoch": 0.554147465437788, + "grad_norm": 7.3521647453308105, + "learning_rate": 2.0778896328241023e-05, + "loss": 1.7579, + "step": 3848 + }, + { + "epoch": 0.5542914746543779, + "grad_norm": 2.2419464588165283, + "learning_rate": 2.0767748690122095e-05, + "loss": 0.2808, + "step": 3849 + }, + { + "epoch": 0.5544354838709677, + "grad_norm": 1.0800530910491943, + "learning_rate": 2.0756601918270143e-05, + "loss": 0.1161, + "step": 3850 + }, + { + "epoch": 0.5545794930875576, + "grad_norm": 6.355402946472168, + "learning_rate": 2.0745456014966723e-05, + "loss": 2.3324, + "step": 3851 + }, + { + "epoch": 0.5547235023041475, + "grad_norm": 1.8788472414016724, + "learning_rate": 2.0734310982493204e-05, + "loss": 0.2053, + "step": 3852 + }, + { + "epoch": 0.5548675115207373, + "grad_norm": 0.9694936275482178, + "learning_rate": 2.0723166823130774e-05, + "loss": 0.1368, + "step": 3853 + }, + { + "epoch": 0.5550115207373272, + "grad_norm": 1.5543544292449951, + "learning_rate": 2.0712023539160442e-05, + "loss": 0.1576, + "step": 3854 + }, + { + "epoch": 0.555155529953917, + "grad_norm": 0.36340081691741943, + "learning_rate": 2.0700881132863052e-05, + "loss": 0.071, + "step": 3855 + }, + { + "epoch": 0.5552995391705069, + "grad_norm": 4.406770706176758, + "learning_rate": 2.0689739606519246e-05, + "loss": 2.422, + "step": 3856 + }, + { + "epoch": 0.5554435483870968, + "grad_norm": 0.8128178715705872, + "learning_rate": 2.0678598962409504e-05, + "loss": 0.115, + "step": 3857 + }, + { + "epoch": 0.5555875576036866, + "grad_norm": 0.8002925515174866, + "learning_rate": 2.0667459202814117e-05, + "loss": 0.1036, + "step": 3858 + }, + { + "epoch": 0.5557315668202765, + "grad_norm": 5.049229145050049, + "learning_rate": 2.0656320330013193e-05, + "loss": 1.2123, + "step": 3859 + }, + { + "epoch": 0.5558755760368663, + "grad_norm": 2.1789915561676025, + "learning_rate": 2.064518234628667e-05, + "loss": 0.1673, + "step": 3860 + }, + { + "epoch": 0.5560195852534562, + "grad_norm": 0.7046225666999817, + "learning_rate": 2.063404525391429e-05, + "loss": 0.0753, + "step": 3861 + }, + { + "epoch": 0.556163594470046, + "grad_norm": 1.0106332302093506, + "learning_rate": 2.062290905517562e-05, + "loss": 0.1579, + "step": 3862 + }, + { + "epoch": 0.5563076036866359, + "grad_norm": 3.440310001373291, + "learning_rate": 2.0611773752350047e-05, + "loss": 0.2397, + "step": 3863 + }, + { + "epoch": 0.5564516129032258, + "grad_norm": 0.39301353693008423, + "learning_rate": 2.0600639347716766e-05, + "loss": 0.0497, + "step": 3864 + }, + { + "epoch": 0.5565956221198156, + "grad_norm": 2.2586143016815186, + "learning_rate": 2.0589505843554797e-05, + "loss": 0.1387, + "step": 3865 + }, + { + "epoch": 0.5567396313364056, + "grad_norm": 0.37845584750175476, + "learning_rate": 2.057837324214296e-05, + "loss": 0.0472, + "step": 3866 + }, + { + "epoch": 0.5568836405529954, + "grad_norm": 0.6843501329421997, + "learning_rate": 2.0567241545759907e-05, + "loss": 0.0929, + "step": 3867 + }, + { + "epoch": 0.5570276497695853, + "grad_norm": 0.6881590485572815, + "learning_rate": 2.0556110756684112e-05, + "loss": 0.0853, + "step": 3868 + }, + { + "epoch": 0.5571716589861752, + "grad_norm": 1.6571568250656128, + "learning_rate": 2.0544980877193838e-05, + "loss": 0.2225, + "step": 3869 + }, + { + "epoch": 0.557315668202765, + "grad_norm": 1.6453518867492676, + "learning_rate": 2.053385190956718e-05, + "loss": 0.1729, + "step": 3870 + }, + { + "epoch": 0.5574596774193549, + "grad_norm": 1.1172221899032593, + "learning_rate": 2.0522723856082036e-05, + "loss": 0.1031, + "step": 3871 + }, + { + "epoch": 0.5576036866359447, + "grad_norm": 0.5264525413513184, + "learning_rate": 2.0511596719016126e-05, + "loss": 0.0702, + "step": 3872 + }, + { + "epoch": 0.5577476958525346, + "grad_norm": 3.141103506088257, + "learning_rate": 2.0500470500646978e-05, + "loss": 0.1569, + "step": 3873 + }, + { + "epoch": 0.5578917050691244, + "grad_norm": 1.252568244934082, + "learning_rate": 2.048934520325193e-05, + "loss": 0.0961, + "step": 3874 + }, + { + "epoch": 0.5580357142857143, + "grad_norm": 2.3028900623321533, + "learning_rate": 2.047822082910813e-05, + "loss": 0.2212, + "step": 3875 + }, + { + "epoch": 0.5581797235023042, + "grad_norm": 0.7850832939147949, + "learning_rate": 2.0467097380492544e-05, + "loss": 0.0989, + "step": 3876 + }, + { + "epoch": 0.558323732718894, + "grad_norm": 1.5505011081695557, + "learning_rate": 2.045597485968195e-05, + "loss": 0.181, + "step": 3877 + }, + { + "epoch": 0.5584677419354839, + "grad_norm": 0.618544340133667, + "learning_rate": 2.0444853268952923e-05, + "loss": 0.0797, + "step": 3878 + }, + { + "epoch": 0.5586117511520737, + "grad_norm": 1.1005275249481201, + "learning_rate": 2.0433732610581862e-05, + "loss": 0.1014, + "step": 3879 + }, + { + "epoch": 0.5587557603686636, + "grad_norm": 5.492246627807617, + "learning_rate": 2.0422612886844966e-05, + "loss": 2.9455, + "step": 3880 + }, + { + "epoch": 0.5588997695852534, + "grad_norm": 1.0642427206039429, + "learning_rate": 2.0411494100018246e-05, + "loss": 0.1174, + "step": 3881 + }, + { + "epoch": 0.5590437788018433, + "grad_norm": 0.5990496277809143, + "learning_rate": 2.0400376252377522e-05, + "loss": 0.054, + "step": 3882 + }, + { + "epoch": 0.5591877880184332, + "grad_norm": 1.490015983581543, + "learning_rate": 2.0389259346198425e-05, + "loss": 0.1016, + "step": 3883 + }, + { + "epoch": 0.559331797235023, + "grad_norm": 1.2819855213165283, + "learning_rate": 2.037814338375638e-05, + "loss": 0.1275, + "step": 3884 + }, + { + "epoch": 0.5594758064516129, + "grad_norm": 4.644863605499268, + "learning_rate": 2.0367028367326632e-05, + "loss": 0.9496, + "step": 3885 + }, + { + "epoch": 0.5596198156682027, + "grad_norm": 0.7876660823822021, + "learning_rate": 2.0355914299184232e-05, + "loss": 0.08, + "step": 3886 + }, + { + "epoch": 0.5597638248847926, + "grad_norm": 4.310342788696289, + "learning_rate": 2.0344801181604025e-05, + "loss": 0.9275, + "step": 3887 + }, + { + "epoch": 0.5599078341013825, + "grad_norm": 4.820072174072266, + "learning_rate": 2.0333689016860677e-05, + "loss": 1.6365, + "step": 3888 + }, + { + "epoch": 0.5600518433179723, + "grad_norm": 0.9796759486198425, + "learning_rate": 2.0322577807228648e-05, + "loss": 0.1241, + "step": 3889 + }, + { + "epoch": 0.5601958525345622, + "grad_norm": 0.718421220779419, + "learning_rate": 2.0311467554982208e-05, + "loss": 0.1284, + "step": 3890 + }, + { + "epoch": 0.560339861751152, + "grad_norm": 0.7602603435516357, + "learning_rate": 2.0300358262395426e-05, + "loss": 0.1094, + "step": 3891 + }, + { + "epoch": 0.5604838709677419, + "grad_norm": 1.401784896850586, + "learning_rate": 2.028924993174218e-05, + "loss": 0.1801, + "step": 3892 + }, + { + "epoch": 0.5606278801843319, + "grad_norm": 1.1309372186660767, + "learning_rate": 2.027814256529615e-05, + "loss": 0.125, + "step": 3893 + }, + { + "epoch": 0.5607718894009217, + "grad_norm": 0.5292521715164185, + "learning_rate": 2.026703616533081e-05, + "loss": 0.0578, + "step": 3894 + }, + { + "epoch": 0.5609158986175116, + "grad_norm": 1.9118586778640747, + "learning_rate": 2.0255930734119456e-05, + "loss": 0.1669, + "step": 3895 + }, + { + "epoch": 0.5610599078341014, + "grad_norm": 0.7062826752662659, + "learning_rate": 2.0244826273935162e-05, + "loss": 0.0751, + "step": 3896 + }, + { + "epoch": 0.5612039170506913, + "grad_norm": 1.6489425897598267, + "learning_rate": 2.0233722787050827e-05, + "loss": 0.1386, + "step": 3897 + }, + { + "epoch": 0.5613479262672811, + "grad_norm": 2.9034199714660645, + "learning_rate": 2.0222620275739128e-05, + "loss": 1.0286, + "step": 3898 + }, + { + "epoch": 0.561491935483871, + "grad_norm": 4.457592010498047, + "learning_rate": 2.0211518742272557e-05, + "loss": 2.6199, + "step": 3899 + }, + { + "epoch": 0.5616359447004609, + "grad_norm": 0.4819483757019043, + "learning_rate": 2.0200418188923397e-05, + "loss": 0.0707, + "step": 3900 + }, + { + "epoch": 0.5617799539170507, + "grad_norm": 0.5938711762428284, + "learning_rate": 2.018931861796374e-05, + "loss": 0.0745, + "step": 3901 + }, + { + "epoch": 0.5619239631336406, + "grad_norm": 3.7775166034698486, + "learning_rate": 2.017822003166547e-05, + "loss": 0.2889, + "step": 3902 + }, + { + "epoch": 0.5620679723502304, + "grad_norm": 2.9675631523132324, + "learning_rate": 2.0167122432300272e-05, + "loss": 0.2858, + "step": 3903 + }, + { + "epoch": 0.5622119815668203, + "grad_norm": 2.316577196121216, + "learning_rate": 2.0156025822139628e-05, + "loss": 0.1269, + "step": 3904 + }, + { + "epoch": 0.5623559907834101, + "grad_norm": 4.522067546844482, + "learning_rate": 2.0144930203454816e-05, + "loss": 1.5617, + "step": 3905 + }, + { + "epoch": 0.5625, + "grad_norm": 0.8036180734634399, + "learning_rate": 2.0133835578516912e-05, + "loss": 0.096, + "step": 3906 + }, + { + "epoch": 0.5626440092165899, + "grad_norm": 1.114175796508789, + "learning_rate": 2.0122741949596797e-05, + "loss": 0.1012, + "step": 3907 + }, + { + "epoch": 0.5627880184331797, + "grad_norm": 0.5954682230949402, + "learning_rate": 2.011164931896513e-05, + "loss": 0.0798, + "step": 3908 + }, + { + "epoch": 0.5629320276497696, + "grad_norm": 2.0787341594696045, + "learning_rate": 2.0100557688892385e-05, + "loss": 0.1491, + "step": 3909 + }, + { + "epoch": 0.5630760368663594, + "grad_norm": 10.877297401428223, + "learning_rate": 2.008946706164882e-05, + "loss": 1.5685, + "step": 3910 + }, + { + "epoch": 0.5632200460829493, + "grad_norm": 0.7234711050987244, + "learning_rate": 2.0078377439504486e-05, + "loss": 0.1015, + "step": 3911 + }, + { + "epoch": 0.5633640552995391, + "grad_norm": 0.7051416039466858, + "learning_rate": 2.006728882472924e-05, + "loss": 0.0816, + "step": 3912 + }, + { + "epoch": 0.563508064516129, + "grad_norm": 0.4759959578514099, + "learning_rate": 2.0056201219592714e-05, + "loss": 0.0509, + "step": 3913 + }, + { + "epoch": 0.5636520737327189, + "grad_norm": 0.3010523319244385, + "learning_rate": 2.0045114626364358e-05, + "loss": 0.0379, + "step": 3914 + }, + { + "epoch": 0.5637960829493087, + "grad_norm": 0.6735109090805054, + "learning_rate": 2.0034029047313395e-05, + "loss": 0.1096, + "step": 3915 + }, + { + "epoch": 0.5639400921658986, + "grad_norm": 0.6918171048164368, + "learning_rate": 2.0022944484708846e-05, + "loss": 0.0927, + "step": 3916 + }, + { + "epoch": 0.5640841013824884, + "grad_norm": 3.6920504570007324, + "learning_rate": 2.0011860940819523e-05, + "loss": 0.7466, + "step": 3917 + }, + { + "epoch": 0.5642281105990783, + "grad_norm": 0.8114519715309143, + "learning_rate": 2.000077841791404e-05, + "loss": 0.0991, + "step": 3918 + }, + { + "epoch": 0.5643721198156681, + "grad_norm": 1.1243529319763184, + "learning_rate": 1.9989696918260786e-05, + "loss": 4.3534, + "step": 3919 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 0.6969679594039917, + "learning_rate": 1.997861644412795e-05, + "loss": 0.0736, + "step": 3920 + }, + { + "epoch": 0.564660138248848, + "grad_norm": 1.1757069826126099, + "learning_rate": 1.9967536997783494e-05, + "loss": 0.1375, + "step": 3921 + }, + { + "epoch": 0.5648041474654378, + "grad_norm": 2.8421857357025146, + "learning_rate": 1.9956458581495216e-05, + "loss": 1.0015, + "step": 3922 + }, + { + "epoch": 0.5649481566820277, + "grad_norm": 1.0202020406723022, + "learning_rate": 1.9945381197530653e-05, + "loss": 0.0885, + "step": 3923 + }, + { + "epoch": 0.5650921658986175, + "grad_norm": 3.802048444747925, + "learning_rate": 1.9934304848157154e-05, + "loss": 1.9339, + "step": 3924 + }, + { + "epoch": 0.5652361751152074, + "grad_norm": 2.0819990634918213, + "learning_rate": 1.992322953564185e-05, + "loss": 0.228, + "step": 3925 + }, + { + "epoch": 0.5653801843317973, + "grad_norm": 0.9598188996315002, + "learning_rate": 1.991215526225166e-05, + "loss": 0.0897, + "step": 3926 + }, + { + "epoch": 0.5655241935483871, + "grad_norm": 1.5829124450683594, + "learning_rate": 1.9901082030253292e-05, + "loss": 0.1564, + "step": 3927 + }, + { + "epoch": 0.565668202764977, + "grad_norm": 0.5977948307991028, + "learning_rate": 1.9890009841913242e-05, + "loss": 0.1059, + "step": 3928 + }, + { + "epoch": 0.5658122119815668, + "grad_norm": 0.6414699554443359, + "learning_rate": 1.9878938699497796e-05, + "loss": 0.0789, + "step": 3929 + }, + { + "epoch": 0.5659562211981567, + "grad_norm": 0.43521782755851746, + "learning_rate": 1.986786860527301e-05, + "loss": 0.0683, + "step": 3930 + }, + { + "epoch": 0.5661002304147466, + "grad_norm": 3.7641496658325195, + "learning_rate": 1.9856799561504748e-05, + "loss": 0.8741, + "step": 3931 + }, + { + "epoch": 0.5662442396313364, + "grad_norm": 1.9275656938552856, + "learning_rate": 1.9845731570458638e-05, + "loss": 0.2327, + "step": 3932 + }, + { + "epoch": 0.5663882488479263, + "grad_norm": 0.9109424352645874, + "learning_rate": 1.9834664634400108e-05, + "loss": 0.1063, + "step": 3933 + }, + { + "epoch": 0.5665322580645161, + "grad_norm": 1.9056732654571533, + "learning_rate": 1.9823598755594364e-05, + "loss": 0.1946, + "step": 3934 + }, + { + "epoch": 0.566676267281106, + "grad_norm": 4.559300422668457, + "learning_rate": 1.9812533936306392e-05, + "loss": 1.5931, + "step": 3935 + }, + { + "epoch": 0.5668202764976958, + "grad_norm": 0.7752812504768372, + "learning_rate": 1.9801470178800965e-05, + "loss": 0.1058, + "step": 3936 + }, + { + "epoch": 0.5669642857142857, + "grad_norm": 1.4463495016098022, + "learning_rate": 1.979040748534264e-05, + "loss": 0.105, + "step": 3937 + }, + { + "epoch": 0.5671082949308756, + "grad_norm": 1.0084097385406494, + "learning_rate": 1.977934585819576e-05, + "loss": 0.1355, + "step": 3938 + }, + { + "epoch": 0.5672523041474654, + "grad_norm": 0.5479863286018372, + "learning_rate": 1.9768285299624435e-05, + "loss": 0.0848, + "step": 3939 + }, + { + "epoch": 0.5673963133640553, + "grad_norm": 1.0796864032745361, + "learning_rate": 1.975722581189257e-05, + "loss": 0.1224, + "step": 3940 + }, + { + "epoch": 0.5675403225806451, + "grad_norm": 4.747697353363037, + "learning_rate": 1.9746167397263847e-05, + "loss": 2.0189, + "step": 3941 + }, + { + "epoch": 0.567684331797235, + "grad_norm": 3.723198652267456, + "learning_rate": 1.9735110058001727e-05, + "loss": 3.2418, + "step": 3942 + }, + { + "epoch": 0.5678283410138248, + "grad_norm": 3.801663637161255, + "learning_rate": 1.972405379636945e-05, + "loss": 0.3047, + "step": 3943 + }, + { + "epoch": 0.5679723502304147, + "grad_norm": 0.5685513615608215, + "learning_rate": 1.9712998614630045e-05, + "loss": 0.0595, + "step": 3944 + }, + { + "epoch": 0.5681163594470046, + "grad_norm": 2.920783758163452, + "learning_rate": 1.9701944515046304e-05, + "loss": 0.1002, + "step": 3945 + }, + { + "epoch": 0.5682603686635944, + "grad_norm": 0.6729159951210022, + "learning_rate": 1.9690891499880804e-05, + "loss": 0.0813, + "step": 3946 + }, + { + "epoch": 0.5684043778801844, + "grad_norm": 1.8507195711135864, + "learning_rate": 1.967983957139591e-05, + "loss": 0.1614, + "step": 3947 + }, + { + "epoch": 0.5685483870967742, + "grad_norm": 3.884801149368286, + "learning_rate": 1.966878873185374e-05, + "loss": 0.2094, + "step": 3948 + }, + { + "epoch": 0.5686923963133641, + "grad_norm": 10.95729923248291, + "learning_rate": 1.9657738983516227e-05, + "loss": 0.6812, + "step": 3949 + }, + { + "epoch": 0.568836405529954, + "grad_norm": 1.7630525827407837, + "learning_rate": 1.9646690328645052e-05, + "loss": 0.183, + "step": 3950 + }, + { + "epoch": 0.5689804147465438, + "grad_norm": 1.4152859449386597, + "learning_rate": 1.9635642769501674e-05, + "loss": 0.1234, + "step": 3951 + }, + { + "epoch": 0.5691244239631337, + "grad_norm": 3.492441415786743, + "learning_rate": 1.9624596308347336e-05, + "loss": 0.222, + "step": 3952 + }, + { + "epoch": 0.5692684331797235, + "grad_norm": 0.4381289482116699, + "learning_rate": 1.9613550947443056e-05, + "loss": 0.0438, + "step": 3953 + }, + { + "epoch": 0.5694124423963134, + "grad_norm": 0.603961169719696, + "learning_rate": 1.960250668904962e-05, + "loss": 0.0862, + "step": 3954 + }, + { + "epoch": 0.5695564516129032, + "grad_norm": 0.7779719233512878, + "learning_rate": 1.959146353542759e-05, + "loss": 0.1154, + "step": 3955 + }, + { + "epoch": 0.5697004608294931, + "grad_norm": 0.4731806516647339, + "learning_rate": 1.958042148883731e-05, + "loss": 0.0629, + "step": 3956 + }, + { + "epoch": 0.569844470046083, + "grad_norm": 1.0933325290679932, + "learning_rate": 1.956938055153889e-05, + "loss": 0.1173, + "step": 3957 + }, + { + "epoch": 0.5699884792626728, + "grad_norm": 0.636060893535614, + "learning_rate": 1.9558340725792214e-05, + "loss": 0.0728, + "step": 3958 + }, + { + "epoch": 0.5701324884792627, + "grad_norm": 0.6584190130233765, + "learning_rate": 1.9547302013856934e-05, + "loss": 0.0751, + "step": 3959 + }, + { + "epoch": 0.5702764976958525, + "grad_norm": 4.968941688537598, + "learning_rate": 1.9536264417992487e-05, + "loss": 0.8644, + "step": 3960 + }, + { + "epoch": 0.5704205069124424, + "grad_norm": 1.1246427297592163, + "learning_rate": 1.9525227940458067e-05, + "loss": 0.1216, + "step": 3961 + }, + { + "epoch": 0.5705645161290323, + "grad_norm": 2.410125494003296, + "learning_rate": 1.9514192583512654e-05, + "loss": 0.1574, + "step": 3962 + }, + { + "epoch": 0.5707085253456221, + "grad_norm": 9.85913372039795, + "learning_rate": 1.9503158349414984e-05, + "loss": 2.378, + "step": 3963 + }, + { + "epoch": 0.570852534562212, + "grad_norm": 0.9391957521438599, + "learning_rate": 1.949212524042357e-05, + "loss": 0.1324, + "step": 3964 + }, + { + "epoch": 0.5709965437788018, + "grad_norm": 2.3829503059387207, + "learning_rate": 1.9481093258796697e-05, + "loss": 0.305, + "step": 3965 + }, + { + "epoch": 0.5711405529953917, + "grad_norm": 1.2626090049743652, + "learning_rate": 1.9470062406792412e-05, + "loss": 0.1288, + "step": 3966 + }, + { + "epoch": 0.5712845622119815, + "grad_norm": 0.6382114291191101, + "learning_rate": 1.945903268666853e-05, + "loss": 0.0759, + "step": 3967 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.6413066983222961, + "learning_rate": 1.944800410068266e-05, + "loss": 0.0675, + "step": 3968 + }, + { + "epoch": 0.5715725806451613, + "grad_norm": 0.5665306448936462, + "learning_rate": 1.9436976651092144e-05, + "loss": 0.0805, + "step": 3969 + }, + { + "epoch": 0.5717165898617511, + "grad_norm": 1.2222808599472046, + "learning_rate": 1.9425950340154107e-05, + "loss": 0.1808, + "step": 3970 + }, + { + "epoch": 0.571860599078341, + "grad_norm": 2.083467483520508, + "learning_rate": 1.941492517012544e-05, + "loss": 0.161, + "step": 3971 + }, + { + "epoch": 0.5720046082949308, + "grad_norm": 0.9020915627479553, + "learning_rate": 1.94039011432628e-05, + "loss": 0.0844, + "step": 3972 + }, + { + "epoch": 0.5721486175115207, + "grad_norm": 0.914097785949707, + "learning_rate": 1.9392878261822616e-05, + "loss": 0.1458, + "step": 3973 + }, + { + "epoch": 0.5722926267281107, + "grad_norm": 1.1065387725830078, + "learning_rate": 1.9381856528061073e-05, + "loss": 0.1624, + "step": 3974 + }, + { + "epoch": 0.5724366359447005, + "grad_norm": 4.016314506530762, + "learning_rate": 1.937083594423411e-05, + "loss": 1.7041, + "step": 3975 + }, + { + "epoch": 0.5725806451612904, + "grad_norm": 0.5692209005355835, + "learning_rate": 1.9359816512597473e-05, + "loss": 0.0395, + "step": 3976 + }, + { + "epoch": 0.5727246543778802, + "grad_norm": 4.495973110198975, + "learning_rate": 1.934879823540663e-05, + "loss": 1.0574, + "step": 3977 + }, + { + "epoch": 0.5728686635944701, + "grad_norm": 0.7732425928115845, + "learning_rate": 1.933778111491683e-05, + "loss": 0.1042, + "step": 3978 + }, + { + "epoch": 0.5730126728110599, + "grad_norm": 1.8454821109771729, + "learning_rate": 1.9326765153383078e-05, + "loss": 0.1879, + "step": 3979 + }, + { + "epoch": 0.5731566820276498, + "grad_norm": 0.6951848864555359, + "learning_rate": 1.9315750353060153e-05, + "loss": 0.097, + "step": 3980 + }, + { + "epoch": 0.5733006912442397, + "grad_norm": 3.251659870147705, + "learning_rate": 1.9304736716202586e-05, + "loss": 1.8027, + "step": 3981 + }, + { + "epoch": 0.5734447004608295, + "grad_norm": 0.6380454301834106, + "learning_rate": 1.9293724245064677e-05, + "loss": 0.0725, + "step": 3982 + }, + { + "epoch": 0.5735887096774194, + "grad_norm": 0.7550536394119263, + "learning_rate": 1.928271294190048e-05, + "loss": 0.0799, + "step": 3983 + }, + { + "epoch": 0.5737327188940092, + "grad_norm": 2.0223569869995117, + "learning_rate": 1.9271702808963813e-05, + "loss": 0.2106, + "step": 3984 + }, + { + "epoch": 0.5738767281105991, + "grad_norm": 1.8518507480621338, + "learning_rate": 1.926069384850826e-05, + "loss": 0.1778, + "step": 3985 + }, + { + "epoch": 0.574020737327189, + "grad_norm": 1.2188143730163574, + "learning_rate": 1.9249686062787152e-05, + "loss": 0.1303, + "step": 3986 + }, + { + "epoch": 0.5741647465437788, + "grad_norm": 1.1371163129806519, + "learning_rate": 1.9238679454053606e-05, + "loss": 0.1343, + "step": 3987 + }, + { + "epoch": 0.5743087557603687, + "grad_norm": 0.9450805187225342, + "learning_rate": 1.9227674024560463e-05, + "loss": 0.0894, + "step": 3988 + }, + { + "epoch": 0.5744527649769585, + "grad_norm": 0.626236617565155, + "learning_rate": 1.921666977656035e-05, + "loss": 0.0722, + "step": 3989 + }, + { + "epoch": 0.5745967741935484, + "grad_norm": 1.080952763557434, + "learning_rate": 1.920566671230563e-05, + "loss": 0.1264, + "step": 3990 + }, + { + "epoch": 0.5747407834101382, + "grad_norm": 0.7530115246772766, + "learning_rate": 1.9194664834048446e-05, + "loss": 0.0794, + "step": 3991 + }, + { + "epoch": 0.5748847926267281, + "grad_norm": 1.0502924919128418, + "learning_rate": 1.918366414404069e-05, + "loss": 0.1121, + "step": 3992 + }, + { + "epoch": 0.575028801843318, + "grad_norm": 0.9840311408042908, + "learning_rate": 1.9172664644534e-05, + "loss": 0.1285, + "step": 3993 + }, + { + "epoch": 0.5751728110599078, + "grad_norm": 0.975468099117279, + "learning_rate": 1.9161666337779782e-05, + "loss": 0.1259, + "step": 3994 + }, + { + "epoch": 0.5753168202764977, + "grad_norm": 1.0631914138793945, + "learning_rate": 1.9150669226029195e-05, + "loss": 0.1009, + "step": 3995 + }, + { + "epoch": 0.5754608294930875, + "grad_norm": 1.1193510293960571, + "learning_rate": 1.9139673311533153e-05, + "loss": 0.1495, + "step": 3996 + }, + { + "epoch": 0.5756048387096774, + "grad_norm": 3.086751699447632, + "learning_rate": 1.9128678596542328e-05, + "loss": 1.0456, + "step": 3997 + }, + { + "epoch": 0.5757488479262672, + "grad_norm": 1.0476152896881104, + "learning_rate": 1.911768508330714e-05, + "loss": 0.111, + "step": 3998 + }, + { + "epoch": 0.5758928571428571, + "grad_norm": 1.0447733402252197, + "learning_rate": 1.9106692774077772e-05, + "loss": 0.117, + "step": 3999 + }, + { + "epoch": 0.576036866359447, + "grad_norm": 7.253108978271484, + "learning_rate": 1.909570167110415e-05, + "loss": 2.0823, + "step": 4000 + }, + { + "epoch": 0.5761808755760369, + "grad_norm": 4.933205604553223, + "learning_rate": 1.9084711776635958e-05, + "loss": 1.2068, + "step": 4001 + }, + { + "epoch": 0.5763248847926268, + "grad_norm": 0.7195201516151428, + "learning_rate": 1.907372309292263e-05, + "loss": 0.0803, + "step": 4002 + }, + { + "epoch": 0.5764688940092166, + "grad_norm": 0.6379591226577759, + "learning_rate": 1.9062735622213366e-05, + "loss": 0.0787, + "step": 4003 + }, + { + "epoch": 0.5766129032258065, + "grad_norm": 0.6953241229057312, + "learning_rate": 1.90517493667571e-05, + "loss": 0.0888, + "step": 4004 + }, + { + "epoch": 0.5767569124423964, + "grad_norm": 0.6023262143135071, + "learning_rate": 1.904076432880252e-05, + "loss": 0.0795, + "step": 4005 + }, + { + "epoch": 0.5769009216589862, + "grad_norm": 1.290687084197998, + "learning_rate": 1.902978051059808e-05, + "loss": 0.1453, + "step": 4006 + }, + { + "epoch": 0.5770449308755761, + "grad_norm": 1.0248733758926392, + "learning_rate": 1.901879791439197e-05, + "loss": 0.0986, + "step": 4007 + }, + { + "epoch": 0.5771889400921659, + "grad_norm": 0.6521994471549988, + "learning_rate": 1.900781654243213e-05, + "loss": 0.0545, + "step": 4008 + }, + { + "epoch": 0.5773329493087558, + "grad_norm": 0.5822210907936096, + "learning_rate": 1.899683639696625e-05, + "loss": 0.0701, + "step": 4009 + }, + { + "epoch": 0.5774769585253456, + "grad_norm": 0.7882620096206665, + "learning_rate": 1.8985857480241775e-05, + "loss": 0.1183, + "step": 4010 + }, + { + "epoch": 0.5776209677419355, + "grad_norm": 2.697627067565918, + "learning_rate": 1.8974879794505896e-05, + "loss": 0.1843, + "step": 4011 + }, + { + "epoch": 0.5777649769585254, + "grad_norm": 0.6858093738555908, + "learning_rate": 1.8963903342005553e-05, + "loss": 0.0996, + "step": 4012 + }, + { + "epoch": 0.5779089861751152, + "grad_norm": 6.599998474121094, + "learning_rate": 1.8952928124987422e-05, + "loss": 1.5378, + "step": 4013 + }, + { + "epoch": 0.5780529953917051, + "grad_norm": 1.6779776811599731, + "learning_rate": 1.8941954145697948e-05, + "loss": 0.2025, + "step": 4014 + }, + { + "epoch": 0.5781970046082949, + "grad_norm": 0.9975787997245789, + "learning_rate": 1.89309814063833e-05, + "loss": 0.0876, + "step": 4015 + }, + { + "epoch": 0.5783410138248848, + "grad_norm": 0.8232687711715698, + "learning_rate": 1.8920009909289415e-05, + "loss": 0.103, + "step": 4016 + }, + { + "epoch": 0.5784850230414746, + "grad_norm": 0.6475475430488586, + "learning_rate": 1.890903965666195e-05, + "loss": 0.0829, + "step": 4017 + }, + { + "epoch": 0.5786290322580645, + "grad_norm": 0.7422717809677124, + "learning_rate": 1.889807065074634e-05, + "loss": 0.1017, + "step": 4018 + }, + { + "epoch": 0.5787730414746544, + "grad_norm": 0.7100384831428528, + "learning_rate": 1.888710289378773e-05, + "loss": 0.1105, + "step": 4019 + }, + { + "epoch": 0.5789170506912442, + "grad_norm": 6.1035685539245605, + "learning_rate": 1.887613638803103e-05, + "loss": 2.0723, + "step": 4020 + }, + { + "epoch": 0.5790610599078341, + "grad_norm": 0.9739629626274109, + "learning_rate": 1.8865171135720893e-05, + "loss": 4.3366, + "step": 4021 + }, + { + "epoch": 0.5792050691244239, + "grad_norm": 2.490973949432373, + "learning_rate": 1.885420713910171e-05, + "loss": 0.1033, + "step": 4022 + }, + { + "epoch": 0.5793490783410138, + "grad_norm": 0.5513819456100464, + "learning_rate": 1.8843244400417624e-05, + "loss": 0.0585, + "step": 4023 + }, + { + "epoch": 0.5794930875576036, + "grad_norm": 1.1753944158554077, + "learning_rate": 1.8832282921912503e-05, + "loss": 0.0944, + "step": 4024 + }, + { + "epoch": 0.5796370967741935, + "grad_norm": 0.5306431651115417, + "learning_rate": 1.8821322705829972e-05, + "loss": 0.0547, + "step": 4025 + }, + { + "epoch": 0.5797811059907834, + "grad_norm": 4.130974292755127, + "learning_rate": 1.8810363754413392e-05, + "loss": 0.6616, + "step": 4026 + }, + { + "epoch": 0.5799251152073732, + "grad_norm": 1.3806089162826538, + "learning_rate": 1.879940606990587e-05, + "loss": 0.1971, + "step": 4027 + }, + { + "epoch": 0.5800691244239631, + "grad_norm": 0.6588627696037292, + "learning_rate": 1.878844965455025e-05, + "loss": 0.0604, + "step": 4028 + }, + { + "epoch": 0.580213133640553, + "grad_norm": 0.8022257685661316, + "learning_rate": 1.8777494510589117e-05, + "loss": 0.084, + "step": 4029 + }, + { + "epoch": 0.5803571428571429, + "grad_norm": 1.492052435874939, + "learning_rate": 1.8766540640264778e-05, + "loss": 0.2122, + "step": 4030 + }, + { + "epoch": 0.5805011520737328, + "grad_norm": 0.7647203207015991, + "learning_rate": 1.8755588045819327e-05, + "loss": 0.0821, + "step": 4031 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.8515807390213013, + "learning_rate": 1.8744636729494548e-05, + "loss": 0.0972, + "step": 4032 + }, + { + "epoch": 0.5807891705069125, + "grad_norm": 1.6974910497665405, + "learning_rate": 1.8733686693531985e-05, + "loss": 0.1834, + "step": 4033 + }, + { + "epoch": 0.5809331797235023, + "grad_norm": 0.5313750505447388, + "learning_rate": 1.8722737940172914e-05, + "loss": 0.0551, + "step": 4034 + }, + { + "epoch": 0.5810771889400922, + "grad_norm": 4.3776421546936035, + "learning_rate": 1.871179047165836e-05, + "loss": 3.02, + "step": 4035 + }, + { + "epoch": 0.581221198156682, + "grad_norm": 0.8753734827041626, + "learning_rate": 1.8700844290229062e-05, + "loss": 0.1281, + "step": 4036 + }, + { + "epoch": 0.5813652073732719, + "grad_norm": 3.7001349925994873, + "learning_rate": 1.8689899398125525e-05, + "loss": 1.7891, + "step": 4037 + }, + { + "epoch": 0.5815092165898618, + "grad_norm": 8.15339469909668, + "learning_rate": 1.8678955797587964e-05, + "loss": 1.9685, + "step": 4038 + }, + { + "epoch": 0.5816532258064516, + "grad_norm": 0.8525282144546509, + "learning_rate": 1.8668013490856342e-05, + "loss": 0.0938, + "step": 4039 + }, + { + "epoch": 0.5817972350230415, + "grad_norm": 0.9780722260475159, + "learning_rate": 1.865707248017036e-05, + "loss": 0.0919, + "step": 4040 + }, + { + "epoch": 0.5819412442396313, + "grad_norm": 4.176339626312256, + "learning_rate": 1.8646132767769446e-05, + "loss": 1.1762, + "step": 4041 + }, + { + "epoch": 0.5820852534562212, + "grad_norm": 5.9283647537231445, + "learning_rate": 1.8635194355892766e-05, + "loss": 2.6826, + "step": 4042 + }, + { + "epoch": 0.582229262672811, + "grad_norm": 2.105571985244751, + "learning_rate": 1.862425724677922e-05, + "loss": 0.2269, + "step": 4043 + }, + { + "epoch": 0.5823732718894009, + "grad_norm": 0.697034478187561, + "learning_rate": 1.8613321442667442e-05, + "loss": 0.06, + "step": 4044 + }, + { + "epoch": 0.5825172811059908, + "grad_norm": 0.9479572176933289, + "learning_rate": 1.860238694579579e-05, + "loss": 0.0808, + "step": 4045 + }, + { + "epoch": 0.5826612903225806, + "grad_norm": 0.6722093820571899, + "learning_rate": 1.859145375840238e-05, + "loss": 0.0907, + "step": 4046 + }, + { + "epoch": 0.5828052995391705, + "grad_norm": 1.5580651760101318, + "learning_rate": 1.8580521882725022e-05, + "loss": 0.1939, + "step": 4047 + }, + { + "epoch": 0.5829493087557603, + "grad_norm": 1.086091160774231, + "learning_rate": 1.8569591321001283e-05, + "loss": 0.123, + "step": 4048 + }, + { + "epoch": 0.5830933179723502, + "grad_norm": 1.3703362941741943, + "learning_rate": 1.8558662075468466e-05, + "loss": 0.145, + "step": 4049 + }, + { + "epoch": 0.5832373271889401, + "grad_norm": 0.9436412453651428, + "learning_rate": 1.8547734148363582e-05, + "loss": 0.1057, + "step": 4050 + }, + { + "epoch": 0.5833813364055299, + "grad_norm": 3.454482078552246, + "learning_rate": 1.8536807541923397e-05, + "loss": 2.0474, + "step": 4051 + }, + { + "epoch": 0.5835253456221198, + "grad_norm": 1.3829585313796997, + "learning_rate": 1.8525882258384377e-05, + "loss": 0.1539, + "step": 4052 + }, + { + "epoch": 0.5836693548387096, + "grad_norm": 2.707672595977783, + "learning_rate": 1.851495829998275e-05, + "loss": 0.1493, + "step": 4053 + }, + { + "epoch": 0.5838133640552995, + "grad_norm": 0.32165125012397766, + "learning_rate": 1.8504035668954448e-05, + "loss": 0.0631, + "step": 4054 + }, + { + "epoch": 0.5839573732718893, + "grad_norm": 0.9112032055854797, + "learning_rate": 1.849311436753514e-05, + "loss": 0.0927, + "step": 4055 + }, + { + "epoch": 0.5841013824884793, + "grad_norm": 0.6237892508506775, + "learning_rate": 1.848219439796023e-05, + "loss": 0.0826, + "step": 4056 + }, + { + "epoch": 0.5842453917050692, + "grad_norm": 0.5143719911575317, + "learning_rate": 1.8471275762464828e-05, + "loss": 0.0741, + "step": 4057 + }, + { + "epoch": 0.584389400921659, + "grad_norm": 4.402255058288574, + "learning_rate": 1.8460358463283812e-05, + "loss": 2.3674, + "step": 4058 + }, + { + "epoch": 0.5845334101382489, + "grad_norm": 4.426423072814941, + "learning_rate": 1.8449442502651738e-05, + "loss": 2.8297, + "step": 4059 + }, + { + "epoch": 0.5846774193548387, + "grad_norm": 0.7723106145858765, + "learning_rate": 1.8438527882802915e-05, + "loss": 0.0842, + "step": 4060 + }, + { + "epoch": 0.5848214285714286, + "grad_norm": 4.302999973297119, + "learning_rate": 1.842761460597138e-05, + "loss": 1.7975, + "step": 4061 + }, + { + "epoch": 0.5849654377880185, + "grad_norm": 3.166254758834839, + "learning_rate": 1.841670267439088e-05, + "loss": 2.1285, + "step": 4062 + }, + { + "epoch": 0.5851094470046083, + "grad_norm": 0.6640121936798096, + "learning_rate": 1.8405792090294892e-05, + "loss": 0.0935, + "step": 4063 + }, + { + "epoch": 0.5852534562211982, + "grad_norm": 0.8238686919212341, + "learning_rate": 1.839488285591663e-05, + "loss": 0.114, + "step": 4064 + }, + { + "epoch": 0.585397465437788, + "grad_norm": 2.9954049587249756, + "learning_rate": 1.838397497348901e-05, + "loss": 2.1354, + "step": 4065 + }, + { + "epoch": 0.5855414746543779, + "grad_norm": 0.915939450263977, + "learning_rate": 1.8373068445244696e-05, + "loss": 0.0659, + "step": 4066 + }, + { + "epoch": 0.5856854838709677, + "grad_norm": 1.1949113607406616, + "learning_rate": 1.8362163273416046e-05, + "loss": 0.144, + "step": 4067 + }, + { + "epoch": 0.5858294930875576, + "grad_norm": 1.0060582160949707, + "learning_rate": 1.8351259460235165e-05, + "loss": 0.0966, + "step": 4068 + }, + { + "epoch": 0.5859735023041475, + "grad_norm": 0.7836282849311829, + "learning_rate": 1.8340357007933867e-05, + "loss": 0.1101, + "step": 4069 + }, + { + "epoch": 0.5861175115207373, + "grad_norm": 0.6916611194610596, + "learning_rate": 1.8329455918743693e-05, + "loss": 0.0532, + "step": 4070 + }, + { + "epoch": 0.5862615207373272, + "grad_norm": 0.6619588136672974, + "learning_rate": 1.831855619489591e-05, + "loss": 0.1031, + "step": 4071 + }, + { + "epoch": 0.586405529953917, + "grad_norm": 1.0727113485336304, + "learning_rate": 1.8307657838621483e-05, + "loss": 0.1458, + "step": 4072 + }, + { + "epoch": 0.5865495391705069, + "grad_norm": 0.6927089691162109, + "learning_rate": 1.8296760852151125e-05, + "loss": 0.1071, + "step": 4073 + }, + { + "epoch": 0.5866935483870968, + "grad_norm": 1.2728301286697388, + "learning_rate": 1.8285865237715248e-05, + "loss": 0.1953, + "step": 4074 + }, + { + "epoch": 0.5868375576036866, + "grad_norm": 2.366572618484497, + "learning_rate": 1.8274970997544005e-05, + "loss": 0.2193, + "step": 4075 + }, + { + "epoch": 0.5869815668202765, + "grad_norm": 0.554145872592926, + "learning_rate": 1.8264078133867242e-05, + "loss": 0.0607, + "step": 4076 + }, + { + "epoch": 0.5871255760368663, + "grad_norm": 0.9165964722633362, + "learning_rate": 1.8253186648914535e-05, + "loss": 0.0932, + "step": 4077 + }, + { + "epoch": 0.5872695852534562, + "grad_norm": 1.1068191528320312, + "learning_rate": 1.824229654491519e-05, + "loss": 0.1802, + "step": 4078 + }, + { + "epoch": 0.587413594470046, + "grad_norm": 0.8778085112571716, + "learning_rate": 1.82314078240982e-05, + "loss": 0.0679, + "step": 4079 + }, + { + "epoch": 0.5875576036866359, + "grad_norm": 1.0525513887405396, + "learning_rate": 1.8220520488692316e-05, + "loss": 0.1495, + "step": 4080 + }, + { + "epoch": 0.5877016129032258, + "grad_norm": 3.4906744956970215, + "learning_rate": 1.8209634540925966e-05, + "loss": 0.2077, + "step": 4081 + }, + { + "epoch": 0.5878456221198156, + "grad_norm": 3.6204636096954346, + "learning_rate": 1.819874998302732e-05, + "loss": 0.3102, + "step": 4082 + }, + { + "epoch": 0.5879896313364056, + "grad_norm": 4.5211501121521, + "learning_rate": 1.8187866817224248e-05, + "loss": 0.8734, + "step": 4083 + }, + { + "epoch": 0.5881336405529954, + "grad_norm": 0.6562371850013733, + "learning_rate": 1.8176985045744334e-05, + "loss": 0.0916, + "step": 4084 + }, + { + "epoch": 0.5882776497695853, + "grad_norm": 3.9123573303222656, + "learning_rate": 1.8166104670814905e-05, + "loss": 1.7201, + "step": 4085 + }, + { + "epoch": 0.5884216589861752, + "grad_norm": 2.7714600563049316, + "learning_rate": 1.815522569466297e-05, + "loss": 0.1932, + "step": 4086 + }, + { + "epoch": 0.588565668202765, + "grad_norm": 0.9228522181510925, + "learning_rate": 1.8144348119515268e-05, + "loss": 0.0954, + "step": 4087 + }, + { + "epoch": 0.5887096774193549, + "grad_norm": 14.757681846618652, + "learning_rate": 1.813347194759824e-05, + "loss": 2.2028, + "step": 4088 + }, + { + "epoch": 0.5888536866359447, + "grad_norm": 3.1436851024627686, + "learning_rate": 1.812259718113805e-05, + "loss": 2.7567, + "step": 4089 + }, + { + "epoch": 0.5889976958525346, + "grad_norm": 0.6458501815795898, + "learning_rate": 1.8111723822360566e-05, + "loss": 0.0633, + "step": 4090 + }, + { + "epoch": 0.5891417050691244, + "grad_norm": 1.8515808582305908, + "learning_rate": 1.8100851873491377e-05, + "loss": 0.0988, + "step": 4091 + }, + { + "epoch": 0.5892857142857143, + "grad_norm": 0.5329076647758484, + "learning_rate": 1.8089981336755772e-05, + "loss": 0.0697, + "step": 4092 + }, + { + "epoch": 0.5894297235023042, + "grad_norm": 0.8149493932723999, + "learning_rate": 1.8079112214378768e-05, + "loss": 0.0807, + "step": 4093 + }, + { + "epoch": 0.589573732718894, + "grad_norm": 9.828343391418457, + "learning_rate": 1.8068244508585075e-05, + "loss": 3.3798, + "step": 4094 + }, + { + "epoch": 0.5897177419354839, + "grad_norm": 2.2219512462615967, + "learning_rate": 1.805737822159912e-05, + "loss": 0.2442, + "step": 4095 + }, + { + "epoch": 0.5898617511520737, + "grad_norm": 0.976884126663208, + "learning_rate": 1.8046513355645038e-05, + "loss": 0.0948, + "step": 4096 + }, + { + "epoch": 0.5900057603686636, + "grad_norm": 0.6435165405273438, + "learning_rate": 1.8035649912946684e-05, + "loss": 0.0665, + "step": 4097 + }, + { + "epoch": 0.5901497695852534, + "grad_norm": 0.8769043684005737, + "learning_rate": 1.8024787895727603e-05, + "loss": 0.1049, + "step": 4098 + }, + { + "epoch": 0.5902937788018433, + "grad_norm": 1.3146461248397827, + "learning_rate": 1.8013927306211058e-05, + "loss": 0.247, + "step": 4099 + }, + { + "epoch": 0.5904377880184332, + "grad_norm": 5.005465984344482, + "learning_rate": 1.8003068146620027e-05, + "loss": 2.0895, + "step": 4100 + }, + { + "epoch": 0.590581797235023, + "grad_norm": 2.823737382888794, + "learning_rate": 1.7992210419177186e-05, + "loss": 0.317, + "step": 4101 + }, + { + "epoch": 0.5907258064516129, + "grad_norm": 0.6582417488098145, + "learning_rate": 1.7981354126104914e-05, + "loss": 0.0944, + "step": 4102 + }, + { + "epoch": 0.5908698156682027, + "grad_norm": 0.7440763711929321, + "learning_rate": 1.7970499269625306e-05, + "loss": 0.0868, + "step": 4103 + }, + { + "epoch": 0.5910138248847926, + "grad_norm": 1.1424676179885864, + "learning_rate": 1.795964585196016e-05, + "loss": 0.1058, + "step": 4104 + }, + { + "epoch": 0.5911578341013825, + "grad_norm": 1.1372548341751099, + "learning_rate": 1.7948793875330977e-05, + "loss": 0.1387, + "step": 4105 + }, + { + "epoch": 0.5913018433179723, + "grad_norm": 0.5622557401657104, + "learning_rate": 1.793794334195896e-05, + "loss": 0.0827, + "step": 4106 + }, + { + "epoch": 0.5914458525345622, + "grad_norm": 3.479327917098999, + "learning_rate": 1.792709425406503e-05, + "loss": 0.7776, + "step": 4107 + }, + { + "epoch": 0.591589861751152, + "grad_norm": 2.223386764526367, + "learning_rate": 1.79162466138698e-05, + "loss": 0.2455, + "step": 4108 + }, + { + "epoch": 0.5917338709677419, + "grad_norm": 0.8429241180419922, + "learning_rate": 1.790540042359359e-05, + "loss": 0.1179, + "step": 4109 + }, + { + "epoch": 0.5918778801843319, + "grad_norm": 6.106538772583008, + "learning_rate": 1.7894555685456425e-05, + "loss": 1.6883, + "step": 4110 + }, + { + "epoch": 0.5920218894009217, + "grad_norm": 1.970025897026062, + "learning_rate": 1.7883712401678022e-05, + "loss": 0.1937, + "step": 4111 + }, + { + "epoch": 0.5921658986175116, + "grad_norm": 6.915515422821045, + "learning_rate": 1.787287057447782e-05, + "loss": 1.9023, + "step": 4112 + }, + { + "epoch": 0.5923099078341014, + "grad_norm": 0.5269663333892822, + "learning_rate": 1.786203020607495e-05, + "loss": 0.0727, + "step": 4113 + }, + { + "epoch": 0.5924539170506913, + "grad_norm": 0.9759679436683655, + "learning_rate": 1.7851191298688237e-05, + "loss": 0.0642, + "step": 4114 + }, + { + "epoch": 0.5925979262672811, + "grad_norm": 0.5539023876190186, + "learning_rate": 1.7840353854536217e-05, + "loss": 0.063, + "step": 4115 + }, + { + "epoch": 0.592741935483871, + "grad_norm": 0.6054211258888245, + "learning_rate": 1.782951787583712e-05, + "loss": 0.0539, + "step": 4116 + }, + { + "epoch": 0.5928859447004609, + "grad_norm": 1.46781587600708, + "learning_rate": 1.7818683364808884e-05, + "loss": 0.1849, + "step": 4117 + }, + { + "epoch": 0.5930299539170507, + "grad_norm": 1.0785983800888062, + "learning_rate": 1.7807850323669137e-05, + "loss": 0.1492, + "step": 4118 + }, + { + "epoch": 0.5931739631336406, + "grad_norm": 4.214503765106201, + "learning_rate": 1.7797018754635214e-05, + "loss": 1.2391, + "step": 4119 + }, + { + "epoch": 0.5933179723502304, + "grad_norm": 0.4165908098220825, + "learning_rate": 1.7786188659924148e-05, + "loss": 0.0515, + "step": 4120 + }, + { + "epoch": 0.5934619815668203, + "grad_norm": 1.5070879459381104, + "learning_rate": 1.777536004175266e-05, + "loss": 0.1555, + "step": 4121 + }, + { + "epoch": 0.5936059907834101, + "grad_norm": 2.9680747985839844, + "learning_rate": 1.7764532902337182e-05, + "loss": 1.0149, + "step": 4122 + }, + { + "epoch": 0.59375, + "grad_norm": 0.8843550086021423, + "learning_rate": 1.7753707243893835e-05, + "loss": 0.1043, + "step": 4123 + }, + { + "epoch": 0.5938940092165899, + "grad_norm": 5.205761909484863, + "learning_rate": 1.7742883068638447e-05, + "loss": 1.8342, + "step": 4124 + }, + { + "epoch": 0.5940380184331797, + "grad_norm": 0.8945352435112, + "learning_rate": 1.773206037878652e-05, + "loss": 0.0961, + "step": 4125 + }, + { + "epoch": 0.5941820276497696, + "grad_norm": 0.6280040740966797, + "learning_rate": 1.7721239176553283e-05, + "loss": 0.0848, + "step": 4126 + }, + { + "epoch": 0.5943260368663594, + "grad_norm": 0.6635971665382385, + "learning_rate": 1.7710419464153643e-05, + "loss": 0.0587, + "step": 4127 + }, + { + "epoch": 0.5944700460829493, + "grad_norm": 0.6757796406745911, + "learning_rate": 1.7699601243802196e-05, + "loss": 0.0993, + "step": 4128 + }, + { + "epoch": 0.5946140552995391, + "grad_norm": 1.2040592432022095, + "learning_rate": 1.7688784517713248e-05, + "loss": 0.1384, + "step": 4129 + }, + { + "epoch": 0.594758064516129, + "grad_norm": 0.9913089871406555, + "learning_rate": 1.7677969288100782e-05, + "loss": 0.169, + "step": 4130 + }, + { + "epoch": 0.5949020737327189, + "grad_norm": 0.8842601180076599, + "learning_rate": 1.7667155557178492e-05, + "loss": 0.1105, + "step": 4131 + }, + { + "epoch": 0.5950460829493087, + "grad_norm": 4.166893482208252, + "learning_rate": 1.7656343327159754e-05, + "loss": 0.2999, + "step": 4132 + }, + { + "epoch": 0.5951900921658986, + "grad_norm": 4.301052570343018, + "learning_rate": 1.764553260025764e-05, + "loss": 1.1759, + "step": 4133 + }, + { + "epoch": 0.5953341013824884, + "grad_norm": 0.6635058522224426, + "learning_rate": 1.763472337868492e-05, + "loss": 0.0762, + "step": 4134 + }, + { + "epoch": 0.5954781105990783, + "grad_norm": 0.7652328610420227, + "learning_rate": 1.7623915664654045e-05, + "loss": 0.0996, + "step": 4135 + }, + { + "epoch": 0.5956221198156681, + "grad_norm": 2.185361385345459, + "learning_rate": 1.7613109460377163e-05, + "loss": 0.1796, + "step": 4136 + }, + { + "epoch": 0.5957661290322581, + "grad_norm": 1.9674577713012695, + "learning_rate": 1.760230476806612e-05, + "loss": 0.1967, + "step": 4137 + }, + { + "epoch": 0.595910138248848, + "grad_norm": 0.3266462981700897, + "learning_rate": 1.7591501589932426e-05, + "loss": 0.0481, + "step": 4138 + }, + { + "epoch": 0.5960541474654378, + "grad_norm": 1.544317603111267, + "learning_rate": 1.7580699928187326e-05, + "loss": 0.161, + "step": 4139 + }, + { + "epoch": 0.5961981566820277, + "grad_norm": 1.7782069444656372, + "learning_rate": 1.7569899785041713e-05, + "loss": 0.1543, + "step": 4140 + }, + { + "epoch": 0.5963421658986175, + "grad_norm": 0.8489818572998047, + "learning_rate": 1.755910116270619e-05, + "loss": 0.1079, + "step": 4141 + }, + { + "epoch": 0.5964861751152074, + "grad_norm": 2.085949420928955, + "learning_rate": 1.7548304063391045e-05, + "loss": 0.2342, + "step": 4142 + }, + { + "epoch": 0.5966301843317973, + "grad_norm": 1.1557905673980713, + "learning_rate": 1.7537508489306242e-05, + "loss": 0.1783, + "step": 4143 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 6.509414196014404, + "learning_rate": 1.7526714442661462e-05, + "loss": 2.468, + "step": 4144 + }, + { + "epoch": 0.596918202764977, + "grad_norm": 0.9565730094909668, + "learning_rate": 1.7515921925666052e-05, + "loss": 0.1122, + "step": 4145 + }, + { + "epoch": 0.5970622119815668, + "grad_norm": 0.6365829110145569, + "learning_rate": 1.7505130940529035e-05, + "loss": 0.0901, + "step": 4146 + }, + { + "epoch": 0.5972062211981567, + "grad_norm": 0.5486305952072144, + "learning_rate": 1.7494341489459152e-05, + "loss": 0.0522, + "step": 4147 + }, + { + "epoch": 0.5973502304147466, + "grad_norm": 0.7274585962295532, + "learning_rate": 1.74835535746648e-05, + "loss": 0.103, + "step": 4148 + }, + { + "epoch": 0.5974942396313364, + "grad_norm": 0.7618743181228638, + "learning_rate": 1.7472767198354086e-05, + "loss": 0.101, + "step": 4149 + }, + { + "epoch": 0.5976382488479263, + "grad_norm": 1.0390945672988892, + "learning_rate": 1.7461982362734776e-05, + "loss": 0.096, + "step": 4150 + }, + { + "epoch": 0.5977822580645161, + "grad_norm": 0.9978783130645752, + "learning_rate": 1.7451199070014345e-05, + "loss": 0.1241, + "step": 4151 + }, + { + "epoch": 0.597926267281106, + "grad_norm": 1.0287456512451172, + "learning_rate": 1.7440417322399943e-05, + "loss": 0.1347, + "step": 4152 + }, + { + "epoch": 0.5980702764976958, + "grad_norm": 0.8806313276290894, + "learning_rate": 1.7429637122098398e-05, + "loss": 0.0861, + "step": 4153 + }, + { + "epoch": 0.5982142857142857, + "grad_norm": 0.6169323325157166, + "learning_rate": 1.741885847131623e-05, + "loss": 0.0946, + "step": 4154 + }, + { + "epoch": 0.5983582949308756, + "grad_norm": 1.3919135332107544, + "learning_rate": 1.7408081372259632e-05, + "loss": 0.1638, + "step": 4155 + }, + { + "epoch": 0.5985023041474654, + "grad_norm": 0.781689465045929, + "learning_rate": 1.7397305827134497e-05, + "loss": 0.1012, + "step": 4156 + }, + { + "epoch": 0.5986463133640553, + "grad_norm": 1.32510244846344, + "learning_rate": 1.7386531838146377e-05, + "loss": 0.1758, + "step": 4157 + }, + { + "epoch": 0.5987903225806451, + "grad_norm": 2.8539726734161377, + "learning_rate": 1.7375759407500526e-05, + "loss": 0.3832, + "step": 4158 + }, + { + "epoch": 0.598934331797235, + "grad_norm": 1.6500605344772339, + "learning_rate": 1.736498853740186e-05, + "loss": 0.1738, + "step": 4159 + }, + { + "epoch": 0.5990783410138248, + "grad_norm": 1.3512732982635498, + "learning_rate": 1.7354219230054998e-05, + "loss": 0.1352, + "step": 4160 + }, + { + "epoch": 0.5992223502304147, + "grad_norm": 0.7110257148742676, + "learning_rate": 1.7343451487664214e-05, + "loss": 0.0959, + "step": 4161 + }, + { + "epoch": 0.5993663594470046, + "grad_norm": 4.440972805023193, + "learning_rate": 1.7332685312433483e-05, + "loss": 2.3038, + "step": 4162 + }, + { + "epoch": 0.5995103686635944, + "grad_norm": 1.460269570350647, + "learning_rate": 1.7321920706566447e-05, + "loss": 0.1522, + "step": 4163 + }, + { + "epoch": 0.5996543778801844, + "grad_norm": 0.8945279717445374, + "learning_rate": 1.7311157672266432e-05, + "loss": 0.0832, + "step": 4164 + }, + { + "epoch": 0.5997983870967742, + "grad_norm": 0.7102258205413818, + "learning_rate": 1.730039621173643e-05, + "loss": 0.0765, + "step": 4165 + }, + { + "epoch": 0.5999423963133641, + "grad_norm": 0.4796365201473236, + "learning_rate": 1.7289636327179144e-05, + "loss": 0.057, + "step": 4166 + }, + { + "epoch": 0.600086405529954, + "grad_norm": 3.7058067321777344, + "learning_rate": 1.7278878020796917e-05, + "loss": 0.2688, + "step": 4167 + }, + { + "epoch": 0.6002304147465438, + "grad_norm": 0.8081400990486145, + "learning_rate": 1.7268121294791788e-05, + "loss": 0.1393, + "step": 4168 + }, + { + "epoch": 0.6003744239631337, + "grad_norm": 0.7528298497200012, + "learning_rate": 1.7257366151365467e-05, + "loss": 0.0717, + "step": 4169 + }, + { + "epoch": 0.6005184331797235, + "grad_norm": 0.9862465262413025, + "learning_rate": 1.7246612592719346e-05, + "loss": 3.7589, + "step": 4170 + }, + { + "epoch": 0.6006624423963134, + "grad_norm": 0.4491156339645386, + "learning_rate": 1.7235860621054477e-05, + "loss": 0.0513, + "step": 4171 + }, + { + "epoch": 0.6008064516129032, + "grad_norm": 0.4850119948387146, + "learning_rate": 1.7225110238571613e-05, + "loss": 0.0702, + "step": 4172 + }, + { + "epoch": 0.6009504608294931, + "grad_norm": 0.9735264778137207, + "learning_rate": 1.7214361447471157e-05, + "loss": 0.1551, + "step": 4173 + }, + { + "epoch": 0.601094470046083, + "grad_norm": 0.8442066311836243, + "learning_rate": 1.72036142499532e-05, + "loss": 0.1075, + "step": 4174 + }, + { + "epoch": 0.6012384792626728, + "grad_norm": 1.2327992916107178, + "learning_rate": 1.71928686482175e-05, + "loss": 0.1279, + "step": 4175 + }, + { + "epoch": 0.6013824884792627, + "grad_norm": 0.7992380857467651, + "learning_rate": 1.7182124644463495e-05, + "loss": 0.1024, + "step": 4176 + }, + { + "epoch": 0.6015264976958525, + "grad_norm": 0.6504772901535034, + "learning_rate": 1.7171382240890292e-05, + "loss": 0.0833, + "step": 4177 + }, + { + "epoch": 0.6016705069124424, + "grad_norm": 0.8956130743026733, + "learning_rate": 1.716064143969667e-05, + "loss": 0.1224, + "step": 4178 + }, + { + "epoch": 0.6018145161290323, + "grad_norm": 0.9780241847038269, + "learning_rate": 1.7149902243081084e-05, + "loss": 0.0841, + "step": 4179 + }, + { + "epoch": 0.6019585253456221, + "grad_norm": 0.8085782527923584, + "learning_rate": 1.7139164653241653e-05, + "loss": 0.1263, + "step": 4180 + }, + { + "epoch": 0.602102534562212, + "grad_norm": 1.3669511079788208, + "learning_rate": 1.712842867237618e-05, + "loss": 0.1441, + "step": 4181 + }, + { + "epoch": 0.6022465437788018, + "grad_norm": 0.9604008197784424, + "learning_rate": 1.7117694302682115e-05, + "loss": 0.1235, + "step": 4182 + }, + { + "epoch": 0.6023905529953917, + "grad_norm": 0.5447611212730408, + "learning_rate": 1.7106961546356608e-05, + "loss": 0.0705, + "step": 4183 + }, + { + "epoch": 0.6025345622119815, + "grad_norm": 3.495393991470337, + "learning_rate": 1.7096230405596458e-05, + "loss": 2.1691, + "step": 4184 + }, + { + "epoch": 0.6026785714285714, + "grad_norm": 1.0501583814620972, + "learning_rate": 1.7085500882598144e-05, + "loss": 0.1315, + "step": 4185 + }, + { + "epoch": 0.6028225806451613, + "grad_norm": 1.7582844495773315, + "learning_rate": 1.7074772979557802e-05, + "loss": 4.1518, + "step": 4186 + }, + { + "epoch": 0.6029665898617511, + "grad_norm": 0.901436448097229, + "learning_rate": 1.7064046698671254e-05, + "loss": 0.0708, + "step": 4187 + }, + { + "epoch": 0.603110599078341, + "grad_norm": 0.9901793003082275, + "learning_rate": 1.7053322042133972e-05, + "loss": 0.1203, + "step": 4188 + }, + { + "epoch": 0.6032546082949308, + "grad_norm": 0.6367722749710083, + "learning_rate": 1.70425990121411e-05, + "loss": 0.0754, + "step": 4189 + }, + { + "epoch": 0.6033986175115207, + "grad_norm": 3.952779531478882, + "learning_rate": 1.703187761088747e-05, + "loss": 1.8471, + "step": 4190 + }, + { + "epoch": 0.6035426267281107, + "grad_norm": 0.8805313110351562, + "learning_rate": 1.7021157840567546e-05, + "loss": 0.1094, + "step": 4191 + }, + { + "epoch": 0.6036866359447005, + "grad_norm": 5.346199035644531, + "learning_rate": 1.701043970337547e-05, + "loss": 1.0708, + "step": 4192 + }, + { + "epoch": 0.6038306451612904, + "grad_norm": 1.123875617980957, + "learning_rate": 1.6999723201505078e-05, + "loss": 0.1181, + "step": 4193 + }, + { + "epoch": 0.6039746543778802, + "grad_norm": 0.9853358268737793, + "learning_rate": 1.6989008337149838e-05, + "loss": 0.09, + "step": 4194 + }, + { + "epoch": 0.6041186635944701, + "grad_norm": 8.626018524169922, + "learning_rate": 1.697829511250289e-05, + "loss": 2.7402, + "step": 4195 + }, + { + "epoch": 0.6042626728110599, + "grad_norm": 0.7851576209068298, + "learning_rate": 1.696758352975704e-05, + "loss": 0.0865, + "step": 4196 + }, + { + "epoch": 0.6044066820276498, + "grad_norm": 0.8346116542816162, + "learning_rate": 1.6956873591104768e-05, + "loss": 0.1067, + "step": 4197 + }, + { + "epoch": 0.6045506912442397, + "grad_norm": 0.938254177570343, + "learning_rate": 1.6946165298738205e-05, + "loss": 0.1278, + "step": 4198 + }, + { + "epoch": 0.6046947004608295, + "grad_norm": 6.298842430114746, + "learning_rate": 1.6935458654849146e-05, + "loss": 1.951, + "step": 4199 + }, + { + "epoch": 0.6048387096774194, + "grad_norm": 6.654256343841553, + "learning_rate": 1.692475366162905e-05, + "loss": 1.4684, + "step": 4200 + }, + { + "epoch": 0.6049827188940092, + "grad_norm": 1.0382823944091797, + "learning_rate": 1.6914050321269047e-05, + "loss": 0.1087, + "step": 4201 + }, + { + "epoch": 0.6051267281105991, + "grad_norm": 5.176774501800537, + "learning_rate": 1.690334863595992e-05, + "loss": 2.8398, + "step": 4202 + }, + { + "epoch": 0.605270737327189, + "grad_norm": 0.7601309418678284, + "learning_rate": 1.689264860789211e-05, + "loss": 0.0811, + "step": 4203 + }, + { + "epoch": 0.6054147465437788, + "grad_norm": 1.7236088514328003, + "learning_rate": 1.6881950239255727e-05, + "loss": 0.1507, + "step": 4204 + }, + { + "epoch": 0.6055587557603687, + "grad_norm": 0.8792561888694763, + "learning_rate": 1.6871253532240535e-05, + "loss": 0.1161, + "step": 4205 + }, + { + "epoch": 0.6057027649769585, + "grad_norm": 1.4317069053649902, + "learning_rate": 1.6860558489035967e-05, + "loss": 0.0951, + "step": 4206 + }, + { + "epoch": 0.6058467741935484, + "grad_norm": 1.856698989868164, + "learning_rate": 1.6849865111831097e-05, + "loss": 0.1935, + "step": 4207 + }, + { + "epoch": 0.6059907834101382, + "grad_norm": 0.6402552723884583, + "learning_rate": 1.6839173402814683e-05, + "loss": 0.1048, + "step": 4208 + }, + { + "epoch": 0.6061347926267281, + "grad_norm": 8.398944854736328, + "learning_rate": 1.6828483364175128e-05, + "loss": 1.622, + "step": 4209 + }, + { + "epoch": 0.606278801843318, + "grad_norm": 0.7325151562690735, + "learning_rate": 1.6817794998100484e-05, + "loss": 0.0847, + "step": 4210 + }, + { + "epoch": 0.6064228110599078, + "grad_norm": 0.8264970183372498, + "learning_rate": 1.6807108306778473e-05, + "loss": 0.151, + "step": 4211 + }, + { + "epoch": 0.6065668202764977, + "grad_norm": 0.6442265510559082, + "learning_rate": 1.679642329239648e-05, + "loss": 0.0809, + "step": 4212 + }, + { + "epoch": 0.6067108294930875, + "grad_norm": 3.3278937339782715, + "learning_rate": 1.6785739957141532e-05, + "loss": 2.0095, + "step": 4213 + }, + { + "epoch": 0.6068548387096774, + "grad_norm": 0.6795005798339844, + "learning_rate": 1.677505830320032e-05, + "loss": 0.0733, + "step": 4214 + }, + { + "epoch": 0.6069988479262672, + "grad_norm": 4.550412654876709, + "learning_rate": 1.676437833275919e-05, + "loss": 1.1307, + "step": 4215 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 0.9931883811950684, + "learning_rate": 1.675370004800414e-05, + "loss": 0.1513, + "step": 4216 + }, + { + "epoch": 0.607286866359447, + "grad_norm": 4.348437786102295, + "learning_rate": 1.6743023451120832e-05, + "loss": 1.609, + "step": 4217 + }, + { + "epoch": 0.6074308755760369, + "grad_norm": 1.0177361965179443, + "learning_rate": 1.673234854429457e-05, + "loss": 0.1166, + "step": 4218 + }, + { + "epoch": 0.6075748847926268, + "grad_norm": 0.7789782285690308, + "learning_rate": 1.6721675329710313e-05, + "loss": 0.0872, + "step": 4219 + }, + { + "epoch": 0.6077188940092166, + "grad_norm": 2.1478970050811768, + "learning_rate": 1.6711003809552696e-05, + "loss": 0.3992, + "step": 4220 + }, + { + "epoch": 0.6078629032258065, + "grad_norm": 8.544866561889648, + "learning_rate": 1.6700333986005985e-05, + "loss": 3.8372, + "step": 4221 + }, + { + "epoch": 0.6080069124423964, + "grad_norm": 0.6821381449699402, + "learning_rate": 1.66896658612541e-05, + "loss": 0.0765, + "step": 4222 + }, + { + "epoch": 0.6081509216589862, + "grad_norm": 0.9244568943977356, + "learning_rate": 1.667899943748062e-05, + "loss": 0.1347, + "step": 4223 + }, + { + "epoch": 0.6082949308755761, + "grad_norm": 0.5576269030570984, + "learning_rate": 1.666833471686877e-05, + "loss": 0.0516, + "step": 4224 + }, + { + "epoch": 0.6084389400921659, + "grad_norm": 0.5707087516784668, + "learning_rate": 1.6657671701601434e-05, + "loss": 0.0663, + "step": 4225 + }, + { + "epoch": 0.6085829493087558, + "grad_norm": 0.949215829372406, + "learning_rate": 1.664701039386114e-05, + "loss": 0.1108, + "step": 4226 + }, + { + "epoch": 0.6087269585253456, + "grad_norm": 0.571419358253479, + "learning_rate": 1.663635079583007e-05, + "loss": 0.0809, + "step": 4227 + }, + { + "epoch": 0.6088709677419355, + "grad_norm": 0.8633070588111877, + "learning_rate": 1.6625692909690055e-05, + "loss": 0.1425, + "step": 4228 + }, + { + "epoch": 0.6090149769585254, + "grad_norm": 0.981619656085968, + "learning_rate": 1.6615036737622573e-05, + "loss": 0.1545, + "step": 4229 + }, + { + "epoch": 0.6091589861751152, + "grad_norm": 1.3522409200668335, + "learning_rate": 1.660438228180876e-05, + "loss": 0.1394, + "step": 4230 + }, + { + "epoch": 0.6093029953917051, + "grad_norm": 0.5602099299430847, + "learning_rate": 1.6593729544429386e-05, + "loss": 0.0744, + "step": 4231 + }, + { + "epoch": 0.6094470046082949, + "grad_norm": 0.970034658908844, + "learning_rate": 1.6583078527664887e-05, + "loss": 0.1109, + "step": 4232 + }, + { + "epoch": 0.6095910138248848, + "grad_norm": 0.8735780119895935, + "learning_rate": 1.6572429233695337e-05, + "loss": 0.16, + "step": 4233 + }, + { + "epoch": 0.6097350230414746, + "grad_norm": 4.970635414123535, + "learning_rate": 1.6561781664700448e-05, + "loss": 1.532, + "step": 4234 + }, + { + "epoch": 0.6098790322580645, + "grad_norm": 1.1359355449676514, + "learning_rate": 1.6551135822859597e-05, + "loss": 0.1126, + "step": 4235 + }, + { + "epoch": 0.6100230414746544, + "grad_norm": 3.7955853939056396, + "learning_rate": 1.65404917103518e-05, + "loss": 1.2427, + "step": 4236 + }, + { + "epoch": 0.6101670506912442, + "grad_norm": 0.3633475601673126, + "learning_rate": 1.652984932935572e-05, + "loss": 0.0526, + "step": 4237 + }, + { + "epoch": 0.6103110599078341, + "grad_norm": 0.4144364595413208, + "learning_rate": 1.651920868204966e-05, + "loss": 0.0464, + "step": 4238 + }, + { + "epoch": 0.6104550691244239, + "grad_norm": 0.993714451789856, + "learning_rate": 1.650856977061157e-05, + "loss": 0.1277, + "step": 4239 + }, + { + "epoch": 0.6105990783410138, + "grad_norm": 1.3987923860549927, + "learning_rate": 1.6497932597219052e-05, + "loss": 0.1792, + "step": 4240 + }, + { + "epoch": 0.6107430875576036, + "grad_norm": 1.0905985832214355, + "learning_rate": 1.648729716404935e-05, + "loss": 0.1057, + "step": 4241 + }, + { + "epoch": 0.6108870967741935, + "grad_norm": 0.9544569849967957, + "learning_rate": 1.6476663473279337e-05, + "loss": 0.1116, + "step": 4242 + }, + { + "epoch": 0.6110311059907834, + "grad_norm": 1.0463560819625854, + "learning_rate": 1.6466031527085553e-05, + "loss": 0.1404, + "step": 4243 + }, + { + "epoch": 0.6111751152073732, + "grad_norm": 1.3440603017807007, + "learning_rate": 1.645540132764416e-05, + "loss": 0.1488, + "step": 4244 + }, + { + "epoch": 0.6113191244239631, + "grad_norm": 0.7335097193717957, + "learning_rate": 1.644477287713098e-05, + "loss": 0.0933, + "step": 4245 + }, + { + "epoch": 0.611463133640553, + "grad_norm": 0.8257714509963989, + "learning_rate": 1.643414617772147e-05, + "loss": 0.1227, + "step": 4246 + }, + { + "epoch": 0.6116071428571429, + "grad_norm": 1.3310894966125488, + "learning_rate": 1.6423521231590717e-05, + "loss": 0.1369, + "step": 4247 + }, + { + "epoch": 0.6117511520737328, + "grad_norm": 1.0594704151153564, + "learning_rate": 1.641289804091347e-05, + "loss": 0.1238, + "step": 4248 + }, + { + "epoch": 0.6118951612903226, + "grad_norm": 0.9842399954795837, + "learning_rate": 1.640227660786411e-05, + "loss": 0.0878, + "step": 4249 + }, + { + "epoch": 0.6120391705069125, + "grad_norm": 1.2404168844223022, + "learning_rate": 1.6391656934616646e-05, + "loss": 0.1547, + "step": 4250 + }, + { + "epoch": 0.6121831797235023, + "grad_norm": 1.254879117012024, + "learning_rate": 1.638103902334474e-05, + "loss": 0.1362, + "step": 4251 + }, + { + "epoch": 0.6123271889400922, + "grad_norm": 0.9624782204627991, + "learning_rate": 1.6370422876221694e-05, + "loss": 0.1015, + "step": 4252 + }, + { + "epoch": 0.612471198156682, + "grad_norm": 0.7168964743614197, + "learning_rate": 1.6359808495420444e-05, + "loss": 0.0967, + "step": 4253 + }, + { + "epoch": 0.6126152073732719, + "grad_norm": 0.6980259418487549, + "learning_rate": 1.6349195883113565e-05, + "loss": 0.086, + "step": 4254 + }, + { + "epoch": 0.6127592165898618, + "grad_norm": 0.6837056279182434, + "learning_rate": 1.6338585041473276e-05, + "loss": 0.0804, + "step": 4255 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 0.6651586294174194, + "learning_rate": 1.6327975972671422e-05, + "loss": 0.0872, + "step": 4256 + }, + { + "epoch": 0.6130472350230415, + "grad_norm": 1.076112151145935, + "learning_rate": 1.6317368678879495e-05, + "loss": 0.168, + "step": 4257 + }, + { + "epoch": 0.6131912442396313, + "grad_norm": 1.3136539459228516, + "learning_rate": 1.6306763162268622e-05, + "loss": 0.163, + "step": 4258 + }, + { + "epoch": 0.6133352534562212, + "grad_norm": 1.3242791891098022, + "learning_rate": 1.6296159425009562e-05, + "loss": 0.1499, + "step": 4259 + }, + { + "epoch": 0.613479262672811, + "grad_norm": 0.5664209127426147, + "learning_rate": 1.628555746927271e-05, + "loss": 0.0669, + "step": 4260 + }, + { + "epoch": 0.6136232718894009, + "grad_norm": 10.667801856994629, + "learning_rate": 1.6274957297228105e-05, + "loss": 1.5923, + "step": 4261 + }, + { + "epoch": 0.6137672811059908, + "grad_norm": 5.1706109046936035, + "learning_rate": 1.6264358911045407e-05, + "loss": 0.2072, + "step": 4262 + }, + { + "epoch": 0.6139112903225806, + "grad_norm": 0.6398152112960815, + "learning_rate": 1.6253762312893923e-05, + "loss": 0.0809, + "step": 4263 + }, + { + "epoch": 0.6140552995391705, + "grad_norm": 0.8224712610244751, + "learning_rate": 1.624316750494259e-05, + "loss": 0.0948, + "step": 4264 + }, + { + "epoch": 0.6141993087557603, + "grad_norm": 3.963853359222412, + "learning_rate": 1.623257448935998e-05, + "loss": 1.1947, + "step": 4265 + }, + { + "epoch": 0.6143433179723502, + "grad_norm": 1.2770277261734009, + "learning_rate": 1.622198326831429e-05, + "loss": 0.2206, + "step": 4266 + }, + { + "epoch": 0.6144873271889401, + "grad_norm": 0.938423216342926, + "learning_rate": 1.621139384397336e-05, + "loss": 0.076, + "step": 4267 + }, + { + "epoch": 0.6146313364055299, + "grad_norm": 0.6767153143882751, + "learning_rate": 1.6200806218504657e-05, + "loss": 0.1038, + "step": 4268 + }, + { + "epoch": 0.6147753456221198, + "grad_norm": 2.7241783142089844, + "learning_rate": 1.619022039407528e-05, + "loss": 0.3345, + "step": 4269 + }, + { + "epoch": 0.6149193548387096, + "grad_norm": 0.5284458994865417, + "learning_rate": 1.6179636372851952e-05, + "loss": 0.0806, + "step": 4270 + }, + { + "epoch": 0.6150633640552995, + "grad_norm": 0.6302849650382996, + "learning_rate": 1.616905415700105e-05, + "loss": 4.2483, + "step": 4271 + }, + { + "epoch": 0.6152073732718893, + "grad_norm": 1.6822980642318726, + "learning_rate": 1.6158473748688557e-05, + "loss": 0.1725, + "step": 4272 + }, + { + "epoch": 0.6153513824884793, + "grad_norm": 0.6352470517158508, + "learning_rate": 1.61478951500801e-05, + "loss": 0.1049, + "step": 4273 + }, + { + "epoch": 0.6154953917050692, + "grad_norm": 0.64036625623703, + "learning_rate": 1.6137318363340923e-05, + "loss": 0.0785, + "step": 4274 + }, + { + "epoch": 0.615639400921659, + "grad_norm": 0.8494387865066528, + "learning_rate": 1.612674339063592e-05, + "loss": 0.1637, + "step": 4275 + }, + { + "epoch": 0.6157834101382489, + "grad_norm": 0.6655952334403992, + "learning_rate": 1.6116170234129584e-05, + "loss": 0.085, + "step": 4276 + }, + { + "epoch": 0.6159274193548387, + "grad_norm": 0.9891579747200012, + "learning_rate": 1.610559889598607e-05, + "loss": 0.0932, + "step": 4277 + }, + { + "epoch": 0.6160714285714286, + "grad_norm": 7.653783321380615, + "learning_rate": 1.6095029378369137e-05, + "loss": 3.1276, + "step": 4278 + }, + { + "epoch": 0.6162154377880185, + "grad_norm": 1.0584348440170288, + "learning_rate": 1.6084461683442176e-05, + "loss": 0.1305, + "step": 4279 + }, + { + "epoch": 0.6163594470046083, + "grad_norm": 1.3417972326278687, + "learning_rate": 1.607389581336821e-05, + "loss": 0.1367, + "step": 4280 + }, + { + "epoch": 0.6165034562211982, + "grad_norm": 3.458892583847046, + "learning_rate": 1.6063331770309886e-05, + "loss": 0.1743, + "step": 4281 + }, + { + "epoch": 0.616647465437788, + "grad_norm": 0.6661435961723328, + "learning_rate": 1.605276955642947e-05, + "loss": 0.0909, + "step": 4282 + }, + { + "epoch": 0.6167914746543779, + "grad_norm": 1.267917513847351, + "learning_rate": 1.604220917388887e-05, + "loss": 0.1602, + "step": 4283 + }, + { + "epoch": 0.6169354838709677, + "grad_norm": 1.4455842971801758, + "learning_rate": 1.6031650624849603e-05, + "loss": 0.0905, + "step": 4284 + }, + { + "epoch": 0.6170794930875576, + "grad_norm": 0.8986014127731323, + "learning_rate": 1.6021093911472824e-05, + "loss": 0.1129, + "step": 4285 + }, + { + "epoch": 0.6172235023041475, + "grad_norm": 0.8266485929489136, + "learning_rate": 1.60105390359193e-05, + "loss": 0.0917, + "step": 4286 + }, + { + "epoch": 0.6173675115207373, + "grad_norm": 0.8708195090293884, + "learning_rate": 1.599998600034943e-05, + "loss": 0.1006, + "step": 4287 + }, + { + "epoch": 0.6175115207373272, + "grad_norm": 0.48601192235946655, + "learning_rate": 1.5989434806923233e-05, + "loss": 0.0643, + "step": 4288 + }, + { + "epoch": 0.617655529953917, + "grad_norm": 0.7569682598114014, + "learning_rate": 1.5978885457800345e-05, + "loss": 0.0937, + "step": 4289 + }, + { + "epoch": 0.6177995391705069, + "grad_norm": 4.599602699279785, + "learning_rate": 1.596833795514004e-05, + "loss": 1.1969, + "step": 4290 + }, + { + "epoch": 0.6179435483870968, + "grad_norm": 4.333366870880127, + "learning_rate": 1.5957792301101205e-05, + "loss": 2.1333, + "step": 4291 + }, + { + "epoch": 0.6180875576036866, + "grad_norm": 0.833163857460022, + "learning_rate": 1.5947248497842344e-05, + "loss": 0.085, + "step": 4292 + }, + { + "epoch": 0.6182315668202765, + "grad_norm": 1.062033772468567, + "learning_rate": 1.593670654752159e-05, + "loss": 0.1525, + "step": 4293 + }, + { + "epoch": 0.6183755760368663, + "grad_norm": 1.0507497787475586, + "learning_rate": 1.5926166452296692e-05, + "loss": 0.1256, + "step": 4294 + }, + { + "epoch": 0.6185195852534562, + "grad_norm": 0.7858744263648987, + "learning_rate": 1.5915628214325025e-05, + "loss": 0.0931, + "step": 4295 + }, + { + "epoch": 0.618663594470046, + "grad_norm": 0.6162979602813721, + "learning_rate": 1.5905091835763574e-05, + "loss": 0.07, + "step": 4296 + }, + { + "epoch": 0.6188076036866359, + "grad_norm": 0.6781107187271118, + "learning_rate": 1.5894557318768948e-05, + "loss": 0.0719, + "step": 4297 + }, + { + "epoch": 0.6189516129032258, + "grad_norm": 4.20004415512085, + "learning_rate": 1.588402466549739e-05, + "loss": 1.0625, + "step": 4298 + }, + { + "epoch": 0.6190956221198156, + "grad_norm": 1.119418740272522, + "learning_rate": 1.5873493878104735e-05, + "loss": 0.1067, + "step": 4299 + }, + { + "epoch": 0.6192396313364056, + "grad_norm": 0.7827768921852112, + "learning_rate": 1.5862964958746448e-05, + "loss": 0.067, + "step": 4300 + }, + { + "epoch": 0.6193836405529954, + "grad_norm": 0.522551953792572, + "learning_rate": 1.585243790957761e-05, + "loss": 0.0746, + "step": 4301 + }, + { + "epoch": 0.6195276497695853, + "grad_norm": 1.143159031867981, + "learning_rate": 1.584191273275294e-05, + "loss": 0.117, + "step": 4302 + }, + { + "epoch": 0.6196716589861752, + "grad_norm": 0.7812302708625793, + "learning_rate": 1.583138943042674e-05, + "loss": 0.121, + "step": 4303 + }, + { + "epoch": 0.619815668202765, + "grad_norm": 1.3507620096206665, + "learning_rate": 1.5820868004752955e-05, + "loss": 0.1797, + "step": 4304 + }, + { + "epoch": 0.6199596774193549, + "grad_norm": 2.5188755989074707, + "learning_rate": 1.581034845788512e-05, + "loss": 0.2429, + "step": 4305 + }, + { + "epoch": 0.6201036866359447, + "grad_norm": 0.9104918837547302, + "learning_rate": 1.579983079197641e-05, + "loss": 0.103, + "step": 4306 + }, + { + "epoch": 0.6202476958525346, + "grad_norm": 2.7695229053497314, + "learning_rate": 1.5789315009179607e-05, + "loss": 0.2565, + "step": 4307 + }, + { + "epoch": 0.6203917050691244, + "grad_norm": 4.372103691101074, + "learning_rate": 1.57788011116471e-05, + "loss": 0.1673, + "step": 4308 + }, + { + "epoch": 0.6205357142857143, + "grad_norm": 0.6460058689117432, + "learning_rate": 1.5768289101530898e-05, + "loss": 0.0696, + "step": 4309 + }, + { + "epoch": 0.6206797235023042, + "grad_norm": 0.8943524360656738, + "learning_rate": 1.5757778980982626e-05, + "loss": 0.1118, + "step": 4310 + }, + { + "epoch": 0.620823732718894, + "grad_norm": 5.57053279876709, + "learning_rate": 1.574727075215352e-05, + "loss": 2.3128, + "step": 4311 + }, + { + "epoch": 0.6209677419354839, + "grad_norm": 0.7345547676086426, + "learning_rate": 1.5736764417194426e-05, + "loss": 0.0846, + "step": 4312 + }, + { + "epoch": 0.6211117511520737, + "grad_norm": 5.900229454040527, + "learning_rate": 1.572625997825581e-05, + "loss": 2.0631, + "step": 4313 + }, + { + "epoch": 0.6212557603686636, + "grad_norm": 1.4884957075119019, + "learning_rate": 1.5715757437487735e-05, + "loss": 0.1261, + "step": 4314 + }, + { + "epoch": 0.6213997695852534, + "grad_norm": 0.6901214718818665, + "learning_rate": 1.5705256797039897e-05, + "loss": 0.0715, + "step": 4315 + }, + { + "epoch": 0.6215437788018433, + "grad_norm": 0.4702761769294739, + "learning_rate": 1.569475805906158e-05, + "loss": 0.0648, + "step": 4316 + }, + { + "epoch": 0.6216877880184332, + "grad_norm": 0.9173063039779663, + "learning_rate": 1.5684261225701702e-05, + "loss": 0.1301, + "step": 4317 + }, + { + "epoch": 0.621831797235023, + "grad_norm": 3.4670488834381104, + "learning_rate": 1.5673766299108773e-05, + "loss": 0.781, + "step": 4318 + }, + { + "epoch": 0.6219758064516129, + "grad_norm": 4.682861804962158, + "learning_rate": 1.5663273281430914e-05, + "loss": 1.5726, + "step": 4319 + }, + { + "epoch": 0.6221198156682027, + "grad_norm": 0.9778193235397339, + "learning_rate": 1.5652782174815868e-05, + "loss": 0.108, + "step": 4320 + }, + { + "epoch": 0.6222638248847926, + "grad_norm": 0.7238790392875671, + "learning_rate": 1.5642292981410976e-05, + "loss": 0.1143, + "step": 4321 + }, + { + "epoch": 0.6224078341013825, + "grad_norm": 1.0496461391448975, + "learning_rate": 1.563180570336319e-05, + "loss": 0.132, + "step": 4322 + }, + { + "epoch": 0.6225518433179723, + "grad_norm": 1.2293541431427002, + "learning_rate": 1.5621320342819073e-05, + "loss": 0.1838, + "step": 4323 + }, + { + "epoch": 0.6226958525345622, + "grad_norm": 7.041621208190918, + "learning_rate": 1.561083690192479e-05, + "loss": 1.5763, + "step": 4324 + }, + { + "epoch": 0.622839861751152, + "grad_norm": 0.989898145198822, + "learning_rate": 1.5600355382826116e-05, + "loss": 0.0942, + "step": 4325 + }, + { + "epoch": 0.6229838709677419, + "grad_norm": 0.8177875280380249, + "learning_rate": 1.558987578766843e-05, + "loss": 0.0812, + "step": 4326 + }, + { + "epoch": 0.6231278801843319, + "grad_norm": 0.9312479496002197, + "learning_rate": 1.557939811859672e-05, + "loss": 0.1249, + "step": 4327 + }, + { + "epoch": 0.6232718894009217, + "grad_norm": 0.43787920475006104, + "learning_rate": 1.556892237775558e-05, + "loss": 0.0553, + "step": 4328 + }, + { + "epoch": 0.6234158986175116, + "grad_norm": 0.4662344455718994, + "learning_rate": 1.5558448567289218e-05, + "loss": 0.054, + "step": 4329 + }, + { + "epoch": 0.6235599078341014, + "grad_norm": 1.142421841621399, + "learning_rate": 1.5547976689341432e-05, + "loss": 0.1279, + "step": 4330 + }, + { + "epoch": 0.6237039170506913, + "grad_norm": 0.732511043548584, + "learning_rate": 1.5537506746055627e-05, + "loss": 0.0613, + "step": 4331 + }, + { + "epoch": 0.6238479262672811, + "grad_norm": 1.0665754079818726, + "learning_rate": 1.5527038739574817e-05, + "loss": 0.203, + "step": 4332 + }, + { + "epoch": 0.623991935483871, + "grad_norm": 4.351269721984863, + "learning_rate": 1.5516572672041622e-05, + "loss": 1.0283, + "step": 4333 + }, + { + "epoch": 0.6241359447004609, + "grad_norm": 3.1156508922576904, + "learning_rate": 1.5506108545598254e-05, + "loss": 1.9165, + "step": 4334 + }, + { + "epoch": 0.6242799539170507, + "grad_norm": 6.480345249176025, + "learning_rate": 1.5495646362386533e-05, + "loss": 1.6454, + "step": 4335 + }, + { + "epoch": 0.6244239631336406, + "grad_norm": 3.7103848457336426, + "learning_rate": 1.5485186124547894e-05, + "loss": 1.6594, + "step": 4336 + }, + { + "epoch": 0.6245679723502304, + "grad_norm": 0.675586462020874, + "learning_rate": 1.5474727834223356e-05, + "loss": 0.1003, + "step": 4337 + }, + { + "epoch": 0.6247119815668203, + "grad_norm": 0.7034206390380859, + "learning_rate": 1.5464271493553546e-05, + "loss": 0.0689, + "step": 4338 + }, + { + "epoch": 0.6248559907834101, + "grad_norm": 0.596135139465332, + "learning_rate": 1.5453817104678687e-05, + "loss": 0.0629, + "step": 4339 + }, + { + "epoch": 0.625, + "grad_norm": 0.8596541285514832, + "learning_rate": 1.5443364669738618e-05, + "loss": 0.1071, + "step": 4340 + }, + { + "epoch": 0.6251440092165899, + "grad_norm": 0.5172322988510132, + "learning_rate": 1.5432914190872757e-05, + "loss": 0.0864, + "step": 4341 + }, + { + "epoch": 0.6252880184331797, + "grad_norm": 1.1223424673080444, + "learning_rate": 1.5422465670220143e-05, + "loss": 0.1333, + "step": 4342 + }, + { + "epoch": 0.6254320276497696, + "grad_norm": 0.5564063191413879, + "learning_rate": 1.5412019109919394e-05, + "loss": 0.0634, + "step": 4343 + }, + { + "epoch": 0.6255760368663594, + "grad_norm": 1.050742745399475, + "learning_rate": 1.540157451210874e-05, + "loss": 0.1116, + "step": 4344 + }, + { + "epoch": 0.6257200460829493, + "grad_norm": 1.4054676294326782, + "learning_rate": 1.5391131878926005e-05, + "loss": 0.1499, + "step": 4345 + }, + { + "epoch": 0.6258640552995391, + "grad_norm": 0.720617949962616, + "learning_rate": 1.5380691212508612e-05, + "loss": 0.0886, + "step": 4346 + }, + { + "epoch": 0.626008064516129, + "grad_norm": 0.8105195760726929, + "learning_rate": 1.537025251499358e-05, + "loss": 0.0991, + "step": 4347 + }, + { + "epoch": 0.6261520737327189, + "grad_norm": 2.0732169151306152, + "learning_rate": 1.5359815788517525e-05, + "loss": 0.1869, + "step": 4348 + }, + { + "epoch": 0.6262960829493087, + "grad_norm": 1.084280014038086, + "learning_rate": 1.5349381035216666e-05, + "loss": 0.1196, + "step": 4349 + }, + { + "epoch": 0.6264400921658986, + "grad_norm": 0.5017781853675842, + "learning_rate": 1.53389482572268e-05, + "loss": 0.0624, + "step": 4350 + }, + { + "epoch": 0.6265841013824884, + "grad_norm": 1.2616435289382935, + "learning_rate": 1.5328517456683343e-05, + "loss": 0.1205, + "step": 4351 + }, + { + "epoch": 0.6267281105990783, + "grad_norm": 0.6460614800453186, + "learning_rate": 1.5318088635721295e-05, + "loss": 0.0641, + "step": 4352 + }, + { + "epoch": 0.6268721198156681, + "grad_norm": 1.3713542222976685, + "learning_rate": 1.5307661796475247e-05, + "loss": 0.1164, + "step": 4353 + }, + { + "epoch": 0.6270161290322581, + "grad_norm": 4.39506196975708, + "learning_rate": 1.5297236941079386e-05, + "loss": 1.4809, + "step": 4354 + }, + { + "epoch": 0.627160138248848, + "grad_norm": 1.1227036714553833, + "learning_rate": 1.5286814071667494e-05, + "loss": 0.1102, + "step": 4355 + }, + { + "epoch": 0.6273041474654378, + "grad_norm": 0.7127211689949036, + "learning_rate": 1.527639319037296e-05, + "loss": 0.0989, + "step": 4356 + }, + { + "epoch": 0.6274481566820277, + "grad_norm": 4.509676456451416, + "learning_rate": 1.526597429932875e-05, + "loss": 1.5911, + "step": 4357 + }, + { + "epoch": 0.6275921658986175, + "grad_norm": 0.94413161277771, + "learning_rate": 1.5255557400667425e-05, + "loss": 0.1536, + "step": 4358 + }, + { + "epoch": 0.6277361751152074, + "grad_norm": 0.65097576379776, + "learning_rate": 1.5245142496521136e-05, + "loss": 0.077, + "step": 4359 + }, + { + "epoch": 0.6278801843317973, + "grad_norm": 0.7578818798065186, + "learning_rate": 1.5234729589021635e-05, + "loss": 0.1065, + "step": 4360 + }, + { + "epoch": 0.6280241935483871, + "grad_norm": 1.1333914995193481, + "learning_rate": 1.522431868030026e-05, + "loss": 0.1209, + "step": 4361 + }, + { + "epoch": 0.628168202764977, + "grad_norm": 0.8006284832954407, + "learning_rate": 1.5213909772487934e-05, + "loss": 4.384, + "step": 4362 + }, + { + "epoch": 0.6283122119815668, + "grad_norm": 0.8091951608657837, + "learning_rate": 1.5203502867715181e-05, + "loss": 0.0911, + "step": 4363 + }, + { + "epoch": 0.6284562211981567, + "grad_norm": 0.707855761051178, + "learning_rate": 1.5193097968112108e-05, + "loss": 0.0987, + "step": 4364 + }, + { + "epoch": 0.6286002304147466, + "grad_norm": 1.609222173690796, + "learning_rate": 1.5182695075808418e-05, + "loss": 0.1503, + "step": 4365 + }, + { + "epoch": 0.6287442396313364, + "grad_norm": 0.6035845279693604, + "learning_rate": 1.5172294192933393e-05, + "loss": 0.0822, + "step": 4366 + }, + { + "epoch": 0.6288882488479263, + "grad_norm": 1.5254621505737305, + "learning_rate": 1.5161895321615916e-05, + "loss": 0.0811, + "step": 4367 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 0.5355373620986938, + "learning_rate": 1.5151498463984445e-05, + "loss": 0.0603, + "step": 4368 + }, + { + "epoch": 0.629176267281106, + "grad_norm": 0.4714439809322357, + "learning_rate": 1.5141103622167041e-05, + "loss": 0.0676, + "step": 4369 + }, + { + "epoch": 0.6293202764976958, + "grad_norm": 2.7587544918060303, + "learning_rate": 1.5130710798291337e-05, + "loss": 0.451, + "step": 4370 + }, + { + "epoch": 0.6294642857142857, + "grad_norm": 0.9638227224349976, + "learning_rate": 1.5120319994484563e-05, + "loss": 4.1237, + "step": 4371 + }, + { + "epoch": 0.6296082949308756, + "grad_norm": 4.8481574058532715, + "learning_rate": 1.5109931212873534e-05, + "loss": 2.0754, + "step": 4372 + }, + { + "epoch": 0.6297523041474654, + "grad_norm": 1.33567214012146, + "learning_rate": 1.5099544455584652e-05, + "loss": 0.1466, + "step": 4373 + }, + { + "epoch": 0.6298963133640553, + "grad_norm": 0.48897597193717957, + "learning_rate": 1.5089159724743896e-05, + "loss": 0.0535, + "step": 4374 + }, + { + "epoch": 0.6300403225806451, + "grad_norm": 0.7489674687385559, + "learning_rate": 1.5078777022476842e-05, + "loss": 0.0889, + "step": 4375 + }, + { + "epoch": 0.630184331797235, + "grad_norm": 0.8931747674942017, + "learning_rate": 1.5068396350908642e-05, + "loss": 0.1179, + "step": 4376 + }, + { + "epoch": 0.6303283410138248, + "grad_norm": 0.9964835047721863, + "learning_rate": 1.5058017712164035e-05, + "loss": 0.0797, + "step": 4377 + }, + { + "epoch": 0.6304723502304147, + "grad_norm": 2.557960033416748, + "learning_rate": 1.5047641108367349e-05, + "loss": 0.2252, + "step": 4378 + }, + { + "epoch": 0.6306163594470046, + "grad_norm": 0.8967743515968323, + "learning_rate": 1.5037266541642492e-05, + "loss": 0.0751, + "step": 4379 + }, + { + "epoch": 0.6307603686635944, + "grad_norm": 3.106053113937378, + "learning_rate": 1.5026894014112952e-05, + "loss": 0.5165, + "step": 4380 + }, + { + "epoch": 0.6309043778801844, + "grad_norm": 0.4337708055973053, + "learning_rate": 1.50165235279018e-05, + "loss": 0.0625, + "step": 4381 + }, + { + "epoch": 0.6310483870967742, + "grad_norm": 0.6855765581130981, + "learning_rate": 1.5006155085131682e-05, + "loss": 0.0906, + "step": 4382 + }, + { + "epoch": 0.6311923963133641, + "grad_norm": 8.204242706298828, + "learning_rate": 1.4995788687924856e-05, + "loss": 1.5821, + "step": 4383 + }, + { + "epoch": 0.631336405529954, + "grad_norm": 2.6269941329956055, + "learning_rate": 1.4985424338403131e-05, + "loss": 0.3846, + "step": 4384 + }, + { + "epoch": 0.6314804147465438, + "grad_norm": 0.9424605369567871, + "learning_rate": 1.4975062038687904e-05, + "loss": 0.0938, + "step": 4385 + }, + { + "epoch": 0.6316244239631337, + "grad_norm": 0.47643908858299255, + "learning_rate": 1.4964701790900154e-05, + "loss": 0.0661, + "step": 4386 + }, + { + "epoch": 0.6317684331797235, + "grad_norm": 0.6846320629119873, + "learning_rate": 1.4954343597160445e-05, + "loss": 0.0563, + "step": 4387 + }, + { + "epoch": 0.6319124423963134, + "grad_norm": 0.7990171313285828, + "learning_rate": 1.4943987459588909e-05, + "loss": 0.084, + "step": 4388 + }, + { + "epoch": 0.6320564516129032, + "grad_norm": 2.134653329849243, + "learning_rate": 1.493363338030527e-05, + "loss": 1.7421, + "step": 4389 + }, + { + "epoch": 0.6322004608294931, + "grad_norm": 0.9024360179901123, + "learning_rate": 1.4923281361428823e-05, + "loss": 0.1082, + "step": 4390 + }, + { + "epoch": 0.632344470046083, + "grad_norm": 0.45204463601112366, + "learning_rate": 1.4912931405078442e-05, + "loss": 0.06, + "step": 4391 + }, + { + "epoch": 0.6324884792626728, + "grad_norm": 1.3196176290512085, + "learning_rate": 1.4902583513372582e-05, + "loss": 0.1386, + "step": 4392 + }, + { + "epoch": 0.6326324884792627, + "grad_norm": 5.558650970458984, + "learning_rate": 1.4892237688429273e-05, + "loss": 2.4443, + "step": 4393 + }, + { + "epoch": 0.6327764976958525, + "grad_norm": 1.3096221685409546, + "learning_rate": 1.488189393236612e-05, + "loss": 0.1445, + "step": 4394 + }, + { + "epoch": 0.6329205069124424, + "grad_norm": 0.5480949878692627, + "learning_rate": 1.4871552247300307e-05, + "loss": 0.075, + "step": 4395 + }, + { + "epoch": 0.6330645161290323, + "grad_norm": 0.7715849280357361, + "learning_rate": 1.4861212635348598e-05, + "loss": 0.1007, + "step": 4396 + }, + { + "epoch": 0.6332085253456221, + "grad_norm": 0.4905400276184082, + "learning_rate": 1.4850875098627326e-05, + "loss": 0.0592, + "step": 4397 + }, + { + "epoch": 0.633352534562212, + "grad_norm": 0.5767237544059753, + "learning_rate": 1.4840539639252404e-05, + "loss": 0.0458, + "step": 4398 + }, + { + "epoch": 0.6334965437788018, + "grad_norm": 0.9371697902679443, + "learning_rate": 1.4830206259339314e-05, + "loss": 0.1185, + "step": 4399 + }, + { + "epoch": 0.6336405529953917, + "grad_norm": 4.551721572875977, + "learning_rate": 1.4819874961003118e-05, + "loss": 2.0473, + "step": 4400 + }, + { + "epoch": 0.6337845622119815, + "grad_norm": 1.1717764139175415, + "learning_rate": 1.4809545746358447e-05, + "loss": 0.1386, + "step": 4401 + }, + { + "epoch": 0.6339285714285714, + "grad_norm": 0.9106143116950989, + "learning_rate": 1.4799218617519514e-05, + "loss": 0.1097, + "step": 4402 + }, + { + "epoch": 0.6340725806451613, + "grad_norm": 6.675014495849609, + "learning_rate": 1.4788893576600099e-05, + "loss": 1.4225, + "step": 4403 + }, + { + "epoch": 0.6342165898617511, + "grad_norm": 0.9600300788879395, + "learning_rate": 1.4778570625713552e-05, + "loss": 0.1141, + "step": 4404 + }, + { + "epoch": 0.634360599078341, + "grad_norm": 0.7688263058662415, + "learning_rate": 1.4768249766972802e-05, + "loss": 0.0895, + "step": 4405 + }, + { + "epoch": 0.6345046082949308, + "grad_norm": 0.6773515343666077, + "learning_rate": 1.4757931002490344e-05, + "loss": 0.0637, + "step": 4406 + }, + { + "epoch": 0.6346486175115207, + "grad_norm": 1.2974543571472168, + "learning_rate": 1.4747614334378246e-05, + "loss": 0.1211, + "step": 4407 + }, + { + "epoch": 0.6347926267281107, + "grad_norm": 1.583511233329773, + "learning_rate": 1.4737299764748148e-05, + "loss": 0.1543, + "step": 4408 + }, + { + "epoch": 0.6349366359447005, + "grad_norm": 0.46174156665802, + "learning_rate": 1.4726987295711253e-05, + "loss": 0.0493, + "step": 4409 + }, + { + "epoch": 0.6350806451612904, + "grad_norm": 5.154824733734131, + "learning_rate": 1.4716676929378353e-05, + "loss": 1.5796, + "step": 4410 + }, + { + "epoch": 0.6352246543778802, + "grad_norm": 1.6389386653900146, + "learning_rate": 1.47063686678598e-05, + "loss": 0.2667, + "step": 4411 + }, + { + "epoch": 0.6353686635944701, + "grad_norm": 0.8225964307785034, + "learning_rate": 1.4696062513265495e-05, + "loss": 0.1095, + "step": 4412 + }, + { + "epoch": 0.6355126728110599, + "grad_norm": 0.85353022813797, + "learning_rate": 1.4685758467704947e-05, + "loss": 0.1058, + "step": 4413 + }, + { + "epoch": 0.6356566820276498, + "grad_norm": 0.8676108121871948, + "learning_rate": 1.4675456533287193e-05, + "loss": 0.1076, + "step": 4414 + }, + { + "epoch": 0.6358006912442397, + "grad_norm": 0.982374370098114, + "learning_rate": 1.4665156712120868e-05, + "loss": 0.1124, + "step": 4415 + }, + { + "epoch": 0.6359447004608295, + "grad_norm": 2.000338554382324, + "learning_rate": 1.4654859006314158e-05, + "loss": 0.1659, + "step": 4416 + }, + { + "epoch": 0.6360887096774194, + "grad_norm": 3.1548430919647217, + "learning_rate": 1.4644563417974827e-05, + "loss": 2.3847, + "step": 4417 + }, + { + "epoch": 0.6362327188940092, + "grad_norm": 3.307133436203003, + "learning_rate": 1.4634269949210191e-05, + "loss": 2.8593, + "step": 4418 + }, + { + "epoch": 0.6363767281105991, + "grad_norm": 1.5367571115493774, + "learning_rate": 1.462397860212715e-05, + "loss": 0.1302, + "step": 4419 + }, + { + "epoch": 0.636520737327189, + "grad_norm": 0.9141053557395935, + "learning_rate": 1.4613689378832152e-05, + "loss": 0.1135, + "step": 4420 + }, + { + "epoch": 0.6366647465437788, + "grad_norm": 0.6759769916534424, + "learning_rate": 1.4603402281431225e-05, + "loss": 0.0822, + "step": 4421 + }, + { + "epoch": 0.6368087557603687, + "grad_norm": 0.8669494986534119, + "learning_rate": 1.459311731202996e-05, + "loss": 0.1134, + "step": 4422 + }, + { + "epoch": 0.6369527649769585, + "grad_norm": 1.0377402305603027, + "learning_rate": 1.4582834472733501e-05, + "loss": 0.104, + "step": 4423 + }, + { + "epoch": 0.6370967741935484, + "grad_norm": 0.7227188944816589, + "learning_rate": 1.4572553765646562e-05, + "loss": 0.078, + "step": 4424 + }, + { + "epoch": 0.6372407834101382, + "grad_norm": 1.2981395721435547, + "learning_rate": 1.4562275192873428e-05, + "loss": 0.1713, + "step": 4425 + }, + { + "epoch": 0.6373847926267281, + "grad_norm": 0.6704667806625366, + "learning_rate": 1.4551998756517938e-05, + "loss": 0.0702, + "step": 4426 + }, + { + "epoch": 0.637528801843318, + "grad_norm": 0.9920125603675842, + "learning_rate": 1.4541724458683495e-05, + "loss": 0.122, + "step": 4427 + }, + { + "epoch": 0.6376728110599078, + "grad_norm": 3.6053640842437744, + "learning_rate": 1.453145230147307e-05, + "loss": 0.2739, + "step": 4428 + }, + { + "epoch": 0.6378168202764977, + "grad_norm": 1.1400728225708008, + "learning_rate": 1.4521182286989187e-05, + "loss": 0.12, + "step": 4429 + }, + { + "epoch": 0.6379608294930875, + "grad_norm": 1.023107886314392, + "learning_rate": 1.4510914417333943e-05, + "loss": 0.1182, + "step": 4430 + }, + { + "epoch": 0.6381048387096774, + "grad_norm": 0.6996926069259644, + "learning_rate": 1.4500648694608981e-05, + "loss": 0.091, + "step": 4431 + }, + { + "epoch": 0.6382488479262672, + "grad_norm": 0.5371840596199036, + "learning_rate": 1.449038512091552e-05, + "loss": 0.0594, + "step": 4432 + }, + { + "epoch": 0.6383928571428571, + "grad_norm": 0.6714287996292114, + "learning_rate": 1.4480123698354332e-05, + "loss": 0.0907, + "step": 4433 + }, + { + "epoch": 0.638536866359447, + "grad_norm": 4.270877838134766, + "learning_rate": 1.446986442902574e-05, + "loss": 1.4675, + "step": 4434 + }, + { + "epoch": 0.6386808755760369, + "grad_norm": 0.5084843039512634, + "learning_rate": 1.4459607315029644e-05, + "loss": 0.0497, + "step": 4435 + }, + { + "epoch": 0.6388248847926268, + "grad_norm": 2.770329236984253, + "learning_rate": 1.444935235846548e-05, + "loss": 0.3378, + "step": 4436 + }, + { + "epoch": 0.6389688940092166, + "grad_norm": 1.175374150276184, + "learning_rate": 1.4439099561432278e-05, + "loss": 0.1004, + "step": 4437 + }, + { + "epoch": 0.6391129032258065, + "grad_norm": 1.173275351524353, + "learning_rate": 1.4428848926028593e-05, + "loss": 0.1751, + "step": 4438 + }, + { + "epoch": 0.6392569124423964, + "grad_norm": 5.580125331878662, + "learning_rate": 1.4418600454352548e-05, + "loss": 1.476, + "step": 4439 + }, + { + "epoch": 0.6394009216589862, + "grad_norm": 0.6880161762237549, + "learning_rate": 1.4408354148501823e-05, + "loss": 0.0994, + "step": 4440 + }, + { + "epoch": 0.6395449308755761, + "grad_norm": 0.9621788263320923, + "learning_rate": 1.4398110010573662e-05, + "loss": 0.1074, + "step": 4441 + }, + { + "epoch": 0.6396889400921659, + "grad_norm": 1.9362707138061523, + "learning_rate": 1.4387868042664854e-05, + "loss": 0.2034, + "step": 4442 + }, + { + "epoch": 0.6398329493087558, + "grad_norm": 4.025570869445801, + "learning_rate": 1.4377628246871743e-05, + "loss": 2.355, + "step": 4443 + }, + { + "epoch": 0.6399769585253456, + "grad_norm": 1.139626145362854, + "learning_rate": 1.4367390625290245e-05, + "loss": 0.115, + "step": 4444 + }, + { + "epoch": 0.6401209677419355, + "grad_norm": 0.672411322593689, + "learning_rate": 1.4357155180015813e-05, + "loss": 0.0969, + "step": 4445 + }, + { + "epoch": 0.6402649769585254, + "grad_norm": 0.5399532318115234, + "learning_rate": 1.4346921913143466e-05, + "loss": 0.0738, + "step": 4446 + }, + { + "epoch": 0.6404089861751152, + "grad_norm": 0.9070764780044556, + "learning_rate": 1.4336690826767767e-05, + "loss": 0.109, + "step": 4447 + }, + { + "epoch": 0.6405529953917051, + "grad_norm": 4.564159870147705, + "learning_rate": 1.4326461922982845e-05, + "loss": 1.5397, + "step": 4448 + }, + { + "epoch": 0.6406970046082949, + "grad_norm": 0.660579264163971, + "learning_rate": 1.4316235203882371e-05, + "loss": 0.0809, + "step": 4449 + }, + { + "epoch": 0.6408410138248848, + "grad_norm": 0.9773398637771606, + "learning_rate": 1.4306010671559575e-05, + "loss": 0.1323, + "step": 4450 + }, + { + "epoch": 0.6409850230414746, + "grad_norm": 0.6993473172187805, + "learning_rate": 1.4295788328107238e-05, + "loss": 0.0674, + "step": 4451 + }, + { + "epoch": 0.6411290322580645, + "grad_norm": 0.6212794184684753, + "learning_rate": 1.4285568175617692e-05, + "loss": 0.0796, + "step": 4452 + }, + { + "epoch": 0.6412730414746544, + "grad_norm": 1.2065004110336304, + "learning_rate": 1.4275350216182822e-05, + "loss": 0.1218, + "step": 4453 + }, + { + "epoch": 0.6414170506912442, + "grad_norm": 3.7628836631774902, + "learning_rate": 1.4265134451894062e-05, + "loss": 2.4321, + "step": 4454 + }, + { + "epoch": 0.6415610599078341, + "grad_norm": 1.0175793170928955, + "learning_rate": 1.4254920884842404e-05, + "loss": 0.0952, + "step": 4455 + }, + { + "epoch": 0.6417050691244239, + "grad_norm": 1.1607370376586914, + "learning_rate": 1.4244709517118379e-05, + "loss": 0.1012, + "step": 4456 + }, + { + "epoch": 0.6418490783410138, + "grad_norm": 0.5976537466049194, + "learning_rate": 1.4234500350812074e-05, + "loss": 0.0691, + "step": 4457 + }, + { + "epoch": 0.6419930875576036, + "grad_norm": 1.4880354404449463, + "learning_rate": 1.4224293388013126e-05, + "loss": 0.1405, + "step": 4458 + }, + { + "epoch": 0.6421370967741935, + "grad_norm": 0.4825268089771271, + "learning_rate": 1.421408863081072e-05, + "loss": 0.0806, + "step": 4459 + }, + { + "epoch": 0.6422811059907834, + "grad_norm": 5.1229400634765625, + "learning_rate": 1.4203886081293589e-05, + "loss": 1.3291, + "step": 4460 + }, + { + "epoch": 0.6424251152073732, + "grad_norm": 4.1754913330078125, + "learning_rate": 1.4193685741550003e-05, + "loss": 1.0889, + "step": 4461 + }, + { + "epoch": 0.6425691244239631, + "grad_norm": 0.9822124242782593, + "learning_rate": 1.4183487613667811e-05, + "loss": 0.0718, + "step": 4462 + }, + { + "epoch": 0.642713133640553, + "grad_norm": 0.8106881976127625, + "learning_rate": 1.4173291699734384e-05, + "loss": 0.097, + "step": 4463 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.7006178498268127, + "learning_rate": 1.4163098001836638e-05, + "loss": 0.0658, + "step": 4464 + }, + { + "epoch": 0.6430011520737328, + "grad_norm": 4.871581554412842, + "learning_rate": 1.4152906522061048e-05, + "loss": 2.7982, + "step": 4465 + }, + { + "epoch": 0.6431451612903226, + "grad_norm": 5.762640953063965, + "learning_rate": 1.4142717262493629e-05, + "loss": 2.151, + "step": 4466 + }, + { + "epoch": 0.6432891705069125, + "grad_norm": 0.9403517842292786, + "learning_rate": 1.4132530225219943e-05, + "loss": 0.118, + "step": 4467 + }, + { + "epoch": 0.6434331797235023, + "grad_norm": 0.6100620627403259, + "learning_rate": 1.4122345412325092e-05, + "loss": 0.0831, + "step": 4468 + }, + { + "epoch": 0.6435771889400922, + "grad_norm": 3.7576565742492676, + "learning_rate": 1.411216282589373e-05, + "loss": 1.9463, + "step": 4469 + }, + { + "epoch": 0.643721198156682, + "grad_norm": 1.5969964265823364, + "learning_rate": 1.410198246801005e-05, + "loss": 0.1373, + "step": 4470 + }, + { + "epoch": 0.6438652073732719, + "grad_norm": 0.8089897036552429, + "learning_rate": 1.4091804340757798e-05, + "loss": 0.1133, + "step": 4471 + }, + { + "epoch": 0.6440092165898618, + "grad_norm": 1.033282995223999, + "learning_rate": 1.4081628446220246e-05, + "loss": 0.1148, + "step": 4472 + }, + { + "epoch": 0.6441532258064516, + "grad_norm": 0.5153348445892334, + "learning_rate": 1.4071454786480232e-05, + "loss": 0.0713, + "step": 4473 + }, + { + "epoch": 0.6442972350230415, + "grad_norm": 0.5099635124206543, + "learning_rate": 1.4061283363620111e-05, + "loss": 0.0528, + "step": 4474 + }, + { + "epoch": 0.6444412442396313, + "grad_norm": 0.9859601855278015, + "learning_rate": 1.4051114179721802e-05, + "loss": 0.0902, + "step": 4475 + }, + { + "epoch": 0.6445852534562212, + "grad_norm": 2.9739797115325928, + "learning_rate": 1.4040947236866758e-05, + "loss": 2.3238, + "step": 4476 + }, + { + "epoch": 0.644729262672811, + "grad_norm": 0.9726548194885254, + "learning_rate": 1.4030782537135967e-05, + "loss": 0.159, + "step": 4477 + }, + { + "epoch": 0.6448732718894009, + "grad_norm": 1.5098973512649536, + "learning_rate": 1.402062008260997e-05, + "loss": 0.2057, + "step": 4478 + }, + { + "epoch": 0.6450172811059908, + "grad_norm": 4.480342388153076, + "learning_rate": 1.401045987536883e-05, + "loss": 1.7413, + "step": 4479 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.4772697389125824, + "learning_rate": 1.4000301917492165e-05, + "loss": 0.059, + "step": 4480 + }, + { + "epoch": 0.6453052995391705, + "grad_norm": 0.5161353349685669, + "learning_rate": 1.399014621105914e-05, + "loss": 0.0597, + "step": 4481 + }, + { + "epoch": 0.6454493087557603, + "grad_norm": 1.7795709371566772, + "learning_rate": 1.3979992758148444e-05, + "loss": 0.1413, + "step": 4482 + }, + { + "epoch": 0.6455933179723502, + "grad_norm": 0.6539427042007446, + "learning_rate": 1.3969841560838307e-05, + "loss": 0.0957, + "step": 4483 + }, + { + "epoch": 0.6457373271889401, + "grad_norm": 0.6156671047210693, + "learning_rate": 1.3959692621206499e-05, + "loss": 0.0831, + "step": 4484 + }, + { + "epoch": 0.6458813364055299, + "grad_norm": 0.5547316074371338, + "learning_rate": 1.3949545941330327e-05, + "loss": 0.0838, + "step": 4485 + }, + { + "epoch": 0.6460253456221198, + "grad_norm": 2.180952787399292, + "learning_rate": 1.3939401523286643e-05, + "loss": 0.2854, + "step": 4486 + }, + { + "epoch": 0.6461693548387096, + "grad_norm": 4.729816436767578, + "learning_rate": 1.392925936915182e-05, + "loss": 1.1532, + "step": 4487 + }, + { + "epoch": 0.6463133640552995, + "grad_norm": 1.1855299472808838, + "learning_rate": 1.3919119481001792e-05, + "loss": 0.1234, + "step": 4488 + }, + { + "epoch": 0.6464573732718893, + "grad_norm": 0.7313432693481445, + "learning_rate": 1.3908981860911999e-05, + "loss": 0.0969, + "step": 4489 + }, + { + "epoch": 0.6466013824884793, + "grad_norm": 1.2460427284240723, + "learning_rate": 1.3898846510957442e-05, + "loss": 0.149, + "step": 4490 + }, + { + "epoch": 0.6467453917050692, + "grad_norm": 1.0260288715362549, + "learning_rate": 1.3888713433212645e-05, + "loss": 0.1396, + "step": 4491 + }, + { + "epoch": 0.646889400921659, + "grad_norm": 0.7487742304801941, + "learning_rate": 1.3878582629751668e-05, + "loss": 0.1184, + "step": 4492 + }, + { + "epoch": 0.6470334101382489, + "grad_norm": 0.3783435523509979, + "learning_rate": 1.3868454102648115e-05, + "loss": 0.0403, + "step": 4493 + }, + { + "epoch": 0.6471774193548387, + "grad_norm": 0.751878559589386, + "learning_rate": 1.3858327853975105e-05, + "loss": 0.0915, + "step": 4494 + }, + { + "epoch": 0.6473214285714286, + "grad_norm": 0.8690100312232971, + "learning_rate": 1.3848203885805313e-05, + "loss": 0.1189, + "step": 4495 + }, + { + "epoch": 0.6474654377880185, + "grad_norm": 2.11653208732605, + "learning_rate": 1.3838082200210931e-05, + "loss": 0.3054, + "step": 4496 + }, + { + "epoch": 0.6476094470046083, + "grad_norm": 0.6072826981544495, + "learning_rate": 1.3827962799263685e-05, + "loss": 0.0692, + "step": 4497 + }, + { + "epoch": 0.6477534562211982, + "grad_norm": 0.5214735865592957, + "learning_rate": 1.3817845685034847e-05, + "loss": 0.054, + "step": 4498 + }, + { + "epoch": 0.647897465437788, + "grad_norm": 1.8916800022125244, + "learning_rate": 1.3807730859595192e-05, + "loss": 0.1442, + "step": 4499 + }, + { + "epoch": 0.6480414746543779, + "grad_norm": 0.45447322726249695, + "learning_rate": 1.3797618325015072e-05, + "loss": 0.0443, + "step": 4500 + }, + { + "epoch": 0.6481854838709677, + "grad_norm": 0.5541004538536072, + "learning_rate": 1.3787508083364325e-05, + "loss": 0.0812, + "step": 4501 + }, + { + "epoch": 0.6483294930875576, + "grad_norm": 0.5853996872901917, + "learning_rate": 1.3777400136712354e-05, + "loss": 0.0729, + "step": 4502 + }, + { + "epoch": 0.6484735023041475, + "grad_norm": 0.9371541738510132, + "learning_rate": 1.3767294487128063e-05, + "loss": 0.1152, + "step": 4503 + }, + { + "epoch": 0.6486175115207373, + "grad_norm": 1.4786655902862549, + "learning_rate": 1.375719113667991e-05, + "loss": 0.1346, + "step": 4504 + }, + { + "epoch": 0.6487615207373272, + "grad_norm": 0.7184931635856628, + "learning_rate": 1.374709008743586e-05, + "loss": 0.0791, + "step": 4505 + }, + { + "epoch": 0.648905529953917, + "grad_norm": 0.8142910003662109, + "learning_rate": 1.373699134146343e-05, + "loss": 0.1182, + "step": 4506 + }, + { + "epoch": 0.6490495391705069, + "grad_norm": 0.5573334693908691, + "learning_rate": 1.372689490082965e-05, + "loss": 0.056, + "step": 4507 + }, + { + "epoch": 0.6491935483870968, + "grad_norm": 0.7258556485176086, + "learning_rate": 1.3716800767601085e-05, + "loss": 0.0855, + "step": 4508 + }, + { + "epoch": 0.6493375576036866, + "grad_norm": 1.310009241104126, + "learning_rate": 1.3706708943843821e-05, + "loss": 0.142, + "step": 4509 + }, + { + "epoch": 0.6494815668202765, + "grad_norm": 4.502348899841309, + "learning_rate": 1.3696619431623484e-05, + "loss": 2.5955, + "step": 4510 + }, + { + "epoch": 0.6496255760368663, + "grad_norm": 1.8213657140731812, + "learning_rate": 1.3686532233005212e-05, + "loss": 0.2726, + "step": 4511 + }, + { + "epoch": 0.6497695852534562, + "grad_norm": 4.2830705642700195, + "learning_rate": 1.3676447350053672e-05, + "loss": 1.281, + "step": 4512 + }, + { + "epoch": 0.649913594470046, + "grad_norm": 5.453336715698242, + "learning_rate": 1.3666364784833075e-05, + "loss": 1.6789, + "step": 4513 + }, + { + "epoch": 0.6500576036866359, + "grad_norm": 0.7631909251213074, + "learning_rate": 1.3656284539407133e-05, + "loss": 0.099, + "step": 4514 + }, + { + "epoch": 0.6502016129032258, + "grad_norm": 0.6468568444252014, + "learning_rate": 1.3646206615839097e-05, + "loss": 0.0799, + "step": 4515 + }, + { + "epoch": 0.6503456221198156, + "grad_norm": 0.9863340258598328, + "learning_rate": 1.3636131016191736e-05, + "loss": 0.1339, + "step": 4516 + }, + { + "epoch": 0.6504896313364056, + "grad_norm": 0.564002513885498, + "learning_rate": 1.3626057742527354e-05, + "loss": 0.0844, + "step": 4517 + }, + { + "epoch": 0.6506336405529954, + "grad_norm": 1.032504677772522, + "learning_rate": 1.3615986796907753e-05, + "loss": 0.118, + "step": 4518 + }, + { + "epoch": 0.6507776497695853, + "grad_norm": 3.8470962047576904, + "learning_rate": 1.3605918181394308e-05, + "loss": 2.439, + "step": 4519 + }, + { + "epoch": 0.6509216589861752, + "grad_norm": 0.36619964241981506, + "learning_rate": 1.3595851898047874e-05, + "loss": 0.0361, + "step": 4520 + }, + { + "epoch": 0.651065668202765, + "grad_norm": 0.9318439364433289, + "learning_rate": 1.358578794892883e-05, + "loss": 4.1984, + "step": 4521 + }, + { + "epoch": 0.6512096774193549, + "grad_norm": 0.4820261299610138, + "learning_rate": 1.3575726336097102e-05, + "loss": 0.0777, + "step": 4522 + }, + { + "epoch": 0.6513536866359447, + "grad_norm": 0.802703857421875, + "learning_rate": 1.3565667061612119e-05, + "loss": 0.1112, + "step": 4523 + }, + { + "epoch": 0.6514976958525346, + "grad_norm": 0.7840380072593689, + "learning_rate": 1.3555610127532837e-05, + "loss": 0.0794, + "step": 4524 + }, + { + "epoch": 0.6516417050691244, + "grad_norm": 0.4057624638080597, + "learning_rate": 1.3545555535917729e-05, + "loss": 0.0585, + "step": 4525 + }, + { + "epoch": 0.6517857142857143, + "grad_norm": 4.223751544952393, + "learning_rate": 1.3535503288824797e-05, + "loss": 2.557, + "step": 4526 + }, + { + "epoch": 0.6519297235023042, + "grad_norm": 1.4740676879882812, + "learning_rate": 1.3525453388311554e-05, + "loss": 0.1551, + "step": 4527 + }, + { + "epoch": 0.652073732718894, + "grad_norm": 2.0150396823883057, + "learning_rate": 1.3515405836435042e-05, + "loss": 0.1797, + "step": 4528 + }, + { + "epoch": 0.6522177419354839, + "grad_norm": 1.0365784168243408, + "learning_rate": 1.3505360635251812e-05, + "loss": 0.118, + "step": 4529 + }, + { + "epoch": 0.6523617511520737, + "grad_norm": 1.1521018743515015, + "learning_rate": 1.3495317786817945e-05, + "loss": 0.1427, + "step": 4530 + }, + { + "epoch": 0.6525057603686636, + "grad_norm": 0.695289671421051, + "learning_rate": 1.3485277293189028e-05, + "loss": 0.1278, + "step": 4531 + }, + { + "epoch": 0.6526497695852534, + "grad_norm": 3.5971713066101074, + "learning_rate": 1.3475239156420174e-05, + "loss": 2.1209, + "step": 4532 + }, + { + "epoch": 0.6527937788018433, + "grad_norm": 0.9412140250205994, + "learning_rate": 1.3465203378566017e-05, + "loss": 0.1137, + "step": 4533 + }, + { + "epoch": 0.6529377880184332, + "grad_norm": 0.6807209849357605, + "learning_rate": 1.3455169961680698e-05, + "loss": 0.0829, + "step": 4534 + }, + { + "epoch": 0.653081797235023, + "grad_norm": 0.7896579504013062, + "learning_rate": 1.3445138907817878e-05, + "loss": 0.1032, + "step": 4535 + }, + { + "epoch": 0.6532258064516129, + "grad_norm": 0.9011073112487793, + "learning_rate": 1.3435110219030742e-05, + "loss": 0.1446, + "step": 4536 + }, + { + "epoch": 0.6533698156682027, + "grad_norm": 0.5343500971794128, + "learning_rate": 1.3425083897371981e-05, + "loss": 0.0598, + "step": 4537 + }, + { + "epoch": 0.6535138248847926, + "grad_norm": 1.7730717658996582, + "learning_rate": 1.341505994489381e-05, + "loss": 0.2362, + "step": 4538 + }, + { + "epoch": 0.6536578341013825, + "grad_norm": 0.7268236875534058, + "learning_rate": 1.3405038363647953e-05, + "loss": 0.0779, + "step": 4539 + }, + { + "epoch": 0.6538018433179723, + "grad_norm": 0.9816795587539673, + "learning_rate": 1.3395019155685648e-05, + "loss": 0.1039, + "step": 4540 + }, + { + "epoch": 0.6539458525345622, + "grad_norm": 0.5152014493942261, + "learning_rate": 1.3385002323057651e-05, + "loss": 0.0691, + "step": 4541 + }, + { + "epoch": 0.654089861751152, + "grad_norm": 4.588764190673828, + "learning_rate": 1.337498786781423e-05, + "loss": 1.7476, + "step": 4542 + }, + { + "epoch": 0.6542338709677419, + "grad_norm": 5.338955402374268, + "learning_rate": 1.3364975792005172e-05, + "loss": 1.3214, + "step": 4543 + }, + { + "epoch": 0.6543778801843319, + "grad_norm": 0.9917251467704773, + "learning_rate": 1.3354966097679767e-05, + "loss": 0.1249, + "step": 4544 + }, + { + "epoch": 0.6545218894009217, + "grad_norm": 1.8246690034866333, + "learning_rate": 1.3344958786886808e-05, + "loss": 0.1525, + "step": 4545 + }, + { + "epoch": 0.6546658986175116, + "grad_norm": 0.5542665719985962, + "learning_rate": 1.3334953861674644e-05, + "loss": 0.0503, + "step": 4546 + }, + { + "epoch": 0.6548099078341014, + "grad_norm": 5.808346271514893, + "learning_rate": 1.332495132409109e-05, + "loss": 1.808, + "step": 4547 + }, + { + "epoch": 0.6549539170506913, + "grad_norm": 1.3086105585098267, + "learning_rate": 1.3314951176183488e-05, + "loss": 0.1443, + "step": 4548 + }, + { + "epoch": 0.6550979262672811, + "grad_norm": 0.3677675724029541, + "learning_rate": 1.3304953419998695e-05, + "loss": 0.0362, + "step": 4549 + }, + { + "epoch": 0.655241935483871, + "grad_norm": 1.238693356513977, + "learning_rate": 1.3294958057583076e-05, + "loss": 0.1399, + "step": 4550 + }, + { + "epoch": 0.6553859447004609, + "grad_norm": 0.7903734445571899, + "learning_rate": 1.3284965090982499e-05, + "loss": 0.0864, + "step": 4551 + }, + { + "epoch": 0.6555299539170507, + "grad_norm": 0.9130779504776001, + "learning_rate": 1.3274974522242353e-05, + "loss": 0.117, + "step": 4552 + }, + { + "epoch": 0.6556739631336406, + "grad_norm": 0.7958996295928955, + "learning_rate": 1.3264986353407527e-05, + "loss": 0.0765, + "step": 4553 + }, + { + "epoch": 0.6558179723502304, + "grad_norm": 0.4292137622833252, + "learning_rate": 1.3255000586522426e-05, + "loss": 0.0545, + "step": 4554 + }, + { + "epoch": 0.6559619815668203, + "grad_norm": 5.351078033447266, + "learning_rate": 1.3245017223630956e-05, + "loss": 0.9281, + "step": 4555 + }, + { + "epoch": 0.6561059907834101, + "grad_norm": 0.37671390175819397, + "learning_rate": 1.3235036266776535e-05, + "loss": 0.0478, + "step": 4556 + }, + { + "epoch": 0.65625, + "grad_norm": 0.3266085386276245, + "learning_rate": 1.3225057718002093e-05, + "loss": 0.0544, + "step": 4557 + }, + { + "epoch": 0.6563940092165899, + "grad_norm": 0.5219181180000305, + "learning_rate": 1.3215081579350058e-05, + "loss": 0.076, + "step": 4558 + }, + { + "epoch": 0.6565380184331797, + "grad_norm": 0.9334813952445984, + "learning_rate": 1.3205107852862373e-05, + "loss": 0.0896, + "step": 4559 + }, + { + "epoch": 0.6566820276497696, + "grad_norm": 0.6575621366500854, + "learning_rate": 1.3195136540580478e-05, + "loss": 0.0891, + "step": 4560 + }, + { + "epoch": 0.6568260368663594, + "grad_norm": 0.9452588558197021, + "learning_rate": 1.3185167644545327e-05, + "loss": 0.0955, + "step": 4561 + }, + { + "epoch": 0.6569700460829493, + "grad_norm": 0.7490657567977905, + "learning_rate": 1.3175201166797379e-05, + "loss": 0.0795, + "step": 4562 + }, + { + "epoch": 0.6571140552995391, + "grad_norm": 0.6367915868759155, + "learning_rate": 1.316523710937659e-05, + "loss": 0.1195, + "step": 4563 + }, + { + "epoch": 0.657258064516129, + "grad_norm": 0.8773254156112671, + "learning_rate": 1.3155275474322432e-05, + "loss": 0.098, + "step": 4564 + }, + { + "epoch": 0.6574020737327189, + "grad_norm": 3.547520399093628, + "learning_rate": 1.3145316263673874e-05, + "loss": 1.4819, + "step": 4565 + }, + { + "epoch": 0.6575460829493087, + "grad_norm": 0.7069854140281677, + "learning_rate": 1.3135359479469389e-05, + "loss": 0.0897, + "step": 4566 + }, + { + "epoch": 0.6576900921658986, + "grad_norm": 0.6280485391616821, + "learning_rate": 1.3125405123746957e-05, + "loss": 0.0667, + "step": 4567 + }, + { + "epoch": 0.6578341013824884, + "grad_norm": 0.5720449686050415, + "learning_rate": 1.3115453198544053e-05, + "loss": 0.0735, + "step": 4568 + }, + { + "epoch": 0.6579781105990783, + "grad_norm": 0.8677749037742615, + "learning_rate": 1.3105503705897668e-05, + "loss": 0.1038, + "step": 4569 + }, + { + "epoch": 0.6581221198156681, + "grad_norm": 0.6361798048019409, + "learning_rate": 1.3095556647844281e-05, + "loss": 0.0822, + "step": 4570 + }, + { + "epoch": 0.6582661290322581, + "grad_norm": 0.3658501207828522, + "learning_rate": 1.308561202641988e-05, + "loss": 0.0453, + "step": 4571 + }, + { + "epoch": 0.658410138248848, + "grad_norm": 0.7445298433303833, + "learning_rate": 1.3075669843659943e-05, + "loss": 0.093, + "step": 4572 + }, + { + "epoch": 0.6585541474654378, + "grad_norm": 0.9502248764038086, + "learning_rate": 1.3065730101599482e-05, + "loss": 0.1017, + "step": 4573 + }, + { + "epoch": 0.6586981566820277, + "grad_norm": 4.1502251625061035, + "learning_rate": 1.3055792802272976e-05, + "loss": 1.5263, + "step": 4574 + }, + { + "epoch": 0.6588421658986175, + "grad_norm": 0.6408762335777283, + "learning_rate": 1.304585794771441e-05, + "loss": 0.0805, + "step": 4575 + }, + { + "epoch": 0.6589861751152074, + "grad_norm": 0.7429251074790955, + "learning_rate": 1.3035925539957278e-05, + "loss": 0.0861, + "step": 4576 + }, + { + "epoch": 0.6591301843317973, + "grad_norm": 0.9646016955375671, + "learning_rate": 1.3025995581034561e-05, + "loss": 0.125, + "step": 4577 + }, + { + "epoch": 0.6592741935483871, + "grad_norm": 0.6933543682098389, + "learning_rate": 1.3016068072978754e-05, + "loss": 0.0759, + "step": 4578 + }, + { + "epoch": 0.659418202764977, + "grad_norm": 4.105437755584717, + "learning_rate": 1.300614301782184e-05, + "loss": 1.1893, + "step": 4579 + }, + { + "epoch": 0.6595622119815668, + "grad_norm": 2.459242582321167, + "learning_rate": 1.29962204175953e-05, + "loss": 0.2104, + "step": 4580 + }, + { + "epoch": 0.6597062211981567, + "grad_norm": 0.7345227003097534, + "learning_rate": 1.2986300274330115e-05, + "loss": 0.0775, + "step": 4581 + }, + { + "epoch": 0.6598502304147466, + "grad_norm": 1.457026481628418, + "learning_rate": 1.2976382590056769e-05, + "loss": 0.1462, + "step": 4582 + }, + { + "epoch": 0.6599942396313364, + "grad_norm": 4.198763370513916, + "learning_rate": 1.2966467366805224e-05, + "loss": 1.4526, + "step": 4583 + }, + { + "epoch": 0.6601382488479263, + "grad_norm": 0.681831955909729, + "learning_rate": 1.2956554606604964e-05, + "loss": 0.0757, + "step": 4584 + }, + { + "epoch": 0.6602822580645161, + "grad_norm": 0.632732629776001, + "learning_rate": 1.2946644311484946e-05, + "loss": 0.0701, + "step": 4585 + }, + { + "epoch": 0.660426267281106, + "grad_norm": 1.7269554138183594, + "learning_rate": 1.2936736483473638e-05, + "loss": 0.3061, + "step": 4586 + }, + { + "epoch": 0.6605702764976958, + "grad_norm": 2.572314500808716, + "learning_rate": 1.2926831124598998e-05, + "loss": 2.0763, + "step": 4587 + }, + { + "epoch": 0.6607142857142857, + "grad_norm": 2.1001014709472656, + "learning_rate": 1.2916928236888471e-05, + "loss": 0.2079, + "step": 4588 + }, + { + "epoch": 0.6608582949308756, + "grad_norm": 15.891912460327148, + "learning_rate": 1.2907027822369005e-05, + "loss": 4.9863, + "step": 4589 + }, + { + "epoch": 0.6610023041474654, + "grad_norm": 0.5888150334358215, + "learning_rate": 1.289712988306705e-05, + "loss": 0.0668, + "step": 4590 + }, + { + "epoch": 0.6611463133640553, + "grad_norm": 0.6298269033432007, + "learning_rate": 1.2887234421008523e-05, + "loss": 0.0763, + "step": 4591 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 3.7514803409576416, + "learning_rate": 1.287734143821886e-05, + "loss": 0.7202, + "step": 4592 + }, + { + "epoch": 0.661434331797235, + "grad_norm": 4.738045692443848, + "learning_rate": 1.286745093672298e-05, + "loss": 1.3105, + "step": 4593 + }, + { + "epoch": 0.6615783410138248, + "grad_norm": 0.9116498827934265, + "learning_rate": 1.2857562918545288e-05, + "loss": 0.1116, + "step": 4594 + }, + { + "epoch": 0.6617223502304147, + "grad_norm": 0.47090545296669006, + "learning_rate": 1.284767738570969e-05, + "loss": 0.0733, + "step": 4595 + }, + { + "epoch": 0.6618663594470046, + "grad_norm": 0.5835784077644348, + "learning_rate": 1.2837794340239579e-05, + "loss": 0.0716, + "step": 4596 + }, + { + "epoch": 0.6620103686635944, + "grad_norm": 1.3701125383377075, + "learning_rate": 1.282791378415784e-05, + "loss": 0.1426, + "step": 4597 + }, + { + "epoch": 0.6621543778801844, + "grad_norm": 3.6088249683380127, + "learning_rate": 1.2818035719486849e-05, + "loss": 0.2985, + "step": 4598 + }, + { + "epoch": 0.6622983870967742, + "grad_norm": 0.6921645402908325, + "learning_rate": 1.280816014824846e-05, + "loss": 0.094, + "step": 4599 + }, + { + "epoch": 0.6624423963133641, + "grad_norm": 0.5305364727973938, + "learning_rate": 1.2798287072464048e-05, + "loss": 0.0836, + "step": 4600 + }, + { + "epoch": 0.662586405529954, + "grad_norm": 5.556802272796631, + "learning_rate": 1.2788416494154446e-05, + "loss": 2.3082, + "step": 4601 + }, + { + "epoch": 0.6627304147465438, + "grad_norm": 1.019511342048645, + "learning_rate": 1.2778548415339986e-05, + "loss": 0.1036, + "step": 4602 + }, + { + "epoch": 0.6628744239631337, + "grad_norm": 4.054712295532227, + "learning_rate": 1.2768682838040494e-05, + "loss": 2.6986, + "step": 4603 + }, + { + "epoch": 0.6630184331797235, + "grad_norm": 3.7242984771728516, + "learning_rate": 1.2758819764275276e-05, + "loss": 1.9545, + "step": 4604 + }, + { + "epoch": 0.6631624423963134, + "grad_norm": 0.5768896341323853, + "learning_rate": 1.2748959196063127e-05, + "loss": 0.0705, + "step": 4605 + }, + { + "epoch": 0.6633064516129032, + "grad_norm": 1.3067033290863037, + "learning_rate": 1.2739101135422332e-05, + "loss": 0.1159, + "step": 4606 + }, + { + "epoch": 0.6634504608294931, + "grad_norm": 0.5009055733680725, + "learning_rate": 1.2729245584370661e-05, + "loss": 0.0576, + "step": 4607 + }, + { + "epoch": 0.663594470046083, + "grad_norm": 0.5384986400604248, + "learning_rate": 1.2719392544925374e-05, + "loss": 0.0641, + "step": 4608 + }, + { + "epoch": 0.6637384792626728, + "grad_norm": 0.8778179883956909, + "learning_rate": 1.270954201910321e-05, + "loss": 0.1242, + "step": 4609 + }, + { + "epoch": 0.6638824884792627, + "grad_norm": 4.4088664054870605, + "learning_rate": 1.2699694008920404e-05, + "loss": 2.2322, + "step": 4610 + }, + { + "epoch": 0.6640264976958525, + "grad_norm": 2.386094808578491, + "learning_rate": 1.268984851639266e-05, + "loss": 0.1779, + "step": 4611 + }, + { + "epoch": 0.6641705069124424, + "grad_norm": 4.750864505767822, + "learning_rate": 1.2680005543535183e-05, + "loss": 1.8525, + "step": 4612 + }, + { + "epoch": 0.6643145161290323, + "grad_norm": 2.085740566253662, + "learning_rate": 1.2670165092362657e-05, + "loss": 0.1456, + "step": 4613 + }, + { + "epoch": 0.6644585253456221, + "grad_norm": 0.8900134563446045, + "learning_rate": 1.2660327164889241e-05, + "loss": 0.1025, + "step": 4614 + }, + { + "epoch": 0.664602534562212, + "grad_norm": 0.49965986609458923, + "learning_rate": 1.2650491763128588e-05, + "loss": 0.0672, + "step": 4615 + }, + { + "epoch": 0.6647465437788018, + "grad_norm": 2.0807321071624756, + "learning_rate": 1.2640658889093831e-05, + "loss": 0.2544, + "step": 4616 + }, + { + "epoch": 0.6648905529953917, + "grad_norm": 0.8595982789993286, + "learning_rate": 1.2630828544797591e-05, + "loss": 0.1036, + "step": 4617 + }, + { + "epoch": 0.6650345622119815, + "grad_norm": 1.5035521984100342, + "learning_rate": 1.2621000732251953e-05, + "loss": 0.112, + "step": 4618 + }, + { + "epoch": 0.6651785714285714, + "grad_norm": 0.70725017786026, + "learning_rate": 1.261117545346851e-05, + "loss": 0.0978, + "step": 4619 + }, + { + "epoch": 0.6653225806451613, + "grad_norm": 0.7478552460670471, + "learning_rate": 1.2601352710458313e-05, + "loss": 0.0764, + "step": 4620 + }, + { + "epoch": 0.6654665898617511, + "grad_norm": 0.5613279342651367, + "learning_rate": 1.2591532505231906e-05, + "loss": 0.0677, + "step": 4621 + }, + { + "epoch": 0.665610599078341, + "grad_norm": 1.306485891342163, + "learning_rate": 1.2581714839799317e-05, + "loss": 0.1024, + "step": 4622 + }, + { + "epoch": 0.6657546082949308, + "grad_norm": 0.8799846172332764, + "learning_rate": 1.2571899716170043e-05, + "loss": 0.1135, + "step": 4623 + }, + { + "epoch": 0.6658986175115207, + "grad_norm": 2.3267722129821777, + "learning_rate": 1.2562087136353066e-05, + "loss": 0.262, + "step": 4624 + }, + { + "epoch": 0.6660426267281107, + "grad_norm": 0.6760618686676025, + "learning_rate": 1.2552277102356846e-05, + "loss": 0.0845, + "step": 4625 + }, + { + "epoch": 0.6661866359447005, + "grad_norm": 0.9315707087516785, + "learning_rate": 1.254246961618932e-05, + "loss": 0.1275, + "step": 4626 + }, + { + "epoch": 0.6663306451612904, + "grad_norm": 0.9868862628936768, + "learning_rate": 1.2532664679857923e-05, + "loss": 0.1124, + "step": 4627 + }, + { + "epoch": 0.6664746543778802, + "grad_norm": 0.8349103331565857, + "learning_rate": 1.2522862295369541e-05, + "loss": 0.1319, + "step": 4628 + }, + { + "epoch": 0.6666186635944701, + "grad_norm": 1.381345272064209, + "learning_rate": 1.2513062464730552e-05, + "loss": 0.1954, + "step": 4629 + }, + { + "epoch": 0.6667626728110599, + "grad_norm": 3.407860040664673, + "learning_rate": 1.25032651899468e-05, + "loss": 1.4311, + "step": 4630 + }, + { + "epoch": 0.6669066820276498, + "grad_norm": 0.5276654362678528, + "learning_rate": 1.2493470473023624e-05, + "loss": 0.0939, + "step": 4631 + }, + { + "epoch": 0.6670506912442397, + "grad_norm": 0.9396846890449524, + "learning_rate": 1.2483678315965827e-05, + "loss": 0.0788, + "step": 4632 + }, + { + "epoch": 0.6671947004608295, + "grad_norm": 0.987046480178833, + "learning_rate": 1.2473888720777685e-05, + "loss": 0.082, + "step": 4633 + }, + { + "epoch": 0.6673387096774194, + "grad_norm": 0.6002364754676819, + "learning_rate": 1.246410168946296e-05, + "loss": 0.0764, + "step": 4634 + }, + { + "epoch": 0.6674827188940092, + "grad_norm": 1.05910062789917, + "learning_rate": 1.245431722402488e-05, + "loss": 0.1113, + "step": 4635 + }, + { + "epoch": 0.6676267281105991, + "grad_norm": 1.3677647113800049, + "learning_rate": 1.2444535326466159e-05, + "loss": 0.1637, + "step": 4636 + }, + { + "epoch": 0.667770737327189, + "grad_norm": 0.5372827649116516, + "learning_rate": 1.243475599878897e-05, + "loss": 0.0804, + "step": 4637 + }, + { + "epoch": 0.6679147465437788, + "grad_norm": 2.073209762573242, + "learning_rate": 1.2424979242994975e-05, + "loss": 0.207, + "step": 4638 + }, + { + "epoch": 0.6680587557603687, + "grad_norm": 0.5983861088752747, + "learning_rate": 1.2415205061085297e-05, + "loss": 0.0957, + "step": 4639 + }, + { + "epoch": 0.6682027649769585, + "grad_norm": 0.8876417279243469, + "learning_rate": 1.2405433455060545e-05, + "loss": 0.1186, + "step": 4640 + }, + { + "epoch": 0.6683467741935484, + "grad_norm": 0.8208742737770081, + "learning_rate": 1.239566442692079e-05, + "loss": 0.1122, + "step": 4641 + }, + { + "epoch": 0.6684907834101382, + "grad_norm": 2.1803038120269775, + "learning_rate": 1.2385897978665579e-05, + "loss": 0.2186, + "step": 4642 + }, + { + "epoch": 0.6686347926267281, + "grad_norm": 0.7487136721611023, + "learning_rate": 1.237613411229393e-05, + "loss": 0.0766, + "step": 4643 + }, + { + "epoch": 0.668778801843318, + "grad_norm": 1.0320487022399902, + "learning_rate": 1.2366372829804337e-05, + "loss": 0.1193, + "step": 4644 + }, + { + "epoch": 0.6689228110599078, + "grad_norm": 1.2830699682235718, + "learning_rate": 1.235661413319476e-05, + "loss": 0.0901, + "step": 4645 + }, + { + "epoch": 0.6690668202764977, + "grad_norm": 3.957164764404297, + "learning_rate": 1.2346858024462632e-05, + "loss": 0.6026, + "step": 4646 + }, + { + "epoch": 0.6692108294930875, + "grad_norm": 0.8448511958122253, + "learning_rate": 1.2337104505604857e-05, + "loss": 0.1275, + "step": 4647 + }, + { + "epoch": 0.6693548387096774, + "grad_norm": 1.1357226371765137, + "learning_rate": 1.2327353578617806e-05, + "loss": 0.1205, + "step": 4648 + }, + { + "epoch": 0.6694988479262672, + "grad_norm": 4.291447639465332, + "learning_rate": 1.2317605245497323e-05, + "loss": 0.7986, + "step": 4649 + }, + { + "epoch": 0.6696428571428571, + "grad_norm": 1.167960524559021, + "learning_rate": 1.2307859508238717e-05, + "loss": 0.0896, + "step": 4650 + }, + { + "epoch": 0.669786866359447, + "grad_norm": 7.8041558265686035, + "learning_rate": 1.229811636883677e-05, + "loss": 1.2143, + "step": 4651 + }, + { + "epoch": 0.6699308755760369, + "grad_norm": 3.6697072982788086, + "learning_rate": 1.228837582928573e-05, + "loss": 1.4257, + "step": 4652 + }, + { + "epoch": 0.6700748847926268, + "grad_norm": 5.830082893371582, + "learning_rate": 1.2278637891579304e-05, + "loss": 1.99, + "step": 4653 + }, + { + "epoch": 0.6702188940092166, + "grad_norm": 1.0331271886825562, + "learning_rate": 1.2268902557710693e-05, + "loss": 0.1077, + "step": 4654 + }, + { + "epoch": 0.6703629032258065, + "grad_norm": 1.4303230047225952, + "learning_rate": 1.2259169829672539e-05, + "loss": 0.1662, + "step": 4655 + }, + { + "epoch": 0.6705069124423964, + "grad_norm": 1.00997793674469, + "learning_rate": 1.2249439709456958e-05, + "loss": 0.0954, + "step": 4656 + }, + { + "epoch": 0.6706509216589862, + "grad_norm": 5.3531084060668945, + "learning_rate": 1.223971219905554e-05, + "loss": 1.1738, + "step": 4657 + }, + { + "epoch": 0.6707949308755761, + "grad_norm": 4.470348358154297, + "learning_rate": 1.2229987300459323e-05, + "loss": 1.5206, + "step": 4658 + }, + { + "epoch": 0.6709389400921659, + "grad_norm": 1.2588025331497192, + "learning_rate": 1.222026501565883e-05, + "loss": 0.1286, + "step": 4659 + }, + { + "epoch": 0.6710829493087558, + "grad_norm": 1.8683335781097412, + "learning_rate": 1.221054534664404e-05, + "loss": 0.1708, + "step": 4660 + }, + { + "epoch": 0.6712269585253456, + "grad_norm": 1.0585113763809204, + "learning_rate": 1.2200828295404396e-05, + "loss": 3.8876, + "step": 4661 + }, + { + "epoch": 0.6713709677419355, + "grad_norm": 0.8623279333114624, + "learning_rate": 1.2191113863928805e-05, + "loss": 0.0842, + "step": 4662 + }, + { + "epoch": 0.6715149769585254, + "grad_norm": 0.7597923278808594, + "learning_rate": 1.218140205420564e-05, + "loss": 0.0939, + "step": 4663 + }, + { + "epoch": 0.6716589861751152, + "grad_norm": 5.659722805023193, + "learning_rate": 1.2171692868222739e-05, + "loss": 0.8298, + "step": 4664 + }, + { + "epoch": 0.6718029953917051, + "grad_norm": 0.8588436245918274, + "learning_rate": 1.2161986307967398e-05, + "loss": 0.1037, + "step": 4665 + }, + { + "epoch": 0.6719470046082949, + "grad_norm": 2.7791121006011963, + "learning_rate": 1.2152282375426383e-05, + "loss": 0.1114, + "step": 4666 + }, + { + "epoch": 0.6720910138248848, + "grad_norm": 0.7350063323974609, + "learning_rate": 1.214258107258591e-05, + "loss": 0.0849, + "step": 4667 + }, + { + "epoch": 0.6722350230414746, + "grad_norm": 0.5221338272094727, + "learning_rate": 1.213288240143167e-05, + "loss": 0.055, + "step": 4668 + }, + { + "epoch": 0.6723790322580645, + "grad_norm": 0.9351997971534729, + "learning_rate": 1.2123186363948805e-05, + "loss": 0.1047, + "step": 4669 + }, + { + "epoch": 0.6725230414746544, + "grad_norm": 0.9928014278411865, + "learning_rate": 1.2113492962121924e-05, + "loss": 4.2203, + "step": 4670 + }, + { + "epoch": 0.6726670506912442, + "grad_norm": 0.6971950531005859, + "learning_rate": 1.21038021979351e-05, + "loss": 0.0849, + "step": 4671 + }, + { + "epoch": 0.6728110599078341, + "grad_norm": 4.05716609954834, + "learning_rate": 1.209411407337185e-05, + "loss": 2.1528, + "step": 4672 + }, + { + "epoch": 0.6729550691244239, + "grad_norm": 0.6635546088218689, + "learning_rate": 1.2084428590415172e-05, + "loss": 0.0752, + "step": 4673 + }, + { + "epoch": 0.6730990783410138, + "grad_norm": 0.8888615369796753, + "learning_rate": 1.2074745751047509e-05, + "loss": 0.0907, + "step": 4674 + }, + { + "epoch": 0.6732430875576036, + "grad_norm": 0.6707918643951416, + "learning_rate": 1.2065065557250765e-05, + "loss": 0.0612, + "step": 4675 + }, + { + "epoch": 0.6733870967741935, + "grad_norm": 0.8071878552436829, + "learning_rate": 1.2055388011006311e-05, + "loss": 0.1085, + "step": 4676 + }, + { + "epoch": 0.6735311059907834, + "grad_norm": 1.0382342338562012, + "learning_rate": 1.204571311429496e-05, + "loss": 0.1009, + "step": 4677 + }, + { + "epoch": 0.6736751152073732, + "grad_norm": 1.0315523147583008, + "learning_rate": 1.2036040869097001e-05, + "loss": 0.1053, + "step": 4678 + }, + { + "epoch": 0.6738191244239631, + "grad_norm": 0.8072168231010437, + "learning_rate": 1.2026371277392165e-05, + "loss": 0.0852, + "step": 4679 + }, + { + "epoch": 0.673963133640553, + "grad_norm": 4.352389335632324, + "learning_rate": 1.2016704341159652e-05, + "loss": 1.3966, + "step": 4680 + }, + { + "epoch": 0.6741071428571429, + "grad_norm": 1.453310251235962, + "learning_rate": 1.20070400623781e-05, + "loss": 0.1604, + "step": 4681 + }, + { + "epoch": 0.6742511520737328, + "grad_norm": 2.0545120239257812, + "learning_rate": 1.1997378443025634e-05, + "loss": 0.1647, + "step": 4682 + }, + { + "epoch": 0.6743951612903226, + "grad_norm": 1.0553659200668335, + "learning_rate": 1.198771948507981e-05, + "loss": 0.1477, + "step": 4683 + }, + { + "epoch": 0.6745391705069125, + "grad_norm": 0.657688558101654, + "learning_rate": 1.1978063190517644e-05, + "loss": 0.1049, + "step": 4684 + }, + { + "epoch": 0.6746831797235023, + "grad_norm": 1.1342425346374512, + "learning_rate": 1.196840956131561e-05, + "loss": 0.1154, + "step": 4685 + }, + { + "epoch": 0.6748271889400922, + "grad_norm": 0.4943380355834961, + "learning_rate": 1.1958758599449631e-05, + "loss": 0.07, + "step": 4686 + }, + { + "epoch": 0.674971198156682, + "grad_norm": 0.483540803194046, + "learning_rate": 1.1949110306895095e-05, + "loss": 0.0613, + "step": 4687 + }, + { + "epoch": 0.6751152073732719, + "grad_norm": 0.9789985418319702, + "learning_rate": 1.1939464685626833e-05, + "loss": 0.0897, + "step": 4688 + }, + { + "epoch": 0.6752592165898618, + "grad_norm": 1.9113670587539673, + "learning_rate": 1.1929821737619131e-05, + "loss": 0.2315, + "step": 4689 + }, + { + "epoch": 0.6754032258064516, + "grad_norm": 1.4149609804153442, + "learning_rate": 1.1920181464845736e-05, + "loss": 0.1365, + "step": 4690 + }, + { + "epoch": 0.6755472350230415, + "grad_norm": 1.410359263420105, + "learning_rate": 1.1910543869279835e-05, + "loss": 0.1973, + "step": 4691 + }, + { + "epoch": 0.6756912442396313, + "grad_norm": 4.004401206970215, + "learning_rate": 1.1900908952894076e-05, + "loss": 1.0894, + "step": 4692 + }, + { + "epoch": 0.6758352534562212, + "grad_norm": 3.52211856842041, + "learning_rate": 1.1891276717660557e-05, + "loss": 1.447, + "step": 4693 + }, + { + "epoch": 0.675979262672811, + "grad_norm": 0.9014118313789368, + "learning_rate": 1.1881647165550824e-05, + "loss": 0.0946, + "step": 4694 + }, + { + "epoch": 0.6761232718894009, + "grad_norm": 0.7397797107696533, + "learning_rate": 1.187202029853588e-05, + "loss": 0.0568, + "step": 4695 + }, + { + "epoch": 0.6762672811059908, + "grad_norm": 0.7482265830039978, + "learning_rate": 1.1862396118586167e-05, + "loss": 0.1294, + "step": 4696 + }, + { + "epoch": 0.6764112903225806, + "grad_norm": 0.9032662510871887, + "learning_rate": 1.1852774627671592e-05, + "loss": 0.0949, + "step": 4697 + }, + { + "epoch": 0.6765552995391705, + "grad_norm": 3.7546169757843018, + "learning_rate": 1.1843155827761498e-05, + "loss": 2.2088, + "step": 4698 + }, + { + "epoch": 0.6766993087557603, + "grad_norm": 0.6201726794242859, + "learning_rate": 1.1833539720824689e-05, + "loss": 0.0791, + "step": 4699 + }, + { + "epoch": 0.6768433179723502, + "grad_norm": 0.6357205510139465, + "learning_rate": 1.1823926308829408e-05, + "loss": 0.0671, + "step": 4700 + }, + { + "epoch": 0.6769873271889401, + "grad_norm": 0.6208476424217224, + "learning_rate": 1.181431559374335e-05, + "loss": 0.0892, + "step": 4701 + }, + { + "epoch": 0.6771313364055299, + "grad_norm": 1.9482991695404053, + "learning_rate": 1.180470757753366e-05, + "loss": 0.242, + "step": 4702 + }, + { + "epoch": 0.6772753456221198, + "grad_norm": 0.6757457852363586, + "learning_rate": 1.1795102262166931e-05, + "loss": 0.0721, + "step": 4703 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.8411000967025757, + "learning_rate": 1.17854996496092e-05, + "loss": 0.0724, + "step": 4704 + }, + { + "epoch": 0.6775633640552995, + "grad_norm": 0.6702768206596375, + "learning_rate": 1.1775899741825947e-05, + "loss": 0.063, + "step": 4705 + }, + { + "epoch": 0.6777073732718893, + "grad_norm": 1.01582670211792, + "learning_rate": 1.1766302540782109e-05, + "loss": 0.1237, + "step": 4706 + }, + { + "epoch": 0.6778513824884793, + "grad_norm": 3.518317699432373, + "learning_rate": 1.175670804844206e-05, + "loss": 0.1942, + "step": 4707 + }, + { + "epoch": 0.6779953917050692, + "grad_norm": 0.6925159096717834, + "learning_rate": 1.1747116266769617e-05, + "loss": 0.0774, + "step": 4708 + }, + { + "epoch": 0.678139400921659, + "grad_norm": 0.672765851020813, + "learning_rate": 1.1737527197728066e-05, + "loss": 0.1034, + "step": 4709 + }, + { + "epoch": 0.6782834101382489, + "grad_norm": 0.6578497886657715, + "learning_rate": 1.1727940843280108e-05, + "loss": 0.0664, + "step": 4710 + }, + { + "epoch": 0.6784274193548387, + "grad_norm": 0.8283602595329285, + "learning_rate": 1.17183572053879e-05, + "loss": 0.1036, + "step": 4711 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 0.7091240286827087, + "learning_rate": 1.1708776286013046e-05, + "loss": 0.1092, + "step": 4712 + }, + { + "epoch": 0.6787154377880185, + "grad_norm": 12.330144882202148, + "learning_rate": 1.1699198087116589e-05, + "loss": 2.9213, + "step": 4713 + }, + { + "epoch": 0.6788594470046083, + "grad_norm": 0.8531458973884583, + "learning_rate": 1.1689622610659016e-05, + "loss": 0.0931, + "step": 4714 + }, + { + "epoch": 0.6790034562211982, + "grad_norm": 0.8456393480300903, + "learning_rate": 1.1680049858600262e-05, + "loss": 0.0925, + "step": 4715 + }, + { + "epoch": 0.679147465437788, + "grad_norm": 5.375692844390869, + "learning_rate": 1.1670479832899695e-05, + "loss": 1.507, + "step": 4716 + }, + { + "epoch": 0.6792914746543779, + "grad_norm": 5.265297889709473, + "learning_rate": 1.166091253551613e-05, + "loss": 0.7664, + "step": 4717 + }, + { + "epoch": 0.6794354838709677, + "grad_norm": 0.6192951798439026, + "learning_rate": 1.1651347968407827e-05, + "loss": 0.0768, + "step": 4718 + }, + { + "epoch": 0.6795794930875576, + "grad_norm": 0.8199037313461304, + "learning_rate": 1.1641786133532482e-05, + "loss": 0.0828, + "step": 4719 + }, + { + "epoch": 0.6797235023041475, + "grad_norm": 0.48319604992866516, + "learning_rate": 1.1632227032847234e-05, + "loss": 0.0733, + "step": 4720 + }, + { + "epoch": 0.6798675115207373, + "grad_norm": 0.9842122793197632, + "learning_rate": 1.1622670668308663e-05, + "loss": 0.0809, + "step": 4721 + }, + { + "epoch": 0.6800115207373272, + "grad_norm": 5.11141300201416, + "learning_rate": 1.1613117041872785e-05, + "loss": 1.6345, + "step": 4722 + }, + { + "epoch": 0.680155529953917, + "grad_norm": 0.7032898664474487, + "learning_rate": 1.1603566155495058e-05, + "loss": 0.0797, + "step": 4723 + }, + { + "epoch": 0.6802995391705069, + "grad_norm": 0.9918705821037292, + "learning_rate": 1.1594018011130381e-05, + "loss": 0.1035, + "step": 4724 + }, + { + "epoch": 0.6804435483870968, + "grad_norm": 0.7478086352348328, + "learning_rate": 1.1584472610733094e-05, + "loss": 0.0959, + "step": 4725 + }, + { + "epoch": 0.6805875576036866, + "grad_norm": 0.8036831021308899, + "learning_rate": 1.1574929956256964e-05, + "loss": 0.1, + "step": 4726 + }, + { + "epoch": 0.6807315668202765, + "grad_norm": 0.2993578016757965, + "learning_rate": 1.1565390049655208e-05, + "loss": 0.0483, + "step": 4727 + }, + { + "epoch": 0.6808755760368663, + "grad_norm": 0.8939865231513977, + "learning_rate": 1.1555852892880478e-05, + "loss": 0.092, + "step": 4728 + }, + { + "epoch": 0.6810195852534562, + "grad_norm": 1.3166944980621338, + "learning_rate": 1.1546318487884858e-05, + "loss": 0.0873, + "step": 4729 + }, + { + "epoch": 0.681163594470046, + "grad_norm": 0.8767579793930054, + "learning_rate": 1.1536786836619873e-05, + "loss": 0.115, + "step": 4730 + }, + { + "epoch": 0.6813076036866359, + "grad_norm": 0.4627261757850647, + "learning_rate": 1.1527257941036484e-05, + "loss": 0.0464, + "step": 4731 + }, + { + "epoch": 0.6814516129032258, + "grad_norm": 0.9824691414833069, + "learning_rate": 1.1517731803085086e-05, + "loss": 0.1164, + "step": 4732 + }, + { + "epoch": 0.6815956221198156, + "grad_norm": 15.782502174377441, + "learning_rate": 1.1508208424715511e-05, + "loss": 2.7415, + "step": 4733 + }, + { + "epoch": 0.6817396313364056, + "grad_norm": 4.916112422943115, + "learning_rate": 1.1498687807877028e-05, + "loss": 2.0309, + "step": 4734 + }, + { + "epoch": 0.6818836405529954, + "grad_norm": 1.4558016061782837, + "learning_rate": 1.1489169954518328e-05, + "loss": 0.1541, + "step": 4735 + }, + { + "epoch": 0.6820276497695853, + "grad_norm": 0.7218899130821228, + "learning_rate": 1.1479654866587567e-05, + "loss": 0.0799, + "step": 4736 + }, + { + "epoch": 0.6821716589861752, + "grad_norm": 0.45186707377433777, + "learning_rate": 1.1470142546032304e-05, + "loss": 0.0662, + "step": 4737 + }, + { + "epoch": 0.682315668202765, + "grad_norm": 0.9955732226371765, + "learning_rate": 1.1460632994799545e-05, + "loss": 0.1104, + "step": 4738 + }, + { + "epoch": 0.6824596774193549, + "grad_norm": 0.8259661197662354, + "learning_rate": 1.1451126214835725e-05, + "loss": 0.0937, + "step": 4739 + }, + { + "epoch": 0.6826036866359447, + "grad_norm": 1.1213934421539307, + "learning_rate": 1.1441622208086714e-05, + "loss": 0.1142, + "step": 4740 + }, + { + "epoch": 0.6827476958525346, + "grad_norm": 0.3503437042236328, + "learning_rate": 1.1432120976497815e-05, + "loss": 0.0522, + "step": 4741 + }, + { + "epoch": 0.6828917050691244, + "grad_norm": 3.7791857719421387, + "learning_rate": 1.1422622522013762e-05, + "loss": 1.058, + "step": 4742 + }, + { + "epoch": 0.6830357142857143, + "grad_norm": 0.9035323858261108, + "learning_rate": 1.1413126846578723e-05, + "loss": 0.1017, + "step": 4743 + }, + { + "epoch": 0.6831797235023042, + "grad_norm": 0.7480999231338501, + "learning_rate": 1.140363395213629e-05, + "loss": 0.0858, + "step": 4744 + }, + { + "epoch": 0.683323732718894, + "grad_norm": 1.4563089609146118, + "learning_rate": 1.1394143840629489e-05, + "loss": 0.1597, + "step": 4745 + }, + { + "epoch": 0.6834677419354839, + "grad_norm": 0.8814010620117188, + "learning_rate": 1.1384656514000786e-05, + "loss": 0.1022, + "step": 4746 + }, + { + "epoch": 0.6836117511520737, + "grad_norm": 3.595524549484253, + "learning_rate": 1.1375171974192064e-05, + "loss": 1.3394, + "step": 4747 + }, + { + "epoch": 0.6837557603686636, + "grad_norm": 1.8559273481369019, + "learning_rate": 1.136569022314464e-05, + "loss": 0.1503, + "step": 4748 + }, + { + "epoch": 0.6838997695852534, + "grad_norm": 1.5138812065124512, + "learning_rate": 1.1356211262799263e-05, + "loss": 0.1597, + "step": 4749 + }, + { + "epoch": 0.6840437788018433, + "grad_norm": 0.7536826133728027, + "learning_rate": 1.1346735095096106e-05, + "loss": 0.0879, + "step": 4750 + }, + { + "epoch": 0.6841877880184332, + "grad_norm": 0.8588994145393372, + "learning_rate": 1.1337261721974776e-05, + "loss": 0.0891, + "step": 4751 + }, + { + "epoch": 0.684331797235023, + "grad_norm": 0.6611497402191162, + "learning_rate": 1.1327791145374304e-05, + "loss": 0.0847, + "step": 4752 + }, + { + "epoch": 0.6844758064516129, + "grad_norm": 0.7358364462852478, + "learning_rate": 1.1318323367233146e-05, + "loss": 0.1015, + "step": 4753 + }, + { + "epoch": 0.6846198156682027, + "grad_norm": 0.8127564191818237, + "learning_rate": 1.1308858389489191e-05, + "loss": 0.0907, + "step": 4754 + }, + { + "epoch": 0.6847638248847926, + "grad_norm": 5.6609907150268555, + "learning_rate": 1.1299396214079756e-05, + "loss": 2.0068, + "step": 4755 + }, + { + "epoch": 0.6849078341013825, + "grad_norm": 0.6417825818061829, + "learning_rate": 1.1289936842941575e-05, + "loss": 0.0721, + "step": 4756 + }, + { + "epoch": 0.6850518433179723, + "grad_norm": 4.629251956939697, + "learning_rate": 1.1280480278010811e-05, + "loss": 1.5813, + "step": 4757 + }, + { + "epoch": 0.6851958525345622, + "grad_norm": 0.46111515164375305, + "learning_rate": 1.1271026521223066e-05, + "loss": 0.079, + "step": 4758 + }, + { + "epoch": 0.685339861751152, + "grad_norm": 1.4948408603668213, + "learning_rate": 1.1261575574513355e-05, + "loss": 0.1461, + "step": 4759 + }, + { + "epoch": 0.6854838709677419, + "grad_norm": 1.2255606651306152, + "learning_rate": 1.1252127439816117e-05, + "loss": 0.1401, + "step": 4760 + }, + { + "epoch": 0.6856278801843319, + "grad_norm": 9.555606842041016, + "learning_rate": 1.1242682119065218e-05, + "loss": 2.5795, + "step": 4761 + }, + { + "epoch": 0.6857718894009217, + "grad_norm": 0.7744283676147461, + "learning_rate": 1.1233239614193947e-05, + "loss": 0.0874, + "step": 4762 + }, + { + "epoch": 0.6859158986175116, + "grad_norm": 0.9498125910758972, + "learning_rate": 1.1223799927135018e-05, + "loss": 0.1036, + "step": 4763 + }, + { + "epoch": 0.6860599078341014, + "grad_norm": 0.8468683362007141, + "learning_rate": 1.1214363059820571e-05, + "loss": 0.1234, + "step": 4764 + }, + { + "epoch": 0.6862039170506913, + "grad_norm": 0.624097466468811, + "learning_rate": 1.1204929014182162e-05, + "loss": 0.0799, + "step": 4765 + }, + { + "epoch": 0.6863479262672811, + "grad_norm": 0.34801191091537476, + "learning_rate": 1.1195497792150776e-05, + "loss": 0.0677, + "step": 4766 + }, + { + "epoch": 0.686491935483871, + "grad_norm": 3.0948917865753174, + "learning_rate": 1.1186069395656814e-05, + "loss": 1.6341, + "step": 4767 + }, + { + "epoch": 0.6866359447004609, + "grad_norm": 0.8932515382766724, + "learning_rate": 1.1176643826630104e-05, + "loss": 0.0839, + "step": 4768 + }, + { + "epoch": 0.6867799539170507, + "grad_norm": 7.016218662261963, + "learning_rate": 1.1167221086999895e-05, + "loss": 1.9781, + "step": 4769 + }, + { + "epoch": 0.6869239631336406, + "grad_norm": 0.6244432330131531, + "learning_rate": 1.1157801178694854e-05, + "loss": 0.0621, + "step": 4770 + }, + { + "epoch": 0.6870679723502304, + "grad_norm": 6.107020854949951, + "learning_rate": 1.1148384103643068e-05, + "loss": 1.1037, + "step": 4771 + }, + { + "epoch": 0.6872119815668203, + "grad_norm": 0.5177151560783386, + "learning_rate": 1.1138969863772048e-05, + "loss": 0.0789, + "step": 4772 + }, + { + "epoch": 0.6873559907834101, + "grad_norm": 0.2918751835823059, + "learning_rate": 1.1129558461008718e-05, + "loss": 0.0641, + "step": 4773 + }, + { + "epoch": 0.6875, + "grad_norm": 0.5692476630210876, + "learning_rate": 1.112014989727943e-05, + "loss": 0.0706, + "step": 4774 + }, + { + "epoch": 0.6876440092165899, + "grad_norm": 1.167686104774475, + "learning_rate": 1.1110744174509952e-05, + "loss": 0.1358, + "step": 4775 + }, + { + "epoch": 0.6877880184331797, + "grad_norm": 1.9432713985443115, + "learning_rate": 1.1101341294625456e-05, + "loss": 0.1648, + "step": 4776 + }, + { + "epoch": 0.6879320276497696, + "grad_norm": 0.46722209453582764, + "learning_rate": 1.1091941259550562e-05, + "loss": 0.0681, + "step": 4777 + }, + { + "epoch": 0.6880760368663594, + "grad_norm": 0.7130010724067688, + "learning_rate": 1.1082544071209289e-05, + "loss": 0.0952, + "step": 4778 + }, + { + "epoch": 0.6882200460829493, + "grad_norm": 1.0648661851882935, + "learning_rate": 1.1073149731525068e-05, + "loss": 0.1111, + "step": 4779 + }, + { + "epoch": 0.6883640552995391, + "grad_norm": 0.3914143145084381, + "learning_rate": 1.1063758242420757e-05, + "loss": 0.0417, + "step": 4780 + }, + { + "epoch": 0.688508064516129, + "grad_norm": 0.6815327405929565, + "learning_rate": 1.1054369605818629e-05, + "loss": 0.0893, + "step": 4781 + }, + { + "epoch": 0.6886520737327189, + "grad_norm": 0.615227222442627, + "learning_rate": 1.1044983823640371e-05, + "loss": 4.3103, + "step": 4782 + }, + { + "epoch": 0.6887960829493087, + "grad_norm": 1.181067705154419, + "learning_rate": 1.1035600897807084e-05, + "loss": 0.1111, + "step": 4783 + }, + { + "epoch": 0.6889400921658986, + "grad_norm": 0.7578305602073669, + "learning_rate": 1.102622083023929e-05, + "loss": 0.0954, + "step": 4784 + }, + { + "epoch": 0.6890841013824884, + "grad_norm": 1.8270394802093506, + "learning_rate": 1.1016843622856923e-05, + "loss": 0.2176, + "step": 4785 + }, + { + "epoch": 0.6892281105990783, + "grad_norm": 0.9862242341041565, + "learning_rate": 1.100746927757933e-05, + "loss": 0.1325, + "step": 4786 + }, + { + "epoch": 0.6893721198156681, + "grad_norm": 0.6676281690597534, + "learning_rate": 1.0998097796325273e-05, + "loss": 0.0822, + "step": 4787 + }, + { + "epoch": 0.6895161290322581, + "grad_norm": 0.40233927965164185, + "learning_rate": 1.098872918101293e-05, + "loss": 0.0636, + "step": 4788 + }, + { + "epoch": 0.689660138248848, + "grad_norm": 0.6483054161071777, + "learning_rate": 1.0979363433559891e-05, + "loss": 0.0814, + "step": 4789 + }, + { + "epoch": 0.6898041474654378, + "grad_norm": 0.9914736151695251, + "learning_rate": 1.097000055588316e-05, + "loss": 0.079, + "step": 4790 + }, + { + "epoch": 0.6899481566820277, + "grad_norm": 0.589655876159668, + "learning_rate": 1.0960640549899149e-05, + "loss": 0.0532, + "step": 4791 + }, + { + "epoch": 0.6900921658986175, + "grad_norm": 0.6544744372367859, + "learning_rate": 1.0951283417523687e-05, + "loss": 0.0926, + "step": 4792 + }, + { + "epoch": 0.6902361751152074, + "grad_norm": 3.925034284591675, + "learning_rate": 1.0941929160672013e-05, + "loss": 1.1986, + "step": 4793 + }, + { + "epoch": 0.6903801843317973, + "grad_norm": 1.7778503894805908, + "learning_rate": 1.093257778125877e-05, + "loss": 0.1404, + "step": 4794 + }, + { + "epoch": 0.6905241935483871, + "grad_norm": 0.7887038588523865, + "learning_rate": 1.0923229281198039e-05, + "loss": 0.0818, + "step": 4795 + }, + { + "epoch": 0.690668202764977, + "grad_norm": 0.8982183933258057, + "learning_rate": 1.0913883662403283e-05, + "loss": 0.102, + "step": 4796 + }, + { + "epoch": 0.6908122119815668, + "grad_norm": 0.6467613577842712, + "learning_rate": 1.0904540926787382e-05, + "loss": 0.0724, + "step": 4797 + }, + { + "epoch": 0.6909562211981567, + "grad_norm": 1.0168997049331665, + "learning_rate": 1.0895201076262631e-05, + "loss": 0.1289, + "step": 4798 + }, + { + "epoch": 0.6911002304147466, + "grad_norm": 0.49085086584091187, + "learning_rate": 1.0885864112740734e-05, + "loss": 0.0615, + "step": 4799 + }, + { + "epoch": 0.6912442396313364, + "grad_norm": 0.5700295567512512, + "learning_rate": 1.0876530038132802e-05, + "loss": 0.0705, + "step": 4800 + }, + { + "epoch": 0.6913882488479263, + "grad_norm": 0.5666221380233765, + "learning_rate": 1.086719885434935e-05, + "loss": 0.0716, + "step": 4801 + }, + { + "epoch": 0.6915322580645161, + "grad_norm": 0.7403396368026733, + "learning_rate": 1.085787056330031e-05, + "loss": 0.0776, + "step": 4802 + }, + { + "epoch": 0.691676267281106, + "grad_norm": 1.085893154144287, + "learning_rate": 1.084854516689502e-05, + "loss": 0.1139, + "step": 4803 + }, + { + "epoch": 0.6918202764976958, + "grad_norm": 1.0145865678787231, + "learning_rate": 1.0839222667042218e-05, + "loss": 0.1018, + "step": 4804 + }, + { + "epoch": 0.6919642857142857, + "grad_norm": 0.6114829182624817, + "learning_rate": 1.082990306565006e-05, + "loss": 0.0638, + "step": 4805 + }, + { + "epoch": 0.6921082949308756, + "grad_norm": 0.9362423419952393, + "learning_rate": 1.0820586364626104e-05, + "loss": 0.0914, + "step": 4806 + }, + { + "epoch": 0.6922523041474654, + "grad_norm": 1.070720911026001, + "learning_rate": 1.0811272565877309e-05, + "loss": 0.1066, + "step": 4807 + }, + { + "epoch": 0.6923963133640553, + "grad_norm": 1.6487164497375488, + "learning_rate": 1.080196167131005e-05, + "loss": 0.1489, + "step": 4808 + }, + { + "epoch": 0.6925403225806451, + "grad_norm": 1.256685495376587, + "learning_rate": 1.0792653682830099e-05, + "loss": 0.159, + "step": 4809 + }, + { + "epoch": 0.692684331797235, + "grad_norm": 0.6120016574859619, + "learning_rate": 1.0783348602342639e-05, + "loss": 0.0822, + "step": 4810 + }, + { + "epoch": 0.6928283410138248, + "grad_norm": 0.6002236604690552, + "learning_rate": 1.0774046431752253e-05, + "loss": 0.0604, + "step": 4811 + }, + { + "epoch": 0.6929723502304147, + "grad_norm": 0.813616931438446, + "learning_rate": 1.076474717296293e-05, + "loss": 0.0824, + "step": 4812 + }, + { + "epoch": 0.6931163594470046, + "grad_norm": 3.389533042907715, + "learning_rate": 1.0755450827878067e-05, + "loss": 0.1933, + "step": 4813 + }, + { + "epoch": 0.6932603686635944, + "grad_norm": 1.9095733165740967, + "learning_rate": 1.074615739840046e-05, + "loss": 0.1308, + "step": 4814 + }, + { + "epoch": 0.6934043778801844, + "grad_norm": 0.8831154108047485, + "learning_rate": 1.0736866886432311e-05, + "loss": 0.1555, + "step": 4815 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 0.8943410515785217, + "learning_rate": 1.0727579293875211e-05, + "loss": 0.0951, + "step": 4816 + }, + { + "epoch": 0.6936923963133641, + "grad_norm": 0.7077829241752625, + "learning_rate": 1.0718294622630188e-05, + "loss": 0.0805, + "step": 4817 + }, + { + "epoch": 0.693836405529954, + "grad_norm": 0.7476462125778198, + "learning_rate": 1.0709012874597637e-05, + "loss": 0.0923, + "step": 4818 + }, + { + "epoch": 0.6939804147465438, + "grad_norm": 5.395347595214844, + "learning_rate": 1.0699734051677373e-05, + "loss": 1.5429, + "step": 4819 + }, + { + "epoch": 0.6941244239631337, + "grad_norm": 0.6682755947113037, + "learning_rate": 1.06904581557686e-05, + "loss": 0.0678, + "step": 4820 + }, + { + "epoch": 0.6942684331797235, + "grad_norm": 0.5142917037010193, + "learning_rate": 1.0681185188769935e-05, + "loss": 0.0627, + "step": 4821 + }, + { + "epoch": 0.6944124423963134, + "grad_norm": 0.8123272061347961, + "learning_rate": 1.067191515257939e-05, + "loss": 0.0912, + "step": 4822 + }, + { + "epoch": 0.6945564516129032, + "grad_norm": 1.1369386911392212, + "learning_rate": 1.0662648049094375e-05, + "loss": 0.1053, + "step": 4823 + }, + { + "epoch": 0.6947004608294931, + "grad_norm": 1.0002615451812744, + "learning_rate": 1.0653383880211704e-05, + "loss": 4.1569, + "step": 4824 + }, + { + "epoch": 0.694844470046083, + "grad_norm": 0.8034593462944031, + "learning_rate": 1.0644122647827587e-05, + "loss": 0.0952, + "step": 4825 + }, + { + "epoch": 0.6949884792626728, + "grad_norm": 0.42761844396591187, + "learning_rate": 1.0634864353837636e-05, + "loss": 0.0587, + "step": 4826 + }, + { + "epoch": 0.6951324884792627, + "grad_norm": 3.3680851459503174, + "learning_rate": 1.062560900013686e-05, + "loss": 0.9209, + "step": 4827 + }, + { + "epoch": 0.6952764976958525, + "grad_norm": 0.8636692762374878, + "learning_rate": 1.0616356588619663e-05, + "loss": 0.1035, + "step": 4828 + }, + { + "epoch": 0.6954205069124424, + "grad_norm": 1.2419947385787964, + "learning_rate": 1.0607107121179855e-05, + "loss": 0.1238, + "step": 4829 + }, + { + "epoch": 0.6955645161290323, + "grad_norm": 0.7801353931427002, + "learning_rate": 1.0597860599710636e-05, + "loss": 0.0795, + "step": 4830 + }, + { + "epoch": 0.6957085253456221, + "grad_norm": 0.9628891944885254, + "learning_rate": 1.0588617026104602e-05, + "loss": 0.1312, + "step": 4831 + }, + { + "epoch": 0.695852534562212, + "grad_norm": 0.8527974486351013, + "learning_rate": 1.0579376402253755e-05, + "loss": 0.0939, + "step": 4832 + }, + { + "epoch": 0.6959965437788018, + "grad_norm": 0.3488309979438782, + "learning_rate": 1.0570138730049484e-05, + "loss": 0.0523, + "step": 4833 + }, + { + "epoch": 0.6961405529953917, + "grad_norm": 0.905326247215271, + "learning_rate": 1.0560904011382578e-05, + "loss": 0.125, + "step": 4834 + }, + { + "epoch": 0.6962845622119815, + "grad_norm": 1.2500941753387451, + "learning_rate": 1.055167224814322e-05, + "loss": 0.1341, + "step": 4835 + }, + { + "epoch": 0.6964285714285714, + "grad_norm": 0.790368378162384, + "learning_rate": 1.0542443442220989e-05, + "loss": 0.0844, + "step": 4836 + }, + { + "epoch": 0.6965725806451613, + "grad_norm": 7.263467788696289, + "learning_rate": 1.0533217595504858e-05, + "loss": 1.8405, + "step": 4837 + }, + { + "epoch": 0.6967165898617511, + "grad_norm": 1.1433465480804443, + "learning_rate": 1.0523994709883195e-05, + "loss": 0.141, + "step": 4838 + }, + { + "epoch": 0.696860599078341, + "grad_norm": 0.5899380445480347, + "learning_rate": 1.0514774787243761e-05, + "loss": 0.0687, + "step": 4839 + }, + { + "epoch": 0.6970046082949308, + "grad_norm": 1.0479309558868408, + "learning_rate": 1.0505557829473714e-05, + "loss": 0.148, + "step": 4840 + }, + { + "epoch": 0.6971486175115207, + "grad_norm": 0.4414390027523041, + "learning_rate": 1.0496343838459596e-05, + "loss": 0.0578, + "step": 4841 + }, + { + "epoch": 0.6972926267281107, + "grad_norm": 0.8643466830253601, + "learning_rate": 1.0487132816087353e-05, + "loss": 0.1064, + "step": 4842 + }, + { + "epoch": 0.6974366359447005, + "grad_norm": 1.0440400838851929, + "learning_rate": 1.0477924764242308e-05, + "loss": 0.1386, + "step": 4843 + }, + { + "epoch": 0.6975806451612904, + "grad_norm": 0.7496224045753479, + "learning_rate": 1.0468719684809206e-05, + "loss": 0.0803, + "step": 4844 + }, + { + "epoch": 0.6977246543778802, + "grad_norm": 3.984884023666382, + "learning_rate": 1.0459517579672151e-05, + "loss": 1.2317, + "step": 4845 + }, + { + "epoch": 0.6978686635944701, + "grad_norm": 1.4668391942977905, + "learning_rate": 1.0450318450714656e-05, + "loss": 0.1584, + "step": 4846 + }, + { + "epoch": 0.6980126728110599, + "grad_norm": 6.180446147918701, + "learning_rate": 1.0441122299819613e-05, + "loss": 1.9644, + "step": 4847 + }, + { + "epoch": 0.6981566820276498, + "grad_norm": 1.026832938194275, + "learning_rate": 1.0431929128869319e-05, + "loss": 0.1345, + "step": 4848 + }, + { + "epoch": 0.6983006912442397, + "grad_norm": 4.298906326293945, + "learning_rate": 1.0422738939745452e-05, + "loss": 2.7915, + "step": 4849 + }, + { + "epoch": 0.6984447004608295, + "grad_norm": 0.9413024187088013, + "learning_rate": 1.0413551734329077e-05, + "loss": 0.1169, + "step": 4850 + }, + { + "epoch": 0.6985887096774194, + "grad_norm": 0.9595313668251038, + "learning_rate": 1.0404367514500656e-05, + "loss": 0.0891, + "step": 4851 + }, + { + "epoch": 0.6987327188940092, + "grad_norm": 1.056171178817749, + "learning_rate": 1.0395186282140034e-05, + "loss": 0.1363, + "step": 4852 + }, + { + "epoch": 0.6988767281105991, + "grad_norm": 3.9335131645202637, + "learning_rate": 1.038600803912645e-05, + "loss": 1.1171, + "step": 4853 + }, + { + "epoch": 0.699020737327189, + "grad_norm": 0.8594435453414917, + "learning_rate": 1.0376832787338525e-05, + "loss": 4.2123, + "step": 4854 + }, + { + "epoch": 0.6991647465437788, + "grad_norm": 3.0843794345855713, + "learning_rate": 1.0367660528654272e-05, + "loss": 2.1771, + "step": 4855 + }, + { + "epoch": 0.6993087557603687, + "grad_norm": 0.8873035907745361, + "learning_rate": 1.0358491264951089e-05, + "loss": 0.1927, + "step": 4856 + }, + { + "epoch": 0.6994527649769585, + "grad_norm": 0.9314242005348206, + "learning_rate": 1.0349324998105766e-05, + "loss": 0.0681, + "step": 4857 + }, + { + "epoch": 0.6995967741935484, + "grad_norm": 1.0956226587295532, + "learning_rate": 1.0340161729994471e-05, + "loss": 0.105, + "step": 4858 + }, + { + "epoch": 0.6997407834101382, + "grad_norm": 1.077183485031128, + "learning_rate": 1.0331001462492765e-05, + "loss": 0.1018, + "step": 4859 + }, + { + "epoch": 0.6998847926267281, + "grad_norm": 2.0230069160461426, + "learning_rate": 1.0321844197475591e-05, + "loss": 0.1621, + "step": 4860 + }, + { + "epoch": 0.700028801843318, + "grad_norm": 2.991854190826416, + "learning_rate": 1.0312689936817283e-05, + "loss": 1.279, + "step": 4861 + }, + { + "epoch": 0.7001728110599078, + "grad_norm": 0.6946030259132385, + "learning_rate": 1.0303538682391553e-05, + "loss": 0.0641, + "step": 4862 + }, + { + "epoch": 0.7003168202764977, + "grad_norm": 0.8233190774917603, + "learning_rate": 1.02943904360715e-05, + "loss": 0.0938, + "step": 4863 + }, + { + "epoch": 0.7004608294930875, + "grad_norm": 0.8065478205680847, + "learning_rate": 1.0285245199729613e-05, + "loss": 0.1008, + "step": 4864 + }, + { + "epoch": 0.7006048387096774, + "grad_norm": 1.33478581905365, + "learning_rate": 1.0276102975237754e-05, + "loss": 0.1229, + "step": 4865 + }, + { + "epoch": 0.7007488479262672, + "grad_norm": 0.9628174901008606, + "learning_rate": 1.026696376446718e-05, + "loss": 0.1022, + "step": 4866 + }, + { + "epoch": 0.7008928571428571, + "grad_norm": 1.0528013706207275, + "learning_rate": 1.0257827569288522e-05, + "loss": 0.1398, + "step": 4867 + }, + { + "epoch": 0.701036866359447, + "grad_norm": 0.6772385835647583, + "learning_rate": 1.0248694391571801e-05, + "loss": 0.0751, + "step": 4868 + }, + { + "epoch": 0.7011808755760369, + "grad_norm": 0.916251540184021, + "learning_rate": 1.0239564233186413e-05, + "loss": 0.1012, + "step": 4869 + }, + { + "epoch": 0.7013248847926268, + "grad_norm": 3.2754061222076416, + "learning_rate": 1.0230437096001133e-05, + "loss": 2.1146, + "step": 4870 + }, + { + "epoch": 0.7014688940092166, + "grad_norm": 1.1040641069412231, + "learning_rate": 1.0221312981884143e-05, + "loss": 0.1662, + "step": 4871 + }, + { + "epoch": 0.7016129032258065, + "grad_norm": 5.0917277336120605, + "learning_rate": 1.0212191892702979e-05, + "loss": 1.2142, + "step": 4872 + }, + { + "epoch": 0.7017569124423964, + "grad_norm": 0.4761129915714264, + "learning_rate": 1.0203073830324567e-05, + "loss": 0.0628, + "step": 4873 + }, + { + "epoch": 0.7019009216589862, + "grad_norm": 1.3846977949142456, + "learning_rate": 1.0193958796615208e-05, + "loss": 0.1218, + "step": 4874 + }, + { + "epoch": 0.7020449308755761, + "grad_norm": 0.7719039916992188, + "learning_rate": 1.0184846793440594e-05, + "loss": 0.0706, + "step": 4875 + }, + { + "epoch": 0.7021889400921659, + "grad_norm": 0.7065379619598389, + "learning_rate": 1.017573782266579e-05, + "loss": 0.0821, + "step": 4876 + }, + { + "epoch": 0.7023329493087558, + "grad_norm": 0.6928699016571045, + "learning_rate": 1.016663188615524e-05, + "loss": 0.0881, + "step": 4877 + }, + { + "epoch": 0.7024769585253456, + "grad_norm": 0.6490471363067627, + "learning_rate": 1.0157528985772769e-05, + "loss": 0.0809, + "step": 4878 + }, + { + "epoch": 0.7026209677419355, + "grad_norm": 0.7507237792015076, + "learning_rate": 1.0148429123381577e-05, + "loss": 0.0795, + "step": 4879 + }, + { + "epoch": 0.7027649769585254, + "grad_norm": 0.7258495688438416, + "learning_rate": 1.0139332300844248e-05, + "loss": 0.0725, + "step": 4880 + }, + { + "epoch": 0.7029089861751152, + "grad_norm": 0.6100548505783081, + "learning_rate": 1.013023852002274e-05, + "loss": 0.0877, + "step": 4881 + }, + { + "epoch": 0.7030529953917051, + "grad_norm": 1.8649711608886719, + "learning_rate": 1.0121147782778387e-05, + "loss": 0.1854, + "step": 4882 + }, + { + "epoch": 0.7031970046082949, + "grad_norm": 4.081342697143555, + "learning_rate": 1.0112060090971906e-05, + "loss": 0.972, + "step": 4883 + }, + { + "epoch": 0.7033410138248848, + "grad_norm": 0.4281999468803406, + "learning_rate": 1.0102975446463384e-05, + "loss": 0.0477, + "step": 4884 + }, + { + "epoch": 0.7034850230414746, + "grad_norm": 0.6405277848243713, + "learning_rate": 1.0093893851112284e-05, + "loss": 0.0885, + "step": 4885 + }, + { + "epoch": 0.7036290322580645, + "grad_norm": 1.0750473737716675, + "learning_rate": 1.0084815306777456e-05, + "loss": 0.1239, + "step": 4886 + }, + { + "epoch": 0.7037730414746544, + "grad_norm": 1.355873942375183, + "learning_rate": 1.0075739815317112e-05, + "loss": 0.1264, + "step": 4887 + }, + { + "epoch": 0.7039170506912442, + "grad_norm": 0.8532301783561707, + "learning_rate": 1.0066667378588843e-05, + "loss": 0.1387, + "step": 4888 + }, + { + "epoch": 0.7040610599078341, + "grad_norm": 0.9129965305328369, + "learning_rate": 1.0057597998449623e-05, + "loss": 0.1062, + "step": 4889 + }, + { + "epoch": 0.7042050691244239, + "grad_norm": 5.247020721435547, + "learning_rate": 1.0048531676755784e-05, + "loss": 1.6395, + "step": 4890 + }, + { + "epoch": 0.7043490783410138, + "grad_norm": 7.282968521118164, + "learning_rate": 1.0039468415363052e-05, + "loss": 2.4704, + "step": 4891 + }, + { + "epoch": 0.7044930875576036, + "grad_norm": 3.2972543239593506, + "learning_rate": 1.0030408216126511e-05, + "loss": 1.7014, + "step": 4892 + }, + { + "epoch": 0.7046370967741935, + "grad_norm": 0.4449438154697418, + "learning_rate": 1.0021351080900621e-05, + "loss": 0.0582, + "step": 4893 + }, + { + "epoch": 0.7047811059907834, + "grad_norm": 0.8362402319908142, + "learning_rate": 1.0012297011539224e-05, + "loss": 0.0857, + "step": 4894 + }, + { + "epoch": 0.7049251152073732, + "grad_norm": 0.785132110118866, + "learning_rate": 1.0003246009895522e-05, + "loss": 0.1133, + "step": 4895 + }, + { + "epoch": 0.7050691244239631, + "grad_norm": 5.496160507202148, + "learning_rate": 9.994198077822098e-06, + "loss": 0.9829, + "step": 4896 + }, + { + "epoch": 0.705213133640553, + "grad_norm": 0.6295954585075378, + "learning_rate": 9.985153217170903e-06, + "loss": 0.0948, + "step": 4897 + }, + { + "epoch": 0.7053571428571429, + "grad_norm": 1.487380027770996, + "learning_rate": 9.97611142979325e-06, + "loss": 3.828, + "step": 4898 + }, + { + "epoch": 0.7055011520737328, + "grad_norm": 0.8750095963478088, + "learning_rate": 9.967072717539851e-06, + "loss": 0.0855, + "step": 4899 + }, + { + "epoch": 0.7056451612903226, + "grad_norm": 3.421016216278076, + "learning_rate": 9.958037082260765e-06, + "loss": 1.8305, + "step": 4900 + }, + { + "epoch": 0.7057891705069125, + "grad_norm": 0.8601051568984985, + "learning_rate": 9.949004525805423e-06, + "loss": 0.1009, + "step": 4901 + }, + { + "epoch": 0.7059331797235023, + "grad_norm": 0.7996153831481934, + "learning_rate": 9.93997505002263e-06, + "loss": 0.1079, + "step": 4902 + }, + { + "epoch": 0.7060771889400922, + "grad_norm": 0.49436822533607483, + "learning_rate": 9.930948656760561e-06, + "loss": 0.0623, + "step": 4903 + }, + { + "epoch": 0.706221198156682, + "grad_norm": 4.935552597045898, + "learning_rate": 9.921925347866759e-06, + "loss": 1.9608, + "step": 4904 + }, + { + "epoch": 0.7063652073732719, + "grad_norm": 4.917835235595703, + "learning_rate": 9.912905125188136e-06, + "loss": 1.2968, + "step": 4905 + }, + { + "epoch": 0.7065092165898618, + "grad_norm": 0.4318293631076813, + "learning_rate": 9.903887990570967e-06, + "loss": 0.0455, + "step": 4906 + }, + { + "epoch": 0.7066532258064516, + "grad_norm": 0.935074508190155, + "learning_rate": 9.894873945860908e-06, + "loss": 0.0846, + "step": 4907 + }, + { + "epoch": 0.7067972350230415, + "grad_norm": 0.46710893511772156, + "learning_rate": 9.88586299290297e-06, + "loss": 0.0608, + "step": 4908 + }, + { + "epoch": 0.7069412442396313, + "grad_norm": 1.1125516891479492, + "learning_rate": 9.876855133541538e-06, + "loss": 0.0865, + "step": 4909 + }, + { + "epoch": 0.7070852534562212, + "grad_norm": 0.7064957022666931, + "learning_rate": 9.867850369620357e-06, + "loss": 0.1065, + "step": 4910 + }, + { + "epoch": 0.707229262672811, + "grad_norm": 0.3770734667778015, + "learning_rate": 9.858848702982547e-06, + "loss": 0.0646, + "step": 4911 + }, + { + "epoch": 0.7073732718894009, + "grad_norm": 0.5443082451820374, + "learning_rate": 9.849850135470589e-06, + "loss": 0.0498, + "step": 4912 + }, + { + "epoch": 0.7075172811059908, + "grad_norm": 0.8735293745994568, + "learning_rate": 9.840854668926333e-06, + "loss": 0.1116, + "step": 4913 + }, + { + "epoch": 0.7076612903225806, + "grad_norm": 0.9680543541908264, + "learning_rate": 9.831862305190986e-06, + "loss": 0.1293, + "step": 4914 + }, + { + "epoch": 0.7078052995391705, + "grad_norm": 0.7244268655776978, + "learning_rate": 9.82287304610513e-06, + "loss": 0.0767, + "step": 4915 + }, + { + "epoch": 0.7079493087557603, + "grad_norm": 0.3100583553314209, + "learning_rate": 9.81388689350871e-06, + "loss": 0.059, + "step": 4916 + }, + { + "epoch": 0.7080933179723502, + "grad_norm": 0.9453049898147583, + "learning_rate": 9.804903849241023e-06, + "loss": 0.0848, + "step": 4917 + }, + { + "epoch": 0.7082373271889401, + "grad_norm": 5.3725199699401855, + "learning_rate": 9.795923915140747e-06, + "loss": 0.9657, + "step": 4918 + }, + { + "epoch": 0.7083813364055299, + "grad_norm": 0.6829004883766174, + "learning_rate": 9.786947093045915e-06, + "loss": 0.0783, + "step": 4919 + }, + { + "epoch": 0.7085253456221198, + "grad_norm": 0.49028468132019043, + "learning_rate": 9.777973384793923e-06, + "loss": 0.0683, + "step": 4920 + }, + { + "epoch": 0.7086693548387096, + "grad_norm": 3.3874411582946777, + "learning_rate": 9.76900279222153e-06, + "loss": 0.4161, + "step": 4921 + }, + { + "epoch": 0.7088133640552995, + "grad_norm": 0.9112091064453125, + "learning_rate": 9.760035317164857e-06, + "loss": 0.0784, + "step": 4922 + }, + { + "epoch": 0.7089573732718893, + "grad_norm": 1.0357558727264404, + "learning_rate": 9.751070961459385e-06, + "loss": 0.1036, + "step": 4923 + }, + { + "epoch": 0.7091013824884793, + "grad_norm": 0.6794096827507019, + "learning_rate": 9.742109726939966e-06, + "loss": 0.0571, + "step": 4924 + }, + { + "epoch": 0.7092453917050692, + "grad_norm": 4.938579559326172, + "learning_rate": 9.733151615440791e-06, + "loss": 2.0031, + "step": 4925 + }, + { + "epoch": 0.709389400921659, + "grad_norm": 0.583015501499176, + "learning_rate": 9.724196628795449e-06, + "loss": 0.0527, + "step": 4926 + }, + { + "epoch": 0.7095334101382489, + "grad_norm": 1.4248499870300293, + "learning_rate": 9.715244768836856e-06, + "loss": 0.1941, + "step": 4927 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 0.7896797060966492, + "learning_rate": 9.7062960373973e-06, + "loss": 0.0896, + "step": 4928 + }, + { + "epoch": 0.7098214285714286, + "grad_norm": 2.4041390419006348, + "learning_rate": 9.697350436308427e-06, + "loss": 0.1882, + "step": 4929 + }, + { + "epoch": 0.7099654377880185, + "grad_norm": 3.239039659500122, + "learning_rate": 9.688407967401248e-06, + "loss": 2.0638, + "step": 4930 + }, + { + "epoch": 0.7101094470046083, + "grad_norm": 1.281614899635315, + "learning_rate": 9.679468632506122e-06, + "loss": 0.2602, + "step": 4931 + }, + { + "epoch": 0.7102534562211982, + "grad_norm": 1.9477370977401733, + "learning_rate": 9.670532433452776e-06, + "loss": 0.1745, + "step": 4932 + }, + { + "epoch": 0.710397465437788, + "grad_norm": 0.8715617060661316, + "learning_rate": 9.66159937207029e-06, + "loss": 0.095, + "step": 4933 + }, + { + "epoch": 0.7105414746543779, + "grad_norm": 0.735914945602417, + "learning_rate": 9.652669450187105e-06, + "loss": 0.0761, + "step": 4934 + }, + { + "epoch": 0.7106854838709677, + "grad_norm": 4.957225799560547, + "learning_rate": 9.643742669631018e-06, + "loss": 2.0627, + "step": 4935 + }, + { + "epoch": 0.7108294930875576, + "grad_norm": 0.8353914022445679, + "learning_rate": 9.634819032229183e-06, + "loss": 4.0226, + "step": 4936 + }, + { + "epoch": 0.7109735023041475, + "grad_norm": 0.6469784379005432, + "learning_rate": 9.625898539808112e-06, + "loss": 0.0651, + "step": 4937 + }, + { + "epoch": 0.7111175115207373, + "grad_norm": 6.083950042724609, + "learning_rate": 9.61698119419367e-06, + "loss": 1.6593, + "step": 4938 + }, + { + "epoch": 0.7112615207373272, + "grad_norm": 0.4387098550796509, + "learning_rate": 9.608066997211081e-06, + "loss": 0.0561, + "step": 4939 + }, + { + "epoch": 0.711405529953917, + "grad_norm": 7.350257396697998, + "learning_rate": 9.599155950684924e-06, + "loss": 1.6496, + "step": 4940 + }, + { + "epoch": 0.7115495391705069, + "grad_norm": 0.6636157631874084, + "learning_rate": 9.59024805643913e-06, + "loss": 0.0781, + "step": 4941 + }, + { + "epoch": 0.7116935483870968, + "grad_norm": 1.1349948644638062, + "learning_rate": 9.58134331629699e-06, + "loss": 0.1295, + "step": 4942 + }, + { + "epoch": 0.7118375576036866, + "grad_norm": 0.49261102080345154, + "learning_rate": 9.572441732081144e-06, + "loss": 0.0599, + "step": 4943 + }, + { + "epoch": 0.7119815668202765, + "grad_norm": 1.003208041191101, + "learning_rate": 9.563543305613592e-06, + "loss": 0.098, + "step": 4944 + }, + { + "epoch": 0.7121255760368663, + "grad_norm": 1.1798664331436157, + "learning_rate": 9.554648038715685e-06, + "loss": 0.1457, + "step": 4945 + }, + { + "epoch": 0.7122695852534562, + "grad_norm": 0.8700708746910095, + "learning_rate": 9.545755933208122e-06, + "loss": 0.1011, + "step": 4946 + }, + { + "epoch": 0.712413594470046, + "grad_norm": 0.6956058144569397, + "learning_rate": 9.536866990910967e-06, + "loss": 0.0861, + "step": 4947 + }, + { + "epoch": 0.7125576036866359, + "grad_norm": 1.1103557348251343, + "learning_rate": 9.527981213643623e-06, + "loss": 3.8229, + "step": 4948 + }, + { + "epoch": 0.7127016129032258, + "grad_norm": 1.0432049036026, + "learning_rate": 9.519098603224852e-06, + "loss": 0.1399, + "step": 4949 + }, + { + "epoch": 0.7128456221198156, + "grad_norm": 0.6420832276344299, + "learning_rate": 9.510219161472768e-06, + "loss": 0.1011, + "step": 4950 + }, + { + "epoch": 0.7129896313364056, + "grad_norm": 0.9363003373146057, + "learning_rate": 9.50134289020484e-06, + "loss": 0.091, + "step": 4951 + }, + { + "epoch": 0.7131336405529954, + "grad_norm": 5.807102203369141, + "learning_rate": 9.492469791237868e-06, + "loss": 1.7334, + "step": 4952 + }, + { + "epoch": 0.7132776497695853, + "grad_norm": 7.013087749481201, + "learning_rate": 9.48359986638804e-06, + "loss": 1.9543, + "step": 4953 + }, + { + "epoch": 0.7134216589861752, + "grad_norm": 0.8779212832450867, + "learning_rate": 9.474733117470865e-06, + "loss": 0.079, + "step": 4954 + }, + { + "epoch": 0.713565668202765, + "grad_norm": 0.4843643307685852, + "learning_rate": 9.465869546301206e-06, + "loss": 0.0737, + "step": 4955 + }, + { + "epoch": 0.7137096774193549, + "grad_norm": 0.6358640789985657, + "learning_rate": 9.457009154693284e-06, + "loss": 0.0829, + "step": 4956 + }, + { + "epoch": 0.7138536866359447, + "grad_norm": 0.9013643264770508, + "learning_rate": 9.448151944460657e-06, + "loss": 0.1028, + "step": 4957 + }, + { + "epoch": 0.7139976958525346, + "grad_norm": 1.2465806007385254, + "learning_rate": 9.439297917416245e-06, + "loss": 0.109, + "step": 4958 + }, + { + "epoch": 0.7141417050691244, + "grad_norm": 6.788650989532471, + "learning_rate": 9.430447075372311e-06, + "loss": 2.6666, + "step": 4959 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.5060356855392456, + "learning_rate": 9.421599420140465e-06, + "loss": 0.0636, + "step": 4960 + }, + { + "epoch": 0.7144297235023042, + "grad_norm": 1.3295824527740479, + "learning_rate": 9.412754953531663e-06, + "loss": 0.1402, + "step": 4961 + }, + { + "epoch": 0.714573732718894, + "grad_norm": 0.7116462588310242, + "learning_rate": 9.403913677356216e-06, + "loss": 0.0827, + "step": 4962 + }, + { + "epoch": 0.7147177419354839, + "grad_norm": 0.32883596420288086, + "learning_rate": 9.395075593423769e-06, + "loss": 0.0446, + "step": 4963 + }, + { + "epoch": 0.7148617511520737, + "grad_norm": 0.6007466316223145, + "learning_rate": 9.386240703543328e-06, + "loss": 0.07, + "step": 4964 + }, + { + "epoch": 0.7150057603686636, + "grad_norm": 0.6701818704605103, + "learning_rate": 9.377409009523238e-06, + "loss": 0.0722, + "step": 4965 + }, + { + "epoch": 0.7151497695852534, + "grad_norm": 1.3902980089187622, + "learning_rate": 9.368580513171188e-06, + "loss": 0.1392, + "step": 4966 + }, + { + "epoch": 0.7152937788018433, + "grad_norm": 0.6690689325332642, + "learning_rate": 9.359755216294217e-06, + "loss": 0.0687, + "step": 4967 + }, + { + "epoch": 0.7154377880184332, + "grad_norm": 0.7595102787017822, + "learning_rate": 9.350933120698708e-06, + "loss": 0.0761, + "step": 4968 + }, + { + "epoch": 0.715581797235023, + "grad_norm": 5.165990352630615, + "learning_rate": 9.342114228190383e-06, + "loss": 0.9377, + "step": 4969 + }, + { + "epoch": 0.7157258064516129, + "grad_norm": 0.9476959109306335, + "learning_rate": 9.333298540574317e-06, + "loss": 0.1274, + "step": 4970 + }, + { + "epoch": 0.7158698156682027, + "grad_norm": 1.238048791885376, + "learning_rate": 9.324486059654926e-06, + "loss": 0.1382, + "step": 4971 + }, + { + "epoch": 0.7160138248847926, + "grad_norm": 0.6726256608963013, + "learning_rate": 9.31567678723597e-06, + "loss": 0.1046, + "step": 4972 + }, + { + "epoch": 0.7161578341013825, + "grad_norm": 0.9060871601104736, + "learning_rate": 9.306870725120547e-06, + "loss": 0.0906, + "step": 4973 + }, + { + "epoch": 0.7163018433179723, + "grad_norm": 1.462412714958191, + "learning_rate": 9.298067875111105e-06, + "loss": 0.1423, + "step": 4974 + }, + { + "epoch": 0.7164458525345622, + "grad_norm": 0.8069825172424316, + "learning_rate": 9.289268239009433e-06, + "loss": 0.1023, + "step": 4975 + }, + { + "epoch": 0.716589861751152, + "grad_norm": 0.9794593453407288, + "learning_rate": 9.280471818616656e-06, + "loss": 0.0957, + "step": 4976 + }, + { + "epoch": 0.7167338709677419, + "grad_norm": 3.884295701980591, + "learning_rate": 9.271678615733252e-06, + "loss": 1.2084, + "step": 4977 + }, + { + "epoch": 0.7168778801843319, + "grad_norm": 1.5349549055099487, + "learning_rate": 9.262888632159028e-06, + "loss": 0.1698, + "step": 4978 + }, + { + "epoch": 0.7170218894009217, + "grad_norm": 1.2579960823059082, + "learning_rate": 9.254101869693133e-06, + "loss": 0.1375, + "step": 4979 + }, + { + "epoch": 0.7171658986175116, + "grad_norm": 4.931596755981445, + "learning_rate": 9.245318330134078e-06, + "loss": 2.5124, + "step": 4980 + }, + { + "epoch": 0.7173099078341014, + "grad_norm": 0.9565901756286621, + "learning_rate": 9.23653801527969e-06, + "loss": 0.1052, + "step": 4981 + }, + { + "epoch": 0.7174539170506913, + "grad_norm": 1.1721570491790771, + "learning_rate": 9.227760926927142e-06, + "loss": 0.1186, + "step": 4982 + }, + { + "epoch": 0.7175979262672811, + "grad_norm": 0.7117530703544617, + "learning_rate": 9.218987066872955e-06, + "loss": 0.1079, + "step": 4983 + }, + { + "epoch": 0.717741935483871, + "grad_norm": 0.7227891683578491, + "learning_rate": 9.210216436912978e-06, + "loss": 0.0751, + "step": 4984 + }, + { + "epoch": 0.7178859447004609, + "grad_norm": 0.8088095784187317, + "learning_rate": 9.201449038842402e-06, + "loss": 0.0998, + "step": 4985 + }, + { + "epoch": 0.7180299539170507, + "grad_norm": 0.8518248796463013, + "learning_rate": 9.192684874455761e-06, + "loss": 0.1146, + "step": 4986 + }, + { + "epoch": 0.7181739631336406, + "grad_norm": 5.036011695861816, + "learning_rate": 9.183923945546926e-06, + "loss": 1.3808, + "step": 4987 + }, + { + "epoch": 0.7183179723502304, + "grad_norm": 8.629231452941895, + "learning_rate": 9.175166253909104e-06, + "loss": 2.3738, + "step": 4988 + }, + { + "epoch": 0.7184619815668203, + "grad_norm": 0.7670775651931763, + "learning_rate": 9.166411801334835e-06, + "loss": 0.0736, + "step": 4989 + }, + { + "epoch": 0.7186059907834101, + "grad_norm": 3.2244083881378174, + "learning_rate": 9.157660589616005e-06, + "loss": 0.1816, + "step": 4990 + }, + { + "epoch": 0.71875, + "grad_norm": 1.3236039876937866, + "learning_rate": 9.148912620543831e-06, + "loss": 0.1314, + "step": 4991 + }, + { + "epoch": 0.7188940092165899, + "grad_norm": 1.0900664329528809, + "learning_rate": 9.140167895908867e-06, + "loss": 0.1251, + "step": 4992 + }, + { + "epoch": 0.7190380184331797, + "grad_norm": 0.6665103435516357, + "learning_rate": 9.131426417501005e-06, + "loss": 0.068, + "step": 4993 + }, + { + "epoch": 0.7191820276497696, + "grad_norm": 1.4688986539840698, + "learning_rate": 9.122688187109468e-06, + "loss": 0.1462, + "step": 4994 + }, + { + "epoch": 0.7193260368663594, + "grad_norm": 2.982624053955078, + "learning_rate": 9.113953206522822e-06, + "loss": 0.2178, + "step": 4995 + }, + { + "epoch": 0.7194700460829493, + "grad_norm": 0.6060623526573181, + "learning_rate": 9.105221477528956e-06, + "loss": 0.0848, + "step": 4996 + }, + { + "epoch": 0.7196140552995391, + "grad_norm": 1.5291404724121094, + "learning_rate": 9.096493001915107e-06, + "loss": 0.1607, + "step": 4997 + }, + { + "epoch": 0.719758064516129, + "grad_norm": 4.418877124786377, + "learning_rate": 9.087767781467838e-06, + "loss": 1.6048, + "step": 4998 + }, + { + "epoch": 0.7199020737327189, + "grad_norm": 0.5943484306335449, + "learning_rate": 9.079045817973045e-06, + "loss": 0.0595, + "step": 4999 + }, + { + "epoch": 0.7200460829493087, + "grad_norm": 0.7174344062805176, + "learning_rate": 9.070327113215963e-06, + "loss": 0.0977, + "step": 5000 + }, + { + "epoch": 0.7201900921658986, + "grad_norm": 3.837473154067993, + "learning_rate": 9.061611668981151e-06, + "loss": 2.2372, + "step": 5001 + }, + { + "epoch": 0.7203341013824884, + "grad_norm": 0.7518596649169922, + "learning_rate": 9.052899487052513e-06, + "loss": 0.0946, + "step": 5002 + }, + { + "epoch": 0.7204781105990783, + "grad_norm": 1.1363276243209839, + "learning_rate": 9.044190569213276e-06, + "loss": 0.1181, + "step": 5003 + }, + { + "epoch": 0.7206221198156681, + "grad_norm": 0.8435840606689453, + "learning_rate": 9.035484917245998e-06, + "loss": 0.1157, + "step": 5004 + }, + { + "epoch": 0.7207661290322581, + "grad_norm": 5.6875433921813965, + "learning_rate": 9.026782532932578e-06, + "loss": 2.464, + "step": 5005 + }, + { + "epoch": 0.720910138248848, + "grad_norm": 0.9437582492828369, + "learning_rate": 9.018083418054227e-06, + "loss": 0.1142, + "step": 5006 + }, + { + "epoch": 0.7210541474654378, + "grad_norm": 0.6912900805473328, + "learning_rate": 9.00938757439152e-06, + "loss": 0.0753, + "step": 5007 + }, + { + "epoch": 0.7211981566820277, + "grad_norm": 0.8736713528633118, + "learning_rate": 9.000695003724329e-06, + "loss": 0.0998, + "step": 5008 + }, + { + "epoch": 0.7213421658986175, + "grad_norm": 0.7083357572555542, + "learning_rate": 8.992005707831876e-06, + "loss": 0.1038, + "step": 5009 + }, + { + "epoch": 0.7214861751152074, + "grad_norm": 0.8825226426124573, + "learning_rate": 8.983319688492706e-06, + "loss": 0.0955, + "step": 5010 + }, + { + "epoch": 0.7216301843317973, + "grad_norm": 1.278828740119934, + "learning_rate": 8.974636947484686e-06, + "loss": 0.133, + "step": 5011 + }, + { + "epoch": 0.7217741935483871, + "grad_norm": 0.7883100509643555, + "learning_rate": 8.965957486585025e-06, + "loss": 0.0824, + "step": 5012 + }, + { + "epoch": 0.721918202764977, + "grad_norm": 1.5151152610778809, + "learning_rate": 8.957281307570253e-06, + "loss": 0.1392, + "step": 5013 + }, + { + "epoch": 0.7220622119815668, + "grad_norm": 0.6848189830780029, + "learning_rate": 8.948608412216234e-06, + "loss": 0.056, + "step": 5014 + }, + { + "epoch": 0.7222062211981567, + "grad_norm": 0.6679043769836426, + "learning_rate": 8.939938802298154e-06, + "loss": 0.0973, + "step": 5015 + }, + { + "epoch": 0.7223502304147466, + "grad_norm": 0.5782454609870911, + "learning_rate": 8.931272479590528e-06, + "loss": 0.0584, + "step": 5016 + }, + { + "epoch": 0.7224942396313364, + "grad_norm": 0.7533441781997681, + "learning_rate": 8.9226094458672e-06, + "loss": 0.096, + "step": 5017 + }, + { + "epoch": 0.7226382488479263, + "grad_norm": 0.8059340715408325, + "learning_rate": 8.913949702901337e-06, + "loss": 0.082, + "step": 5018 + }, + { + "epoch": 0.7227822580645161, + "grad_norm": 5.499565124511719, + "learning_rate": 8.905293252465441e-06, + "loss": 2.3031, + "step": 5019 + }, + { + "epoch": 0.722926267281106, + "grad_norm": 0.7962672710418701, + "learning_rate": 8.896640096331329e-06, + "loss": 0.0937, + "step": 5020 + }, + { + "epoch": 0.7230702764976958, + "grad_norm": 1.5325440168380737, + "learning_rate": 8.88799023627015e-06, + "loss": 0.1666, + "step": 5021 + }, + { + "epoch": 0.7232142857142857, + "grad_norm": 1.0892744064331055, + "learning_rate": 8.879343674052381e-06, + "loss": 0.2283, + "step": 5022 + }, + { + "epoch": 0.7233582949308756, + "grad_norm": 4.148657321929932, + "learning_rate": 8.870700411447816e-06, + "loss": 1.5021, + "step": 5023 + }, + { + "epoch": 0.7235023041474654, + "grad_norm": 0.815376341342926, + "learning_rate": 8.862060450225579e-06, + "loss": 0.1236, + "step": 5024 + }, + { + "epoch": 0.7236463133640553, + "grad_norm": 4.036196708679199, + "learning_rate": 8.85342379215412e-06, + "loss": 2.3819, + "step": 5025 + }, + { + "epoch": 0.7237903225806451, + "grad_norm": 3.316540002822876, + "learning_rate": 8.844790439001205e-06, + "loss": 1.5413, + "step": 5026 + }, + { + "epoch": 0.723934331797235, + "grad_norm": 0.9485151171684265, + "learning_rate": 8.836160392533935e-06, + "loss": 0.0977, + "step": 5027 + }, + { + "epoch": 0.7240783410138248, + "grad_norm": 4.230076313018799, + "learning_rate": 8.827533654518721e-06, + "loss": 1.4517, + "step": 5028 + }, + { + "epoch": 0.7242223502304147, + "grad_norm": 0.6347218155860901, + "learning_rate": 8.818910226721308e-06, + "loss": 0.0718, + "step": 5029 + }, + { + "epoch": 0.7243663594470046, + "grad_norm": 0.6860532164573669, + "learning_rate": 8.81029011090676e-06, + "loss": 0.0745, + "step": 5030 + }, + { + "epoch": 0.7245103686635944, + "grad_norm": 0.9442921876907349, + "learning_rate": 8.801673308839461e-06, + "loss": 0.0763, + "step": 5031 + }, + { + "epoch": 0.7246543778801844, + "grad_norm": 0.4953638017177582, + "learning_rate": 8.793059822283114e-06, + "loss": 0.0612, + "step": 5032 + }, + { + "epoch": 0.7247983870967742, + "grad_norm": 0.7386147975921631, + "learning_rate": 8.784449653000746e-06, + "loss": 0.1005, + "step": 5033 + }, + { + "epoch": 0.7249423963133641, + "grad_norm": 3.6697123050689697, + "learning_rate": 8.77584280275472e-06, + "loss": 2.1392, + "step": 5034 + }, + { + "epoch": 0.725086405529954, + "grad_norm": 1.5847963094711304, + "learning_rate": 8.767239273306696e-06, + "loss": 0.1457, + "step": 5035 + }, + { + "epoch": 0.7252304147465438, + "grad_norm": 0.9093308448791504, + "learning_rate": 8.758639066417666e-06, + "loss": 0.12, + "step": 5036 + }, + { + "epoch": 0.7253744239631337, + "grad_norm": 1.0084137916564941, + "learning_rate": 8.750042183847936e-06, + "loss": 0.1113, + "step": 5037 + }, + { + "epoch": 0.7255184331797235, + "grad_norm": 1.0922138690948486, + "learning_rate": 8.741448627357143e-06, + "loss": 0.0936, + "step": 5038 + }, + { + "epoch": 0.7256624423963134, + "grad_norm": 0.6656127572059631, + "learning_rate": 8.732858398704233e-06, + "loss": 0.0791, + "step": 5039 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 1.3686954975128174, + "learning_rate": 8.72427149964747e-06, + "loss": 0.1579, + "step": 5040 + }, + { + "epoch": 0.7259504608294931, + "grad_norm": 0.4280448257923126, + "learning_rate": 8.715687931944449e-06, + "loss": 0.0485, + "step": 5041 + }, + { + "epoch": 0.726094470046083, + "grad_norm": 0.9114478230476379, + "learning_rate": 8.707107697352065e-06, + "loss": 0.0942, + "step": 5042 + }, + { + "epoch": 0.7262384792626728, + "grad_norm": 0.8093544244766235, + "learning_rate": 8.698530797626547e-06, + "loss": 0.0895, + "step": 5043 + }, + { + "epoch": 0.7263824884792627, + "grad_norm": 0.7736591100692749, + "learning_rate": 8.689957234523432e-06, + "loss": 4.3351, + "step": 5044 + }, + { + "epoch": 0.7265264976958525, + "grad_norm": 0.34243088960647583, + "learning_rate": 8.681387009797577e-06, + "loss": 0.0533, + "step": 5045 + }, + { + "epoch": 0.7266705069124424, + "grad_norm": 5.472765922546387, + "learning_rate": 8.672820125203154e-06, + "loss": 2.3849, + "step": 5046 + }, + { + "epoch": 0.7268145161290323, + "grad_norm": 1.2159645557403564, + "learning_rate": 8.664256582493654e-06, + "loss": 0.0976, + "step": 5047 + }, + { + "epoch": 0.7269585253456221, + "grad_norm": 1.4383227825164795, + "learning_rate": 8.655696383421883e-06, + "loss": 0.1414, + "step": 5048 + }, + { + "epoch": 0.727102534562212, + "grad_norm": 0.8138378262519836, + "learning_rate": 8.647139529739964e-06, + "loss": 0.1136, + "step": 5049 + }, + { + "epoch": 0.7272465437788018, + "grad_norm": 0.8940933346748352, + "learning_rate": 8.63858602319933e-06, + "loss": 0.1079, + "step": 5050 + }, + { + "epoch": 0.7273905529953917, + "grad_norm": 1.3959314823150635, + "learning_rate": 8.630035865550734e-06, + "loss": 0.1089, + "step": 5051 + }, + { + "epoch": 0.7275345622119815, + "grad_norm": 1.4803367853164673, + "learning_rate": 8.621489058544233e-06, + "loss": 0.2306, + "step": 5052 + }, + { + "epoch": 0.7276785714285714, + "grad_norm": 0.7629092931747437, + "learning_rate": 8.612945603929226e-06, + "loss": 0.0862, + "step": 5053 + }, + { + "epoch": 0.7278225806451613, + "grad_norm": 0.5082203149795532, + "learning_rate": 8.604405503454399e-06, + "loss": 0.0543, + "step": 5054 + }, + { + "epoch": 0.7279665898617511, + "grad_norm": 4.513160705566406, + "learning_rate": 8.595868758867755e-06, + "loss": 2.0357, + "step": 5055 + }, + { + "epoch": 0.728110599078341, + "grad_norm": 0.41346287727355957, + "learning_rate": 8.587335371916621e-06, + "loss": 0.0602, + "step": 5056 + }, + { + "epoch": 0.7282546082949308, + "grad_norm": 0.5664450526237488, + "learning_rate": 8.578805344347623e-06, + "loss": 0.0616, + "step": 5057 + }, + { + "epoch": 0.7283986175115207, + "grad_norm": 0.6376262903213501, + "learning_rate": 8.570278677906715e-06, + "loss": 0.0813, + "step": 5058 + }, + { + "epoch": 0.7285426267281107, + "grad_norm": 3.9467949867248535, + "learning_rate": 8.561755374339147e-06, + "loss": 0.9298, + "step": 5059 + }, + { + "epoch": 0.7286866359447005, + "grad_norm": 0.657094419002533, + "learning_rate": 8.553235435389496e-06, + "loss": 0.0671, + "step": 5060 + }, + { + "epoch": 0.7288306451612904, + "grad_norm": 1.0424162149429321, + "learning_rate": 8.544718862801635e-06, + "loss": 0.1904, + "step": 5061 + }, + { + "epoch": 0.7289746543778802, + "grad_norm": 0.9942811727523804, + "learning_rate": 8.53620565831876e-06, + "loss": 0.1115, + "step": 5062 + }, + { + "epoch": 0.7291186635944701, + "grad_norm": 2.8471529483795166, + "learning_rate": 8.527695823683374e-06, + "loss": 2.5333, + "step": 5063 + }, + { + "epoch": 0.7292626728110599, + "grad_norm": 0.9915168285369873, + "learning_rate": 8.519189360637289e-06, + "loss": 4.0885, + "step": 5064 + }, + { + "epoch": 0.7294066820276498, + "grad_norm": 0.848027765750885, + "learning_rate": 8.510686270921624e-06, + "loss": 0.0844, + "step": 5065 + }, + { + "epoch": 0.7295506912442397, + "grad_norm": 1.0527311563491821, + "learning_rate": 8.50218655627682e-06, + "loss": 0.1118, + "step": 5066 + }, + { + "epoch": 0.7296947004608295, + "grad_norm": 0.5292297601699829, + "learning_rate": 8.493690218442606e-06, + "loss": 0.0766, + "step": 5067 + }, + { + "epoch": 0.7298387096774194, + "grad_norm": 2.932905435562134, + "learning_rate": 8.485197259158044e-06, + "loss": 1.7393, + "step": 5068 + }, + { + "epoch": 0.7299827188940092, + "grad_norm": 0.7756822109222412, + "learning_rate": 8.476707680161486e-06, + "loss": 0.1032, + "step": 5069 + }, + { + "epoch": 0.7301267281105991, + "grad_norm": 0.3804536759853363, + "learning_rate": 8.468221483190597e-06, + "loss": 0.0432, + "step": 5070 + }, + { + "epoch": 0.730270737327189, + "grad_norm": 0.6975178122520447, + "learning_rate": 8.459738669982348e-06, + "loss": 0.0821, + "step": 5071 + }, + { + "epoch": 0.7304147465437788, + "grad_norm": 0.7198653221130371, + "learning_rate": 8.451259242273032e-06, + "loss": 0.0605, + "step": 5072 + }, + { + "epoch": 0.7305587557603687, + "grad_norm": 1.0976951122283936, + "learning_rate": 8.442783201798237e-06, + "loss": 0.116, + "step": 5073 + }, + { + "epoch": 0.7307027649769585, + "grad_norm": 0.8420084118843079, + "learning_rate": 8.434310550292854e-06, + "loss": 0.0965, + "step": 5074 + }, + { + "epoch": 0.7308467741935484, + "grad_norm": 0.8571888208389282, + "learning_rate": 8.425841289491083e-06, + "loss": 0.1, + "step": 5075 + }, + { + "epoch": 0.7309907834101382, + "grad_norm": 1.241355299949646, + "learning_rate": 8.417375421126433e-06, + "loss": 0.1547, + "step": 5076 + }, + { + "epoch": 0.7311347926267281, + "grad_norm": 0.6523582339286804, + "learning_rate": 8.408912946931721e-06, + "loss": 0.0715, + "step": 5077 + }, + { + "epoch": 0.731278801843318, + "grad_norm": 0.7481032609939575, + "learning_rate": 8.400453868639064e-06, + "loss": 0.09, + "step": 5078 + }, + { + "epoch": 0.7314228110599078, + "grad_norm": 0.4782191812992096, + "learning_rate": 8.391998187979886e-06, + "loss": 0.0709, + "step": 5079 + }, + { + "epoch": 0.7315668202764977, + "grad_norm": 0.7423643469810486, + "learning_rate": 8.383545906684912e-06, + "loss": 3.9399, + "step": 5080 + }, + { + "epoch": 0.7317108294930875, + "grad_norm": 5.230178356170654, + "learning_rate": 8.375097026484176e-06, + "loss": 1.905, + "step": 5081 + }, + { + "epoch": 0.7318548387096774, + "grad_norm": 0.9498891234397888, + "learning_rate": 8.366651549107016e-06, + "loss": 0.0727, + "step": 5082 + }, + { + "epoch": 0.7319988479262672, + "grad_norm": 0.7386761903762817, + "learning_rate": 8.358209476282073e-06, + "loss": 0.0825, + "step": 5083 + }, + { + "epoch": 0.7321428571428571, + "grad_norm": 0.9155988693237305, + "learning_rate": 8.349770809737288e-06, + "loss": 0.0899, + "step": 5084 + }, + { + "epoch": 0.732286866359447, + "grad_norm": 0.6412478685379028, + "learning_rate": 8.341335551199902e-06, + "loss": 0.0984, + "step": 5085 + }, + { + "epoch": 0.7324308755760369, + "grad_norm": 0.7030891180038452, + "learning_rate": 8.332903702396472e-06, + "loss": 0.079, + "step": 5086 + }, + { + "epoch": 0.7325748847926268, + "grad_norm": 0.8422414660453796, + "learning_rate": 8.324475265052845e-06, + "loss": 0.0788, + "step": 5087 + }, + { + "epoch": 0.7327188940092166, + "grad_norm": 1.0093605518341064, + "learning_rate": 8.316050240894171e-06, + "loss": 0.1015, + "step": 5088 + }, + { + "epoch": 0.7328629032258065, + "grad_norm": 1.0122052431106567, + "learning_rate": 8.307628631644903e-06, + "loss": 0.1121, + "step": 5089 + }, + { + "epoch": 0.7330069124423964, + "grad_norm": 0.6912545561790466, + "learning_rate": 8.299210439028794e-06, + "loss": 0.0724, + "step": 5090 + }, + { + "epoch": 0.7331509216589862, + "grad_norm": 0.7719224691390991, + "learning_rate": 8.290795664768906e-06, + "loss": 0.1229, + "step": 5091 + }, + { + "epoch": 0.7332949308755761, + "grad_norm": 0.4275592565536499, + "learning_rate": 8.282384310587593e-06, + "loss": 0.0471, + "step": 5092 + }, + { + "epoch": 0.7334389400921659, + "grad_norm": 0.5021594762802124, + "learning_rate": 8.273976378206508e-06, + "loss": 0.0617, + "step": 5093 + }, + { + "epoch": 0.7335829493087558, + "grad_norm": 1.0093579292297363, + "learning_rate": 8.265571869346605e-06, + "loss": 0.0935, + "step": 5094 + }, + { + "epoch": 0.7337269585253456, + "grad_norm": 5.54874849319458, + "learning_rate": 8.25717078572814e-06, + "loss": 1.5995, + "step": 5095 + }, + { + "epoch": 0.7338709677419355, + "grad_norm": 0.7613135576248169, + "learning_rate": 8.248773129070666e-06, + "loss": 0.0864, + "step": 5096 + }, + { + "epoch": 0.7340149769585254, + "grad_norm": 5.481618404388428, + "learning_rate": 8.240378901093034e-06, + "loss": 1.7934, + "step": 5097 + }, + { + "epoch": 0.7341589861751152, + "grad_norm": 0.6723498106002808, + "learning_rate": 8.231988103513397e-06, + "loss": 0.0628, + "step": 5098 + }, + { + "epoch": 0.7343029953917051, + "grad_norm": 1.4777655601501465, + "learning_rate": 8.223600738049198e-06, + "loss": 0.0927, + "step": 5099 + }, + { + "epoch": 0.7344470046082949, + "grad_norm": 0.8319818377494812, + "learning_rate": 8.215216806417183e-06, + "loss": 0.1084, + "step": 5100 + }, + { + "epoch": 0.7345910138248848, + "grad_norm": 0.7786383032798767, + "learning_rate": 8.206836310333401e-06, + "loss": 0.0818, + "step": 5101 + }, + { + "epoch": 0.7347350230414746, + "grad_norm": 0.8746985793113708, + "learning_rate": 8.198459251513182e-06, + "loss": 0.0767, + "step": 5102 + }, + { + "epoch": 0.7348790322580645, + "grad_norm": 1.3719249963760376, + "learning_rate": 8.190085631671172e-06, + "loss": 0.1176, + "step": 5103 + }, + { + "epoch": 0.7350230414746544, + "grad_norm": 0.7941738963127136, + "learning_rate": 8.18171545252129e-06, + "loss": 0.1601, + "step": 5104 + }, + { + "epoch": 0.7351670506912442, + "grad_norm": 1.6319234371185303, + "learning_rate": 8.173348715776777e-06, + "loss": 0.139, + "step": 5105 + }, + { + "epoch": 0.7353110599078341, + "grad_norm": 4.880118370056152, + "learning_rate": 8.164985423150148e-06, + "loss": 1.8048, + "step": 5106 + }, + { + "epoch": 0.7354550691244239, + "grad_norm": 0.7027468681335449, + "learning_rate": 8.156625576353222e-06, + "loss": 0.073, + "step": 5107 + }, + { + "epoch": 0.7355990783410138, + "grad_norm": 0.9825553894042969, + "learning_rate": 8.148269177097111e-06, + "loss": 0.1026, + "step": 5108 + }, + { + "epoch": 0.7357430875576036, + "grad_norm": 0.9629524350166321, + "learning_rate": 8.139916227092229e-06, + "loss": 0.1227, + "step": 5109 + }, + { + "epoch": 0.7358870967741935, + "grad_norm": 1.1058712005615234, + "learning_rate": 8.131566728048268e-06, + "loss": 0.1084, + "step": 5110 + }, + { + "epoch": 0.7360311059907834, + "grad_norm": 1.2226344347000122, + "learning_rate": 8.123220681674227e-06, + "loss": 0.1255, + "step": 5111 + }, + { + "epoch": 0.7361751152073732, + "grad_norm": 7.253852844238281, + "learning_rate": 8.114878089678393e-06, + "loss": 1.0023, + "step": 5112 + }, + { + "epoch": 0.7363191244239631, + "grad_norm": 4.014724254608154, + "learning_rate": 8.10653895376835e-06, + "loss": 1.524, + "step": 5113 + }, + { + "epoch": 0.736463133640553, + "grad_norm": 0.574944794178009, + "learning_rate": 8.098203275650967e-06, + "loss": 0.0878, + "step": 5114 + }, + { + "epoch": 0.7366071428571429, + "grad_norm": 0.699097216129303, + "learning_rate": 8.089871057032405e-06, + "loss": 0.0772, + "step": 5115 + }, + { + "epoch": 0.7367511520737328, + "grad_norm": 1.2430492639541626, + "learning_rate": 8.081542299618139e-06, + "loss": 0.0954, + "step": 5116 + }, + { + "epoch": 0.7368951612903226, + "grad_norm": 3.174675703048706, + "learning_rate": 8.073217005112907e-06, + "loss": 2.4932, + "step": 5117 + }, + { + "epoch": 0.7370391705069125, + "grad_norm": 1.7436814308166504, + "learning_rate": 8.064895175220752e-06, + "loss": 0.1154, + "step": 5118 + }, + { + "epoch": 0.7371831797235023, + "grad_norm": 1.6984812021255493, + "learning_rate": 8.056576811645003e-06, + "loss": 0.1501, + "step": 5119 + }, + { + "epoch": 0.7373271889400922, + "grad_norm": 0.9108030796051025, + "learning_rate": 8.048261916088281e-06, + "loss": 0.1318, + "step": 5120 + }, + { + "epoch": 0.737471198156682, + "grad_norm": 0.46936559677124023, + "learning_rate": 8.039950490252505e-06, + "loss": 0.0476, + "step": 5121 + }, + { + "epoch": 0.7376152073732719, + "grad_norm": 0.42100265622138977, + "learning_rate": 8.031642535838868e-06, + "loss": 0.0689, + "step": 5122 + }, + { + "epoch": 0.7377592165898618, + "grad_norm": 13.069024085998535, + "learning_rate": 8.023338054547869e-06, + "loss": 2.6183, + "step": 5123 + }, + { + "epoch": 0.7379032258064516, + "grad_norm": 3.7977070808410645, + "learning_rate": 8.015037048079282e-06, + "loss": 1.6843, + "step": 5124 + }, + { + "epoch": 0.7380472350230415, + "grad_norm": 0.8357831239700317, + "learning_rate": 8.006739518132177e-06, + "loss": 0.102, + "step": 5125 + }, + { + "epoch": 0.7381912442396313, + "grad_norm": 0.940102756023407, + "learning_rate": 7.998445466404919e-06, + "loss": 0.0897, + "step": 5126 + }, + { + "epoch": 0.7383352534562212, + "grad_norm": 1.3404045104980469, + "learning_rate": 7.990154894595144e-06, + "loss": 4.1018, + "step": 5127 + }, + { + "epoch": 0.738479262672811, + "grad_norm": 5.420523643493652, + "learning_rate": 7.981867804399792e-06, + "loss": 1.8511, + "step": 5128 + }, + { + "epoch": 0.7386232718894009, + "grad_norm": 3.170828104019165, + "learning_rate": 7.97358419751508e-06, + "loss": 1.9159, + "step": 5129 + }, + { + "epoch": 0.7387672811059908, + "grad_norm": 1.074342966079712, + "learning_rate": 7.965304075636518e-06, + "loss": 0.1077, + "step": 5130 + }, + { + "epoch": 0.7389112903225806, + "grad_norm": 0.5176472663879395, + "learning_rate": 7.9570274404589e-06, + "loss": 0.0673, + "step": 5131 + }, + { + "epoch": 0.7390552995391705, + "grad_norm": 6.042227268218994, + "learning_rate": 7.948754293676306e-06, + "loss": 1.5087, + "step": 5132 + }, + { + "epoch": 0.7391993087557603, + "grad_norm": 0.8003148436546326, + "learning_rate": 7.940484636982104e-06, + "loss": 0.107, + "step": 5133 + }, + { + "epoch": 0.7393433179723502, + "grad_norm": 0.995682954788208, + "learning_rate": 7.932218472068945e-06, + "loss": 0.1809, + "step": 5134 + }, + { + "epoch": 0.7394873271889401, + "grad_norm": 0.7535360455513, + "learning_rate": 7.923955800628768e-06, + "loss": 0.0732, + "step": 5135 + }, + { + "epoch": 0.7396313364055299, + "grad_norm": 0.30824464559555054, + "learning_rate": 7.915696624352797e-06, + "loss": 0.0444, + "step": 5136 + }, + { + "epoch": 0.7397753456221198, + "grad_norm": 0.6070700287818909, + "learning_rate": 7.907440944931536e-06, + "loss": 0.0576, + "step": 5137 + }, + { + "epoch": 0.7399193548387096, + "grad_norm": 1.7566314935684204, + "learning_rate": 7.899188764054777e-06, + "loss": 0.1394, + "step": 5138 + }, + { + "epoch": 0.7400633640552995, + "grad_norm": 1.0645490884780884, + "learning_rate": 7.890940083411599e-06, + "loss": 0.0917, + "step": 5139 + }, + { + "epoch": 0.7402073732718893, + "grad_norm": 1.2144794464111328, + "learning_rate": 7.882694904690358e-06, + "loss": 0.1457, + "step": 5140 + }, + { + "epoch": 0.7403513824884793, + "grad_norm": 0.5165092349052429, + "learning_rate": 7.874453229578696e-06, + "loss": 0.0711, + "step": 5141 + }, + { + "epoch": 0.7404953917050692, + "grad_norm": 0.44213229417800903, + "learning_rate": 7.86621505976353e-06, + "loss": 0.0419, + "step": 5142 + }, + { + "epoch": 0.740639400921659, + "grad_norm": 0.8519138097763062, + "learning_rate": 7.857980396931086e-06, + "loss": 0.099, + "step": 5143 + }, + { + "epoch": 0.7407834101382489, + "grad_norm": 0.6757403016090393, + "learning_rate": 7.849749242766844e-06, + "loss": 0.0798, + "step": 5144 + }, + { + "epoch": 0.7409274193548387, + "grad_norm": 0.7939171195030212, + "learning_rate": 7.841521598955576e-06, + "loss": 0.1007, + "step": 5145 + }, + { + "epoch": 0.7410714285714286, + "grad_norm": 1.087679386138916, + "learning_rate": 7.833297467181336e-06, + "loss": 0.1126, + "step": 5146 + }, + { + "epoch": 0.7412154377880185, + "grad_norm": 0.8373237252235413, + "learning_rate": 7.825076849127458e-06, + "loss": 0.1201, + "step": 5147 + }, + { + "epoch": 0.7413594470046083, + "grad_norm": 0.753200352191925, + "learning_rate": 7.816859746476554e-06, + "loss": 0.0707, + "step": 5148 + }, + { + "epoch": 0.7415034562211982, + "grad_norm": 2.1236417293548584, + "learning_rate": 7.808646160910525e-06, + "loss": 0.2067, + "step": 5149 + }, + { + "epoch": 0.741647465437788, + "grad_norm": 1.0240137577056885, + "learning_rate": 7.800436094110543e-06, + "loss": 0.1139, + "step": 5150 + }, + { + "epoch": 0.7417914746543779, + "grad_norm": 1.221002459526062, + "learning_rate": 7.792229547757065e-06, + "loss": 0.1223, + "step": 5151 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 0.8360595107078552, + "learning_rate": 7.784026523529824e-06, + "loss": 0.0977, + "step": 5152 + }, + { + "epoch": 0.7420794930875576, + "grad_norm": 4.683023929595947, + "learning_rate": 7.775827023107835e-06, + "loss": 0.2167, + "step": 5153 + }, + { + "epoch": 0.7422235023041475, + "grad_norm": 0.9469450116157532, + "learning_rate": 7.767631048169393e-06, + "loss": 0.1126, + "step": 5154 + }, + { + "epoch": 0.7423675115207373, + "grad_norm": 0.9593278765678406, + "learning_rate": 7.759438600392065e-06, + "loss": 0.1545, + "step": 5155 + }, + { + "epoch": 0.7425115207373272, + "grad_norm": 3.4657764434814453, + "learning_rate": 7.751249681452702e-06, + "loss": 2.4307, + "step": 5156 + }, + { + "epoch": 0.742655529953917, + "grad_norm": 0.5459062457084656, + "learning_rate": 7.743064293027433e-06, + "loss": 0.0771, + "step": 5157 + }, + { + "epoch": 0.7427995391705069, + "grad_norm": 0.8837961554527283, + "learning_rate": 7.73488243679166e-06, + "loss": 0.097, + "step": 5158 + }, + { + "epoch": 0.7429435483870968, + "grad_norm": 0.5904355645179749, + "learning_rate": 7.726704114420064e-06, + "loss": 0.0904, + "step": 5159 + }, + { + "epoch": 0.7430875576036866, + "grad_norm": 0.609336793422699, + "learning_rate": 7.718529327586601e-06, + "loss": 0.0785, + "step": 5160 + }, + { + "epoch": 0.7432315668202765, + "grad_norm": 4.734989643096924, + "learning_rate": 7.71035807796451e-06, + "loss": 0.9262, + "step": 5161 + }, + { + "epoch": 0.7433755760368663, + "grad_norm": 6.757307529449463, + "learning_rate": 7.702190367226301e-06, + "loss": 1.7777, + "step": 5162 + }, + { + "epoch": 0.7435195852534562, + "grad_norm": 0.66618812084198, + "learning_rate": 7.694026197043756e-06, + "loss": 0.0786, + "step": 5163 + }, + { + "epoch": 0.743663594470046, + "grad_norm": 4.517458438873291, + "learning_rate": 7.685865569087935e-06, + "loss": 2.115, + "step": 5164 + }, + { + "epoch": 0.7438076036866359, + "grad_norm": 0.9335571527481079, + "learning_rate": 7.677708485029182e-06, + "loss": 0.1138, + "step": 5165 + }, + { + "epoch": 0.7439516129032258, + "grad_norm": 4.167686462402344, + "learning_rate": 7.6695549465371e-06, + "loss": 1.8304, + "step": 5166 + }, + { + "epoch": 0.7440956221198156, + "grad_norm": 1.9542595148086548, + "learning_rate": 7.66140495528058e-06, + "loss": 0.1896, + "step": 5167 + }, + { + "epoch": 0.7442396313364056, + "grad_norm": 3.6489205360412598, + "learning_rate": 7.653258512927778e-06, + "loss": 1.0004, + "step": 5168 + }, + { + "epoch": 0.7443836405529954, + "grad_norm": 0.9295651912689209, + "learning_rate": 7.645115621146115e-06, + "loss": 0.095, + "step": 5169 + }, + { + "epoch": 0.7445276497695853, + "grad_norm": 1.209572196006775, + "learning_rate": 7.63697628160232e-06, + "loss": 0.0915, + "step": 5170 + }, + { + "epoch": 0.7446716589861752, + "grad_norm": 0.6118320226669312, + "learning_rate": 7.628840495962361e-06, + "loss": 0.0777, + "step": 5171 + }, + { + "epoch": 0.744815668202765, + "grad_norm": 1.3106482028961182, + "learning_rate": 7.620708265891488e-06, + "loss": 0.1188, + "step": 5172 + }, + { + "epoch": 0.7449596774193549, + "grad_norm": 1.1879103183746338, + "learning_rate": 7.612579593054225e-06, + "loss": 0.1026, + "step": 5173 + }, + { + "epoch": 0.7451036866359447, + "grad_norm": 0.8158475160598755, + "learning_rate": 7.60445447911437e-06, + "loss": 0.1172, + "step": 5174 + }, + { + "epoch": 0.7452476958525346, + "grad_norm": 0.8713764548301697, + "learning_rate": 7.5963329257349895e-06, + "loss": 0.1025, + "step": 5175 + }, + { + "epoch": 0.7453917050691244, + "grad_norm": 1.0633515119552612, + "learning_rate": 7.588214934578419e-06, + "loss": 0.1287, + "step": 5176 + }, + { + "epoch": 0.7455357142857143, + "grad_norm": 0.8359399437904358, + "learning_rate": 7.5801005073062675e-06, + "loss": 0.0831, + "step": 5177 + }, + { + "epoch": 0.7456797235023042, + "grad_norm": 4.409516334533691, + "learning_rate": 7.571989645579419e-06, + "loss": 1.215, + "step": 5178 + }, + { + "epoch": 0.745823732718894, + "grad_norm": 0.8604900240898132, + "learning_rate": 7.5638823510580215e-06, + "loss": 0.1001, + "step": 5179 + }, + { + "epoch": 0.7459677419354839, + "grad_norm": 0.2948078215122223, + "learning_rate": 7.555778625401494e-06, + "loss": 0.0523, + "step": 5180 + }, + { + "epoch": 0.7461117511520737, + "grad_norm": 1.4474557638168335, + "learning_rate": 7.547678470268526e-06, + "loss": 0.1128, + "step": 5181 + }, + { + "epoch": 0.7462557603686636, + "grad_norm": 0.9530521631240845, + "learning_rate": 7.5395818873170764e-06, + "loss": 0.1026, + "step": 5182 + }, + { + "epoch": 0.7463997695852534, + "grad_norm": 0.5932387113571167, + "learning_rate": 7.531488878204371e-06, + "loss": 0.0674, + "step": 5183 + }, + { + "epoch": 0.7465437788018433, + "grad_norm": 1.1104680299758911, + "learning_rate": 7.523399444586909e-06, + "loss": 0.1326, + "step": 5184 + }, + { + "epoch": 0.7466877880184332, + "grad_norm": 1.0883698463439941, + "learning_rate": 7.515313588120451e-06, + "loss": 0.0979, + "step": 5185 + }, + { + "epoch": 0.746831797235023, + "grad_norm": 0.43003159761428833, + "learning_rate": 7.5072313104600305e-06, + "loss": 0.0521, + "step": 5186 + }, + { + "epoch": 0.7469758064516129, + "grad_norm": 2.0218396186828613, + "learning_rate": 7.4991526132599435e-06, + "loss": 0.2088, + "step": 5187 + }, + { + "epoch": 0.7471198156682027, + "grad_norm": 1.1027439832687378, + "learning_rate": 7.4910774981737625e-06, + "loss": 0.0986, + "step": 5188 + }, + { + "epoch": 0.7472638248847926, + "grad_norm": 0.7492672204971313, + "learning_rate": 7.4830059668543174e-06, + "loss": 0.0873, + "step": 5189 + }, + { + "epoch": 0.7474078341013825, + "grad_norm": 0.8446651101112366, + "learning_rate": 7.474938020953709e-06, + "loss": 0.0714, + "step": 5190 + }, + { + "epoch": 0.7475518433179723, + "grad_norm": 0.5629956126213074, + "learning_rate": 7.4668736621233e-06, + "loss": 0.0829, + "step": 5191 + }, + { + "epoch": 0.7476958525345622, + "grad_norm": 0.9636806845664978, + "learning_rate": 7.458812892013722e-06, + "loss": 0.1062, + "step": 5192 + }, + { + "epoch": 0.747839861751152, + "grad_norm": 0.4932549297809601, + "learning_rate": 7.450755712274879e-06, + "loss": 0.0516, + "step": 5193 + }, + { + "epoch": 0.7479838709677419, + "grad_norm": 0.7291005253791809, + "learning_rate": 7.442702124555925e-06, + "loss": 0.0783, + "step": 5194 + }, + { + "epoch": 0.7481278801843319, + "grad_norm": 0.41483160853385925, + "learning_rate": 7.434652130505293e-06, + "loss": 0.0544, + "step": 5195 + }, + { + "epoch": 0.7482718894009217, + "grad_norm": 1.367686152458191, + "learning_rate": 7.426605731770661e-06, + "loss": 0.1136, + "step": 5196 + }, + { + "epoch": 0.7484158986175116, + "grad_norm": 0.6419558525085449, + "learning_rate": 7.418562929999004e-06, + "loss": 0.0645, + "step": 5197 + }, + { + "epoch": 0.7485599078341014, + "grad_norm": 0.3571107089519501, + "learning_rate": 7.410523726836533e-06, + "loss": 0.0529, + "step": 5198 + }, + { + "epoch": 0.7487039170506913, + "grad_norm": 0.8279479146003723, + "learning_rate": 7.40248812392873e-06, + "loss": 0.0897, + "step": 5199 + }, + { + "epoch": 0.7488479262672811, + "grad_norm": 5.361515998840332, + "learning_rate": 7.39445612292034e-06, + "loss": 1.3097, + "step": 5200 + }, + { + "epoch": 0.748991935483871, + "grad_norm": 0.7612103223800659, + "learning_rate": 7.386427725455372e-06, + "loss": 0.1025, + "step": 5201 + }, + { + "epoch": 0.7491359447004609, + "grad_norm": 7.822725772857666, + "learning_rate": 7.3784029331771e-06, + "loss": 1.9435, + "step": 5202 + }, + { + "epoch": 0.7492799539170507, + "grad_norm": 19.498985290527344, + "learning_rate": 7.3703817477280525e-06, + "loss": 2.4733, + "step": 5203 + }, + { + "epoch": 0.7494239631336406, + "grad_norm": 1.0362223386764526, + "learning_rate": 7.362364170750028e-06, + "loss": 0.1199, + "step": 5204 + }, + { + "epoch": 0.7495679723502304, + "grad_norm": 0.922974169254303, + "learning_rate": 7.354350203884078e-06, + "loss": 0.1273, + "step": 5205 + }, + { + "epoch": 0.7497119815668203, + "grad_norm": 1.1421761512756348, + "learning_rate": 7.3463398487705255e-06, + "loss": 0.1226, + "step": 5206 + }, + { + "epoch": 0.7498559907834101, + "grad_norm": 0.8571959137916565, + "learning_rate": 7.3383331070489446e-06, + "loss": 0.1046, + "step": 5207 + }, + { + "epoch": 0.75, + "grad_norm": 0.635453462600708, + "learning_rate": 7.3303299803581745e-06, + "loss": 0.0583, + "step": 5208 + }, + { + "epoch": 0.7501440092165899, + "grad_norm": 0.7989516854286194, + "learning_rate": 7.3223304703363135e-06, + "loss": 0.0876, + "step": 5209 + }, + { + "epoch": 0.7502880184331797, + "grad_norm": 0.8713171482086182, + "learning_rate": 7.314334578620721e-06, + "loss": 0.1073, + "step": 5210 + }, + { + "epoch": 0.7504320276497696, + "grad_norm": 0.9894455075263977, + "learning_rate": 7.306342306848013e-06, + "loss": 0.2175, + "step": 5211 + }, + { + "epoch": 0.7505760368663594, + "grad_norm": 0.6952599883079529, + "learning_rate": 7.298353656654069e-06, + "loss": 0.0955, + "step": 5212 + }, + { + "epoch": 0.7507200460829493, + "grad_norm": 6.694179534912109, + "learning_rate": 7.2903686296740215e-06, + "loss": 1.7412, + "step": 5213 + }, + { + "epoch": 0.7508640552995391, + "grad_norm": 6.206746578216553, + "learning_rate": 7.282387227542265e-06, + "loss": 1.0479, + "step": 5214 + }, + { + "epoch": 0.751008064516129, + "grad_norm": 1.140989065170288, + "learning_rate": 7.27440945189245e-06, + "loss": 0.0904, + "step": 5215 + }, + { + "epoch": 0.7511520737327189, + "grad_norm": 1.0890384912490845, + "learning_rate": 7.266435304357491e-06, + "loss": 0.1188, + "step": 5216 + }, + { + "epoch": 0.7512960829493087, + "grad_norm": 0.5488396883010864, + "learning_rate": 7.258464786569549e-06, + "loss": 0.0506, + "step": 5217 + }, + { + "epoch": 0.7514400921658986, + "grad_norm": 5.567606449127197, + "learning_rate": 7.25049790016005e-06, + "loss": 2.3805, + "step": 5218 + }, + { + "epoch": 0.7515841013824884, + "grad_norm": 0.8697788119316101, + "learning_rate": 7.242534646759677e-06, + "loss": 0.0958, + "step": 5219 + }, + { + "epoch": 0.7517281105990783, + "grad_norm": 10.158220291137695, + "learning_rate": 7.234575027998367e-06, + "loss": 1.8067, + "step": 5220 + }, + { + "epoch": 0.7518721198156681, + "grad_norm": 4.745046615600586, + "learning_rate": 7.226619045505309e-06, + "loss": 1.5941, + "step": 5221 + }, + { + "epoch": 0.7520161290322581, + "grad_norm": 1.6896940469741821, + "learning_rate": 7.218666700908955e-06, + "loss": 0.1475, + "step": 5222 + }, + { + "epoch": 0.752160138248848, + "grad_norm": 0.9235973358154297, + "learning_rate": 7.210717995836999e-06, + "loss": 0.12, + "step": 5223 + }, + { + "epoch": 0.7523041474654378, + "grad_norm": 1.2874013185501099, + "learning_rate": 7.202772931916421e-06, + "loss": 0.1178, + "step": 5224 + }, + { + "epoch": 0.7524481566820277, + "grad_norm": 0.5031424164772034, + "learning_rate": 7.194831510773423e-06, + "loss": 0.0794, + "step": 5225 + }, + { + "epoch": 0.7525921658986175, + "grad_norm": 0.9236135482788086, + "learning_rate": 7.186893734033473e-06, + "loss": 4.452, + "step": 5226 + }, + { + "epoch": 0.7527361751152074, + "grad_norm": 0.8769807815551758, + "learning_rate": 7.178959603321298e-06, + "loss": 0.0964, + "step": 5227 + }, + { + "epoch": 0.7528801843317973, + "grad_norm": 0.49568620324134827, + "learning_rate": 7.171029120260869e-06, + "loss": 0.0781, + "step": 5228 + }, + { + "epoch": 0.7530241935483871, + "grad_norm": 5.808595180511475, + "learning_rate": 7.16310228647542e-06, + "loss": 2.2394, + "step": 5229 + }, + { + "epoch": 0.753168202764977, + "grad_norm": 0.37017911672592163, + "learning_rate": 7.155179103587428e-06, + "loss": 0.0481, + "step": 5230 + }, + { + "epoch": 0.7533122119815668, + "grad_norm": 1.0182045698165894, + "learning_rate": 7.147259573218634e-06, + "loss": 0.0999, + "step": 5231 + }, + { + "epoch": 0.7534562211981567, + "grad_norm": 0.7926430106163025, + "learning_rate": 7.139343696990025e-06, + "loss": 0.0889, + "step": 5232 + }, + { + "epoch": 0.7536002304147466, + "grad_norm": 0.7172993421554565, + "learning_rate": 7.131431476521838e-06, + "loss": 0.0759, + "step": 5233 + }, + { + "epoch": 0.7537442396313364, + "grad_norm": 1.029604434967041, + "learning_rate": 7.123522913433567e-06, + "loss": 0.0968, + "step": 5234 + }, + { + "epoch": 0.7538882488479263, + "grad_norm": 0.7579464912414551, + "learning_rate": 7.115618009343955e-06, + "loss": 0.0715, + "step": 5235 + }, + { + "epoch": 0.7540322580645161, + "grad_norm": 0.8270736932754517, + "learning_rate": 7.107716765870995e-06, + "loss": 0.1373, + "step": 5236 + }, + { + "epoch": 0.754176267281106, + "grad_norm": 0.7689620852470398, + "learning_rate": 7.099819184631928e-06, + "loss": 0.0636, + "step": 5237 + }, + { + "epoch": 0.7543202764976958, + "grad_norm": 0.5976075530052185, + "learning_rate": 7.091925267243257e-06, + "loss": 0.0681, + "step": 5238 + }, + { + "epoch": 0.7544642857142857, + "grad_norm": 0.9032452702522278, + "learning_rate": 7.084035015320722e-06, + "loss": 0.1037, + "step": 5239 + }, + { + "epoch": 0.7546082949308756, + "grad_norm": 0.723436713218689, + "learning_rate": 7.076148430479321e-06, + "loss": 0.0886, + "step": 5240 + }, + { + "epoch": 0.7547523041474654, + "grad_norm": 6.461106300354004, + "learning_rate": 7.0682655143332945e-06, + "loss": 1.9506, + "step": 5241 + }, + { + "epoch": 0.7548963133640553, + "grad_norm": 1.0118249654769897, + "learning_rate": 7.060386268496141e-06, + "loss": 0.1249, + "step": 5242 + }, + { + "epoch": 0.7550403225806451, + "grad_norm": 0.9937555193901062, + "learning_rate": 7.0525106945805994e-06, + "loss": 0.116, + "step": 5243 + }, + { + "epoch": 0.755184331797235, + "grad_norm": 0.991585373878479, + "learning_rate": 7.04463879419866e-06, + "loss": 0.1139, + "step": 5244 + }, + { + "epoch": 0.7553283410138248, + "grad_norm": 0.918014645576477, + "learning_rate": 7.036770568961562e-06, + "loss": 0.1181, + "step": 5245 + }, + { + "epoch": 0.7554723502304147, + "grad_norm": 1.3015087842941284, + "learning_rate": 7.028906020479795e-06, + "loss": 0.1333, + "step": 5246 + }, + { + "epoch": 0.7556163594470046, + "grad_norm": 4.809826850891113, + "learning_rate": 7.021045150363087e-06, + "loss": 1.3471, + "step": 5247 + }, + { + "epoch": 0.7557603686635944, + "grad_norm": 0.5973480939865112, + "learning_rate": 7.013187960220425e-06, + "loss": 0.0635, + "step": 5248 + }, + { + "epoch": 0.7559043778801844, + "grad_norm": 0.8798346519470215, + "learning_rate": 7.005334451660034e-06, + "loss": 0.1064, + "step": 5249 + }, + { + "epoch": 0.7560483870967742, + "grad_norm": 0.6280069947242737, + "learning_rate": 6.99748462628938e-06, + "loss": 0.0703, + "step": 5250 + }, + { + "epoch": 0.7561923963133641, + "grad_norm": 0.8606945872306824, + "learning_rate": 6.989638485715202e-06, + "loss": 0.0756, + "step": 5251 + }, + { + "epoch": 0.756336405529954, + "grad_norm": 5.73184871673584, + "learning_rate": 6.981796031543456e-06, + "loss": 1.571, + "step": 5252 + }, + { + "epoch": 0.7564804147465438, + "grad_norm": 1.0273476839065552, + "learning_rate": 6.973957265379352e-06, + "loss": 0.1095, + "step": 5253 + }, + { + "epoch": 0.7566244239631337, + "grad_norm": 0.7825251221656799, + "learning_rate": 6.966122188827351e-06, + "loss": 0.0875, + "step": 5254 + }, + { + "epoch": 0.7567684331797235, + "grad_norm": 0.46219635009765625, + "learning_rate": 6.958290803491149e-06, + "loss": 0.0689, + "step": 5255 + }, + { + "epoch": 0.7569124423963134, + "grad_norm": 1.0910745859146118, + "learning_rate": 6.950463110973698e-06, + "loss": 0.1036, + "step": 5256 + }, + { + "epoch": 0.7570564516129032, + "grad_norm": 0.6532211899757385, + "learning_rate": 6.942639112877186e-06, + "loss": 0.0559, + "step": 5257 + }, + { + "epoch": 0.7572004608294931, + "grad_norm": 1.0370570421218872, + "learning_rate": 6.934818810803045e-06, + "loss": 0.0955, + "step": 5258 + }, + { + "epoch": 0.757344470046083, + "grad_norm": 0.9251962304115295, + "learning_rate": 6.927002206351957e-06, + "loss": 0.1181, + "step": 5259 + }, + { + "epoch": 0.7574884792626728, + "grad_norm": 0.941900372505188, + "learning_rate": 6.919189301123835e-06, + "loss": 0.0928, + "step": 5260 + }, + { + "epoch": 0.7576324884792627, + "grad_norm": 1.5216771364212036, + "learning_rate": 6.911380096717851e-06, + "loss": 0.1221, + "step": 5261 + }, + { + "epoch": 0.7577764976958525, + "grad_norm": 0.6096512079238892, + "learning_rate": 6.903574594732407e-06, + "loss": 0.0789, + "step": 5262 + }, + { + "epoch": 0.7579205069124424, + "grad_norm": 1.4737403392791748, + "learning_rate": 6.895772796765151e-06, + "loss": 0.1604, + "step": 5263 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 0.3473174273967743, + "learning_rate": 6.887974704412972e-06, + "loss": 0.0564, + "step": 5264 + }, + { + "epoch": 0.7582085253456221, + "grad_norm": 0.8365328311920166, + "learning_rate": 6.880180319272006e-06, + "loss": 0.0859, + "step": 5265 + }, + { + "epoch": 0.758352534562212, + "grad_norm": 0.773053765296936, + "learning_rate": 6.872389642937621e-06, + "loss": 0.0983, + "step": 5266 + }, + { + "epoch": 0.7584965437788018, + "grad_norm": 0.7565257549285889, + "learning_rate": 6.864602677004431e-06, + "loss": 0.073, + "step": 5267 + }, + { + "epoch": 0.7586405529953917, + "grad_norm": 0.6025420427322388, + "learning_rate": 6.856819423066294e-06, + "loss": 0.0608, + "step": 5268 + }, + { + "epoch": 0.7587845622119815, + "grad_norm": 1.2261170148849487, + "learning_rate": 6.8490398827163015e-06, + "loss": 0.1343, + "step": 5269 + }, + { + "epoch": 0.7589285714285714, + "grad_norm": 0.7005963921546936, + "learning_rate": 6.84126405754679e-06, + "loss": 0.0996, + "step": 5270 + }, + { + "epoch": 0.7590725806451613, + "grad_norm": 0.4194117486476898, + "learning_rate": 6.833491949149329e-06, + "loss": 0.0449, + "step": 5271 + }, + { + "epoch": 0.7592165898617511, + "grad_norm": 4.49902868270874, + "learning_rate": 6.825723559114736e-06, + "loss": 2.0777, + "step": 5272 + }, + { + "epoch": 0.759360599078341, + "grad_norm": 0.6494269967079163, + "learning_rate": 6.817958889033061e-06, + "loss": 0.0905, + "step": 5273 + }, + { + "epoch": 0.7595046082949308, + "grad_norm": 3.314570188522339, + "learning_rate": 6.810197940493596e-06, + "loss": 0.91, + "step": 5274 + }, + { + "epoch": 0.7596486175115207, + "grad_norm": 0.9716746807098389, + "learning_rate": 6.802440715084868e-06, + "loss": 0.0885, + "step": 5275 + }, + { + "epoch": 0.7597926267281107, + "grad_norm": 0.5138939619064331, + "learning_rate": 6.794687214394646e-06, + "loss": 0.0579, + "step": 5276 + }, + { + "epoch": 0.7599366359447005, + "grad_norm": 0.9741175174713135, + "learning_rate": 6.786937440009924e-06, + "loss": 0.1253, + "step": 5277 + }, + { + "epoch": 0.7600806451612904, + "grad_norm": 0.668060302734375, + "learning_rate": 6.779191393516962e-06, + "loss": 0.0829, + "step": 5278 + }, + { + "epoch": 0.7602246543778802, + "grad_norm": 0.6900793313980103, + "learning_rate": 6.7714490765012265e-06, + "loss": 0.078, + "step": 5279 + }, + { + "epoch": 0.7603686635944701, + "grad_norm": 0.8054764866828918, + "learning_rate": 6.76371049054744e-06, + "loss": 0.0863, + "step": 5280 + }, + { + "epoch": 0.7605126728110599, + "grad_norm": 0.6815091371536255, + "learning_rate": 6.7559756372395475e-06, + "loss": 0.0834, + "step": 5281 + }, + { + "epoch": 0.7606566820276498, + "grad_norm": 4.730888843536377, + "learning_rate": 6.74824451816074e-06, + "loss": 2.3007, + "step": 5282 + }, + { + "epoch": 0.7608006912442397, + "grad_norm": 0.7733036875724792, + "learning_rate": 6.7405171348934425e-06, + "loss": 0.0777, + "step": 5283 + }, + { + "epoch": 0.7609447004608295, + "grad_norm": 0.5508599877357483, + "learning_rate": 6.7327934890193095e-06, + "loss": 0.0524, + "step": 5284 + }, + { + "epoch": 0.7610887096774194, + "grad_norm": 1.0533008575439453, + "learning_rate": 6.725073582119235e-06, + "loss": 0.1159, + "step": 5285 + }, + { + "epoch": 0.7612327188940092, + "grad_norm": 1.5758484601974487, + "learning_rate": 6.717357415773351e-06, + "loss": 0.1211, + "step": 5286 + }, + { + "epoch": 0.7613767281105991, + "grad_norm": 0.4455191195011139, + "learning_rate": 6.709644991561017e-06, + "loss": 0.0588, + "step": 5287 + }, + { + "epoch": 0.761520737327189, + "grad_norm": 0.9223089814186096, + "learning_rate": 6.701936311060833e-06, + "loss": 0.0929, + "step": 5288 + }, + { + "epoch": 0.7616647465437788, + "grad_norm": 0.6265849471092224, + "learning_rate": 6.694231375850626e-06, + "loss": 0.0845, + "step": 5289 + }, + { + "epoch": 0.7618087557603687, + "grad_norm": 0.9372100830078125, + "learning_rate": 6.6865301875074614e-06, + "loss": 0.1001, + "step": 5290 + }, + { + "epoch": 0.7619527649769585, + "grad_norm": 0.7364844083786011, + "learning_rate": 6.678832747607636e-06, + "loss": 0.0978, + "step": 5291 + }, + { + "epoch": 0.7620967741935484, + "grad_norm": 1.091269850730896, + "learning_rate": 6.671139057726681e-06, + "loss": 0.176, + "step": 5292 + }, + { + "epoch": 0.7622407834101382, + "grad_norm": 1.5863310098648071, + "learning_rate": 6.663449119439358e-06, + "loss": 0.1721, + "step": 5293 + }, + { + "epoch": 0.7623847926267281, + "grad_norm": 0.6712037324905396, + "learning_rate": 6.6557629343196595e-06, + "loss": 0.091, + "step": 5294 + }, + { + "epoch": 0.762528801843318, + "grad_norm": 0.453029990196228, + "learning_rate": 6.648080503940812e-06, + "loss": 0.053, + "step": 5295 + }, + { + "epoch": 0.7626728110599078, + "grad_norm": 1.3903285264968872, + "learning_rate": 6.640401829875275e-06, + "loss": 0.1175, + "step": 5296 + }, + { + "epoch": 0.7628168202764977, + "grad_norm": 0.9672045707702637, + "learning_rate": 6.6327269136947395e-06, + "loss": 0.0967, + "step": 5297 + }, + { + "epoch": 0.7629608294930875, + "grad_norm": 6.00899600982666, + "learning_rate": 6.625055756970119e-06, + "loss": 2.4114, + "step": 5298 + }, + { + "epoch": 0.7631048387096774, + "grad_norm": 5.179117679595947, + "learning_rate": 6.617388361271567e-06, + "loss": 0.8981, + "step": 5299 + }, + { + "epoch": 0.7632488479262672, + "grad_norm": 1.1031781435012817, + "learning_rate": 6.609724728168465e-06, + "loss": 0.0984, + "step": 5300 + }, + { + "epoch": 0.7633928571428571, + "grad_norm": 0.9210220575332642, + "learning_rate": 6.60206485922942e-06, + "loss": 0.0882, + "step": 5301 + }, + { + "epoch": 0.763536866359447, + "grad_norm": 3.9241695404052734, + "learning_rate": 6.594408756022272e-06, + "loss": 1.0635, + "step": 5302 + }, + { + "epoch": 0.7636808755760369, + "grad_norm": 0.4632914364337921, + "learning_rate": 6.586756420114093e-06, + "loss": 0.0471, + "step": 5303 + }, + { + "epoch": 0.7638248847926268, + "grad_norm": 21.318885803222656, + "learning_rate": 6.57910785307117e-06, + "loss": 2.8122, + "step": 5304 + }, + { + "epoch": 0.7639688940092166, + "grad_norm": 0.9405459761619568, + "learning_rate": 6.571463056459048e-06, + "loss": 0.1051, + "step": 5305 + }, + { + "epoch": 0.7641129032258065, + "grad_norm": 0.5682744383811951, + "learning_rate": 6.5638220318424705e-06, + "loss": 0.067, + "step": 5306 + }, + { + "epoch": 0.7642569124423964, + "grad_norm": 0.7630172371864319, + "learning_rate": 6.556184780785421e-06, + "loss": 0.0896, + "step": 5307 + }, + { + "epoch": 0.7644009216589862, + "grad_norm": 4.888513088226318, + "learning_rate": 6.548551304851111e-06, + "loss": 1.7618, + "step": 5308 + }, + { + "epoch": 0.7645449308755761, + "grad_norm": 0.6683850288391113, + "learning_rate": 6.540921605601977e-06, + "loss": 0.0953, + "step": 5309 + }, + { + "epoch": 0.7646889400921659, + "grad_norm": 0.7905837297439575, + "learning_rate": 6.5332956845996856e-06, + "loss": 0.1082, + "step": 5310 + }, + { + "epoch": 0.7648329493087558, + "grad_norm": 0.8225821256637573, + "learning_rate": 6.525673543405123e-06, + "loss": 0.0913, + "step": 5311 + }, + { + "epoch": 0.7649769585253456, + "grad_norm": 0.30259841680526733, + "learning_rate": 6.518055183578412e-06, + "loss": 0.05, + "step": 5312 + }, + { + "epoch": 0.7651209677419355, + "grad_norm": 0.7458184957504272, + "learning_rate": 6.5104406066788915e-06, + "loss": 0.0763, + "step": 5313 + }, + { + "epoch": 0.7652649769585254, + "grad_norm": 1.7997928857803345, + "learning_rate": 6.5028298142651355e-06, + "loss": 0.145, + "step": 5314 + }, + { + "epoch": 0.7654089861751152, + "grad_norm": 3.9027748107910156, + "learning_rate": 6.495222807894935e-06, + "loss": 2.3711, + "step": 5315 + }, + { + "epoch": 0.7655529953917051, + "grad_norm": 0.8464882373809814, + "learning_rate": 6.48761958912531e-06, + "loss": 0.0775, + "step": 5316 + }, + { + "epoch": 0.7656970046082949, + "grad_norm": 0.8890600204467773, + "learning_rate": 6.480020159512506e-06, + "loss": 0.0945, + "step": 5317 + }, + { + "epoch": 0.7658410138248848, + "grad_norm": 0.7245268821716309, + "learning_rate": 6.472424520611994e-06, + "loss": 0.0799, + "step": 5318 + }, + { + "epoch": 0.7659850230414746, + "grad_norm": 0.9961941242218018, + "learning_rate": 6.46483267397846e-06, + "loss": 0.1917, + "step": 5319 + }, + { + "epoch": 0.7661290322580645, + "grad_norm": 0.5269209146499634, + "learning_rate": 6.4572446211658285e-06, + "loss": 0.0693, + "step": 5320 + }, + { + "epoch": 0.7662730414746544, + "grad_norm": 1.3851406574249268, + "learning_rate": 6.449660363727236e-06, + "loss": 0.1579, + "step": 5321 + }, + { + "epoch": 0.7664170506912442, + "grad_norm": 1.7773683071136475, + "learning_rate": 6.442079903215045e-06, + "loss": 0.1448, + "step": 5322 + }, + { + "epoch": 0.7665610599078341, + "grad_norm": 5.252831935882568, + "learning_rate": 6.434503241180845e-06, + "loss": 1.5068, + "step": 5323 + }, + { + "epoch": 0.7667050691244239, + "grad_norm": 1.6900036334991455, + "learning_rate": 6.426930379175439e-06, + "loss": 0.1396, + "step": 5324 + }, + { + "epoch": 0.7668490783410138, + "grad_norm": 0.7700390815734863, + "learning_rate": 6.419361318748865e-06, + "loss": 0.0754, + "step": 5325 + }, + { + "epoch": 0.7669930875576036, + "grad_norm": 0.7721136808395386, + "learning_rate": 6.41179606145037e-06, + "loss": 0.0878, + "step": 5326 + }, + { + "epoch": 0.7671370967741935, + "grad_norm": 0.5693567991256714, + "learning_rate": 6.404234608828433e-06, + "loss": 0.0655, + "step": 5327 + }, + { + "epoch": 0.7672811059907834, + "grad_norm": 0.3042580187320709, + "learning_rate": 6.396676962430745e-06, + "loss": 0.0426, + "step": 5328 + }, + { + "epoch": 0.7674251152073732, + "grad_norm": 2.3220674991607666, + "learning_rate": 6.389123123804217e-06, + "loss": 0.1785, + "step": 5329 + }, + { + "epoch": 0.7675691244239631, + "grad_norm": 5.566410541534424, + "learning_rate": 6.381573094495003e-06, + "loss": 2.8038, + "step": 5330 + }, + { + "epoch": 0.767713133640553, + "grad_norm": 0.9007775783538818, + "learning_rate": 6.37402687604845e-06, + "loss": 0.0735, + "step": 5331 + }, + { + "epoch": 0.7678571428571429, + "grad_norm": 0.39220622181892395, + "learning_rate": 6.3664844700091375e-06, + "loss": 0.0603, + "step": 5332 + }, + { + "epoch": 0.7680011520737328, + "grad_norm": 0.9887630939483643, + "learning_rate": 6.358945877920861e-06, + "loss": 0.1101, + "step": 5333 + }, + { + "epoch": 0.7681451612903226, + "grad_norm": 0.8976712226867676, + "learning_rate": 6.351411101326641e-06, + "loss": 0.1223, + "step": 5334 + }, + { + "epoch": 0.7682891705069125, + "grad_norm": 5.135114669799805, + "learning_rate": 6.343880141768707e-06, + "loss": 1.9654, + "step": 5335 + }, + { + "epoch": 0.7684331797235023, + "grad_norm": 0.6358884572982788, + "learning_rate": 6.336353000788514e-06, + "loss": 0.0809, + "step": 5336 + }, + { + "epoch": 0.7685771889400922, + "grad_norm": 1.0790034532546997, + "learning_rate": 6.32882967992674e-06, + "loss": 0.1195, + "step": 5337 + }, + { + "epoch": 0.768721198156682, + "grad_norm": 0.6725444793701172, + "learning_rate": 6.321310180723272e-06, + "loss": 0.0828, + "step": 5338 + }, + { + "epoch": 0.7688652073732719, + "grad_norm": 0.6962867379188538, + "learning_rate": 6.313794504717218e-06, + "loss": 0.1113, + "step": 5339 + }, + { + "epoch": 0.7690092165898618, + "grad_norm": 3.232154607772827, + "learning_rate": 6.306282653446907e-06, + "loss": 1.2291, + "step": 5340 + }, + { + "epoch": 0.7691532258064516, + "grad_norm": 6.585620880126953, + "learning_rate": 6.2987746284498774e-06, + "loss": 1.7674, + "step": 5341 + }, + { + "epoch": 0.7692972350230415, + "grad_norm": 4.384052276611328, + "learning_rate": 6.291270431262891e-06, + "loss": 2.4194, + "step": 5342 + }, + { + "epoch": 0.7694412442396313, + "grad_norm": 1.2075700759887695, + "learning_rate": 6.2837700634219285e-06, + "loss": 0.1037, + "step": 5343 + }, + { + "epoch": 0.7695852534562212, + "grad_norm": 0.6206492185592651, + "learning_rate": 6.276273526462176e-06, + "loss": 0.0651, + "step": 5344 + }, + { + "epoch": 0.769729262672811, + "grad_norm": 5.489933013916016, + "learning_rate": 6.268780821918044e-06, + "loss": 1.5762, + "step": 5345 + }, + { + "epoch": 0.7698732718894009, + "grad_norm": 0.28365832567214966, + "learning_rate": 6.261291951323159e-06, + "loss": 0.0518, + "step": 5346 + }, + { + "epoch": 0.7700172811059908, + "grad_norm": 1.1303784847259521, + "learning_rate": 6.253806916210361e-06, + "loss": 0.1263, + "step": 5347 + }, + { + "epoch": 0.7701612903225806, + "grad_norm": 0.8359056711196899, + "learning_rate": 6.2463257181116924e-06, + "loss": 0.0931, + "step": 5348 + }, + { + "epoch": 0.7703052995391705, + "grad_norm": 0.6771881580352783, + "learning_rate": 6.238848358558438e-06, + "loss": 0.0658, + "step": 5349 + }, + { + "epoch": 0.7704493087557603, + "grad_norm": 0.5284289717674255, + "learning_rate": 6.231374839081078e-06, + "loss": 0.0584, + "step": 5350 + }, + { + "epoch": 0.7705933179723502, + "grad_norm": 0.8116962909698486, + "learning_rate": 6.223905161209304e-06, + "loss": 0.1097, + "step": 5351 + }, + { + "epoch": 0.7707373271889401, + "grad_norm": 0.8368083238601685, + "learning_rate": 6.216439326472029e-06, + "loss": 0.0983, + "step": 5352 + }, + { + "epoch": 0.7708813364055299, + "grad_norm": 0.9688600897789001, + "learning_rate": 6.208977336397379e-06, + "loss": 0.0782, + "step": 5353 + }, + { + "epoch": 0.7710253456221198, + "grad_norm": 5.267709732055664, + "learning_rate": 6.2015191925126896e-06, + "loss": 1.7101, + "step": 5354 + }, + { + "epoch": 0.7711693548387096, + "grad_norm": 0.7175696492195129, + "learning_rate": 6.194064896344512e-06, + "loss": 0.0636, + "step": 5355 + }, + { + "epoch": 0.7713133640552995, + "grad_norm": 0.47985726594924927, + "learning_rate": 6.186614449418609e-06, + "loss": 0.0569, + "step": 5356 + }, + { + "epoch": 0.7714573732718893, + "grad_norm": 0.6417503952980042, + "learning_rate": 6.179167853259954e-06, + "loss": 0.0717, + "step": 5357 + }, + { + "epoch": 0.7716013824884793, + "grad_norm": 0.8344873189926147, + "learning_rate": 6.1717251093927345e-06, + "loss": 0.0836, + "step": 5358 + }, + { + "epoch": 0.7717453917050692, + "grad_norm": 1.3324946165084839, + "learning_rate": 6.164286219340346e-06, + "loss": 0.104, + "step": 5359 + }, + { + "epoch": 0.771889400921659, + "grad_norm": 5.238208770751953, + "learning_rate": 6.156851184625401e-06, + "loss": 1.3447, + "step": 5360 + }, + { + "epoch": 0.7720334101382489, + "grad_norm": 0.6727806329727173, + "learning_rate": 6.149420006769718e-06, + "loss": 0.0561, + "step": 5361 + }, + { + "epoch": 0.7721774193548387, + "grad_norm": 0.6232426166534424, + "learning_rate": 6.141992687294329e-06, + "loss": 0.0719, + "step": 5362 + }, + { + "epoch": 0.7723214285714286, + "grad_norm": 5.165831565856934, + "learning_rate": 6.134569227719475e-06, + "loss": 1.3882, + "step": 5363 + }, + { + "epoch": 0.7724654377880185, + "grad_norm": 0.7302985787391663, + "learning_rate": 6.127149629564605e-06, + "loss": 0.0877, + "step": 5364 + }, + { + "epoch": 0.7726094470046083, + "grad_norm": 0.5442237854003906, + "learning_rate": 6.119733894348378e-06, + "loss": 0.0518, + "step": 5365 + }, + { + "epoch": 0.7727534562211982, + "grad_norm": 3.7198894023895264, + "learning_rate": 6.112322023588668e-06, + "loss": 1.4858, + "step": 5366 + }, + { + "epoch": 0.772897465437788, + "grad_norm": 1.2653369903564453, + "learning_rate": 6.104914018802546e-06, + "loss": 0.1393, + "step": 5367 + }, + { + "epoch": 0.7730414746543779, + "grad_norm": 0.7023096084594727, + "learning_rate": 6.097509881506311e-06, + "loss": 0.0923, + "step": 5368 + }, + { + "epoch": 0.7731854838709677, + "grad_norm": 0.828292191028595, + "learning_rate": 6.090109613215456e-06, + "loss": 0.0941, + "step": 5369 + }, + { + "epoch": 0.7733294930875576, + "grad_norm": 0.5951153039932251, + "learning_rate": 6.08271321544468e-06, + "loss": 0.0484, + "step": 5370 + }, + { + "epoch": 0.7734735023041475, + "grad_norm": 0.2763633728027344, + "learning_rate": 6.075320689707898e-06, + "loss": 0.0489, + "step": 5371 + }, + { + "epoch": 0.7736175115207373, + "grad_norm": 0.7028306722640991, + "learning_rate": 6.067932037518228e-06, + "loss": 0.0735, + "step": 5372 + }, + { + "epoch": 0.7737615207373272, + "grad_norm": 0.7882200479507446, + "learning_rate": 6.060547260387997e-06, + "loss": 0.0846, + "step": 5373 + }, + { + "epoch": 0.773905529953917, + "grad_norm": 1.130556583404541, + "learning_rate": 6.053166359828741e-06, + "loss": 0.082, + "step": 5374 + }, + { + "epoch": 0.7740495391705069, + "grad_norm": 1.150143027305603, + "learning_rate": 6.045789337351193e-06, + "loss": 0.1038, + "step": 5375 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.5490731596946716, + "learning_rate": 6.038416194465305e-06, + "loss": 0.0714, + "step": 5376 + }, + { + "epoch": 0.7743375576036866, + "grad_norm": 1.164311170578003, + "learning_rate": 6.031046932680229e-06, + "loss": 0.1095, + "step": 5377 + }, + { + "epoch": 0.7744815668202765, + "grad_norm": 1.4771367311477661, + "learning_rate": 6.023681553504318e-06, + "loss": 3.9169, + "step": 5378 + }, + { + "epoch": 0.7746255760368663, + "grad_norm": 0.7707672119140625, + "learning_rate": 6.016320058445138e-06, + "loss": 0.0798, + "step": 5379 + }, + { + "epoch": 0.7747695852534562, + "grad_norm": 0.6281468272209167, + "learning_rate": 6.008962449009456e-06, + "loss": 0.0901, + "step": 5380 + }, + { + "epoch": 0.774913594470046, + "grad_norm": 1.9250162839889526, + "learning_rate": 6.001608726703245e-06, + "loss": 0.1661, + "step": 5381 + }, + { + "epoch": 0.7750576036866359, + "grad_norm": 6.752156734466553, + "learning_rate": 5.994258893031679e-06, + "loss": 1.927, + "step": 5382 + }, + { + "epoch": 0.7752016129032258, + "grad_norm": 0.813305675983429, + "learning_rate": 5.986912949499146e-06, + "loss": 0.0659, + "step": 5383 + }, + { + "epoch": 0.7753456221198156, + "grad_norm": 1.4100722074508667, + "learning_rate": 5.979570897609224e-06, + "loss": 0.1431, + "step": 5384 + }, + { + "epoch": 0.7754896313364056, + "grad_norm": 0.6267372965812683, + "learning_rate": 5.972232738864702e-06, + "loss": 0.0808, + "step": 5385 + }, + { + "epoch": 0.7756336405529954, + "grad_norm": 3.104781150817871, + "learning_rate": 5.9648984747675665e-06, + "loss": 1.0693, + "step": 5386 + }, + { + "epoch": 0.7757776497695853, + "grad_norm": 0.9029744267463684, + "learning_rate": 5.9575681068190254e-06, + "loss": 0.1074, + "step": 5387 + }, + { + "epoch": 0.7759216589861752, + "grad_norm": 0.7970690727233887, + "learning_rate": 5.9502416365194684e-06, + "loss": 0.0829, + "step": 5388 + }, + { + "epoch": 0.776065668202765, + "grad_norm": 0.5078460574150085, + "learning_rate": 5.9429190653684935e-06, + "loss": 0.0505, + "step": 5389 + }, + { + "epoch": 0.7762096774193549, + "grad_norm": 0.44675347208976746, + "learning_rate": 5.935600394864901e-06, + "loss": 0.0573, + "step": 5390 + }, + { + "epoch": 0.7763536866359447, + "grad_norm": 1.2546271085739136, + "learning_rate": 5.928285626506697e-06, + "loss": 0.0902, + "step": 5391 + }, + { + "epoch": 0.7764976958525346, + "grad_norm": 5.971429347991943, + "learning_rate": 5.920974761791079e-06, + "loss": 0.9328, + "step": 5392 + }, + { + "epoch": 0.7766417050691244, + "grad_norm": 0.9156694412231445, + "learning_rate": 5.9136678022144566e-06, + "loss": 0.0801, + "step": 5393 + }, + { + "epoch": 0.7767857142857143, + "grad_norm": 1.7245357036590576, + "learning_rate": 5.90636474927243e-06, + "loss": 0.1298, + "step": 5394 + }, + { + "epoch": 0.7769297235023042, + "grad_norm": 0.8955510258674622, + "learning_rate": 5.899065604459814e-06, + "loss": 0.0923, + "step": 5395 + }, + { + "epoch": 0.777073732718894, + "grad_norm": 0.8358110189437866, + "learning_rate": 5.891770369270605e-06, + "loss": 0.0744, + "step": 5396 + }, + { + "epoch": 0.7772177419354839, + "grad_norm": 0.9973549246788025, + "learning_rate": 5.884479045198013e-06, + "loss": 0.0724, + "step": 5397 + }, + { + "epoch": 0.7773617511520737, + "grad_norm": 0.8483684659004211, + "learning_rate": 5.877191633734444e-06, + "loss": 0.0823, + "step": 5398 + }, + { + "epoch": 0.7775057603686636, + "grad_norm": 4.032780170440674, + "learning_rate": 5.8699081363714995e-06, + "loss": 1.7818, + "step": 5399 + }, + { + "epoch": 0.7776497695852534, + "grad_norm": 0.8275244832038879, + "learning_rate": 5.8626285545999835e-06, + "loss": 0.0926, + "step": 5400 + }, + { + "epoch": 0.7777937788018433, + "grad_norm": 4.289844512939453, + "learning_rate": 5.8553528899098984e-06, + "loss": 2.3665, + "step": 5401 + }, + { + "epoch": 0.7779377880184332, + "grad_norm": 0.7958599328994751, + "learning_rate": 5.848081143790446e-06, + "loss": 4.405, + "step": 5402 + }, + { + "epoch": 0.778081797235023, + "grad_norm": 0.9775769114494324, + "learning_rate": 5.840813317730018e-06, + "loss": 0.1715, + "step": 5403 + }, + { + "epoch": 0.7782258064516129, + "grad_norm": 0.6907666921615601, + "learning_rate": 5.833549413216216e-06, + "loss": 0.1001, + "step": 5404 + }, + { + "epoch": 0.7783698156682027, + "grad_norm": 1.3794169425964355, + "learning_rate": 5.826289431735832e-06, + "loss": 0.1229, + "step": 5405 + }, + { + "epoch": 0.7785138248847926, + "grad_norm": 0.7126482129096985, + "learning_rate": 5.819033374774851e-06, + "loss": 0.0946, + "step": 5406 + }, + { + "epoch": 0.7786578341013825, + "grad_norm": 0.5441449284553528, + "learning_rate": 5.811781243818465e-06, + "loss": 0.082, + "step": 5407 + }, + { + "epoch": 0.7788018433179723, + "grad_norm": 0.5492852926254272, + "learning_rate": 5.804533040351051e-06, + "loss": 0.0463, + "step": 5408 + }, + { + "epoch": 0.7789458525345622, + "grad_norm": 2.1019694805145264, + "learning_rate": 5.7972887658561955e-06, + "loss": 0.195, + "step": 5409 + }, + { + "epoch": 0.779089861751152, + "grad_norm": 2.8671226501464844, + "learning_rate": 5.790048421816668e-06, + "loss": 1.7315, + "step": 5410 + }, + { + "epoch": 0.7792338709677419, + "grad_norm": 0.7898489832878113, + "learning_rate": 5.7828120097144416e-06, + "loss": 0.0841, + "step": 5411 + }, + { + "epoch": 0.7793778801843319, + "grad_norm": 3.3308849334716797, + "learning_rate": 5.7755795310306785e-06, + "loss": 1.4174, + "step": 5412 + }, + { + "epoch": 0.7795218894009217, + "grad_norm": 0.6697842478752136, + "learning_rate": 5.768350987245735e-06, + "loss": 0.0799, + "step": 5413 + }, + { + "epoch": 0.7796658986175116, + "grad_norm": 3.9006760120391846, + "learning_rate": 5.76112637983918e-06, + "loss": 1.518, + "step": 5414 + }, + { + "epoch": 0.7798099078341014, + "grad_norm": 0.9804200530052185, + "learning_rate": 5.753905710289756e-06, + "loss": 0.0838, + "step": 5415 + }, + { + "epoch": 0.7799539170506913, + "grad_norm": 0.8499330878257751, + "learning_rate": 5.746688980075404e-06, + "loss": 0.0833, + "step": 5416 + }, + { + "epoch": 0.7800979262672811, + "grad_norm": 1.0896598100662231, + "learning_rate": 5.739476190673265e-06, + "loss": 0.0996, + "step": 5417 + }, + { + "epoch": 0.780241935483871, + "grad_norm": 1.4857573509216309, + "learning_rate": 5.732267343559666e-06, + "loss": 0.1266, + "step": 5418 + }, + { + "epoch": 0.7803859447004609, + "grad_norm": 0.9323676824569702, + "learning_rate": 5.72506244021013e-06, + "loss": 0.0682, + "step": 5419 + }, + { + "epoch": 0.7805299539170507, + "grad_norm": 0.7665029168128967, + "learning_rate": 5.717861482099376e-06, + "loss": 0.0816, + "step": 5420 + }, + { + "epoch": 0.7806739631336406, + "grad_norm": 1.3643512725830078, + "learning_rate": 5.710664470701313e-06, + "loss": 0.1283, + "step": 5421 + }, + { + "epoch": 0.7808179723502304, + "grad_norm": 0.698563277721405, + "learning_rate": 5.7034714074890385e-06, + "loss": 0.0741, + "step": 5422 + }, + { + "epoch": 0.7809619815668203, + "grad_norm": 2.1237213611602783, + "learning_rate": 5.696282293934848e-06, + "loss": 0.128, + "step": 5423 + }, + { + "epoch": 0.7811059907834101, + "grad_norm": 1.355142593383789, + "learning_rate": 5.689097131510224e-06, + "loss": 0.1331, + "step": 5424 + }, + { + "epoch": 0.78125, + "grad_norm": 0.960864245891571, + "learning_rate": 5.681915921685846e-06, + "loss": 0.142, + "step": 5425 + }, + { + "epoch": 0.7813940092165899, + "grad_norm": 3.7930221557617188, + "learning_rate": 5.674738665931575e-06, + "loss": 2.2534, + "step": 5426 + }, + { + "epoch": 0.7815380184331797, + "grad_norm": 3.6577372550964355, + "learning_rate": 5.667565365716473e-06, + "loss": 2.1246, + "step": 5427 + }, + { + "epoch": 0.7816820276497696, + "grad_norm": 0.7232699990272522, + "learning_rate": 5.6603960225087875e-06, + "loss": 0.0683, + "step": 5428 + }, + { + "epoch": 0.7818260368663594, + "grad_norm": 0.8169407248497009, + "learning_rate": 5.653230637775953e-06, + "loss": 0.0635, + "step": 5429 + }, + { + "epoch": 0.7819700460829493, + "grad_norm": 0.9431648850440979, + "learning_rate": 5.646069212984598e-06, + "loss": 0.0803, + "step": 5430 + }, + { + "epoch": 0.7821140552995391, + "grad_norm": 1.2387700080871582, + "learning_rate": 5.638911749600543e-06, + "loss": 0.0982, + "step": 5431 + }, + { + "epoch": 0.782258064516129, + "grad_norm": 3.8298699855804443, + "learning_rate": 5.6317582490887865e-06, + "loss": 0.4484, + "step": 5432 + }, + { + "epoch": 0.7824020737327189, + "grad_norm": 1.0303114652633667, + "learning_rate": 5.6246087129135315e-06, + "loss": 0.1254, + "step": 5433 + }, + { + "epoch": 0.7825460829493087, + "grad_norm": 0.9839345812797546, + "learning_rate": 5.617463142538159e-06, + "loss": 0.1003, + "step": 5434 + }, + { + "epoch": 0.7826900921658986, + "grad_norm": 0.8259469866752625, + "learning_rate": 5.610321539425239e-06, + "loss": 0.1086, + "step": 5435 + }, + { + "epoch": 0.7828341013824884, + "grad_norm": 3.16471791267395, + "learning_rate": 5.6031839050365354e-06, + "loss": 3.2267, + "step": 5436 + }, + { + "epoch": 0.7829781105990783, + "grad_norm": 3.131964683532715, + "learning_rate": 5.5960502408329896e-06, + "loss": 1.5057, + "step": 5437 + }, + { + "epoch": 0.7831221198156681, + "grad_norm": 3.5779812335968018, + "learning_rate": 5.588920548274742e-06, + "loss": 2.0239, + "step": 5438 + }, + { + "epoch": 0.7832661290322581, + "grad_norm": 3.458552360534668, + "learning_rate": 5.58179482882111e-06, + "loss": 2.4492, + "step": 5439 + }, + { + "epoch": 0.783410138248848, + "grad_norm": 0.9859439134597778, + "learning_rate": 5.574673083930601e-06, + "loss": 0.1182, + "step": 5440 + }, + { + "epoch": 0.7835541474654378, + "grad_norm": 1.1449369192123413, + "learning_rate": 5.567555315060918e-06, + "loss": 0.1277, + "step": 5441 + }, + { + "epoch": 0.7836981566820277, + "grad_norm": 1.1369670629501343, + "learning_rate": 5.560441523668941e-06, + "loss": 0.1256, + "step": 5442 + }, + { + "epoch": 0.7838421658986175, + "grad_norm": 0.299077570438385, + "learning_rate": 5.553331711210733e-06, + "loss": 0.0413, + "step": 5443 + }, + { + "epoch": 0.7839861751152074, + "grad_norm": 0.6299145817756653, + "learning_rate": 5.546225879141548e-06, + "loss": 0.0636, + "step": 5444 + }, + { + "epoch": 0.7841301843317973, + "grad_norm": 0.6620802283287048, + "learning_rate": 5.539124028915826e-06, + "loss": 3.9919, + "step": 5445 + }, + { + "epoch": 0.7842741935483871, + "grad_norm": 0.6989325284957886, + "learning_rate": 5.532026161987189e-06, + "loss": 0.0821, + "step": 5446 + }, + { + "epoch": 0.784418202764977, + "grad_norm": 0.9361629486083984, + "learning_rate": 5.524932279808442e-06, + "loss": 0.1817, + "step": 5447 + }, + { + "epoch": 0.7845622119815668, + "grad_norm": 0.7521358132362366, + "learning_rate": 5.517842383831581e-06, + "loss": 0.0748, + "step": 5448 + }, + { + "epoch": 0.7847062211981567, + "grad_norm": 0.9303933382034302, + "learning_rate": 5.510756475507783e-06, + "loss": 0.1131, + "step": 5449 + }, + { + "epoch": 0.7848502304147466, + "grad_norm": 1.2272651195526123, + "learning_rate": 5.503674556287405e-06, + "loss": 0.1064, + "step": 5450 + }, + { + "epoch": 0.7849942396313364, + "grad_norm": 0.6073123216629028, + "learning_rate": 5.496596627619991e-06, + "loss": 0.0709, + "step": 5451 + }, + { + "epoch": 0.7851382488479263, + "grad_norm": 3.6347131729125977, + "learning_rate": 5.489522690954269e-06, + "loss": 1.5997, + "step": 5452 + }, + { + "epoch": 0.7852822580645161, + "grad_norm": 0.4048953354358673, + "learning_rate": 5.482452747738148e-06, + "loss": 0.0509, + "step": 5453 + }, + { + "epoch": 0.785426267281106, + "grad_norm": 0.4562678337097168, + "learning_rate": 5.475386799418722e-06, + "loss": 0.0735, + "step": 5454 + }, + { + "epoch": 0.7855702764976958, + "grad_norm": 0.8803917169570923, + "learning_rate": 5.46832484744226e-06, + "loss": 0.0928, + "step": 5455 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.7932612895965576, + "learning_rate": 5.461266893254227e-06, + "loss": 0.0843, + "step": 5456 + }, + { + "epoch": 0.7858582949308756, + "grad_norm": 0.8562942743301392, + "learning_rate": 5.454212938299255e-06, + "loss": 0.1153, + "step": 5457 + }, + { + "epoch": 0.7860023041474654, + "grad_norm": 0.6788212060928345, + "learning_rate": 5.447162984021167e-06, + "loss": 0.0831, + "step": 5458 + }, + { + "epoch": 0.7861463133640553, + "grad_norm": 0.7967512011528015, + "learning_rate": 5.440117031862965e-06, + "loss": 0.0904, + "step": 5459 + }, + { + "epoch": 0.7862903225806451, + "grad_norm": 0.2944360077381134, + "learning_rate": 5.433075083266828e-06, + "loss": 0.038, + "step": 5460 + }, + { + "epoch": 0.786434331797235, + "grad_norm": 0.6663317680358887, + "learning_rate": 5.4260371396741175e-06, + "loss": 0.0887, + "step": 5461 + }, + { + "epoch": 0.7865783410138248, + "grad_norm": 0.352446049451828, + "learning_rate": 5.419003202525377e-06, + "loss": 0.047, + "step": 5462 + }, + { + "epoch": 0.7867223502304147, + "grad_norm": 1.0499064922332764, + "learning_rate": 5.411973273260332e-06, + "loss": 3.9255, + "step": 5463 + }, + { + "epoch": 0.7868663594470046, + "grad_norm": 6.2898406982421875, + "learning_rate": 5.4049473533178794e-06, + "loss": 1.5797, + "step": 5464 + }, + { + "epoch": 0.7870103686635944, + "grad_norm": 0.683993399143219, + "learning_rate": 5.397925444136106e-06, + "loss": 0.0615, + "step": 5465 + }, + { + "epoch": 0.7871543778801844, + "grad_norm": 3.7903244495391846, + "learning_rate": 5.390907547152271e-06, + "loss": 1.2326, + "step": 5466 + }, + { + "epoch": 0.7872983870967742, + "grad_norm": 0.6513239741325378, + "learning_rate": 5.383893663802806e-06, + "loss": 0.0624, + "step": 5467 + }, + { + "epoch": 0.7874423963133641, + "grad_norm": 0.8120680451393127, + "learning_rate": 5.376883795523338e-06, + "loss": 0.0834, + "step": 5468 + }, + { + "epoch": 0.787586405529954, + "grad_norm": 0.6615743637084961, + "learning_rate": 5.369877943748666e-06, + "loss": 0.076, + "step": 5469 + }, + { + "epoch": 0.7877304147465438, + "grad_norm": 0.6682198643684387, + "learning_rate": 5.362876109912756e-06, + "loss": 0.0909, + "step": 5470 + }, + { + "epoch": 0.7878744239631337, + "grad_norm": 1.2267130613327026, + "learning_rate": 5.355878295448763e-06, + "loss": 0.0912, + "step": 5471 + }, + { + "epoch": 0.7880184331797235, + "grad_norm": 4.0855021476745605, + "learning_rate": 5.348884501789015e-06, + "loss": 2.5259, + "step": 5472 + }, + { + "epoch": 0.7881624423963134, + "grad_norm": 0.8121361136436462, + "learning_rate": 5.3418947303650185e-06, + "loss": 4.3892, + "step": 5473 + }, + { + "epoch": 0.7883064516129032, + "grad_norm": 0.7022048830986023, + "learning_rate": 5.334908982607456e-06, + "loss": 0.0883, + "step": 5474 + }, + { + "epoch": 0.7884504608294931, + "grad_norm": 0.8731916546821594, + "learning_rate": 5.327927259946183e-06, + "loss": 0.0969, + "step": 5475 + }, + { + "epoch": 0.788594470046083, + "grad_norm": 0.7854814529418945, + "learning_rate": 5.32094956381024e-06, + "loss": 0.1022, + "step": 5476 + }, + { + "epoch": 0.7887384792626728, + "grad_norm": 3.2081053256988525, + "learning_rate": 5.3139758956278375e-06, + "loss": 2.5978, + "step": 5477 + }, + { + "epoch": 0.7888824884792627, + "grad_norm": 11.754436492919922, + "learning_rate": 5.307006256826358e-06, + "loss": 2.3818, + "step": 5478 + }, + { + "epoch": 0.7890264976958525, + "grad_norm": 0.9437799453735352, + "learning_rate": 5.300040648832363e-06, + "loss": 0.0856, + "step": 5479 + }, + { + "epoch": 0.7891705069124424, + "grad_norm": 2.5209240913391113, + "learning_rate": 5.293079073071597e-06, + "loss": 0.1772, + "step": 5480 + }, + { + "epoch": 0.7893145161290323, + "grad_norm": 1.5827209949493408, + "learning_rate": 5.2861215309689625e-06, + "loss": 0.1572, + "step": 5481 + }, + { + "epoch": 0.7894585253456221, + "grad_norm": 0.5701189637184143, + "learning_rate": 5.27916802394855e-06, + "loss": 0.0556, + "step": 5482 + }, + { + "epoch": 0.789602534562212, + "grad_norm": 0.9499281644821167, + "learning_rate": 5.2722185534336165e-06, + "loss": 0.0744, + "step": 5483 + }, + { + "epoch": 0.7897465437788018, + "grad_norm": 0.9440422058105469, + "learning_rate": 5.2652731208466e-06, + "loss": 0.0742, + "step": 5484 + }, + { + "epoch": 0.7898905529953917, + "grad_norm": 0.36454612016677856, + "learning_rate": 5.258331727609103e-06, + "loss": 0.0517, + "step": 5485 + }, + { + "epoch": 0.7900345622119815, + "grad_norm": 0.53998202085495, + "learning_rate": 5.25139437514191e-06, + "loss": 0.0527, + "step": 5486 + }, + { + "epoch": 0.7901785714285714, + "grad_norm": 0.26299113035202026, + "learning_rate": 5.244461064864972e-06, + "loss": 0.0492, + "step": 5487 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 0.7024050354957581, + "learning_rate": 5.237531798197415e-06, + "loss": 0.0733, + "step": 5488 + }, + { + "epoch": 0.7904665898617511, + "grad_norm": 0.875095784664154, + "learning_rate": 5.23060657655754e-06, + "loss": 0.1032, + "step": 5489 + }, + { + "epoch": 0.790610599078341, + "grad_norm": 4.119052886962891, + "learning_rate": 5.2236854013628125e-06, + "loss": 1.6109, + "step": 5490 + }, + { + "epoch": 0.7907546082949308, + "grad_norm": 0.7426705360412598, + "learning_rate": 5.216768274029879e-06, + "loss": 0.0735, + "step": 5491 + }, + { + "epoch": 0.7908986175115207, + "grad_norm": 0.8335259556770325, + "learning_rate": 5.2098551959745504e-06, + "loss": 0.1336, + "step": 5492 + }, + { + "epoch": 0.7910426267281107, + "grad_norm": 1.6811248064041138, + "learning_rate": 5.202946168611811e-06, + "loss": 3.9972, + "step": 5493 + }, + { + "epoch": 0.7911866359447005, + "grad_norm": 5.817631721496582, + "learning_rate": 5.196041193355814e-06, + "loss": 0.8981, + "step": 5494 + }, + { + "epoch": 0.7913306451612904, + "grad_norm": 3.7086904048919678, + "learning_rate": 5.1891402716198935e-06, + "loss": 1.2512, + "step": 5495 + }, + { + "epoch": 0.7914746543778802, + "grad_norm": 4.038156986236572, + "learning_rate": 5.1822434048165444e-06, + "loss": 2.4079, + "step": 5496 + }, + { + "epoch": 0.7916186635944701, + "grad_norm": 0.5126023888587952, + "learning_rate": 5.175350594357431e-06, + "loss": 0.0665, + "step": 5497 + }, + { + "epoch": 0.7917626728110599, + "grad_norm": 4.493897438049316, + "learning_rate": 5.168461841653388e-06, + "loss": 1.3376, + "step": 5498 + }, + { + "epoch": 0.7919066820276498, + "grad_norm": 1.0445137023925781, + "learning_rate": 5.161577148114427e-06, + "loss": 0.0999, + "step": 5499 + }, + { + "epoch": 0.7920506912442397, + "grad_norm": 0.5159420967102051, + "learning_rate": 5.154696515149715e-06, + "loss": 0.0872, + "step": 5500 + }, + { + "epoch": 0.7921947004608295, + "grad_norm": 0.8466431498527527, + "learning_rate": 5.147819944167604e-06, + "loss": 0.1055, + "step": 5501 + }, + { + "epoch": 0.7923387096774194, + "grad_norm": 0.664910078048706, + "learning_rate": 5.1409474365755994e-06, + "loss": 0.0708, + "step": 5502 + }, + { + "epoch": 0.7924827188940092, + "grad_norm": 5.085620403289795, + "learning_rate": 5.134078993780386e-06, + "loss": 1.7534, + "step": 5503 + }, + { + "epoch": 0.7926267281105991, + "grad_norm": 0.7765867114067078, + "learning_rate": 5.1272146171878115e-06, + "loss": 0.0573, + "step": 5504 + }, + { + "epoch": 0.792770737327189, + "grad_norm": 5.3642754554748535, + "learning_rate": 5.120354308202893e-06, + "loss": 2.6839, + "step": 5505 + }, + { + "epoch": 0.7929147465437788, + "grad_norm": 0.587340235710144, + "learning_rate": 5.113498068229813e-06, + "loss": 0.0651, + "step": 5506 + }, + { + "epoch": 0.7930587557603687, + "grad_norm": 0.5974069237709045, + "learning_rate": 5.106645898671921e-06, + "loss": 0.0718, + "step": 5507 + }, + { + "epoch": 0.7932027649769585, + "grad_norm": 0.9098638296127319, + "learning_rate": 5.099797800931741e-06, + "loss": 0.1101, + "step": 5508 + }, + { + "epoch": 0.7933467741935484, + "grad_norm": 0.5567060708999634, + "learning_rate": 5.092953776410953e-06, + "loss": 0.0742, + "step": 5509 + }, + { + "epoch": 0.7934907834101382, + "grad_norm": 0.933390200138092, + "learning_rate": 5.086113826510408e-06, + "loss": 0.0969, + "step": 5510 + }, + { + "epoch": 0.7936347926267281, + "grad_norm": 0.8874287009239197, + "learning_rate": 5.079277952630123e-06, + "loss": 4.0262, + "step": 5511 + }, + { + "epoch": 0.793778801843318, + "grad_norm": 4.305150985717773, + "learning_rate": 5.072446156169283e-06, + "loss": 0.9489, + "step": 5512 + }, + { + "epoch": 0.7939228110599078, + "grad_norm": 1.0666857957839966, + "learning_rate": 5.0656184385262315e-06, + "loss": 0.1684, + "step": 5513 + }, + { + "epoch": 0.7940668202764977, + "grad_norm": 3.8498971462249756, + "learning_rate": 5.058794801098482e-06, + "loss": 1.9206, + "step": 5514 + }, + { + "epoch": 0.7942108294930875, + "grad_norm": 0.5848127603530884, + "learning_rate": 5.051975245282717e-06, + "loss": 0.054, + "step": 5515 + }, + { + "epoch": 0.7943548387096774, + "grad_norm": 0.5936590433120728, + "learning_rate": 5.045159772474775e-06, + "loss": 0.0818, + "step": 5516 + }, + { + "epoch": 0.7944988479262672, + "grad_norm": 10.31969165802002, + "learning_rate": 5.038348384069663e-06, + "loss": 2.7953, + "step": 5517 + }, + { + "epoch": 0.7946428571428571, + "grad_norm": 0.4784112274646759, + "learning_rate": 5.031541081461552e-06, + "loss": 0.0556, + "step": 5518 + }, + { + "epoch": 0.794786866359447, + "grad_norm": 1.1941205263137817, + "learning_rate": 5.02473786604378e-06, + "loss": 0.1103, + "step": 5519 + }, + { + "epoch": 0.7949308755760369, + "grad_norm": 0.8528487682342529, + "learning_rate": 5.017938739208838e-06, + "loss": 4.035, + "step": 5520 + }, + { + "epoch": 0.7950748847926268, + "grad_norm": 0.5896127223968506, + "learning_rate": 5.011143702348387e-06, + "loss": 0.0767, + "step": 5521 + }, + { + "epoch": 0.7952188940092166, + "grad_norm": 0.8014258146286011, + "learning_rate": 5.004352756853259e-06, + "loss": 0.0927, + "step": 5522 + }, + { + "epoch": 0.7953629032258065, + "grad_norm": 1.1266649961471558, + "learning_rate": 4.997565904113438e-06, + "loss": 0.1093, + "step": 5523 + }, + { + "epoch": 0.7955069124423964, + "grad_norm": 0.49781519174575806, + "learning_rate": 4.990783145518069e-06, + "loss": 0.0632, + "step": 5524 + }, + { + "epoch": 0.7956509216589862, + "grad_norm": 0.700327455997467, + "learning_rate": 4.984004482455465e-06, + "loss": 0.0768, + "step": 5525 + }, + { + "epoch": 0.7957949308755761, + "grad_norm": 0.5601661205291748, + "learning_rate": 4.977229916313097e-06, + "loss": 0.0748, + "step": 5526 + }, + { + "epoch": 0.7959389400921659, + "grad_norm": 0.7448676228523254, + "learning_rate": 4.970459448477602e-06, + "loss": 0.0764, + "step": 5527 + }, + { + "epoch": 0.7960829493087558, + "grad_norm": 1.1759289503097534, + "learning_rate": 4.963693080334772e-06, + "loss": 0.1366, + "step": 5528 + }, + { + "epoch": 0.7962269585253456, + "grad_norm": 1.0650155544281006, + "learning_rate": 4.956930813269564e-06, + "loss": 0.1003, + "step": 5529 + }, + { + "epoch": 0.7963709677419355, + "grad_norm": 0.9334713816642761, + "learning_rate": 4.9501726486660935e-06, + "loss": 0.1161, + "step": 5530 + }, + { + "epoch": 0.7965149769585254, + "grad_norm": 0.8954907059669495, + "learning_rate": 4.943418587907636e-06, + "loss": 0.1175, + "step": 5531 + }, + { + "epoch": 0.7966589861751152, + "grad_norm": 0.521531343460083, + "learning_rate": 4.936668632376632e-06, + "loss": 0.0677, + "step": 5532 + }, + { + "epoch": 0.7968029953917051, + "grad_norm": 0.46486952900886536, + "learning_rate": 4.929922783454674e-06, + "loss": 0.0489, + "step": 5533 + }, + { + "epoch": 0.7969470046082949, + "grad_norm": 4.938531875610352, + "learning_rate": 4.923181042522523e-06, + "loss": 1.5606, + "step": 5534 + }, + { + "epoch": 0.7970910138248848, + "grad_norm": 0.748135507106781, + "learning_rate": 4.91644341096009e-06, + "loss": 0.0699, + "step": 5535 + }, + { + "epoch": 0.7972350230414746, + "grad_norm": 0.554397702217102, + "learning_rate": 4.909709890146449e-06, + "loss": 0.0585, + "step": 5536 + }, + { + "epoch": 0.7973790322580645, + "grad_norm": 1.1282291412353516, + "learning_rate": 4.902980481459835e-06, + "loss": 0.1424, + "step": 5537 + }, + { + "epoch": 0.7975230414746544, + "grad_norm": 0.655218780040741, + "learning_rate": 4.896255186277637e-06, + "loss": 0.0779, + "step": 5538 + }, + { + "epoch": 0.7976670506912442, + "grad_norm": 4.264650344848633, + "learning_rate": 4.889534005976407e-06, + "loss": 2.135, + "step": 5539 + }, + { + "epoch": 0.7978110599078341, + "grad_norm": 0.8399022817611694, + "learning_rate": 4.882816941931848e-06, + "loss": 0.1021, + "step": 5540 + }, + { + "epoch": 0.7979550691244239, + "grad_norm": 0.9600359797477722, + "learning_rate": 4.876103995518825e-06, + "loss": 0.1398, + "step": 5541 + }, + { + "epoch": 0.7980990783410138, + "grad_norm": 0.736477792263031, + "learning_rate": 4.86939516811136e-06, + "loss": 0.1002, + "step": 5542 + }, + { + "epoch": 0.7982430875576036, + "grad_norm": 0.3063916862010956, + "learning_rate": 4.862690461082631e-06, + "loss": 0.0468, + "step": 5543 + }, + { + "epoch": 0.7983870967741935, + "grad_norm": 0.3455817401409149, + "learning_rate": 4.855989875804973e-06, + "loss": 0.052, + "step": 5544 + }, + { + "epoch": 0.7985311059907834, + "grad_norm": 1.8203670978546143, + "learning_rate": 4.84929341364988e-06, + "loss": 0.1991, + "step": 5545 + }, + { + "epoch": 0.7986751152073732, + "grad_norm": 0.8142427802085876, + "learning_rate": 4.842601075987993e-06, + "loss": 0.0684, + "step": 5546 + }, + { + "epoch": 0.7988191244239631, + "grad_norm": 0.8068048357963562, + "learning_rate": 4.835912864189121e-06, + "loss": 0.0653, + "step": 5547 + }, + { + "epoch": 0.798963133640553, + "grad_norm": 4.245194435119629, + "learning_rate": 4.829228779622222e-06, + "loss": 2.0883, + "step": 5548 + }, + { + "epoch": 0.7991071428571429, + "grad_norm": 0.8747363090515137, + "learning_rate": 4.822548823655401e-06, + "loss": 0.17, + "step": 5549 + }, + { + "epoch": 0.7992511520737328, + "grad_norm": 0.6297752261161804, + "learning_rate": 4.81587299765594e-06, + "loss": 0.1081, + "step": 5550 + }, + { + "epoch": 0.7993951612903226, + "grad_norm": 1.597418189048767, + "learning_rate": 4.809201302990257e-06, + "loss": 0.1684, + "step": 5551 + }, + { + "epoch": 0.7995391705069125, + "grad_norm": 0.7054194808006287, + "learning_rate": 4.802533741023932e-06, + "loss": 0.0904, + "step": 5552 + }, + { + "epoch": 0.7996831797235023, + "grad_norm": 4.857336044311523, + "learning_rate": 4.795870313121692e-06, + "loss": 1.611, + "step": 5553 + }, + { + "epoch": 0.7998271889400922, + "grad_norm": 1.120185375213623, + "learning_rate": 4.7892110206474275e-06, + "loss": 0.1122, + "step": 5554 + }, + { + "epoch": 0.799971198156682, + "grad_norm": 1.1344316005706787, + "learning_rate": 4.782555864964175e-06, + "loss": 0.1161, + "step": 5555 + }, + { + "epoch": 0.8001152073732719, + "grad_norm": 0.5716903209686279, + "learning_rate": 4.775904847434126e-06, + "loss": 0.0754, + "step": 5556 + }, + { + "epoch": 0.8002592165898618, + "grad_norm": 0.6570586562156677, + "learning_rate": 4.7692579694186305e-06, + "loss": 0.0854, + "step": 5557 + }, + { + "epoch": 0.8004032258064516, + "grad_norm": 4.474412441253662, + "learning_rate": 4.762615232278186e-06, + "loss": 1.6438, + "step": 5558 + }, + { + "epoch": 0.8005472350230415, + "grad_norm": 0.885647177696228, + "learning_rate": 4.755976637372442e-06, + "loss": 0.0584, + "step": 5559 + }, + { + "epoch": 0.8006912442396313, + "grad_norm": 0.4292488992214203, + "learning_rate": 4.7493421860601986e-06, + "loss": 0.0505, + "step": 5560 + }, + { + "epoch": 0.8008352534562212, + "grad_norm": 0.6456418633460999, + "learning_rate": 4.742711879699413e-06, + "loss": 0.0754, + "step": 5561 + }, + { + "epoch": 0.800979262672811, + "grad_norm": 1.1883567571640015, + "learning_rate": 4.736085719647196e-06, + "loss": 0.1154, + "step": 5562 + }, + { + "epoch": 0.8011232718894009, + "grad_norm": 1.0241668224334717, + "learning_rate": 4.7294637072597985e-06, + "loss": 0.1579, + "step": 5563 + }, + { + "epoch": 0.8012672811059908, + "grad_norm": 0.798965573310852, + "learning_rate": 4.7228458438926316e-06, + "loss": 0.0934, + "step": 5564 + }, + { + "epoch": 0.8014112903225806, + "grad_norm": 1.0449869632720947, + "learning_rate": 4.716232130900258e-06, + "loss": 0.1149, + "step": 5565 + }, + { + "epoch": 0.8015552995391705, + "grad_norm": 0.6446225047111511, + "learning_rate": 4.709622569636382e-06, + "loss": 0.0678, + "step": 5566 + }, + { + "epoch": 0.8016993087557603, + "grad_norm": 1.0272629261016846, + "learning_rate": 4.703017161453871e-06, + "loss": 0.0944, + "step": 5567 + }, + { + "epoch": 0.8018433179723502, + "grad_norm": 0.9455484747886658, + "learning_rate": 4.6964159077047296e-06, + "loss": 0.1043, + "step": 5568 + }, + { + "epoch": 0.8019873271889401, + "grad_norm": 0.712273120880127, + "learning_rate": 4.689818809740118e-06, + "loss": 0.0869, + "step": 5569 + }, + { + "epoch": 0.8021313364055299, + "grad_norm": 1.0001845359802246, + "learning_rate": 4.683225868910348e-06, + "loss": 0.1215, + "step": 5570 + }, + { + "epoch": 0.8022753456221198, + "grad_norm": 0.7992826104164124, + "learning_rate": 4.676637086564878e-06, + "loss": 0.0804, + "step": 5571 + }, + { + "epoch": 0.8024193548387096, + "grad_norm": 0.7753897309303284, + "learning_rate": 4.670052464052313e-06, + "loss": 0.0731, + "step": 5572 + }, + { + "epoch": 0.8025633640552995, + "grad_norm": 0.6893078684806824, + "learning_rate": 4.663472002720409e-06, + "loss": 0.0759, + "step": 5573 + }, + { + "epoch": 0.8027073732718893, + "grad_norm": 3.313457727432251, + "learning_rate": 4.6568957039160725e-06, + "loss": 2.0109, + "step": 5574 + }, + { + "epoch": 0.8028513824884793, + "grad_norm": 0.9155951738357544, + "learning_rate": 4.650323568985351e-06, + "loss": 3.7634, + "step": 5575 + }, + { + "epoch": 0.8029953917050692, + "grad_norm": 0.9766839146614075, + "learning_rate": 4.643755599273444e-06, + "loss": 0.1101, + "step": 5576 + }, + { + "epoch": 0.803139400921659, + "grad_norm": 4.63414192199707, + "learning_rate": 4.637191796124707e-06, + "loss": 0.9548, + "step": 5577 + }, + { + "epoch": 0.8032834101382489, + "grad_norm": 0.9845439791679382, + "learning_rate": 4.630632160882628e-06, + "loss": 0.0697, + "step": 5578 + }, + { + "epoch": 0.8034274193548387, + "grad_norm": 1.0261425971984863, + "learning_rate": 4.62407669488985e-06, + "loss": 0.1288, + "step": 5579 + }, + { + "epoch": 0.8035714285714286, + "grad_norm": 1.1813730001449585, + "learning_rate": 4.617525399488163e-06, + "loss": 0.1595, + "step": 5580 + }, + { + "epoch": 0.8037154377880185, + "grad_norm": 1.1587378978729248, + "learning_rate": 4.610978276018496e-06, + "loss": 0.1042, + "step": 5581 + }, + { + "epoch": 0.8038594470046083, + "grad_norm": 0.7047373056411743, + "learning_rate": 4.6044353258209355e-06, + "loss": 0.0649, + "step": 5582 + }, + { + "epoch": 0.8040034562211982, + "grad_norm": 0.6445158123970032, + "learning_rate": 4.597896550234701e-06, + "loss": 0.1197, + "step": 5583 + }, + { + "epoch": 0.804147465437788, + "grad_norm": 3.374495029449463, + "learning_rate": 4.59136195059817e-06, + "loss": 2.4056, + "step": 5584 + }, + { + "epoch": 0.8042914746543779, + "grad_norm": 0.6496677398681641, + "learning_rate": 4.584831528248856e-06, + "loss": 0.0727, + "step": 5585 + }, + { + "epoch": 0.8044354838709677, + "grad_norm": 0.6356351375579834, + "learning_rate": 4.578305284523421e-06, + "loss": 0.0533, + "step": 5586 + }, + { + "epoch": 0.8045794930875576, + "grad_norm": 0.6269351840019226, + "learning_rate": 4.571783220757675e-06, + "loss": 0.065, + "step": 5587 + }, + { + "epoch": 0.8047235023041475, + "grad_norm": 0.9602194428443909, + "learning_rate": 4.565265338286564e-06, + "loss": 0.1177, + "step": 5588 + }, + { + "epoch": 0.8048675115207373, + "grad_norm": 0.6264612078666687, + "learning_rate": 4.558751638444186e-06, + "loss": 0.099, + "step": 5589 + }, + { + "epoch": 0.8050115207373272, + "grad_norm": 1.2304637432098389, + "learning_rate": 4.552242122563782e-06, + "loss": 0.116, + "step": 5590 + }, + { + "epoch": 0.805155529953917, + "grad_norm": 0.7098305225372314, + "learning_rate": 4.545736791977731e-06, + "loss": 0.0657, + "step": 5591 + }, + { + "epoch": 0.8052995391705069, + "grad_norm": 0.771705687046051, + "learning_rate": 4.539235648017564e-06, + "loss": 0.0781, + "step": 5592 + }, + { + "epoch": 0.8054435483870968, + "grad_norm": 0.6015930771827698, + "learning_rate": 4.532738692013944e-06, + "loss": 0.0689, + "step": 5593 + }, + { + "epoch": 0.8055875576036866, + "grad_norm": 1.4788239002227783, + "learning_rate": 4.526245925296687e-06, + "loss": 0.1505, + "step": 5594 + }, + { + "epoch": 0.8057315668202765, + "grad_norm": 1.2516186237335205, + "learning_rate": 4.519757349194748e-06, + "loss": 0.1222, + "step": 5595 + }, + { + "epoch": 0.8058755760368663, + "grad_norm": 0.8119708299636841, + "learning_rate": 4.513272965036222e-06, + "loss": 0.0863, + "step": 5596 + }, + { + "epoch": 0.8060195852534562, + "grad_norm": 0.8366161584854126, + "learning_rate": 4.506792774148347e-06, + "loss": 0.0797, + "step": 5597 + }, + { + "epoch": 0.806163594470046, + "grad_norm": 2.968687057495117, + "learning_rate": 4.500316777857505e-06, + "loss": 1.7051, + "step": 5598 + }, + { + "epoch": 0.8063076036866359, + "grad_norm": 1.0461727380752563, + "learning_rate": 4.4938449774892175e-06, + "loss": 0.0974, + "step": 5599 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.90492182970047, + "learning_rate": 4.487377374368146e-06, + "loss": 0.0897, + "step": 5600 + }, + { + "epoch": 0.8065956221198156, + "grad_norm": 1.282418131828308, + "learning_rate": 4.480913969818098e-06, + "loss": 0.1264, + "step": 5601 + }, + { + "epoch": 0.8067396313364056, + "grad_norm": 0.9028319120407104, + "learning_rate": 4.474454765162017e-06, + "loss": 0.0943, + "step": 5602 + }, + { + "epoch": 0.8068836405529954, + "grad_norm": 0.6019465327262878, + "learning_rate": 4.4679997617219776e-06, + "loss": 0.0559, + "step": 5603 + }, + { + "epoch": 0.8070276497695853, + "grad_norm": 0.939986526966095, + "learning_rate": 4.461548960819223e-06, + "loss": 0.0918, + "step": 5604 + }, + { + "epoch": 0.8071716589861752, + "grad_norm": 4.9369282722473145, + "learning_rate": 4.455102363774108e-06, + "loss": 1.512, + "step": 5605 + }, + { + "epoch": 0.807315668202765, + "grad_norm": 0.8758142590522766, + "learning_rate": 4.448659971906138e-06, + "loss": 0.1086, + "step": 5606 + }, + { + "epoch": 0.8074596774193549, + "grad_norm": 0.6369112133979797, + "learning_rate": 4.442221786533959e-06, + "loss": 0.0694, + "step": 5607 + }, + { + "epoch": 0.8076036866359447, + "grad_norm": 5.3766889572143555, + "learning_rate": 4.435787808975351e-06, + "loss": 1.5167, + "step": 5608 + }, + { + "epoch": 0.8077476958525346, + "grad_norm": 1.6302547454833984, + "learning_rate": 4.4293580405472355e-06, + "loss": 0.1326, + "step": 5609 + }, + { + "epoch": 0.8078917050691244, + "grad_norm": 0.9294933676719666, + "learning_rate": 4.422932482565673e-06, + "loss": 0.1142, + "step": 5610 + }, + { + "epoch": 0.8080357142857143, + "grad_norm": 1.0535054206848145, + "learning_rate": 4.41651113634586e-06, + "loss": 0.115, + "step": 5611 + }, + { + "epoch": 0.8081797235023042, + "grad_norm": 0.5416832566261292, + "learning_rate": 4.410094003202134e-06, + "loss": 0.0689, + "step": 5612 + }, + { + "epoch": 0.808323732718894, + "grad_norm": 0.83980792760849, + "learning_rate": 4.403681084447969e-06, + "loss": 0.0923, + "step": 5613 + }, + { + "epoch": 0.8084677419354839, + "grad_norm": 1.72817862033844, + "learning_rate": 4.3972723813959785e-06, + "loss": 3.8282, + "step": 5614 + }, + { + "epoch": 0.8086117511520737, + "grad_norm": 0.6128783226013184, + "learning_rate": 4.390867895357906e-06, + "loss": 0.0633, + "step": 5615 + }, + { + "epoch": 0.8087557603686636, + "grad_norm": 0.8147634863853455, + "learning_rate": 4.384467627644637e-06, + "loss": 0.0751, + "step": 5616 + }, + { + "epoch": 0.8088997695852534, + "grad_norm": 1.640428900718689, + "learning_rate": 4.378071579566195e-06, + "loss": 0.1517, + "step": 5617 + }, + { + "epoch": 0.8090437788018433, + "grad_norm": 0.4882017970085144, + "learning_rate": 4.371679752431737e-06, + "loss": 0.0852, + "step": 5618 + }, + { + "epoch": 0.8091877880184332, + "grad_norm": 0.6583877205848694, + "learning_rate": 4.3652921475495575e-06, + "loss": 0.08, + "step": 5619 + }, + { + "epoch": 0.809331797235023, + "grad_norm": 6.256714820861816, + "learning_rate": 4.358908766227085e-06, + "loss": 1.6865, + "step": 5620 + }, + { + "epoch": 0.8094758064516129, + "grad_norm": 0.9170699715614319, + "learning_rate": 4.352529609770886e-06, + "loss": 0.1019, + "step": 5621 + }, + { + "epoch": 0.8096198156682027, + "grad_norm": 0.8315218687057495, + "learning_rate": 4.346154679486664e-06, + "loss": 0.1206, + "step": 5622 + }, + { + "epoch": 0.8097638248847926, + "grad_norm": 0.8681950569152832, + "learning_rate": 4.339783976679246e-06, + "loss": 0.079, + "step": 5623 + }, + { + "epoch": 0.8099078341013825, + "grad_norm": 0.9370468258857727, + "learning_rate": 4.333417502652612e-06, + "loss": 0.0907, + "step": 5624 + }, + { + "epoch": 0.8100518433179723, + "grad_norm": 0.8950841426849365, + "learning_rate": 4.327055258709853e-06, + "loss": 0.0921, + "step": 5625 + }, + { + "epoch": 0.8101958525345622, + "grad_norm": 0.8443909883499146, + "learning_rate": 4.320697246153224e-06, + "loss": 0.0855, + "step": 5626 + }, + { + "epoch": 0.810339861751152, + "grad_norm": 4.520545482635498, + "learning_rate": 4.31434346628409e-06, + "loss": 0.8357, + "step": 5627 + }, + { + "epoch": 0.8104838709677419, + "grad_norm": 3.1453630924224854, + "learning_rate": 4.307993920402958e-06, + "loss": 0.6518, + "step": 5628 + }, + { + "epoch": 0.8106278801843319, + "grad_norm": 0.9930550456047058, + "learning_rate": 4.301648609809466e-06, + "loss": 4.0104, + "step": 5629 + }, + { + "epoch": 0.8107718894009217, + "grad_norm": 0.87530517578125, + "learning_rate": 4.29530753580239e-06, + "loss": 0.1654, + "step": 5630 + }, + { + "epoch": 0.8109158986175116, + "grad_norm": 1.1404467821121216, + "learning_rate": 4.288970699679634e-06, + "loss": 0.1193, + "step": 5631 + }, + { + "epoch": 0.8110599078341014, + "grad_norm": 4.692922115325928, + "learning_rate": 4.282638102738237e-06, + "loss": 1.8991, + "step": 5632 + }, + { + "epoch": 0.8112039170506913, + "grad_norm": 5.2051167488098145, + "learning_rate": 4.276309746274368e-06, + "loss": 1.4054, + "step": 5633 + }, + { + "epoch": 0.8113479262672811, + "grad_norm": 0.5380373597145081, + "learning_rate": 4.269985631583331e-06, + "loss": 0.0687, + "step": 5634 + }, + { + "epoch": 0.811491935483871, + "grad_norm": 0.5309250354766846, + "learning_rate": 4.263665759959559e-06, + "loss": 0.0647, + "step": 5635 + }, + { + "epoch": 0.8116359447004609, + "grad_norm": 4.492489337921143, + "learning_rate": 4.257350132696619e-06, + "loss": 2.3449, + "step": 5636 + }, + { + "epoch": 0.8117799539170507, + "grad_norm": 1.6606534719467163, + "learning_rate": 4.251038751087211e-06, + "loss": 0.1215, + "step": 5637 + }, + { + "epoch": 0.8119239631336406, + "grad_norm": 0.6541664600372314, + "learning_rate": 4.244731616423156e-06, + "loss": 0.0762, + "step": 5638 + }, + { + "epoch": 0.8120679723502304, + "grad_norm": 1.0805423259735107, + "learning_rate": 4.23842872999542e-06, + "loss": 0.1119, + "step": 5639 + }, + { + "epoch": 0.8122119815668203, + "grad_norm": 0.830976665019989, + "learning_rate": 4.232130093094089e-06, + "loss": 3.9146, + "step": 5640 + }, + { + "epoch": 0.8123559907834101, + "grad_norm": 4.603256702423096, + "learning_rate": 4.2258357070083825e-06, + "loss": 1.3107, + "step": 5641 + }, + { + "epoch": 0.8125, + "grad_norm": 1.1092886924743652, + "learning_rate": 4.219545573026651e-06, + "loss": 0.1239, + "step": 5642 + }, + { + "epoch": 0.8126440092165899, + "grad_norm": 0.631516695022583, + "learning_rate": 4.213259692436367e-06, + "loss": 0.0834, + "step": 5643 + }, + { + "epoch": 0.8127880184331797, + "grad_norm": 0.5958740711212158, + "learning_rate": 4.206978066524153e-06, + "loss": 0.061, + "step": 5644 + }, + { + "epoch": 0.8129320276497696, + "grad_norm": 0.2705039978027344, + "learning_rate": 4.200700696575738e-06, + "loss": 0.0474, + "step": 5645 + }, + { + "epoch": 0.8130760368663594, + "grad_norm": 7.687735080718994, + "learning_rate": 4.194427583875987e-06, + "loss": 1.9826, + "step": 5646 + }, + { + "epoch": 0.8132200460829493, + "grad_norm": 0.5955629944801331, + "learning_rate": 4.188158729708902e-06, + "loss": 0.0901, + "step": 5647 + }, + { + "epoch": 0.8133640552995391, + "grad_norm": 0.8354419469833374, + "learning_rate": 4.1818941353576005e-06, + "loss": 0.0974, + "step": 5648 + }, + { + "epoch": 0.813508064516129, + "grad_norm": 6.157891750335693, + "learning_rate": 4.1756338021043366e-06, + "loss": 1.5097, + "step": 5649 + }, + { + "epoch": 0.8136520737327189, + "grad_norm": 3.958375930786133, + "learning_rate": 4.16937773123049e-06, + "loss": 1.575, + "step": 5650 + }, + { + "epoch": 0.8137960829493087, + "grad_norm": 1.2480089664459229, + "learning_rate": 4.163125924016564e-06, + "loss": 0.1637, + "step": 5651 + }, + { + "epoch": 0.8139400921658986, + "grad_norm": 4.328015327453613, + "learning_rate": 4.156878381742199e-06, + "loss": 1.5621, + "step": 5652 + }, + { + "epoch": 0.8140841013824884, + "grad_norm": 1.2277511358261108, + "learning_rate": 4.150635105686151e-06, + "loss": 0.1408, + "step": 5653 + }, + { + "epoch": 0.8142281105990783, + "grad_norm": 1.008254051208496, + "learning_rate": 4.144396097126313e-06, + "loss": 0.1112, + "step": 5654 + }, + { + "epoch": 0.8143721198156681, + "grad_norm": 1.029069185256958, + "learning_rate": 4.138161357339696e-06, + "loss": 0.1189, + "step": 5655 + }, + { + "epoch": 0.8145161290322581, + "grad_norm": 0.9235564470291138, + "learning_rate": 4.131930887602442e-06, + "loss": 0.082, + "step": 5656 + }, + { + "epoch": 0.814660138248848, + "grad_norm": 0.6473867893218994, + "learning_rate": 4.125704689189819e-06, + "loss": 0.0732, + "step": 5657 + }, + { + "epoch": 0.8148041474654378, + "grad_norm": 0.565091073513031, + "learning_rate": 4.119482763376218e-06, + "loss": 0.0486, + "step": 5658 + }, + { + "epoch": 0.8149481566820277, + "grad_norm": 0.7708825469017029, + "learning_rate": 4.1132651114351575e-06, + "loss": 0.0914, + "step": 5659 + }, + { + "epoch": 0.8150921658986175, + "grad_norm": 6.384041786193848, + "learning_rate": 4.107051734639281e-06, + "loss": 2.023, + "step": 5660 + }, + { + "epoch": 0.8152361751152074, + "grad_norm": 1.242798089981079, + "learning_rate": 4.100842634260358e-06, + "loss": 0.1297, + "step": 5661 + }, + { + "epoch": 0.8153801843317973, + "grad_norm": 1.0121406316757202, + "learning_rate": 4.094637811569274e-06, + "loss": 0.103, + "step": 5662 + }, + { + "epoch": 0.8155241935483871, + "grad_norm": 0.48554450273513794, + "learning_rate": 4.0884372678360625e-06, + "loss": 0.0619, + "step": 5663 + }, + { + "epoch": 0.815668202764977, + "grad_norm": 0.880331814289093, + "learning_rate": 4.082241004329854e-06, + "loss": 0.0975, + "step": 5664 + }, + { + "epoch": 0.8158122119815668, + "grad_norm": 1.4122341871261597, + "learning_rate": 4.0760490223189144e-06, + "loss": 0.1341, + "step": 5665 + }, + { + "epoch": 0.8159562211981567, + "grad_norm": 1.0104811191558838, + "learning_rate": 4.069861323070634e-06, + "loss": 0.1239, + "step": 5666 + }, + { + "epoch": 0.8161002304147466, + "grad_norm": 3.995256185531616, + "learning_rate": 4.063677907851527e-06, + "loss": 2.4476, + "step": 5667 + }, + { + "epoch": 0.8162442396313364, + "grad_norm": 0.5079808235168457, + "learning_rate": 4.0574987779272264e-06, + "loss": 0.0728, + "step": 5668 + }, + { + "epoch": 0.8163882488479263, + "grad_norm": 0.9739224314689636, + "learning_rate": 4.051323934562495e-06, + "loss": 0.0938, + "step": 5669 + }, + { + "epoch": 0.8165322580645161, + "grad_norm": 1.8266545534133911, + "learning_rate": 4.045153379021211e-06, + "loss": 0.1562, + "step": 5670 + }, + { + "epoch": 0.816676267281106, + "grad_norm": 1.106123447418213, + "learning_rate": 4.038987112566375e-06, + "loss": 0.1376, + "step": 5671 + }, + { + "epoch": 0.8168202764976958, + "grad_norm": 0.6574618816375732, + "learning_rate": 4.03282513646012e-06, + "loss": 0.0896, + "step": 5672 + }, + { + "epoch": 0.8169642857142857, + "grad_norm": 0.37691769003868103, + "learning_rate": 4.026667451963687e-06, + "loss": 0.0588, + "step": 5673 + }, + { + "epoch": 0.8171082949308756, + "grad_norm": 1.1574516296386719, + "learning_rate": 4.0205140603374465e-06, + "loss": 0.141, + "step": 5674 + }, + { + "epoch": 0.8172523041474654, + "grad_norm": 5.628852367401123, + "learning_rate": 4.014364962840892e-06, + "loss": 1.5955, + "step": 5675 + }, + { + "epoch": 0.8173963133640553, + "grad_norm": 0.6793567538261414, + "learning_rate": 4.008220160732631e-06, + "loss": 0.0919, + "step": 5676 + }, + { + "epoch": 0.8175403225806451, + "grad_norm": 5.08930778503418, + "learning_rate": 4.002079655270399e-06, + "loss": 1.444, + "step": 5677 + }, + { + "epoch": 0.817684331797235, + "grad_norm": 8.678194046020508, + "learning_rate": 3.995943447711048e-06, + "loss": 2.0071, + "step": 5678 + }, + { + "epoch": 0.8178283410138248, + "grad_norm": 0.7522678971290588, + "learning_rate": 3.989811539310548e-06, + "loss": 0.0672, + "step": 5679 + }, + { + "epoch": 0.8179723502304147, + "grad_norm": 0.7799726128578186, + "learning_rate": 3.983683931323998e-06, + "loss": 0.0807, + "step": 5680 + }, + { + "epoch": 0.8181163594470046, + "grad_norm": 0.5283710360527039, + "learning_rate": 3.977560625005608e-06, + "loss": 0.0587, + "step": 5681 + }, + { + "epoch": 0.8182603686635944, + "grad_norm": 1.2061012983322144, + "learning_rate": 3.9714416216087066e-06, + "loss": 0.1098, + "step": 5682 + }, + { + "epoch": 0.8184043778801844, + "grad_norm": 5.11699104309082, + "learning_rate": 3.965326922385754e-06, + "loss": 1.6148, + "step": 5683 + }, + { + "epoch": 0.8185483870967742, + "grad_norm": 0.5344350337982178, + "learning_rate": 3.959216528588308e-06, + "loss": 0.0795, + "step": 5684 + }, + { + "epoch": 0.8186923963133641, + "grad_norm": 0.6244778037071228, + "learning_rate": 3.953110441467073e-06, + "loss": 0.066, + "step": 5685 + }, + { + "epoch": 0.818836405529954, + "grad_norm": 2.5037012100219727, + "learning_rate": 3.947008662271851e-06, + "loss": 0.3323, + "step": 5686 + }, + { + "epoch": 0.8189804147465438, + "grad_norm": 1.101998209953308, + "learning_rate": 3.940911192251564e-06, + "loss": 0.1068, + "step": 5687 + }, + { + "epoch": 0.8191244239631337, + "grad_norm": 1.5283931493759155, + "learning_rate": 3.934818032654264e-06, + "loss": 0.126, + "step": 5688 + }, + { + "epoch": 0.8192684331797235, + "grad_norm": 0.8414387106895447, + "learning_rate": 3.928729184727109e-06, + "loss": 0.0695, + "step": 5689 + }, + { + "epoch": 0.8194124423963134, + "grad_norm": 0.7058428525924683, + "learning_rate": 3.922644649716378e-06, + "loss": 0.079, + "step": 5690 + }, + { + "epoch": 0.8195564516129032, + "grad_norm": 1.123640775680542, + "learning_rate": 3.916564428867467e-06, + "loss": 0.1101, + "step": 5691 + }, + { + "epoch": 0.8197004608294931, + "grad_norm": 3.3291409015655518, + "learning_rate": 3.91048852342489e-06, + "loss": 1.5022, + "step": 5692 + }, + { + "epoch": 0.819844470046083, + "grad_norm": 0.9546395540237427, + "learning_rate": 3.90441693463228e-06, + "loss": 0.0992, + "step": 5693 + }, + { + "epoch": 0.8199884792626728, + "grad_norm": 1.0804682970046997, + "learning_rate": 3.898349663732381e-06, + "loss": 0.0848, + "step": 5694 + }, + { + "epoch": 0.8201324884792627, + "grad_norm": 1.271114468574524, + "learning_rate": 3.892286711967058e-06, + "loss": 0.1342, + "step": 5695 + }, + { + "epoch": 0.8202764976958525, + "grad_norm": 0.826106607913971, + "learning_rate": 3.88622808057729e-06, + "loss": 0.1053, + "step": 5696 + }, + { + "epoch": 0.8204205069124424, + "grad_norm": 0.6782070398330688, + "learning_rate": 3.880173770803169e-06, + "loss": 0.0875, + "step": 5697 + }, + { + "epoch": 0.8205645161290323, + "grad_norm": 4.499250411987305, + "learning_rate": 3.874123783883907e-06, + "loss": 1.9796, + "step": 5698 + }, + { + "epoch": 0.8207085253456221, + "grad_norm": 4.920770645141602, + "learning_rate": 3.86807812105783e-06, + "loss": 2.1083, + "step": 5699 + }, + { + "epoch": 0.820852534562212, + "grad_norm": 0.8893893957138062, + "learning_rate": 3.862036783562375e-06, + "loss": 0.1112, + "step": 5700 + }, + { + "epoch": 0.8209965437788018, + "grad_norm": 7.075972557067871, + "learning_rate": 3.855999772634103e-06, + "loss": 1.72, + "step": 5701 + }, + { + "epoch": 0.8211405529953917, + "grad_norm": 3.867929697036743, + "learning_rate": 3.849967089508677e-06, + "loss": 1.3308, + "step": 5702 + }, + { + "epoch": 0.8212845622119815, + "grad_norm": 0.8141931295394897, + "learning_rate": 3.843938735420882e-06, + "loss": 0.0857, + "step": 5703 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 0.7247657179832458, + "learning_rate": 3.83791471160462e-06, + "loss": 0.0871, + "step": 5704 + }, + { + "epoch": 0.8215725806451613, + "grad_norm": 0.6431891322135925, + "learning_rate": 3.831895019292897e-06, + "loss": 0.0751, + "step": 5705 + }, + { + "epoch": 0.8217165898617511, + "grad_norm": 1.890597939491272, + "learning_rate": 3.82587965971784e-06, + "loss": 0.1725, + "step": 5706 + }, + { + "epoch": 0.821860599078341, + "grad_norm": 0.7684743404388428, + "learning_rate": 3.819868634110685e-06, + "loss": 0.0866, + "step": 5707 + }, + { + "epoch": 0.8220046082949308, + "grad_norm": 0.9694284200668335, + "learning_rate": 3.813861943701785e-06, + "loss": 0.0985, + "step": 5708 + }, + { + "epoch": 0.8221486175115207, + "grad_norm": 1.3621492385864258, + "learning_rate": 3.8078595897206e-06, + "loss": 0.1302, + "step": 5709 + }, + { + "epoch": 0.8222926267281107, + "grad_norm": 3.520977735519409, + "learning_rate": 3.80186157339571e-06, + "loss": 0.9167, + "step": 5710 + }, + { + "epoch": 0.8224366359447005, + "grad_norm": 0.7184532880783081, + "learning_rate": 3.795867895954794e-06, + "loss": 0.0849, + "step": 5711 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 2.965632915496826, + "learning_rate": 3.7898785586246625e-06, + "loss": 0.2442, + "step": 5712 + }, + { + "epoch": 0.8227246543778802, + "grad_norm": 0.6296013593673706, + "learning_rate": 3.7838935626312242e-06, + "loss": 0.0978, + "step": 5713 + }, + { + "epoch": 0.8228686635944701, + "grad_norm": 0.5444818735122681, + "learning_rate": 3.777912909199499e-06, + "loss": 0.0801, + "step": 5714 + }, + { + "epoch": 0.8230126728110599, + "grad_norm": 1.3678511381149292, + "learning_rate": 3.7719365995536243e-06, + "loss": 0.1266, + "step": 5715 + }, + { + "epoch": 0.8231566820276498, + "grad_norm": 0.9155946373939514, + "learning_rate": 3.765964634916841e-06, + "loss": 0.1051, + "step": 5716 + }, + { + "epoch": 0.8233006912442397, + "grad_norm": 0.6138092875480652, + "learning_rate": 3.7599970165115073e-06, + "loss": 0.0837, + "step": 5717 + }, + { + "epoch": 0.8234447004608295, + "grad_norm": 1.2021164894104004, + "learning_rate": 3.7540337455590878e-06, + "loss": 0.1214, + "step": 5718 + }, + { + "epoch": 0.8235887096774194, + "grad_norm": 1.2230726480484009, + "learning_rate": 3.7480748232801595e-06, + "loss": 0.1362, + "step": 5719 + }, + { + "epoch": 0.8237327188940092, + "grad_norm": 4.254180908203125, + "learning_rate": 3.742120250894407e-06, + "loss": 2.1511, + "step": 5720 + }, + { + "epoch": 0.8238767281105991, + "grad_norm": 0.8160505890846252, + "learning_rate": 3.736170029620628e-06, + "loss": 0.0875, + "step": 5721 + }, + { + "epoch": 0.824020737327189, + "grad_norm": 3.850980281829834, + "learning_rate": 3.7302241606767262e-06, + "loss": 0.8734, + "step": 5722 + }, + { + "epoch": 0.8241647465437788, + "grad_norm": 0.42900750041007996, + "learning_rate": 3.724282645279717e-06, + "loss": 0.0489, + "step": 5723 + }, + { + "epoch": 0.8243087557603687, + "grad_norm": 0.4569042921066284, + "learning_rate": 3.7183454846457215e-06, + "loss": 0.0697, + "step": 5724 + }, + { + "epoch": 0.8244527649769585, + "grad_norm": 0.5383082032203674, + "learning_rate": 3.712412679989971e-06, + "loss": 0.0677, + "step": 5725 + }, + { + "epoch": 0.8245967741935484, + "grad_norm": 0.9109310507774353, + "learning_rate": 3.706484232526811e-06, + "loss": 0.1105, + "step": 5726 + }, + { + "epoch": 0.8247407834101382, + "grad_norm": 1.4366624355316162, + "learning_rate": 3.7005601434696833e-06, + "loss": 0.1371, + "step": 5727 + }, + { + "epoch": 0.8248847926267281, + "grad_norm": 0.9045280814170837, + "learning_rate": 3.6946404140311475e-06, + "loss": 0.1018, + "step": 5728 + }, + { + "epoch": 0.825028801843318, + "grad_norm": 0.365533709526062, + "learning_rate": 3.688725045422867e-06, + "loss": 0.0467, + "step": 5729 + }, + { + "epoch": 0.8251728110599078, + "grad_norm": 5.98647403717041, + "learning_rate": 3.6828140388556143e-06, + "loss": 1.9074, + "step": 5730 + }, + { + "epoch": 0.8253168202764977, + "grad_norm": 0.5335524678230286, + "learning_rate": 3.676907395539267e-06, + "loss": 0.0863, + "step": 5731 + }, + { + "epoch": 0.8254608294930875, + "grad_norm": 0.9253371953964233, + "learning_rate": 3.6710051166828072e-06, + "loss": 0.071, + "step": 5732 + }, + { + "epoch": 0.8256048387096774, + "grad_norm": 0.9720125198364258, + "learning_rate": 3.665107203494331e-06, + "loss": 0.113, + "step": 5733 + }, + { + "epoch": 0.8257488479262672, + "grad_norm": 1.2624318599700928, + "learning_rate": 3.6592136571810376e-06, + "loss": 0.1571, + "step": 5734 + }, + { + "epoch": 0.8258928571428571, + "grad_norm": 0.7792217135429382, + "learning_rate": 3.653324478949227e-06, + "loss": 0.0809, + "step": 5735 + }, + { + "epoch": 0.826036866359447, + "grad_norm": 1.373515248298645, + "learning_rate": 3.6474396700043158e-06, + "loss": 0.1271, + "step": 5736 + }, + { + "epoch": 0.8261808755760369, + "grad_norm": 1.0016330480575562, + "learning_rate": 3.6415592315508145e-06, + "loss": 0.0857, + "step": 5737 + }, + { + "epoch": 0.8263248847926268, + "grad_norm": 5.745224952697754, + "learning_rate": 3.6356831647923444e-06, + "loss": 1.3753, + "step": 5738 + }, + { + "epoch": 0.8264688940092166, + "grad_norm": 0.8744258880615234, + "learning_rate": 3.6298114709316404e-06, + "loss": 0.0792, + "step": 5739 + }, + { + "epoch": 0.8266129032258065, + "grad_norm": 3.656125783920288, + "learning_rate": 3.62394415117053e-06, + "loss": 1.3926, + "step": 5740 + }, + { + "epoch": 0.8267569124423964, + "grad_norm": 0.9674009680747986, + "learning_rate": 3.6180812067099474e-06, + "loss": 0.1037, + "step": 5741 + }, + { + "epoch": 0.8269009216589862, + "grad_norm": 0.5412290692329407, + "learning_rate": 3.6122226387499376e-06, + "loss": 0.0654, + "step": 5742 + }, + { + "epoch": 0.8270449308755761, + "grad_norm": 2.9489707946777344, + "learning_rate": 3.606368448489644e-06, + "loss": 0.5838, + "step": 5743 + }, + { + "epoch": 0.8271889400921659, + "grad_norm": 0.6971514225006104, + "learning_rate": 3.600518637127315e-06, + "loss": 0.0756, + "step": 5744 + }, + { + "epoch": 0.8273329493087558, + "grad_norm": 0.5973557829856873, + "learning_rate": 3.5946732058603023e-06, + "loss": 0.0801, + "step": 5745 + }, + { + "epoch": 0.8274769585253456, + "grad_norm": 0.5124143362045288, + "learning_rate": 3.588832155885066e-06, + "loss": 0.0706, + "step": 5746 + }, + { + "epoch": 0.8276209677419355, + "grad_norm": 0.3405382037162781, + "learning_rate": 3.5829954883971644e-06, + "loss": 0.0502, + "step": 5747 + }, + { + "epoch": 0.8277649769585254, + "grad_norm": 0.8623670339584351, + "learning_rate": 3.577163204591258e-06, + "loss": 0.1025, + "step": 5748 + }, + { + "epoch": 0.8279089861751152, + "grad_norm": 1.749442458152771, + "learning_rate": 3.571335305661114e-06, + "loss": 0.1353, + "step": 5749 + }, + { + "epoch": 0.8280529953917051, + "grad_norm": 0.7064062356948853, + "learning_rate": 3.5655117927996e-06, + "loss": 0.0832, + "step": 5750 + }, + { + "epoch": 0.8281970046082949, + "grad_norm": 8.983888626098633, + "learning_rate": 3.5596926671986857e-06, + "loss": 1.9728, + "step": 5751 + }, + { + "epoch": 0.8283410138248848, + "grad_norm": 1.1961779594421387, + "learning_rate": 3.5538779300494428e-06, + "loss": 0.1455, + "step": 5752 + }, + { + "epoch": 0.8284850230414746, + "grad_norm": 4.6293110847473145, + "learning_rate": 3.548067582542047e-06, + "loss": 1.4217, + "step": 5753 + }, + { + "epoch": 0.8286290322580645, + "grad_norm": 0.9616038799285889, + "learning_rate": 3.5422616258657727e-06, + "loss": 0.1116, + "step": 5754 + }, + { + "epoch": 0.8287730414746544, + "grad_norm": 1.3978736400604248, + "learning_rate": 3.536460061208996e-06, + "loss": 0.12, + "step": 5755 + }, + { + "epoch": 0.8289170506912442, + "grad_norm": 0.7925077676773071, + "learning_rate": 3.5306628897591955e-06, + "loss": 0.0805, + "step": 5756 + }, + { + "epoch": 0.8290610599078341, + "grad_norm": 0.7018588781356812, + "learning_rate": 3.5248701127029466e-06, + "loss": 0.084, + "step": 5757 + }, + { + "epoch": 0.8292050691244239, + "grad_norm": 8.18397331237793, + "learning_rate": 3.519081731225932e-06, + "loss": 1.8247, + "step": 5758 + }, + { + "epoch": 0.8293490783410138, + "grad_norm": 0.48459482192993164, + "learning_rate": 3.513297746512931e-06, + "loss": 0.0651, + "step": 5759 + }, + { + "epoch": 0.8294930875576036, + "grad_norm": 0.7625690698623657, + "learning_rate": 3.507518159747819e-06, + "loss": 0.0887, + "step": 5760 + }, + { + "epoch": 0.8296370967741935, + "grad_norm": 0.9152089357376099, + "learning_rate": 3.5017429721135807e-06, + "loss": 0.1039, + "step": 5761 + }, + { + "epoch": 0.8297811059907834, + "grad_norm": 6.146718502044678, + "learning_rate": 3.49597218479229e-06, + "loss": 1.7706, + "step": 5762 + }, + { + "epoch": 0.8299251152073732, + "grad_norm": 0.6506029367446899, + "learning_rate": 3.4902057989651294e-06, + "loss": 0.0619, + "step": 5763 + }, + { + "epoch": 0.8300691244239631, + "grad_norm": 0.33552056550979614, + "learning_rate": 3.4844438158123714e-06, + "loss": 0.0483, + "step": 5764 + }, + { + "epoch": 0.830213133640553, + "grad_norm": 1.2470617294311523, + "learning_rate": 3.4786862365133954e-06, + "loss": 0.1062, + "step": 5765 + }, + { + "epoch": 0.8303571428571429, + "grad_norm": 0.5995163321495056, + "learning_rate": 3.4729330622466667e-06, + "loss": 0.0755, + "step": 5766 + }, + { + "epoch": 0.8305011520737328, + "grad_norm": 0.9444689750671387, + "learning_rate": 3.4671842941897765e-06, + "loss": 0.0722, + "step": 5767 + }, + { + "epoch": 0.8306451612903226, + "grad_norm": 0.4715471863746643, + "learning_rate": 3.4614399335193836e-06, + "loss": 0.0455, + "step": 5768 + }, + { + "epoch": 0.8307891705069125, + "grad_norm": 0.6130551695823669, + "learning_rate": 3.455699981411259e-06, + "loss": 0.0763, + "step": 5769 + }, + { + "epoch": 0.8309331797235023, + "grad_norm": 0.8668193817138672, + "learning_rate": 3.4499644390402708e-06, + "loss": 0.1248, + "step": 5770 + }, + { + "epoch": 0.8310771889400922, + "grad_norm": 1.5180587768554688, + "learning_rate": 3.44423330758038e-06, + "loss": 0.1051, + "step": 5771 + }, + { + "epoch": 0.831221198156682, + "grad_norm": 0.5775026082992554, + "learning_rate": 3.438506588204651e-06, + "loss": 0.0819, + "step": 5772 + }, + { + "epoch": 0.8313652073732719, + "grad_norm": 0.9345541596412659, + "learning_rate": 3.432784282085241e-06, + "loss": 0.0711, + "step": 5773 + }, + { + "epoch": 0.8315092165898618, + "grad_norm": 1.1650793552398682, + "learning_rate": 3.427066390393405e-06, + "loss": 0.1182, + "step": 5774 + }, + { + "epoch": 0.8316532258064516, + "grad_norm": 1.1351021528244019, + "learning_rate": 3.4213529142994944e-06, + "loss": 0.1202, + "step": 5775 + }, + { + "epoch": 0.8317972350230415, + "grad_norm": 1.1097873449325562, + "learning_rate": 3.4156438549729554e-06, + "loss": 0.1171, + "step": 5776 + }, + { + "epoch": 0.8319412442396313, + "grad_norm": 0.5530514717102051, + "learning_rate": 3.4099392135823335e-06, + "loss": 0.0488, + "step": 5777 + }, + { + "epoch": 0.8320852534562212, + "grad_norm": 5.493278503417969, + "learning_rate": 3.4042389912952664e-06, + "loss": 1.2299, + "step": 5778 + }, + { + "epoch": 0.832229262672811, + "grad_norm": 0.9741085767745972, + "learning_rate": 3.3985431892784888e-06, + "loss": 0.1229, + "step": 5779 + }, + { + "epoch": 0.8323732718894009, + "grad_norm": 0.5834336280822754, + "learning_rate": 3.3928518086978305e-06, + "loss": 0.059, + "step": 5780 + }, + { + "epoch": 0.8325172811059908, + "grad_norm": 0.31622380018234253, + "learning_rate": 3.3871648507182163e-06, + "loss": 0.0445, + "step": 5781 + }, + { + "epoch": 0.8326612903225806, + "grad_norm": 1.0833041667938232, + "learning_rate": 3.3814823165036673e-06, + "loss": 0.1013, + "step": 5782 + }, + { + "epoch": 0.8328052995391705, + "grad_norm": 0.8436938524246216, + "learning_rate": 3.375804207217298e-06, + "loss": 0.1022, + "step": 5783 + }, + { + "epoch": 0.8329493087557603, + "grad_norm": 0.954557478427887, + "learning_rate": 3.3701305240213142e-06, + "loss": 0.1033, + "step": 5784 + }, + { + "epoch": 0.8330933179723502, + "grad_norm": 0.33539852499961853, + "learning_rate": 3.364461268077021e-06, + "loss": 0.0485, + "step": 5785 + }, + { + "epoch": 0.8332373271889401, + "grad_norm": 0.8433091044425964, + "learning_rate": 3.3587964405448147e-06, + "loss": 0.1056, + "step": 5786 + }, + { + "epoch": 0.8333813364055299, + "grad_norm": 0.8966030478477478, + "learning_rate": 3.353136042584182e-06, + "loss": 0.1329, + "step": 5787 + }, + { + "epoch": 0.8335253456221198, + "grad_norm": 1.0553398132324219, + "learning_rate": 3.347480075353712e-06, + "loss": 0.1125, + "step": 5788 + }, + { + "epoch": 0.8336693548387096, + "grad_norm": 0.8356336355209351, + "learning_rate": 3.3418285400110742e-06, + "loss": 0.085, + "step": 5789 + }, + { + "epoch": 0.8338133640552995, + "grad_norm": 0.6303702592849731, + "learning_rate": 3.3361814377130457e-06, + "loss": 0.0877, + "step": 5790 + }, + { + "epoch": 0.8339573732718893, + "grad_norm": 0.9828336834907532, + "learning_rate": 3.330538769615482e-06, + "loss": 0.1, + "step": 5791 + }, + { + "epoch": 0.8341013824884793, + "grad_norm": 0.5085030794143677, + "learning_rate": 3.3249005368733405e-06, + "loss": 0.0571, + "step": 5792 + }, + { + "epoch": 0.8342453917050692, + "grad_norm": 0.6073365211486816, + "learning_rate": 3.319266740640661e-06, + "loss": 0.0858, + "step": 5793 + }, + { + "epoch": 0.834389400921659, + "grad_norm": 1.0546207427978516, + "learning_rate": 3.3136373820705945e-06, + "loss": 0.119, + "step": 5794 + }, + { + "epoch": 0.8345334101382489, + "grad_norm": 0.590812087059021, + "learning_rate": 3.308012462315363e-06, + "loss": 0.0537, + "step": 5795 + }, + { + "epoch": 0.8346774193548387, + "grad_norm": 1.0150461196899414, + "learning_rate": 3.30239198252629e-06, + "loss": 0.1195, + "step": 5796 + }, + { + "epoch": 0.8348214285714286, + "grad_norm": 0.9059827923774719, + "learning_rate": 3.296775943853789e-06, + "loss": 0.0911, + "step": 5797 + }, + { + "epoch": 0.8349654377880185, + "grad_norm": 1.041749119758606, + "learning_rate": 3.2911643474473646e-06, + "loss": 0.1034, + "step": 5798 + }, + { + "epoch": 0.8351094470046083, + "grad_norm": 0.5894806385040283, + "learning_rate": 3.2855571944556075e-06, + "loss": 0.0592, + "step": 5799 + }, + { + "epoch": 0.8352534562211982, + "grad_norm": 1.114082932472229, + "learning_rate": 3.2799544860262045e-06, + "loss": 0.088, + "step": 5800 + }, + { + "epoch": 0.835397465437788, + "grad_norm": 1.568297028541565, + "learning_rate": 3.27435622330593e-06, + "loss": 0.1656, + "step": 5801 + }, + { + "epoch": 0.8355414746543779, + "grad_norm": 0.6253515481948853, + "learning_rate": 3.2687624074406537e-06, + "loss": 0.0796, + "step": 5802 + }, + { + "epoch": 0.8356854838709677, + "grad_norm": 0.5274111032485962, + "learning_rate": 3.2631730395753235e-06, + "loss": 0.0752, + "step": 5803 + }, + { + "epoch": 0.8358294930875576, + "grad_norm": 0.5824552774429321, + "learning_rate": 3.257588120853991e-06, + "loss": 0.0576, + "step": 5804 + }, + { + "epoch": 0.8359735023041475, + "grad_norm": 7.081813812255859, + "learning_rate": 3.252007652419789e-06, + "loss": 2.1717, + "step": 5805 + }, + { + "epoch": 0.8361175115207373, + "grad_norm": 0.5808261036872864, + "learning_rate": 3.246431635414937e-06, + "loss": 0.0568, + "step": 5806 + }, + { + "epoch": 0.8362615207373272, + "grad_norm": 0.7197732925415039, + "learning_rate": 3.2408600709807472e-06, + "loss": 0.0701, + "step": 5807 + }, + { + "epoch": 0.836405529953917, + "grad_norm": 0.8185047507286072, + "learning_rate": 3.2352929602576272e-06, + "loss": 0.1119, + "step": 5808 + }, + { + "epoch": 0.8365495391705069, + "grad_norm": 0.6439456939697266, + "learning_rate": 3.2297303043850565e-06, + "loss": 0.0949, + "step": 5809 + }, + { + "epoch": 0.8366935483870968, + "grad_norm": 0.6841358542442322, + "learning_rate": 3.2241721045016214e-06, + "loss": 0.0845, + "step": 5810 + }, + { + "epoch": 0.8368375576036866, + "grad_norm": 0.6352851390838623, + "learning_rate": 3.2186183617449794e-06, + "loss": 0.0566, + "step": 5811 + }, + { + "epoch": 0.8369815668202765, + "grad_norm": 3.674607753753662, + "learning_rate": 3.2130690772518874e-06, + "loss": 1.0529, + "step": 5812 + }, + { + "epoch": 0.8371255760368663, + "grad_norm": 3.030747890472412, + "learning_rate": 3.2075242521581865e-06, + "loss": 0.7806, + "step": 5813 + }, + { + "epoch": 0.8372695852534562, + "grad_norm": 0.8242539167404175, + "learning_rate": 3.201983887598803e-06, + "loss": 0.0847, + "step": 5814 + }, + { + "epoch": 0.837413594470046, + "grad_norm": 0.5156833529472351, + "learning_rate": 3.196447984707751e-06, + "loss": 0.0423, + "step": 5815 + }, + { + "epoch": 0.8375576036866359, + "grad_norm": 4.377691268920898, + "learning_rate": 3.1909165446181304e-06, + "loss": 2.1893, + "step": 5816 + }, + { + "epoch": 0.8377016129032258, + "grad_norm": 8.74673843383789, + "learning_rate": 3.1853895684621326e-06, + "loss": 1.7546, + "step": 5817 + }, + { + "epoch": 0.8378456221198156, + "grad_norm": 1.1936315298080444, + "learning_rate": 3.179867057371033e-06, + "loss": 0.1602, + "step": 5818 + }, + { + "epoch": 0.8379896313364056, + "grad_norm": 0.5014047026634216, + "learning_rate": 3.174349012475186e-06, + "loss": 0.0735, + "step": 5819 + }, + { + "epoch": 0.8381336405529954, + "grad_norm": 0.590679407119751, + "learning_rate": 3.1688354349040383e-06, + "loss": 0.0833, + "step": 5820 + }, + { + "epoch": 0.8382776497695853, + "grad_norm": 0.6310086250305176, + "learning_rate": 3.1633263257861283e-06, + "loss": 0.057, + "step": 5821 + }, + { + "epoch": 0.8384216589861752, + "grad_norm": 0.5322380065917969, + "learning_rate": 3.1578216862490685e-06, + "loss": 0.0716, + "step": 5822 + }, + { + "epoch": 0.838565668202765, + "grad_norm": 0.8916221261024475, + "learning_rate": 3.1523215174195624e-06, + "loss": 0.0978, + "step": 5823 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 4.8056206703186035, + "learning_rate": 3.1468258204233993e-06, + "loss": 1.7922, + "step": 5824 + }, + { + "epoch": 0.8388536866359447, + "grad_norm": 4.988178730010986, + "learning_rate": 3.141334596385448e-06, + "loss": 1.3931, + "step": 5825 + }, + { + "epoch": 0.8389976958525346, + "grad_norm": 1.3904671669006348, + "learning_rate": 3.1358478464296653e-06, + "loss": 0.1045, + "step": 5826 + }, + { + "epoch": 0.8391417050691244, + "grad_norm": 0.9908193349838257, + "learning_rate": 3.130365571679092e-06, + "loss": 0.0754, + "step": 5827 + }, + { + "epoch": 0.8392857142857143, + "grad_norm": 0.9491931796073914, + "learning_rate": 3.124887773255855e-06, + "loss": 0.1097, + "step": 5828 + }, + { + "epoch": 0.8394297235023042, + "grad_norm": 0.7194050550460815, + "learning_rate": 3.119414452281158e-06, + "loss": 0.0757, + "step": 5829 + }, + { + "epoch": 0.839573732718894, + "grad_norm": 0.8574138283729553, + "learning_rate": 3.113945609875299e-06, + "loss": 0.1601, + "step": 5830 + }, + { + "epoch": 0.8397177419354839, + "grad_norm": 0.43731042742729187, + "learning_rate": 3.108481247157649e-06, + "loss": 0.0815, + "step": 5831 + }, + { + "epoch": 0.8398617511520737, + "grad_norm": 1.0416803359985352, + "learning_rate": 3.1030213652466667e-06, + "loss": 0.1216, + "step": 5832 + }, + { + "epoch": 0.8400057603686636, + "grad_norm": 0.5668613314628601, + "learning_rate": 3.0975659652598967e-06, + "loss": 0.0525, + "step": 5833 + }, + { + "epoch": 0.8401497695852534, + "grad_norm": 0.7912011742591858, + "learning_rate": 3.0921150483139565e-06, + "loss": 0.0902, + "step": 5834 + }, + { + "epoch": 0.8402937788018433, + "grad_norm": 16.6788387298584, + "learning_rate": 3.08666861552456e-06, + "loss": 2.021, + "step": 5835 + }, + { + "epoch": 0.8404377880184332, + "grad_norm": 8.196538925170898, + "learning_rate": 3.08122666800649e-06, + "loss": 1.7015, + "step": 5836 + }, + { + "epoch": 0.840581797235023, + "grad_norm": 0.44551989436149597, + "learning_rate": 3.0757892068736195e-06, + "loss": 0.0414, + "step": 5837 + }, + { + "epoch": 0.8407258064516129, + "grad_norm": 0.9096925258636475, + "learning_rate": 3.0703562332388995e-06, + "loss": 0.1014, + "step": 5838 + }, + { + "epoch": 0.8408698156682027, + "grad_norm": 0.7915933132171631, + "learning_rate": 3.064927748214366e-06, + "loss": 0.0787, + "step": 5839 + }, + { + "epoch": 0.8410138248847926, + "grad_norm": 1.5945643186569214, + "learning_rate": 3.05950375291113e-06, + "loss": 0.1237, + "step": 5840 + }, + { + "epoch": 0.8411578341013825, + "grad_norm": 0.9472807049751282, + "learning_rate": 3.05408424843939e-06, + "loss": 0.1165, + "step": 5841 + }, + { + "epoch": 0.8413018433179723, + "grad_norm": 0.8731685876846313, + "learning_rate": 3.0486692359084217e-06, + "loss": 0.0866, + "step": 5842 + }, + { + "epoch": 0.8414458525345622, + "grad_norm": 0.6559296250343323, + "learning_rate": 3.0432587164265835e-06, + "loss": 0.093, + "step": 5843 + }, + { + "epoch": 0.841589861751152, + "grad_norm": 0.7186678647994995, + "learning_rate": 3.0378526911013142e-06, + "loss": 0.0888, + "step": 5844 + }, + { + "epoch": 0.8417338709677419, + "grad_norm": 0.8708643913269043, + "learning_rate": 3.0324511610391265e-06, + "loss": 0.0882, + "step": 5845 + }, + { + "epoch": 0.8418778801843319, + "grad_norm": 0.9439960718154907, + "learning_rate": 3.0270541273456216e-06, + "loss": 0.1178, + "step": 5846 + }, + { + "epoch": 0.8420218894009217, + "grad_norm": 4.311249732971191, + "learning_rate": 3.0216615911254713e-06, + "loss": 1.8514, + "step": 5847 + }, + { + "epoch": 0.8421658986175116, + "grad_norm": 0.4681931436061859, + "learning_rate": 3.016273553482443e-06, + "loss": 0.0545, + "step": 5848 + }, + { + "epoch": 0.8423099078341014, + "grad_norm": 0.6447994112968445, + "learning_rate": 3.0108900155193686e-06, + "loss": 0.0481, + "step": 5849 + }, + { + "epoch": 0.8424539170506913, + "grad_norm": 1.1496820449829102, + "learning_rate": 3.0055109783381606e-06, + "loss": 0.1316, + "step": 5850 + }, + { + "epoch": 0.8425979262672811, + "grad_norm": 0.7546249032020569, + "learning_rate": 3.000136443039814e-06, + "loss": 0.0753, + "step": 5851 + }, + { + "epoch": 0.842741935483871, + "grad_norm": 0.8460968136787415, + "learning_rate": 2.9947664107244004e-06, + "loss": 0.0657, + "step": 5852 + }, + { + "epoch": 0.8428859447004609, + "grad_norm": 0.3258618116378784, + "learning_rate": 2.9894008824910726e-06, + "loss": 0.0471, + "step": 5853 + }, + { + "epoch": 0.8430299539170507, + "grad_norm": 1.085638165473938, + "learning_rate": 2.9840398594380562e-06, + "loss": 0.1674, + "step": 5854 + }, + { + "epoch": 0.8431739631336406, + "grad_norm": 0.8579515218734741, + "learning_rate": 2.978683342662661e-06, + "loss": 0.158, + "step": 5855 + }, + { + "epoch": 0.8433179723502304, + "grad_norm": 3.372187852859497, + "learning_rate": 2.97333133326127e-06, + "loss": 1.6742, + "step": 5856 + }, + { + "epoch": 0.8434619815668203, + "grad_norm": 0.8304616808891296, + "learning_rate": 2.967983832329341e-06, + "loss": 0.1069, + "step": 5857 + }, + { + "epoch": 0.8436059907834101, + "grad_norm": 1.1610180139541626, + "learning_rate": 2.96264084096142e-06, + "loss": 0.1342, + "step": 5858 + }, + { + "epoch": 0.84375, + "grad_norm": 0.9473800659179688, + "learning_rate": 2.9573023602511158e-06, + "loss": 0.1034, + "step": 5859 + }, + { + "epoch": 0.8438940092165899, + "grad_norm": 1.0118968486785889, + "learning_rate": 2.9519683912911266e-06, + "loss": 0.0897, + "step": 5860 + }, + { + "epoch": 0.8440380184331797, + "grad_norm": 0.5371156334877014, + "learning_rate": 2.946638935173218e-06, + "loss": 0.0599, + "step": 5861 + }, + { + "epoch": 0.8441820276497696, + "grad_norm": 0.32357150316238403, + "learning_rate": 2.941313992988237e-06, + "loss": 0.0466, + "step": 5862 + }, + { + "epoch": 0.8443260368663594, + "grad_norm": 1.1529347896575928, + "learning_rate": 2.9359935658261063e-06, + "loss": 0.0957, + "step": 5863 + }, + { + "epoch": 0.8444700460829493, + "grad_norm": 0.9494339227676392, + "learning_rate": 2.930677654775821e-06, + "loss": 0.0913, + "step": 5864 + }, + { + "epoch": 0.8446140552995391, + "grad_norm": 0.8860004544258118, + "learning_rate": 2.925366260925452e-06, + "loss": 0.0979, + "step": 5865 + }, + { + "epoch": 0.844758064516129, + "grad_norm": 0.9644821882247925, + "learning_rate": 2.9200593853621533e-06, + "loss": 0.1055, + "step": 5866 + }, + { + "epoch": 0.8449020737327189, + "grad_norm": 0.9522638916969299, + "learning_rate": 2.9147570291721437e-06, + "loss": 0.1038, + "step": 5867 + }, + { + "epoch": 0.8450460829493087, + "grad_norm": 0.4702010452747345, + "learning_rate": 2.909459193440725e-06, + "loss": 0.0527, + "step": 5868 + }, + { + "epoch": 0.8451900921658986, + "grad_norm": 1.0123698711395264, + "learning_rate": 2.9041658792522685e-06, + "loss": 0.1201, + "step": 5869 + }, + { + "epoch": 0.8453341013824884, + "grad_norm": 0.8371678590774536, + "learning_rate": 2.8988770876902216e-06, + "loss": 0.0906, + "step": 5870 + }, + { + "epoch": 0.8454781105990783, + "grad_norm": 1.1956064701080322, + "learning_rate": 2.893592819837107e-06, + "loss": 0.1167, + "step": 5871 + }, + { + "epoch": 0.8456221198156681, + "grad_norm": 0.45266279578208923, + "learning_rate": 2.8883130767745235e-06, + "loss": 0.0514, + "step": 5872 + }, + { + "epoch": 0.8457661290322581, + "grad_norm": 0.937174379825592, + "learning_rate": 2.8830378595831377e-06, + "loss": 0.1146, + "step": 5873 + }, + { + "epoch": 0.845910138248848, + "grad_norm": 1.4976797103881836, + "learning_rate": 2.877767169342688e-06, + "loss": 0.1437, + "step": 5874 + }, + { + "epoch": 0.8460541474654378, + "grad_norm": 1.1773018836975098, + "learning_rate": 2.872501007132003e-06, + "loss": 0.1123, + "step": 5875 + }, + { + "epoch": 0.8461981566820277, + "grad_norm": 0.6801338195800781, + "learning_rate": 2.8672393740289683e-06, + "loss": 0.0758, + "step": 5876 + }, + { + "epoch": 0.8463421658986175, + "grad_norm": 0.9421706199645996, + "learning_rate": 2.861982271110547e-06, + "loss": 0.0855, + "step": 5877 + }, + { + "epoch": 0.8464861751152074, + "grad_norm": 1.0108715295791626, + "learning_rate": 2.856729699452773e-06, + "loss": 0.0981, + "step": 5878 + }, + { + "epoch": 0.8466301843317973, + "grad_norm": 1.124652624130249, + "learning_rate": 2.8514816601307587e-06, + "loss": 0.1265, + "step": 5879 + }, + { + "epoch": 0.8467741935483871, + "grad_norm": 5.865814685821533, + "learning_rate": 2.8462381542186807e-06, + "loss": 2.6306, + "step": 5880 + }, + { + "epoch": 0.846918202764977, + "grad_norm": 3.8573532104492188, + "learning_rate": 2.840999182789797e-06, + "loss": 1.8086, + "step": 5881 + }, + { + "epoch": 0.8470622119815668, + "grad_norm": 0.9275877475738525, + "learning_rate": 2.835764746916425e-06, + "loss": 0.1052, + "step": 5882 + }, + { + "epoch": 0.8472062211981567, + "grad_norm": 0.7797316312789917, + "learning_rate": 2.830534847669969e-06, + "loss": 0.088, + "step": 5883 + }, + { + "epoch": 0.8473502304147466, + "grad_norm": 0.8296979665756226, + "learning_rate": 2.8253094861208917e-06, + "loss": 0.0974, + "step": 5884 + }, + { + "epoch": 0.8474942396313364, + "grad_norm": 0.6300123929977417, + "learning_rate": 2.8200886633387323e-06, + "loss": 0.058, + "step": 5885 + }, + { + "epoch": 0.8476382488479263, + "grad_norm": 0.5528960824012756, + "learning_rate": 2.8148723803921027e-06, + "loss": 0.077, + "step": 5886 + }, + { + "epoch": 0.8477822580645161, + "grad_norm": 1.1087194681167603, + "learning_rate": 2.809660638348685e-06, + "loss": 0.1066, + "step": 5887 + }, + { + "epoch": 0.847926267281106, + "grad_norm": 0.8985524773597717, + "learning_rate": 2.8044534382752284e-06, + "loss": 0.0657, + "step": 5888 + }, + { + "epoch": 0.8480702764976958, + "grad_norm": 0.4889260530471802, + "learning_rate": 2.7992507812375556e-06, + "loss": 0.075, + "step": 5889 + }, + { + "epoch": 0.8482142857142857, + "grad_norm": 0.599880576133728, + "learning_rate": 2.7940526683005564e-06, + "loss": 0.0642, + "step": 5890 + }, + { + "epoch": 0.8483582949308756, + "grad_norm": 0.2601206302642822, + "learning_rate": 2.788859100528196e-06, + "loss": 0.0421, + "step": 5891 + }, + { + "epoch": 0.8485023041474654, + "grad_norm": 1.1043705940246582, + "learning_rate": 2.783670078983505e-06, + "loss": 0.1101, + "step": 5892 + }, + { + "epoch": 0.8486463133640553, + "grad_norm": 3.9957237243652344, + "learning_rate": 2.7784856047285814e-06, + "loss": 1.1313, + "step": 5893 + }, + { + "epoch": 0.8487903225806451, + "grad_norm": 0.6227494478225708, + "learning_rate": 2.7733056788245974e-06, + "loss": 0.0641, + "step": 5894 + }, + { + "epoch": 0.848934331797235, + "grad_norm": 0.5229582786560059, + "learning_rate": 2.7681303023317924e-06, + "loss": 0.0714, + "step": 5895 + }, + { + "epoch": 0.8490783410138248, + "grad_norm": 0.6617391705513, + "learning_rate": 2.762959476309476e-06, + "loss": 0.0764, + "step": 5896 + }, + { + "epoch": 0.8492223502304147, + "grad_norm": 3.025758981704712, + "learning_rate": 2.7577932018160225e-06, + "loss": 0.8988, + "step": 5897 + }, + { + "epoch": 0.8493663594470046, + "grad_norm": 1.4679323434829712, + "learning_rate": 2.7526314799088766e-06, + "loss": 4.1153, + "step": 5898 + }, + { + "epoch": 0.8495103686635944, + "grad_norm": 0.3068355321884155, + "learning_rate": 2.747474311644552e-06, + "loss": 0.0422, + "step": 5899 + }, + { + "epoch": 0.8496543778801844, + "grad_norm": 0.925337016582489, + "learning_rate": 2.7423216980786315e-06, + "loss": 0.1141, + "step": 5900 + }, + { + "epoch": 0.8497983870967742, + "grad_norm": 0.6320428848266602, + "learning_rate": 2.7371736402657556e-06, + "loss": 0.0813, + "step": 5901 + }, + { + "epoch": 0.8499423963133641, + "grad_norm": 0.9027456045150757, + "learning_rate": 2.7320301392596533e-06, + "loss": 0.1553, + "step": 5902 + }, + { + "epoch": 0.850086405529954, + "grad_norm": 0.6171061396598816, + "learning_rate": 2.7268911961131042e-06, + "loss": 0.0787, + "step": 5903 + }, + { + "epoch": 0.8502304147465438, + "grad_norm": 1.345431923866272, + "learning_rate": 2.721756811877957e-06, + "loss": 0.1207, + "step": 5904 + }, + { + "epoch": 0.8503744239631337, + "grad_norm": 1.3495194911956787, + "learning_rate": 2.716626987605131e-06, + "loss": 0.1205, + "step": 5905 + }, + { + "epoch": 0.8505184331797235, + "grad_norm": 0.8572656512260437, + "learning_rate": 2.711501724344606e-06, + "loss": 0.0835, + "step": 5906 + }, + { + "epoch": 0.8506624423963134, + "grad_norm": 4.630234718322754, + "learning_rate": 2.706381023145438e-06, + "loss": 2.4876, + "step": 5907 + }, + { + "epoch": 0.8508064516129032, + "grad_norm": 0.8986215591430664, + "learning_rate": 2.701264885055743e-06, + "loss": 0.1096, + "step": 5908 + }, + { + "epoch": 0.8509504608294931, + "grad_norm": 0.39053407311439514, + "learning_rate": 2.696153311122704e-06, + "loss": 0.0446, + "step": 5909 + }, + { + "epoch": 0.851094470046083, + "grad_norm": 0.6473718881607056, + "learning_rate": 2.6910463023925665e-06, + "loss": 0.0922, + "step": 5910 + }, + { + "epoch": 0.8512384792626728, + "grad_norm": 0.9316853284835815, + "learning_rate": 2.685943859910647e-06, + "loss": 0.0631, + "step": 5911 + }, + { + "epoch": 0.8513824884792627, + "grad_norm": 0.5312709808349609, + "learning_rate": 2.6808459847213254e-06, + "loss": 0.072, + "step": 5912 + }, + { + "epoch": 0.8515264976958525, + "grad_norm": 3.095158576965332, + "learning_rate": 2.6757526778680487e-06, + "loss": 0.4401, + "step": 5913 + }, + { + "epoch": 0.8516705069124424, + "grad_norm": 0.8995490074157715, + "learning_rate": 2.6706639403933225e-06, + "loss": 0.1086, + "step": 5914 + }, + { + "epoch": 0.8518145161290323, + "grad_norm": 0.5791245698928833, + "learning_rate": 2.665579773338725e-06, + "loss": 0.0624, + "step": 5915 + }, + { + "epoch": 0.8519585253456221, + "grad_norm": 0.8366076946258545, + "learning_rate": 2.660500177744893e-06, + "loss": 0.0759, + "step": 5916 + }, + { + "epoch": 0.852102534562212, + "grad_norm": 1.3468854427337646, + "learning_rate": 2.6554251546515305e-06, + "loss": 0.1168, + "step": 5917 + }, + { + "epoch": 0.8522465437788018, + "grad_norm": 5.115444660186768, + "learning_rate": 2.650354705097405e-06, + "loss": 0.9366, + "step": 5918 + }, + { + "epoch": 0.8523905529953917, + "grad_norm": 0.9617175459861755, + "learning_rate": 2.645288830120349e-06, + "loss": 0.1295, + "step": 5919 + }, + { + "epoch": 0.8525345622119815, + "grad_norm": 3.0584025382995605, + "learning_rate": 2.64022753075725e-06, + "loss": 2.2664, + "step": 5920 + }, + { + "epoch": 0.8526785714285714, + "grad_norm": 0.8997029662132263, + "learning_rate": 2.635170808044077e-06, + "loss": 0.1074, + "step": 5921 + }, + { + "epoch": 0.8528225806451613, + "grad_norm": 0.3095710575580597, + "learning_rate": 2.6301186630158485e-06, + "loss": 0.0486, + "step": 5922 + }, + { + "epoch": 0.8529665898617511, + "grad_norm": 8.461623191833496, + "learning_rate": 2.6250710967066494e-06, + "loss": 2.2783, + "step": 5923 + }, + { + "epoch": 0.853110599078341, + "grad_norm": 0.6149989366531372, + "learning_rate": 2.620028110149625e-06, + "loss": 0.073, + "step": 5924 + }, + { + "epoch": 0.8532546082949308, + "grad_norm": 1.5962885618209839, + "learning_rate": 2.6149897043769884e-06, + "loss": 0.1513, + "step": 5925 + }, + { + "epoch": 0.8533986175115207, + "grad_norm": 0.6420953869819641, + "learning_rate": 2.6099558804200064e-06, + "loss": 0.0806, + "step": 5926 + }, + { + "epoch": 0.8535426267281107, + "grad_norm": 5.487829208374023, + "learning_rate": 2.6049266393090218e-06, + "loss": 2.5326, + "step": 5927 + }, + { + "epoch": 0.8536866359447005, + "grad_norm": 0.8891505599021912, + "learning_rate": 2.5999019820734243e-06, + "loss": 0.0916, + "step": 5928 + }, + { + "epoch": 0.8538306451612904, + "grad_norm": 6.811285495758057, + "learning_rate": 2.5948819097416754e-06, + "loss": 2.3534, + "step": 5929 + }, + { + "epoch": 0.8539746543778802, + "grad_norm": 0.37253308296203613, + "learning_rate": 2.5898664233412974e-06, + "loss": 0.0423, + "step": 5930 + }, + { + "epoch": 0.8541186635944701, + "grad_norm": 1.3282060623168945, + "learning_rate": 2.584855523898866e-06, + "loss": 0.1202, + "step": 5931 + }, + { + "epoch": 0.8542626728110599, + "grad_norm": 0.4510582685470581, + "learning_rate": 2.5798492124400273e-06, + "loss": 0.0446, + "step": 5932 + }, + { + "epoch": 0.8544066820276498, + "grad_norm": 0.751460611820221, + "learning_rate": 2.574847489989485e-06, + "loss": 0.0789, + "step": 5933 + }, + { + "epoch": 0.8545506912442397, + "grad_norm": 6.59741735458374, + "learning_rate": 2.569850357571002e-06, + "loss": 1.3404, + "step": 5934 + }, + { + "epoch": 0.8546947004608295, + "grad_norm": 0.5492342114448547, + "learning_rate": 2.5648578162074054e-06, + "loss": 0.0746, + "step": 5935 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 0.9154043793678284, + "learning_rate": 2.559869866920575e-06, + "loss": 0.1076, + "step": 5936 + }, + { + "epoch": 0.8549827188940092, + "grad_norm": 3.0612828731536865, + "learning_rate": 2.5548865107314607e-06, + "loss": 1.2736, + "step": 5937 + }, + { + "epoch": 0.8551267281105991, + "grad_norm": 5.228409290313721, + "learning_rate": 2.5499077486600658e-06, + "loss": 1.6407, + "step": 5938 + }, + { + "epoch": 0.855270737327189, + "grad_norm": 0.5153233408927917, + "learning_rate": 2.5449335817254504e-06, + "loss": 0.0475, + "step": 5939 + }, + { + "epoch": 0.8554147465437788, + "grad_norm": 1.1954318284988403, + "learning_rate": 2.5399640109457444e-06, + "loss": 0.1171, + "step": 5940 + }, + { + "epoch": 0.8555587557603687, + "grad_norm": 0.6474929451942444, + "learning_rate": 2.5349990373381314e-06, + "loss": 0.0778, + "step": 5941 + }, + { + "epoch": 0.8557027649769585, + "grad_norm": 0.7406175136566162, + "learning_rate": 2.5300386619188515e-06, + "loss": 0.0751, + "step": 5942 + }, + { + "epoch": 0.8558467741935484, + "grad_norm": 4.519582748413086, + "learning_rate": 2.525082885703206e-06, + "loss": 2.0809, + "step": 5943 + }, + { + "epoch": 0.8559907834101382, + "grad_norm": 4.630527496337891, + "learning_rate": 2.5201317097055534e-06, + "loss": 1.6274, + "step": 5944 + }, + { + "epoch": 0.8561347926267281, + "grad_norm": 0.8378430604934692, + "learning_rate": 2.515185134939313e-06, + "loss": 0.0847, + "step": 5945 + }, + { + "epoch": 0.856278801843318, + "grad_norm": 0.5023699998855591, + "learning_rate": 2.5102431624169615e-06, + "loss": 0.0463, + "step": 5946 + }, + { + "epoch": 0.8564228110599078, + "grad_norm": 1.0625436305999756, + "learning_rate": 2.505305793150034e-06, + "loss": 0.1002, + "step": 5947 + }, + { + "epoch": 0.8565668202764977, + "grad_norm": 0.6256807446479797, + "learning_rate": 2.5003730281491195e-06, + "loss": 0.0752, + "step": 5948 + }, + { + "epoch": 0.8567108294930875, + "grad_norm": 5.181367874145508, + "learning_rate": 2.4954448684238714e-06, + "loss": 1.7492, + "step": 5949 + }, + { + "epoch": 0.8568548387096774, + "grad_norm": 0.8292328715324402, + "learning_rate": 2.4905213149829937e-06, + "loss": 0.0757, + "step": 5950 + }, + { + "epoch": 0.8569988479262672, + "grad_norm": 1.078902006149292, + "learning_rate": 2.485602368834253e-06, + "loss": 0.0932, + "step": 5951 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 6.113372802734375, + "learning_rate": 2.480688030984471e-06, + "loss": 1.2474, + "step": 5952 + }, + { + "epoch": 0.857286866359447, + "grad_norm": 0.5621035695075989, + "learning_rate": 2.475778302439524e-06, + "loss": 0.0742, + "step": 5953 + }, + { + "epoch": 0.8574308755760369, + "grad_norm": 0.7005934119224548, + "learning_rate": 2.4708731842043446e-06, + "loss": 0.0765, + "step": 5954 + }, + { + "epoch": 0.8575748847926268, + "grad_norm": 5.286518573760986, + "learning_rate": 2.4659726772829294e-06, + "loss": 0.8833, + "step": 5955 + }, + { + "epoch": 0.8577188940092166, + "grad_norm": 5.967059135437012, + "learning_rate": 2.4610767826783204e-06, + "loss": 1.3001, + "step": 5956 + }, + { + "epoch": 0.8578629032258065, + "grad_norm": 1.7627156972885132, + "learning_rate": 2.4561855013926215e-06, + "loss": 0.1242, + "step": 5957 + }, + { + "epoch": 0.8580069124423964, + "grad_norm": 0.7128841280937195, + "learning_rate": 2.4512988344269905e-06, + "loss": 0.0898, + "step": 5958 + }, + { + "epoch": 0.8581509216589862, + "grad_norm": 0.9371341466903687, + "learning_rate": 2.4464167827816463e-06, + "loss": 0.1008, + "step": 5959 + }, + { + "epoch": 0.8582949308755761, + "grad_norm": 0.5934833288192749, + "learning_rate": 2.441539347455857e-06, + "loss": 0.0731, + "step": 5960 + }, + { + "epoch": 0.8584389400921659, + "grad_norm": 0.9920409321784973, + "learning_rate": 2.436666529447948e-06, + "loss": 0.1151, + "step": 5961 + }, + { + "epoch": 0.8585829493087558, + "grad_norm": 3.2245242595672607, + "learning_rate": 2.431798329755294e-06, + "loss": 1.9395, + "step": 5962 + }, + { + "epoch": 0.8587269585253456, + "grad_norm": 0.2649165391921997, + "learning_rate": 2.426934749374335e-06, + "loss": 0.0465, + "step": 5963 + }, + { + "epoch": 0.8588709677419355, + "grad_norm": 4.189086437225342, + "learning_rate": 2.422075789300554e-06, + "loss": 1.6616, + "step": 5964 + }, + { + "epoch": 0.8590149769585254, + "grad_norm": 0.8409771919250488, + "learning_rate": 2.4172214505285007e-06, + "loss": 0.0709, + "step": 5965 + }, + { + "epoch": 0.8591589861751152, + "grad_norm": 0.6892798542976379, + "learning_rate": 2.4123717340517687e-06, + "loss": 0.064, + "step": 5966 + }, + { + "epoch": 0.8593029953917051, + "grad_norm": 6.052571773529053, + "learning_rate": 2.4075266408630087e-06, + "loss": 1.4788, + "step": 5967 + }, + { + "epoch": 0.8594470046082949, + "grad_norm": 3.813689947128296, + "learning_rate": 2.4026861719539275e-06, + "loss": 1.6783, + "step": 5968 + }, + { + "epoch": 0.8595910138248848, + "grad_norm": 0.7560158371925354, + "learning_rate": 2.397850328315285e-06, + "loss": 0.0773, + "step": 5969 + }, + { + "epoch": 0.8597350230414746, + "grad_norm": 0.869491457939148, + "learning_rate": 2.3930191109368865e-06, + "loss": 0.0921, + "step": 5970 + }, + { + "epoch": 0.8598790322580645, + "grad_norm": 0.7725517749786377, + "learning_rate": 2.388192520807603e-06, + "loss": 0.0656, + "step": 5971 + }, + { + "epoch": 0.8600230414746544, + "grad_norm": 0.8144054412841797, + "learning_rate": 2.3833705589153487e-06, + "loss": 0.0688, + "step": 5972 + }, + { + "epoch": 0.8601670506912442, + "grad_norm": 0.5422839522361755, + "learning_rate": 2.378553226247096e-06, + "loss": 0.0657, + "step": 5973 + }, + { + "epoch": 0.8603110599078341, + "grad_norm": 0.8600287437438965, + "learning_rate": 2.373740523788867e-06, + "loss": 0.096, + "step": 5974 + }, + { + "epoch": 0.8604550691244239, + "grad_norm": 0.47502464056015015, + "learning_rate": 2.368932452525735e-06, + "loss": 0.0708, + "step": 5975 + }, + { + "epoch": 0.8605990783410138, + "grad_norm": 2.5614240169525146, + "learning_rate": 2.3641290134418294e-06, + "loss": 0.1846, + "step": 5976 + }, + { + "epoch": 0.8607430875576036, + "grad_norm": 0.6214435696601868, + "learning_rate": 2.3593302075203273e-06, + "loss": 0.0696, + "step": 5977 + }, + { + "epoch": 0.8608870967741935, + "grad_norm": 1.03584623336792, + "learning_rate": 2.3545360357434624e-06, + "loss": 0.145, + "step": 5978 + }, + { + "epoch": 0.8610311059907834, + "grad_norm": 1.456084132194519, + "learning_rate": 2.3497464990925146e-06, + "loss": 0.1158, + "step": 5979 + }, + { + "epoch": 0.8611751152073732, + "grad_norm": 0.3917291760444641, + "learning_rate": 2.344961598547818e-06, + "loss": 0.0506, + "step": 5980 + }, + { + "epoch": 0.8613191244239631, + "grad_norm": 0.9327602386474609, + "learning_rate": 2.3401813350887566e-06, + "loss": 0.0816, + "step": 5981 + }, + { + "epoch": 0.861463133640553, + "grad_norm": 1.2481937408447266, + "learning_rate": 2.3354057096937665e-06, + "loss": 0.0981, + "step": 5982 + }, + { + "epoch": 0.8616071428571429, + "grad_norm": 0.6559556722640991, + "learning_rate": 2.3306347233403277e-06, + "loss": 0.1189, + "step": 5983 + }, + { + "epoch": 0.8617511520737328, + "grad_norm": 4.8339080810546875, + "learning_rate": 2.325868377004986e-06, + "loss": 1.6572, + "step": 5984 + }, + { + "epoch": 0.8618951612903226, + "grad_norm": 0.9310925602912903, + "learning_rate": 2.3211066716633257e-06, + "loss": 0.0988, + "step": 5985 + }, + { + "epoch": 0.8620391705069125, + "grad_norm": 1.7793490886688232, + "learning_rate": 2.316349608289983e-06, + "loss": 0.1526, + "step": 5986 + }, + { + "epoch": 0.8621831797235023, + "grad_norm": 1.008823037147522, + "learning_rate": 2.311597187858644e-06, + "loss": 0.1278, + "step": 5987 + }, + { + "epoch": 0.8623271889400922, + "grad_norm": 0.519679844379425, + "learning_rate": 2.3068494113420436e-06, + "loss": 0.059, + "step": 5988 + }, + { + "epoch": 0.862471198156682, + "grad_norm": 0.4281577467918396, + "learning_rate": 2.3021062797119714e-06, + "loss": 0.0407, + "step": 5989 + }, + { + "epoch": 0.8626152073732719, + "grad_norm": 5.381777286529541, + "learning_rate": 2.297367793939259e-06, + "loss": 2.4915, + "step": 5990 + }, + { + "epoch": 0.8627592165898618, + "grad_norm": 3.687757730484009, + "learning_rate": 2.292633954993792e-06, + "loss": 1.715, + "step": 5991 + }, + { + "epoch": 0.8629032258064516, + "grad_norm": 0.5700912475585938, + "learning_rate": 2.2879047638445035e-06, + "loss": 0.0697, + "step": 5992 + }, + { + "epoch": 0.8630472350230415, + "grad_norm": 0.9138510227203369, + "learning_rate": 2.2831802214593775e-06, + "loss": 0.083, + "step": 5993 + }, + { + "epoch": 0.8631912442396313, + "grad_norm": 0.766527533531189, + "learning_rate": 2.27846032880544e-06, + "loss": 0.0846, + "step": 5994 + }, + { + "epoch": 0.8633352534562212, + "grad_norm": 0.6534144878387451, + "learning_rate": 2.273745086848772e-06, + "loss": 0.0585, + "step": 5995 + }, + { + "epoch": 0.863479262672811, + "grad_norm": 1.5267399549484253, + "learning_rate": 2.269034496554498e-06, + "loss": 0.1061, + "step": 5996 + }, + { + "epoch": 0.8636232718894009, + "grad_norm": 1.0995954275131226, + "learning_rate": 2.264328558886797e-06, + "loss": 0.1018, + "step": 5997 + }, + { + "epoch": 0.8637672811059908, + "grad_norm": 0.8411165475845337, + "learning_rate": 2.2596272748088872e-06, + "loss": 0.0744, + "step": 5998 + }, + { + "epoch": 0.8639112903225806, + "grad_norm": 0.566990315914154, + "learning_rate": 2.2549306452830376e-06, + "loss": 0.085, + "step": 5999 + }, + { + "epoch": 0.8640552995391705, + "grad_norm": 1.102766752243042, + "learning_rate": 2.2502386712705714e-06, + "loss": 0.1209, + "step": 6000 + }, + { + "epoch": 0.8641993087557603, + "grad_norm": 4.814225196838379, + "learning_rate": 2.245551353731845e-06, + "loss": 1.8562, + "step": 6001 + }, + { + "epoch": 0.8643433179723502, + "grad_norm": 0.9276507496833801, + "learning_rate": 2.2408686936262744e-06, + "loss": 0.0606, + "step": 6002 + }, + { + "epoch": 0.8644873271889401, + "grad_norm": 0.8698006272315979, + "learning_rate": 2.2361906919123156e-06, + "loss": 0.0778, + "step": 6003 + }, + { + "epoch": 0.8646313364055299, + "grad_norm": 3.8811588287353516, + "learning_rate": 2.231517349547471e-06, + "loss": 1.8843, + "step": 6004 + }, + { + "epoch": 0.8647753456221198, + "grad_norm": 0.5648912787437439, + "learning_rate": 2.226848667488296e-06, + "loss": 0.0592, + "step": 6005 + }, + { + "epoch": 0.8649193548387096, + "grad_norm": 0.8914675116539001, + "learning_rate": 2.222184646690381e-06, + "loss": 0.1308, + "step": 6006 + }, + { + "epoch": 0.8650633640552995, + "grad_norm": 0.8030669093132019, + "learning_rate": 2.2175252881083743e-06, + "loss": 0.099, + "step": 6007 + }, + { + "epoch": 0.8652073732718893, + "grad_norm": 1.074925184249878, + "learning_rate": 2.212870592695962e-06, + "loss": 0.0997, + "step": 6008 + }, + { + "epoch": 0.8653513824884793, + "grad_norm": 0.8525258898735046, + "learning_rate": 2.2082205614058743e-06, + "loss": 0.0989, + "step": 6009 + }, + { + "epoch": 0.8654953917050692, + "grad_norm": 5.421370506286621, + "learning_rate": 2.2035751951898915e-06, + "loss": 1.7819, + "step": 6010 + }, + { + "epoch": 0.865639400921659, + "grad_norm": 0.6615511178970337, + "learning_rate": 2.1989344949988443e-06, + "loss": 0.0698, + "step": 6011 + }, + { + "epoch": 0.8657834101382489, + "grad_norm": 0.5479465126991272, + "learning_rate": 2.1942984617825984e-06, + "loss": 0.0702, + "step": 6012 + }, + { + "epoch": 0.8659274193548387, + "grad_norm": 1.5291005373001099, + "learning_rate": 2.1896670964900666e-06, + "loss": 0.1354, + "step": 6013 + }, + { + "epoch": 0.8660714285714286, + "grad_norm": 1.323798656463623, + "learning_rate": 2.1850404000692075e-06, + "loss": 0.1099, + "step": 6014 + }, + { + "epoch": 0.8662154377880185, + "grad_norm": 0.7925248146057129, + "learning_rate": 2.1804183734670277e-06, + "loss": 0.0892, + "step": 6015 + }, + { + "epoch": 0.8663594470046083, + "grad_norm": 6.459387302398682, + "learning_rate": 2.1758010176295667e-06, + "loss": 1.9689, + "step": 6016 + }, + { + "epoch": 0.8665034562211982, + "grad_norm": 0.6898562908172607, + "learning_rate": 2.1711883335019225e-06, + "loss": 0.0701, + "step": 6017 + }, + { + "epoch": 0.866647465437788, + "grad_norm": 4.814693450927734, + "learning_rate": 2.166580322028225e-06, + "loss": 1.1745, + "step": 6018 + }, + { + "epoch": 0.8667914746543779, + "grad_norm": 0.8891019225120544, + "learning_rate": 2.1619769841516563e-06, + "loss": 0.1039, + "step": 6019 + }, + { + "epoch": 0.8669354838709677, + "grad_norm": 2.19057035446167, + "learning_rate": 2.157378320814438e-06, + "loss": 0.1681, + "step": 6020 + }, + { + "epoch": 0.8670794930875576, + "grad_norm": 0.5488921403884888, + "learning_rate": 2.1527843329578328e-06, + "loss": 0.0543, + "step": 6021 + }, + { + "epoch": 0.8672235023041475, + "grad_norm": 0.8447344899177551, + "learning_rate": 2.148195021522151e-06, + "loss": 0.0738, + "step": 6022 + }, + { + "epoch": 0.8673675115207373, + "grad_norm": 0.9107891321182251, + "learning_rate": 2.1436103874467427e-06, + "loss": 0.0933, + "step": 6023 + }, + { + "epoch": 0.8675115207373272, + "grad_norm": 11.96793270111084, + "learning_rate": 2.13903043167e-06, + "loss": 2.2458, + "step": 6024 + }, + { + "epoch": 0.867655529953917, + "grad_norm": 0.8276005983352661, + "learning_rate": 2.134455155129361e-06, + "loss": 0.0813, + "step": 6025 + }, + { + "epoch": 0.8677995391705069, + "grad_norm": 0.8069449067115784, + "learning_rate": 2.1298845587613024e-06, + "loss": 0.0803, + "step": 6026 + }, + { + "epoch": 0.8679435483870968, + "grad_norm": 1.1031756401062012, + "learning_rate": 2.125318643501345e-06, + "loss": 0.1035, + "step": 6027 + }, + { + "epoch": 0.8680875576036866, + "grad_norm": 0.5850995182991028, + "learning_rate": 2.120757410284052e-06, + "loss": 0.0826, + "step": 6028 + }, + { + "epoch": 0.8682315668202765, + "grad_norm": 1.2741107940673828, + "learning_rate": 2.1162008600430245e-06, + "loss": 0.0734, + "step": 6029 + }, + { + "epoch": 0.8683755760368663, + "grad_norm": 0.7602206468582153, + "learning_rate": 2.11164899371091e-06, + "loss": 0.0873, + "step": 6030 + }, + { + "epoch": 0.8685195852534562, + "grad_norm": 0.6626525521278381, + "learning_rate": 2.1071018122193946e-06, + "loss": 0.0749, + "step": 6031 + }, + { + "epoch": 0.868663594470046, + "grad_norm": 0.5575090646743774, + "learning_rate": 2.102559316499206e-06, + "loss": 0.0486, + "step": 6032 + }, + { + "epoch": 0.8688076036866359, + "grad_norm": 0.9136234521865845, + "learning_rate": 2.098021507480111e-06, + "loss": 0.1157, + "step": 6033 + }, + { + "epoch": 0.8689516129032258, + "grad_norm": 1.0054880380630493, + "learning_rate": 2.093488386090922e-06, + "loss": 0.1135, + "step": 6034 + }, + { + "epoch": 0.8690956221198156, + "grad_norm": 5.6835713386535645, + "learning_rate": 2.088959953259484e-06, + "loss": 1.9366, + "step": 6035 + }, + { + "epoch": 0.8692396313364056, + "grad_norm": 1.2892321348190308, + "learning_rate": 2.0844362099126935e-06, + "loss": 3.9307, + "step": 6036 + }, + { + "epoch": 0.8693836405529954, + "grad_norm": 0.9284005761146545, + "learning_rate": 2.079917156976471e-06, + "loss": 0.098, + "step": 6037 + }, + { + "epoch": 0.8695276497695853, + "grad_norm": 1.1868491172790527, + "learning_rate": 2.075402795375797e-06, + "loss": 0.1031, + "step": 6038 + }, + { + "epoch": 0.8696716589861752, + "grad_norm": 5.471475601196289, + "learning_rate": 2.0708931260346786e-06, + "loss": 1.3401, + "step": 6039 + }, + { + "epoch": 0.869815668202765, + "grad_norm": 0.5500563383102417, + "learning_rate": 2.066388149876164e-06, + "loss": 0.0654, + "step": 6040 + }, + { + "epoch": 0.8699596774193549, + "grad_norm": 1.398278832435608, + "learning_rate": 2.061887867822343e-06, + "loss": 0.1281, + "step": 6041 + }, + { + "epoch": 0.8701036866359447, + "grad_norm": 7.859118938446045, + "learning_rate": 2.0573922807943402e-06, + "loss": 2.0829, + "step": 6042 + }, + { + "epoch": 0.8702476958525346, + "grad_norm": 1.289530873298645, + "learning_rate": 2.0529013897123277e-06, + "loss": 0.108, + "step": 6043 + }, + { + "epoch": 0.8703917050691244, + "grad_norm": 1.1173110008239746, + "learning_rate": 2.0484151954955095e-06, + "loss": 0.0973, + "step": 6044 + }, + { + "epoch": 0.8705357142857143, + "grad_norm": 1.1346895694732666, + "learning_rate": 2.043933699062131e-06, + "loss": 0.1103, + "step": 6045 + }, + { + "epoch": 0.8706797235023042, + "grad_norm": 0.48105883598327637, + "learning_rate": 2.039456901329473e-06, + "loss": 0.0698, + "step": 6046 + }, + { + "epoch": 0.870823732718894, + "grad_norm": 0.699238657951355, + "learning_rate": 2.0349848032138572e-06, + "loss": 0.0699, + "step": 6047 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 1.0774401426315308, + "learning_rate": 2.030517405630647e-06, + "loss": 0.1203, + "step": 6048 + }, + { + "epoch": 0.8711117511520737, + "grad_norm": 0.7911561727523804, + "learning_rate": 2.026054709494235e-06, + "loss": 0.0994, + "step": 6049 + }, + { + "epoch": 0.8712557603686636, + "grad_norm": 0.9388158917427063, + "learning_rate": 2.0215967157180577e-06, + "loss": 0.0596, + "step": 6050 + }, + { + "epoch": 0.8713997695852534, + "grad_norm": 0.9047279953956604, + "learning_rate": 2.0171434252145878e-06, + "loss": 0.0907, + "step": 6051 + }, + { + "epoch": 0.8715437788018433, + "grad_norm": 0.7348905205726624, + "learning_rate": 2.012694838895335e-06, + "loss": 0.0903, + "step": 6052 + }, + { + "epoch": 0.8716877880184332, + "grad_norm": 0.4398072361946106, + "learning_rate": 2.0082509576708456e-06, + "loss": 0.0412, + "step": 6053 + }, + { + "epoch": 0.871831797235023, + "grad_norm": 0.5445512533187866, + "learning_rate": 2.003811782450704e-06, + "loss": 0.064, + "step": 6054 + }, + { + "epoch": 0.8719758064516129, + "grad_norm": 0.5500966906547546, + "learning_rate": 1.999377314143533e-06, + "loss": 0.0587, + "step": 6055 + }, + { + "epoch": 0.8721198156682027, + "grad_norm": 0.6673213839530945, + "learning_rate": 1.994947553656984e-06, + "loss": 0.0698, + "step": 6056 + }, + { + "epoch": 0.8722638248847926, + "grad_norm": 0.830053985118866, + "learning_rate": 1.9905225018977567e-06, + "loss": 0.0968, + "step": 6057 + }, + { + "epoch": 0.8724078341013825, + "grad_norm": 0.7945166230201721, + "learning_rate": 1.98610215977158e-06, + "loss": 0.0706, + "step": 6058 + }, + { + "epoch": 0.8725518433179723, + "grad_norm": 0.9634775519371033, + "learning_rate": 1.981686528183216e-06, + "loss": 0.0883, + "step": 6059 + }, + { + "epoch": 0.8726958525345622, + "grad_norm": 0.34195491671562195, + "learning_rate": 1.977275608036469e-06, + "loss": 0.049, + "step": 6060 + }, + { + "epoch": 0.872839861751152, + "grad_norm": 0.8091119527816772, + "learning_rate": 1.972869400234176e-06, + "loss": 0.0927, + "step": 6061 + }, + { + "epoch": 0.8729838709677419, + "grad_norm": 5.566616535186768, + "learning_rate": 1.968467905678212e-06, + "loss": 2.0814, + "step": 6062 + }, + { + "epoch": 0.8731278801843319, + "grad_norm": 0.5135546922683716, + "learning_rate": 1.9640711252694816e-06, + "loss": 0.0585, + "step": 6063 + }, + { + "epoch": 0.8732718894009217, + "grad_norm": 0.25247910618782043, + "learning_rate": 1.9596790599079236e-06, + "loss": 0.0414, + "step": 6064 + }, + { + "epoch": 0.8734158986175116, + "grad_norm": 4.582229137420654, + "learning_rate": 1.9552917104925267e-06, + "loss": 1.9036, + "step": 6065 + }, + { + "epoch": 0.8735599078341014, + "grad_norm": 0.9648818373680115, + "learning_rate": 1.950909077921301e-06, + "loss": 0.105, + "step": 6066 + }, + { + "epoch": 0.8737039170506913, + "grad_norm": 5.919749736785889, + "learning_rate": 1.946531163091289e-06, + "loss": 1.073, + "step": 6067 + }, + { + "epoch": 0.8738479262672811, + "grad_norm": 4.888855457305908, + "learning_rate": 1.942157966898575e-06, + "loss": 1.6562, + "step": 6068 + }, + { + "epoch": 0.873991935483871, + "grad_norm": 0.8750132322311401, + "learning_rate": 1.937789490238276e-06, + "loss": 0.0976, + "step": 6069 + }, + { + "epoch": 0.8741359447004609, + "grad_norm": 0.8843021392822266, + "learning_rate": 1.9334257340045405e-06, + "loss": 0.0681, + "step": 6070 + }, + { + "epoch": 0.8742799539170507, + "grad_norm": 4.2050089836120605, + "learning_rate": 1.9290666990905536e-06, + "loss": 1.8989, + "step": 6071 + }, + { + "epoch": 0.8744239631336406, + "grad_norm": 0.7497116923332214, + "learning_rate": 1.924712386388533e-06, + "loss": 0.0867, + "step": 6072 + }, + { + "epoch": 0.8745679723502304, + "grad_norm": 0.4891282618045807, + "learning_rate": 1.9203627967897235e-06, + "loss": 0.051, + "step": 6073 + }, + { + "epoch": 0.8747119815668203, + "grad_norm": 0.6144672632217407, + "learning_rate": 1.916017931184419e-06, + "loss": 0.0724, + "step": 6074 + }, + { + "epoch": 0.8748559907834101, + "grad_norm": 1.1479864120483398, + "learning_rate": 1.9116777904619273e-06, + "loss": 0.1177, + "step": 6075 + }, + { + "epoch": 0.875, + "grad_norm": 0.43072637915611267, + "learning_rate": 1.907342375510604e-06, + "loss": 0.0403, + "step": 6076 + }, + { + "epoch": 0.8751440092165899, + "grad_norm": 0.8329504132270813, + "learning_rate": 1.9030116872178316e-06, + "loss": 0.1165, + "step": 6077 + }, + { + "epoch": 0.8752880184331797, + "grad_norm": 4.990556716918945, + "learning_rate": 1.898685726470023e-06, + "loss": 1.4391, + "step": 6078 + }, + { + "epoch": 0.8754320276497696, + "grad_norm": 1.0919259786605835, + "learning_rate": 1.8943644941526283e-06, + "loss": 0.0952, + "step": 6079 + }, + { + "epoch": 0.8755760368663594, + "grad_norm": 0.4609887897968292, + "learning_rate": 1.8900479911501262e-06, + "loss": 0.047, + "step": 6080 + }, + { + "epoch": 0.8757200460829493, + "grad_norm": 0.8105335831642151, + "learning_rate": 1.8857362183460264e-06, + "loss": 0.0631, + "step": 6081 + }, + { + "epoch": 0.8758640552995391, + "grad_norm": 5.0950026512146, + "learning_rate": 1.881429176622876e-06, + "loss": 0.7687, + "step": 6082 + }, + { + "epoch": 0.876008064516129, + "grad_norm": 3.5349137783050537, + "learning_rate": 1.87712686686225e-06, + "loss": 0.3923, + "step": 6083 + }, + { + "epoch": 0.8761520737327189, + "grad_norm": 3.7672178745269775, + "learning_rate": 1.8728292899447525e-06, + "loss": 0.7525, + "step": 6084 + }, + { + "epoch": 0.8762960829493087, + "grad_norm": 0.8319607377052307, + "learning_rate": 1.8685364467500217e-06, + "loss": 0.1025, + "step": 6085 + }, + { + "epoch": 0.8764400921658986, + "grad_norm": 0.67367023229599, + "learning_rate": 1.8642483381567294e-06, + "loss": 0.07, + "step": 6086 + }, + { + "epoch": 0.8765841013824884, + "grad_norm": 5.334803581237793, + "learning_rate": 1.8599649650425738e-06, + "loss": 1.1194, + "step": 6087 + }, + { + "epoch": 0.8767281105990783, + "grad_norm": 0.45948171615600586, + "learning_rate": 1.8556863282842867e-06, + "loss": 0.0551, + "step": 6088 + }, + { + "epoch": 0.8768721198156681, + "grad_norm": 0.38141727447509766, + "learning_rate": 1.8514124287576262e-06, + "loss": 0.0494, + "step": 6089 + }, + { + "epoch": 0.8770161290322581, + "grad_norm": 0.6199442148208618, + "learning_rate": 1.8471432673373868e-06, + "loss": 0.0652, + "step": 6090 + }, + { + "epoch": 0.877160138248848, + "grad_norm": 0.8763055801391602, + "learning_rate": 1.8428788448973887e-06, + "loss": 0.1057, + "step": 6091 + }, + { + "epoch": 0.8773041474654378, + "grad_norm": 0.625943660736084, + "learning_rate": 1.8386191623104843e-06, + "loss": 0.1133, + "step": 6092 + }, + { + "epoch": 0.8774481566820277, + "grad_norm": 0.8939620852470398, + "learning_rate": 1.834364220448559e-06, + "loss": 4.3903, + "step": 6093 + }, + { + "epoch": 0.8775921658986175, + "grad_norm": 0.9898948669433594, + "learning_rate": 1.8301140201825217e-06, + "loss": 0.0824, + "step": 6094 + }, + { + "epoch": 0.8777361751152074, + "grad_norm": 0.9607778191566467, + "learning_rate": 1.8258685623823103e-06, + "loss": 0.0969, + "step": 6095 + }, + { + "epoch": 0.8778801843317973, + "grad_norm": 0.6323534250259399, + "learning_rate": 1.8216278479168985e-06, + "loss": 0.1147, + "step": 6096 + }, + { + "epoch": 0.8780241935483871, + "grad_norm": 0.775446891784668, + "learning_rate": 1.8173918776542815e-06, + "loss": 0.0824, + "step": 6097 + }, + { + "epoch": 0.878168202764977, + "grad_norm": 1.0111337900161743, + "learning_rate": 1.813160652461493e-06, + "loss": 0.1106, + "step": 6098 + }, + { + "epoch": 0.8783122119815668, + "grad_norm": 0.6312023997306824, + "learning_rate": 1.808934173204585e-06, + "loss": 0.0675, + "step": 6099 + }, + { + "epoch": 0.8784562211981567, + "grad_norm": 0.45155397057533264, + "learning_rate": 1.804712440748646e-06, + "loss": 0.055, + "step": 6100 + }, + { + "epoch": 0.8786002304147466, + "grad_norm": 1.0602601766586304, + "learning_rate": 1.8004954559577902e-06, + "loss": 0.1409, + "step": 6101 + }, + { + "epoch": 0.8787442396313364, + "grad_norm": 0.7944156527519226, + "learning_rate": 1.7962832196951579e-06, + "loss": 0.0744, + "step": 6102 + }, + { + "epoch": 0.8788882488479263, + "grad_norm": 0.5582792162895203, + "learning_rate": 1.7920757328229205e-06, + "loss": 0.0705, + "step": 6103 + }, + { + "epoch": 0.8790322580645161, + "grad_norm": 0.9071877598762512, + "learning_rate": 1.787872996202275e-06, + "loss": 0.1039, + "step": 6104 + }, + { + "epoch": 0.879176267281106, + "grad_norm": 1.0849002599716187, + "learning_rate": 1.7836750106934474e-06, + "loss": 0.0905, + "step": 6105 + }, + { + "epoch": 0.8793202764976958, + "grad_norm": 0.6373359560966492, + "learning_rate": 1.779481777155692e-06, + "loss": 0.0916, + "step": 6106 + }, + { + "epoch": 0.8794642857142857, + "grad_norm": 1.1006416082382202, + "learning_rate": 1.775293296447289e-06, + "loss": 0.1034, + "step": 6107 + }, + { + "epoch": 0.8796082949308756, + "grad_norm": 3.02547550201416, + "learning_rate": 1.771109569425547e-06, + "loss": 1.1332, + "step": 6108 + }, + { + "epoch": 0.8797523041474654, + "grad_norm": 0.6019315123558044, + "learning_rate": 1.7669305969468003e-06, + "loss": 0.0701, + "step": 6109 + }, + { + "epoch": 0.8798963133640553, + "grad_norm": 0.6519468426704407, + "learning_rate": 1.7627563798664121e-06, + "loss": 0.0939, + "step": 6110 + }, + { + "epoch": 0.8800403225806451, + "grad_norm": 0.7984176278114319, + "learning_rate": 1.7585869190387683e-06, + "loss": 0.0983, + "step": 6111 + }, + { + "epoch": 0.880184331797235, + "grad_norm": 0.9399605393409729, + "learning_rate": 1.7544222153172862e-06, + "loss": 0.1075, + "step": 6112 + }, + { + "epoch": 0.8803283410138248, + "grad_norm": 4.129300117492676, + "learning_rate": 1.7502622695544036e-06, + "loss": 2.0284, + "step": 6113 + }, + { + "epoch": 0.8804723502304147, + "grad_norm": 0.8231692314147949, + "learning_rate": 1.7461070826015918e-06, + "loss": 0.1513, + "step": 6114 + }, + { + "epoch": 0.8806163594470046, + "grad_norm": 8.89306640625, + "learning_rate": 1.7419566553093402e-06, + "loss": 2.6538, + "step": 6115 + }, + { + "epoch": 0.8807603686635944, + "grad_norm": 0.8428215384483337, + "learning_rate": 1.737810988527172e-06, + "loss": 0.0984, + "step": 6116 + }, + { + "epoch": 0.8809043778801844, + "grad_norm": 3.9746909141540527, + "learning_rate": 1.7336700831036307e-06, + "loss": 0.8851, + "step": 6117 + }, + { + "epoch": 0.8810483870967742, + "grad_norm": 1.487005591392517, + "learning_rate": 1.7295339398862797e-06, + "loss": 0.1214, + "step": 6118 + }, + { + "epoch": 0.8811923963133641, + "grad_norm": 0.8743619918823242, + "learning_rate": 1.7254025597217228e-06, + "loss": 0.1012, + "step": 6119 + }, + { + "epoch": 0.881336405529954, + "grad_norm": 3.3104171752929688, + "learning_rate": 1.7212759434555803e-06, + "loss": 0.5208, + "step": 6120 + }, + { + "epoch": 0.8814804147465438, + "grad_norm": 0.7032500505447388, + "learning_rate": 1.7171540919324936e-06, + "loss": 0.0745, + "step": 6121 + }, + { + "epoch": 0.8816244239631337, + "grad_norm": 3.2777678966522217, + "learning_rate": 1.7130370059961347e-06, + "loss": 1.501, + "step": 6122 + }, + { + "epoch": 0.8817684331797235, + "grad_norm": 6.320735454559326, + "learning_rate": 1.7089246864891967e-06, + "loss": 1.5166, + "step": 6123 + }, + { + "epoch": 0.8819124423963134, + "grad_norm": 0.804033100605011, + "learning_rate": 1.7048171342534004e-06, + "loss": 0.0885, + "step": 6124 + }, + { + "epoch": 0.8820564516129032, + "grad_norm": 1.1183923482894897, + "learning_rate": 1.7007143501294898e-06, + "loss": 0.0934, + "step": 6125 + }, + { + "epoch": 0.8822004608294931, + "grad_norm": 1.350594401359558, + "learning_rate": 1.6966163349572295e-06, + "loss": 0.0916, + "step": 6126 + }, + { + "epoch": 0.882344470046083, + "grad_norm": 3.7183244228363037, + "learning_rate": 1.6925230895754125e-06, + "loss": 1.1052, + "step": 6127 + }, + { + "epoch": 0.8824884792626728, + "grad_norm": 0.9842641949653625, + "learning_rate": 1.6884346148218545e-06, + "loss": 0.1388, + "step": 6128 + }, + { + "epoch": 0.8826324884792627, + "grad_norm": 0.7876224517822266, + "learning_rate": 1.6843509115333917e-06, + "loss": 0.0954, + "step": 6129 + }, + { + "epoch": 0.8827764976958525, + "grad_norm": 1.013680338859558, + "learning_rate": 1.680271980545886e-06, + "loss": 0.0844, + "step": 6130 + }, + { + "epoch": 0.8829205069124424, + "grad_norm": 4.7914533615112305, + "learning_rate": 1.6761978226942255e-06, + "loss": 0.8431, + "step": 6131 + }, + { + "epoch": 0.8830645161290323, + "grad_norm": 1.3973395824432373, + "learning_rate": 1.6721284388123148e-06, + "loss": 0.1252, + "step": 6132 + }, + { + "epoch": 0.8832085253456221, + "grad_norm": 0.9760321378707886, + "learning_rate": 1.6680638297330854e-06, + "loss": 0.0827, + "step": 6133 + }, + { + "epoch": 0.883352534562212, + "grad_norm": 0.5483518242835999, + "learning_rate": 1.6640039962884935e-06, + "loss": 0.0538, + "step": 6134 + }, + { + "epoch": 0.8834965437788018, + "grad_norm": 0.30208924412727356, + "learning_rate": 1.6599489393095109e-06, + "loss": 0.048, + "step": 6135 + }, + { + "epoch": 0.8836405529953917, + "grad_norm": 2.804206132888794, + "learning_rate": 1.6558986596261429e-06, + "loss": 1.8883, + "step": 6136 + }, + { + "epoch": 0.8837845622119815, + "grad_norm": 0.8207404613494873, + "learning_rate": 1.6518531580674013e-06, + "loss": 0.0987, + "step": 6137 + }, + { + "epoch": 0.8839285714285714, + "grad_norm": 1.2812961339950562, + "learning_rate": 1.647812435461335e-06, + "loss": 0.0862, + "step": 6138 + }, + { + "epoch": 0.8840725806451613, + "grad_norm": 0.8106695413589478, + "learning_rate": 1.6437764926350074e-06, + "loss": 0.0896, + "step": 6139 + }, + { + "epoch": 0.8842165898617511, + "grad_norm": 0.6480234861373901, + "learning_rate": 1.6397453304145022e-06, + "loss": 0.0725, + "step": 6140 + }, + { + "epoch": 0.884360599078341, + "grad_norm": 0.8648846745491028, + "learning_rate": 1.6357189496249287e-06, + "loss": 0.0734, + "step": 6141 + }, + { + "epoch": 0.8845046082949308, + "grad_norm": 0.7604034543037415, + "learning_rate": 1.6316973510904165e-06, + "loss": 0.0844, + "step": 6142 + }, + { + "epoch": 0.8846486175115207, + "grad_norm": 0.812138557434082, + "learning_rate": 1.6276805356341157e-06, + "loss": 0.0883, + "step": 6143 + }, + { + "epoch": 0.8847926267281107, + "grad_norm": 0.8356630206108093, + "learning_rate": 1.6236685040781935e-06, + "loss": 0.0764, + "step": 6144 + }, + { + "epoch": 0.8849366359447005, + "grad_norm": 1.0130105018615723, + "learning_rate": 1.6196612572438429e-06, + "loss": 0.121, + "step": 6145 + }, + { + "epoch": 0.8850806451612904, + "grad_norm": 0.6587336659431458, + "learning_rate": 1.6156587959512832e-06, + "loss": 0.0939, + "step": 6146 + }, + { + "epoch": 0.8852246543778802, + "grad_norm": 0.596451997756958, + "learning_rate": 1.6116611210197419e-06, + "loss": 0.0716, + "step": 6147 + }, + { + "epoch": 0.8853686635944701, + "grad_norm": 0.5204094648361206, + "learning_rate": 1.607668233267473e-06, + "loss": 0.0494, + "step": 6148 + }, + { + "epoch": 0.8855126728110599, + "grad_norm": 1.0460714101791382, + "learning_rate": 1.6036801335117507e-06, + "loss": 0.105, + "step": 6149 + }, + { + "epoch": 0.8856566820276498, + "grad_norm": 0.914931058883667, + "learning_rate": 1.5996968225688663e-06, + "loss": 0.1062, + "step": 6150 + }, + { + "epoch": 0.8858006912442397, + "grad_norm": 5.922601699829102, + "learning_rate": 1.5957183012541372e-06, + "loss": 1.78, + "step": 6151 + }, + { + "epoch": 0.8859447004608295, + "grad_norm": 0.29838085174560547, + "learning_rate": 1.5917445703818923e-06, + "loss": 0.0471, + "step": 6152 + }, + { + "epoch": 0.8860887096774194, + "grad_norm": 0.8995446562767029, + "learning_rate": 1.587775630765484e-06, + "loss": 0.0768, + "step": 6153 + }, + { + "epoch": 0.8862327188940092, + "grad_norm": 0.9987413883209229, + "learning_rate": 1.5838114832172873e-06, + "loss": 0.0924, + "step": 6154 + }, + { + "epoch": 0.8863767281105991, + "grad_norm": 0.5493403077125549, + "learning_rate": 1.5798521285486922e-06, + "loss": 0.0831, + "step": 6155 + }, + { + "epoch": 0.886520737327189, + "grad_norm": 2.9912424087524414, + "learning_rate": 1.5758975675701059e-06, + "loss": 0.5729, + "step": 6156 + }, + { + "epoch": 0.8866647465437788, + "grad_norm": 0.9380465149879456, + "learning_rate": 1.5719478010909589e-06, + "loss": 0.1546, + "step": 6157 + }, + { + "epoch": 0.8868087557603687, + "grad_norm": 0.7085320353507996, + "learning_rate": 1.5680028299197014e-06, + "loss": 0.1019, + "step": 6158 + }, + { + "epoch": 0.8869527649769585, + "grad_norm": 0.9574716687202454, + "learning_rate": 1.5640626548637932e-06, + "loss": 0.106, + "step": 6159 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 0.6224703788757324, + "learning_rate": 1.5601272767297226e-06, + "loss": 0.0646, + "step": 6160 + }, + { + "epoch": 0.8872407834101382, + "grad_norm": 1.5933703184127808, + "learning_rate": 1.5561966963229924e-06, + "loss": 0.1301, + "step": 6161 + }, + { + "epoch": 0.8873847926267281, + "grad_norm": 4.734592437744141, + "learning_rate": 1.5522709144481174e-06, + "loss": 0.8244, + "step": 6162 + }, + { + "epoch": 0.887528801843318, + "grad_norm": 0.8965024352073669, + "learning_rate": 1.5483499319086436e-06, + "loss": 0.1024, + "step": 6163 + }, + { + "epoch": 0.8876728110599078, + "grad_norm": 1.1468241214752197, + "learning_rate": 1.5444337495071209e-06, + "loss": 0.1097, + "step": 6164 + }, + { + "epoch": 0.8878168202764977, + "grad_norm": 0.7317185401916504, + "learning_rate": 1.5405223680451248e-06, + "loss": 0.0851, + "step": 6165 + }, + { + "epoch": 0.8879608294930875, + "grad_norm": 1.5322787761688232, + "learning_rate": 1.536615788323245e-06, + "loss": 0.1439, + "step": 6166 + }, + { + "epoch": 0.8881048387096774, + "grad_norm": 0.9246971011161804, + "learning_rate": 1.5327140111410927e-06, + "loss": 0.1487, + "step": 6167 + }, + { + "epoch": 0.8882488479262672, + "grad_norm": 3.9984188079833984, + "learning_rate": 1.5288170372972865e-06, + "loss": 2.1834, + "step": 6168 + }, + { + "epoch": 0.8883928571428571, + "grad_norm": 0.6195973753929138, + "learning_rate": 1.5249248675894724e-06, + "loss": 0.0598, + "step": 6169 + }, + { + "epoch": 0.888536866359447, + "grad_norm": 0.651159405708313, + "learning_rate": 1.5210375028143097e-06, + "loss": 0.059, + "step": 6170 + }, + { + "epoch": 0.8886808755760369, + "grad_norm": 0.9245191812515259, + "learning_rate": 1.5171549437674682e-06, + "loss": 0.1018, + "step": 6171 + }, + { + "epoch": 0.8888248847926268, + "grad_norm": 0.9022085070610046, + "learning_rate": 1.5132771912436394e-06, + "loss": 0.1109, + "step": 6172 + }, + { + "epoch": 0.8889688940092166, + "grad_norm": 4.453837871551514, + "learning_rate": 1.5094042460365387e-06, + "loss": 1.9006, + "step": 6173 + }, + { + "epoch": 0.8891129032258065, + "grad_norm": 5.638201713562012, + "learning_rate": 1.505536108938882e-06, + "loss": 1.6965, + "step": 6174 + }, + { + "epoch": 0.8892569124423964, + "grad_norm": 0.8157364726066589, + "learning_rate": 1.5016727807424107e-06, + "loss": 0.0926, + "step": 6175 + }, + { + "epoch": 0.8894009216589862, + "grad_norm": 0.8632445335388184, + "learning_rate": 1.4978142622378815e-06, + "loss": 0.103, + "step": 6176 + }, + { + "epoch": 0.8895449308755761, + "grad_norm": 7.983931064605713, + "learning_rate": 1.4939605542150598e-06, + "loss": 1.6278, + "step": 6177 + }, + { + "epoch": 0.8896889400921659, + "grad_norm": 0.8035715818405151, + "learning_rate": 1.4901116574627366e-06, + "loss": 0.1039, + "step": 6178 + }, + { + "epoch": 0.8898329493087558, + "grad_norm": 0.7240176796913147, + "learning_rate": 1.4862675727687124e-06, + "loss": 0.0856, + "step": 6179 + }, + { + "epoch": 0.8899769585253456, + "grad_norm": 0.38020825386047363, + "learning_rate": 1.4824283009197998e-06, + "loss": 0.0437, + "step": 6180 + }, + { + "epoch": 0.8901209677419355, + "grad_norm": 0.8915135264396667, + "learning_rate": 1.4785938427018337e-06, + "loss": 0.1024, + "step": 6181 + }, + { + "epoch": 0.8902649769585254, + "grad_norm": 5.854917049407959, + "learning_rate": 1.4747641988996585e-06, + "loss": 1.3997, + "step": 6182 + }, + { + "epoch": 0.8904089861751152, + "grad_norm": 0.37661975622177124, + "learning_rate": 1.4709393702971335e-06, + "loss": 0.0463, + "step": 6183 + }, + { + "epoch": 0.8905529953917051, + "grad_norm": 0.5167691707611084, + "learning_rate": 1.4671193576771325e-06, + "loss": 0.0468, + "step": 6184 + }, + { + "epoch": 0.8906970046082949, + "grad_norm": 0.9706976413726807, + "learning_rate": 1.4633041618215493e-06, + "loss": 0.0896, + "step": 6185 + }, + { + "epoch": 0.8908410138248848, + "grad_norm": 0.4834651052951813, + "learning_rate": 1.4594937835112815e-06, + "loss": 0.0638, + "step": 6186 + }, + { + "epoch": 0.8909850230414746, + "grad_norm": 2.491966962814331, + "learning_rate": 1.4556882235262498e-06, + "loss": 0.1574, + "step": 6187 + }, + { + "epoch": 0.8911290322580645, + "grad_norm": 0.7694783210754395, + "learning_rate": 1.4518874826453838e-06, + "loss": 0.0908, + "step": 6188 + }, + { + "epoch": 0.8912730414746544, + "grad_norm": 0.814163327217102, + "learning_rate": 1.4480915616466279e-06, + "loss": 0.0934, + "step": 6189 + }, + { + "epoch": 0.8914170506912442, + "grad_norm": 0.9293427467346191, + "learning_rate": 1.444300461306941e-06, + "loss": 0.0901, + "step": 6190 + }, + { + "epoch": 0.8915610599078341, + "grad_norm": 2.410121202468872, + "learning_rate": 1.4405141824022917e-06, + "loss": 0.146, + "step": 6191 + }, + { + "epoch": 0.8917050691244239, + "grad_norm": 0.7762468457221985, + "learning_rate": 1.4367327257076678e-06, + "loss": 0.0778, + "step": 6192 + }, + { + "epoch": 0.8918490783410138, + "grad_norm": 3.470663547515869, + "learning_rate": 1.4329560919970647e-06, + "loss": 2.0259, + "step": 6193 + }, + { + "epoch": 0.8919930875576036, + "grad_norm": 7.858127117156982, + "learning_rate": 1.4291842820434915e-06, + "loss": 1.3233, + "step": 6194 + }, + { + "epoch": 0.8921370967741935, + "grad_norm": 0.8199609518051147, + "learning_rate": 1.425417296618972e-06, + "loss": 0.1096, + "step": 6195 + }, + { + "epoch": 0.8922811059907834, + "grad_norm": 0.8653063178062439, + "learning_rate": 1.4216551364945402e-06, + "loss": 0.0968, + "step": 6196 + }, + { + "epoch": 0.8924251152073732, + "grad_norm": 4.730999946594238, + "learning_rate": 1.4178978024402433e-06, + "loss": 1.3306, + "step": 6197 + }, + { + "epoch": 0.8925691244239631, + "grad_norm": 1.0790722370147705, + "learning_rate": 1.414145295225147e-06, + "loss": 0.1194, + "step": 6198 + }, + { + "epoch": 0.892713133640553, + "grad_norm": 1.0988192558288574, + "learning_rate": 1.4103976156173176e-06, + "loss": 0.0963, + "step": 6199 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.7790629863739014, + "learning_rate": 1.4066547643838413e-06, + "loss": 0.1278, + "step": 6200 + }, + { + "epoch": 0.8930011520737328, + "grad_norm": 0.8788543939590454, + "learning_rate": 1.4029167422908107e-06, + "loss": 0.094, + "step": 6201 + }, + { + "epoch": 0.8931451612903226, + "grad_norm": 0.4396914839744568, + "learning_rate": 1.3991835501033362e-06, + "loss": 0.0818, + "step": 6202 + }, + { + "epoch": 0.8932891705069125, + "grad_norm": 0.8240845799446106, + "learning_rate": 1.3954551885855343e-06, + "loss": 0.0974, + "step": 6203 + }, + { + "epoch": 0.8934331797235023, + "grad_norm": 1.0625944137573242, + "learning_rate": 1.3917316585005363e-06, + "loss": 0.1438, + "step": 6204 + }, + { + "epoch": 0.8935771889400922, + "grad_norm": 0.8223533034324646, + "learning_rate": 1.3880129606104796e-06, + "loss": 0.0716, + "step": 6205 + }, + { + "epoch": 0.893721198156682, + "grad_norm": 3.709771156311035, + "learning_rate": 1.3842990956765195e-06, + "loss": 0.6818, + "step": 6206 + }, + { + "epoch": 0.8938652073732719, + "grad_norm": 0.7553238868713379, + "learning_rate": 1.3805900644588171e-06, + "loss": 0.071, + "step": 6207 + }, + { + "epoch": 0.8940092165898618, + "grad_norm": 1.018292784690857, + "learning_rate": 1.376885867716543e-06, + "loss": 0.0933, + "step": 6208 + }, + { + "epoch": 0.8941532258064516, + "grad_norm": 0.6677776575088501, + "learning_rate": 1.3731865062078852e-06, + "loss": 0.0685, + "step": 6209 + }, + { + "epoch": 0.8942972350230415, + "grad_norm": 0.8252946138381958, + "learning_rate": 1.3694919806900353e-06, + "loss": 0.0849, + "step": 6210 + }, + { + "epoch": 0.8944412442396313, + "grad_norm": 8.19139289855957, + "learning_rate": 1.3658022919191964e-06, + "loss": 1.747, + "step": 6211 + }, + { + "epoch": 0.8945852534562212, + "grad_norm": 0.5520724058151245, + "learning_rate": 1.3621174406505844e-06, + "loss": 0.0483, + "step": 6212 + }, + { + "epoch": 0.894729262672811, + "grad_norm": 0.7285502552986145, + "learning_rate": 1.3584374276384205e-06, + "loss": 0.0907, + "step": 6213 + }, + { + "epoch": 0.8948732718894009, + "grad_norm": 0.44423186779022217, + "learning_rate": 1.354762253635941e-06, + "loss": 0.0449, + "step": 6214 + }, + { + "epoch": 0.8950172811059908, + "grad_norm": 0.8905736804008484, + "learning_rate": 1.3510919193953891e-06, + "loss": 0.1016, + "step": 6215 + }, + { + "epoch": 0.8951612903225806, + "grad_norm": 4.5847625732421875, + "learning_rate": 1.3474264256680109e-06, + "loss": 1.3038, + "step": 6216 + }, + { + "epoch": 0.8953052995391705, + "grad_norm": 5.152536869049072, + "learning_rate": 1.3437657732040782e-06, + "loss": 1.0659, + "step": 6217 + }, + { + "epoch": 0.8954493087557603, + "grad_norm": 0.6653900146484375, + "learning_rate": 1.3401099627528586e-06, + "loss": 0.0736, + "step": 6218 + }, + { + "epoch": 0.8955933179723502, + "grad_norm": 0.8098300695419312, + "learning_rate": 1.3364589950626282e-06, + "loss": 0.0706, + "step": 6219 + }, + { + "epoch": 0.8957373271889401, + "grad_norm": 1.0141727924346924, + "learning_rate": 1.3328128708806786e-06, + "loss": 0.0983, + "step": 6220 + }, + { + "epoch": 0.8958813364055299, + "grad_norm": 0.3425619602203369, + "learning_rate": 1.3291715909533042e-06, + "loss": 0.0478, + "step": 6221 + }, + { + "epoch": 0.8960253456221198, + "grad_norm": 0.8159298896789551, + "learning_rate": 1.3255351560258145e-06, + "loss": 0.0837, + "step": 6222 + }, + { + "epoch": 0.8961693548387096, + "grad_norm": 1.1305605173110962, + "learning_rate": 1.3219035668425195e-06, + "loss": 0.0745, + "step": 6223 + }, + { + "epoch": 0.8963133640552995, + "grad_norm": 0.6341632008552551, + "learning_rate": 1.318276824146747e-06, + "loss": 0.0791, + "step": 6224 + }, + { + "epoch": 0.8964573732718893, + "grad_norm": 4.296459197998047, + "learning_rate": 1.3146549286808195e-06, + "loss": 1.2194, + "step": 6225 + }, + { + "epoch": 0.8966013824884793, + "grad_norm": 0.8738852739334106, + "learning_rate": 1.311037881186078e-06, + "loss": 0.0944, + "step": 6226 + }, + { + "epoch": 0.8967453917050692, + "grad_norm": 0.8476899862289429, + "learning_rate": 1.3074256824028713e-06, + "loss": 0.1029, + "step": 6227 + }, + { + "epoch": 0.896889400921659, + "grad_norm": 0.994813859462738, + "learning_rate": 1.30381833307055e-06, + "loss": 0.0896, + "step": 6228 + }, + { + "epoch": 0.8970334101382489, + "grad_norm": 0.8724350929260254, + "learning_rate": 1.3002158339274733e-06, + "loss": 0.094, + "step": 6229 + }, + { + "epoch": 0.8971774193548387, + "grad_norm": 0.26734721660614014, + "learning_rate": 1.2966181857110098e-06, + "loss": 0.0461, + "step": 6230 + }, + { + "epoch": 0.8973214285714286, + "grad_norm": 1.0043789148330688, + "learning_rate": 1.2930253891575372e-06, + "loss": 0.107, + "step": 6231 + }, + { + "epoch": 0.8974654377880185, + "grad_norm": 0.5999282002449036, + "learning_rate": 1.2894374450024338e-06, + "loss": 0.0571, + "step": 6232 + }, + { + "epoch": 0.8976094470046083, + "grad_norm": 0.6172305941581726, + "learning_rate": 1.28585435398009e-06, + "loss": 0.0704, + "step": 6233 + }, + { + "epoch": 0.8977534562211982, + "grad_norm": 1.231062889099121, + "learning_rate": 1.2822761168239023e-06, + "loss": 0.113, + "step": 6234 + }, + { + "epoch": 0.897897465437788, + "grad_norm": 6.129392147064209, + "learning_rate": 1.2787027342662655e-06, + "loss": 1.9393, + "step": 6235 + }, + { + "epoch": 0.8980414746543779, + "grad_norm": 0.8083063364028931, + "learning_rate": 1.2751342070385974e-06, + "loss": 0.1057, + "step": 6236 + }, + { + "epoch": 0.8981854838709677, + "grad_norm": 0.9148820042610168, + "learning_rate": 1.271570535871311e-06, + "loss": 0.0969, + "step": 6237 + }, + { + "epoch": 0.8983294930875576, + "grad_norm": 0.8862453699111938, + "learning_rate": 1.2680117214938226e-06, + "loss": 0.099, + "step": 6238 + }, + { + "epoch": 0.8984735023041475, + "grad_norm": 0.7427366375923157, + "learning_rate": 1.2644577646345607e-06, + "loss": 0.0802, + "step": 6239 + }, + { + "epoch": 0.8986175115207373, + "grad_norm": 0.8294971585273743, + "learning_rate": 1.2609086660209575e-06, + "loss": 0.1115, + "step": 6240 + }, + { + "epoch": 0.8987615207373272, + "grad_norm": 0.8213338851928711, + "learning_rate": 1.2573644263794483e-06, + "loss": 0.0974, + "step": 6241 + }, + { + "epoch": 0.898905529953917, + "grad_norm": 0.834966778755188, + "learning_rate": 1.2538250464354778e-06, + "loss": 0.1154, + "step": 6242 + }, + { + "epoch": 0.8990495391705069, + "grad_norm": 0.5415728688240051, + "learning_rate": 1.2502905269134974e-06, + "loss": 0.0639, + "step": 6243 + }, + { + "epoch": 0.8991935483870968, + "grad_norm": 0.6259236335754395, + "learning_rate": 1.2467608685369558e-06, + "loss": 0.0571, + "step": 6244 + }, + { + "epoch": 0.8993375576036866, + "grad_norm": 0.6352747082710266, + "learning_rate": 1.243236072028317e-06, + "loss": 0.0492, + "step": 6245 + }, + { + "epoch": 0.8994815668202765, + "grad_norm": 0.9023754000663757, + "learning_rate": 1.2397161381090399e-06, + "loss": 0.1062, + "step": 6246 + }, + { + "epoch": 0.8996255760368663, + "grad_norm": 8.935270309448242, + "learning_rate": 1.2362010674995928e-06, + "loss": 1.9799, + "step": 6247 + }, + { + "epoch": 0.8997695852534562, + "grad_norm": 0.7627495527267456, + "learning_rate": 1.2326908609194525e-06, + "loss": 0.0734, + "step": 6248 + }, + { + "epoch": 0.899913594470046, + "grad_norm": 0.8289364576339722, + "learning_rate": 1.229185519087092e-06, + "loss": 0.1019, + "step": 6249 + }, + { + "epoch": 0.9000576036866359, + "grad_norm": 0.8416813015937805, + "learning_rate": 1.2256850427199957e-06, + "loss": 0.0922, + "step": 6250 + }, + { + "epoch": 0.9002016129032258, + "grad_norm": 0.264920175075531, + "learning_rate": 1.2221894325346456e-06, + "loss": 0.0457, + "step": 6251 + }, + { + "epoch": 0.9003456221198156, + "grad_norm": 0.8423996567726135, + "learning_rate": 1.2186986892465362e-06, + "loss": 0.089, + "step": 6252 + }, + { + "epoch": 0.9004896313364056, + "grad_norm": 0.8737819790840149, + "learning_rate": 1.2152128135701546e-06, + "loss": 0.0884, + "step": 6253 + }, + { + "epoch": 0.9006336405529954, + "grad_norm": 3.618335008621216, + "learning_rate": 1.211731806219002e-06, + "loss": 1.4558, + "step": 6254 + }, + { + "epoch": 0.9007776497695853, + "grad_norm": 0.3685612380504608, + "learning_rate": 1.2082556679055807e-06, + "loss": 0.042, + "step": 6255 + }, + { + "epoch": 0.9009216589861752, + "grad_norm": 0.7758889198303223, + "learning_rate": 1.2047843993413938e-06, + "loss": 0.0811, + "step": 6256 + }, + { + "epoch": 0.901065668202765, + "grad_norm": 0.5414485335350037, + "learning_rate": 1.2013180012369452e-06, + "loss": 0.0772, + "step": 6257 + }, + { + "epoch": 0.9012096774193549, + "grad_norm": 3.632169723510742, + "learning_rate": 1.197856474301748e-06, + "loss": 1.7584, + "step": 6258 + }, + { + "epoch": 0.9013536866359447, + "grad_norm": 0.967963457107544, + "learning_rate": 1.1943998192443157e-06, + "loss": 0.099, + "step": 6259 + }, + { + "epoch": 0.9014976958525346, + "grad_norm": 1.0966789722442627, + "learning_rate": 1.190948036772166e-06, + "loss": 0.114, + "step": 6260 + }, + { + "epoch": 0.9016417050691244, + "grad_norm": 0.5547144412994385, + "learning_rate": 1.1875011275918114e-06, + "loss": 0.061, + "step": 6261 + }, + { + "epoch": 0.9017857142857143, + "grad_norm": 2.3924248218536377, + "learning_rate": 1.184059092408779e-06, + "loss": 0.4411, + "step": 6262 + }, + { + "epoch": 0.9019297235023042, + "grad_norm": 0.5357797145843506, + "learning_rate": 1.180621931927592e-06, + "loss": 0.0621, + "step": 6263 + }, + { + "epoch": 0.902073732718894, + "grad_norm": 1.8539278507232666, + "learning_rate": 1.1771896468517758e-06, + "loss": 0.1619, + "step": 6264 + }, + { + "epoch": 0.9022177419354839, + "grad_norm": 0.676307737827301, + "learning_rate": 1.1737622378838548e-06, + "loss": 0.0656, + "step": 6265 + }, + { + "epoch": 0.9023617511520737, + "grad_norm": 3.613600492477417, + "learning_rate": 1.1703397057253651e-06, + "loss": 0.6393, + "step": 6266 + }, + { + "epoch": 0.9025057603686636, + "grad_norm": 3.1352665424346924, + "learning_rate": 1.1669220510768325e-06, + "loss": 1.2073, + "step": 6267 + }, + { + "epoch": 0.9026497695852534, + "grad_norm": 0.880558431148529, + "learning_rate": 1.1635092746377946e-06, + "loss": 0.0937, + "step": 6268 + }, + { + "epoch": 0.9027937788018433, + "grad_norm": 3.592954635620117, + "learning_rate": 1.160101377106787e-06, + "loss": 1.6114, + "step": 6269 + }, + { + "epoch": 0.9029377880184332, + "grad_norm": 2.0002315044403076, + "learning_rate": 1.1566983591813408e-06, + "loss": 0.0867, + "step": 6270 + }, + { + "epoch": 0.903081797235023, + "grad_norm": 3.2180542945861816, + "learning_rate": 1.1533002215580013e-06, + "loss": 1.6641, + "step": 6271 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.6562220454216003, + "learning_rate": 1.1499069649322985e-06, + "loss": 0.0718, + "step": 6272 + }, + { + "epoch": 0.9033698156682027, + "grad_norm": 0.6338951587677002, + "learning_rate": 1.1465185899987797e-06, + "loss": 0.1149, + "step": 6273 + }, + { + "epoch": 0.9035138248847926, + "grad_norm": 0.9003986120223999, + "learning_rate": 1.1431350974509814e-06, + "loss": 0.087, + "step": 6274 + }, + { + "epoch": 0.9036578341013825, + "grad_norm": 0.8012667894363403, + "learning_rate": 1.1397564879814443e-06, + "loss": 0.0837, + "step": 6275 + }, + { + "epoch": 0.9038018433179723, + "grad_norm": 0.7677321434020996, + "learning_rate": 1.1363827622817098e-06, + "loss": 0.0785, + "step": 6276 + }, + { + "epoch": 0.9039458525345622, + "grad_norm": 0.4615861773490906, + "learning_rate": 1.1330139210423224e-06, + "loss": 0.0652, + "step": 6277 + }, + { + "epoch": 0.904089861751152, + "grad_norm": 0.47265729308128357, + "learning_rate": 1.1296499649528224e-06, + "loss": 0.0513, + "step": 6278 + }, + { + "epoch": 0.9042338709677419, + "grad_norm": 0.5076233744621277, + "learning_rate": 1.1262908947017536e-06, + "loss": 0.0593, + "step": 6279 + }, + { + "epoch": 0.9043778801843319, + "grad_norm": 0.7414261102676392, + "learning_rate": 1.1229367109766576e-06, + "loss": 0.0873, + "step": 6280 + }, + { + "epoch": 0.9045218894009217, + "grad_norm": 0.6389042139053345, + "learning_rate": 1.1195874144640738e-06, + "loss": 0.0904, + "step": 6281 + }, + { + "epoch": 0.9046658986175116, + "grad_norm": 0.9176819920539856, + "learning_rate": 1.116243005849546e-06, + "loss": 0.0893, + "step": 6282 + }, + { + "epoch": 0.9048099078341014, + "grad_norm": 0.4441218674182892, + "learning_rate": 1.112903485817618e-06, + "loss": 0.0543, + "step": 6283 + }, + { + "epoch": 0.9049539170506913, + "grad_norm": 0.809941828250885, + "learning_rate": 1.109568855051829e-06, + "loss": 0.0819, + "step": 6284 + }, + { + "epoch": 0.9050979262672811, + "grad_norm": 4.157713890075684, + "learning_rate": 1.1062391142347195e-06, + "loss": 1.0166, + "step": 6285 + }, + { + "epoch": 0.905241935483871, + "grad_norm": 0.8317684531211853, + "learning_rate": 1.1029142640478247e-06, + "loss": 0.0884, + "step": 6286 + }, + { + "epoch": 0.9053859447004609, + "grad_norm": 4.514431953430176, + "learning_rate": 1.0995943051716862e-06, + "loss": 2.3954, + "step": 6287 + }, + { + "epoch": 0.9055299539170507, + "grad_norm": 0.9427332282066345, + "learning_rate": 1.0962792382858383e-06, + "loss": 0.0612, + "step": 6288 + }, + { + "epoch": 0.9056739631336406, + "grad_norm": 0.9077480435371399, + "learning_rate": 1.0929690640688218e-06, + "loss": 0.0666, + "step": 6289 + }, + { + "epoch": 0.9058179723502304, + "grad_norm": 0.8841196298599243, + "learning_rate": 1.0896637831981637e-06, + "loss": 0.1052, + "step": 6290 + }, + { + "epoch": 0.9059619815668203, + "grad_norm": 1.143865942955017, + "learning_rate": 1.086363396350401e-06, + "loss": 0.1041, + "step": 6291 + }, + { + "epoch": 0.9061059907834101, + "grad_norm": 1.0774363279342651, + "learning_rate": 1.0830679042010628e-06, + "loss": 0.0989, + "step": 6292 + }, + { + "epoch": 0.90625, + "grad_norm": 1.3654228448867798, + "learning_rate": 1.0797773074246813e-06, + "loss": 0.0966, + "step": 6293 + }, + { + "epoch": 0.9063940092165899, + "grad_norm": 1.0248435735702515, + "learning_rate": 1.0764916066947794e-06, + "loss": 0.1104, + "step": 6294 + }, + { + "epoch": 0.9065380184331797, + "grad_norm": 1.0117584466934204, + "learning_rate": 1.0732108026838827e-06, + "loss": 0.1104, + "step": 6295 + }, + { + "epoch": 0.9066820276497696, + "grad_norm": 0.7172563076019287, + "learning_rate": 1.0699348960635153e-06, + "loss": 0.0827, + "step": 6296 + }, + { + "epoch": 0.9068260368663594, + "grad_norm": 0.6859202980995178, + "learning_rate": 1.0666638875041962e-06, + "loss": 0.0999, + "step": 6297 + }, + { + "epoch": 0.9069700460829493, + "grad_norm": 0.4581853151321411, + "learning_rate": 1.0633977776754429e-06, + "loss": 0.0519, + "step": 6298 + }, + { + "epoch": 0.9071140552995391, + "grad_norm": 0.3973190188407898, + "learning_rate": 1.0601365672457702e-06, + "loss": 0.0485, + "step": 6299 + }, + { + "epoch": 0.907258064516129, + "grad_norm": 0.8381580114364624, + "learning_rate": 1.056880256882692e-06, + "loss": 0.0632, + "step": 6300 + }, + { + "epoch": 0.9074020737327189, + "grad_norm": 0.448598712682724, + "learning_rate": 1.0536288472527162e-06, + "loss": 0.0518, + "step": 6301 + }, + { + "epoch": 0.9075460829493087, + "grad_norm": 0.8570807576179504, + "learning_rate": 1.0503823390213496e-06, + "loss": 0.0884, + "step": 6302 + }, + { + "epoch": 0.9076900921658986, + "grad_norm": 8.543415069580078, + "learning_rate": 1.0471407328530914e-06, + "loss": 2.558, + "step": 6303 + }, + { + "epoch": 0.9078341013824884, + "grad_norm": 0.847062349319458, + "learning_rate": 1.0439040294114467e-06, + "loss": 0.0971, + "step": 6304 + }, + { + "epoch": 0.9079781105990783, + "grad_norm": 0.9605914950370789, + "learning_rate": 1.0406722293589078e-06, + "loss": 0.0759, + "step": 6305 + }, + { + "epoch": 0.9081221198156681, + "grad_norm": 0.4336520731449127, + "learning_rate": 1.0374453333569679e-06, + "loss": 0.0436, + "step": 6306 + }, + { + "epoch": 0.9082661290322581, + "grad_norm": 1.457255244255066, + "learning_rate": 1.0342233420661124e-06, + "loss": 0.127, + "step": 6307 + }, + { + "epoch": 0.908410138248848, + "grad_norm": 0.519717276096344, + "learning_rate": 1.0310062561458305e-06, + "loss": 0.0617, + "step": 6308 + }, + { + "epoch": 0.9085541474654378, + "grad_norm": 0.906395435333252, + "learning_rate": 1.0277940762546012e-06, + "loss": 0.0958, + "step": 6309 + }, + { + "epoch": 0.9086981566820277, + "grad_norm": 1.01953125, + "learning_rate": 1.0245868030499012e-06, + "loss": 0.1106, + "step": 6310 + }, + { + "epoch": 0.9088421658986175, + "grad_norm": 0.6965455412864685, + "learning_rate": 1.0213844371882025e-06, + "loss": 0.0728, + "step": 6311 + }, + { + "epoch": 0.9089861751152074, + "grad_norm": 0.7147218585014343, + "learning_rate": 1.0181869793249753e-06, + "loss": 0.0816, + "step": 6312 + }, + { + "epoch": 0.9091301843317973, + "grad_norm": 0.474361777305603, + "learning_rate": 1.014994430114677e-06, + "loss": 0.0602, + "step": 6313 + }, + { + "epoch": 0.9092741935483871, + "grad_norm": 1.0692532062530518, + "learning_rate": 1.0118067902107702e-06, + "loss": 0.1027, + "step": 6314 + }, + { + "epoch": 0.909418202764977, + "grad_norm": 0.7904343605041504, + "learning_rate": 1.008624060265706e-06, + "loss": 0.0851, + "step": 6315 + }, + { + "epoch": 0.9095622119815668, + "grad_norm": 6.182342529296875, + "learning_rate": 1.0054462409309351e-06, + "loss": 0.7778, + "step": 6316 + }, + { + "epoch": 0.9097062211981567, + "grad_norm": 1.0851466655731201, + "learning_rate": 1.0022733328568983e-06, + "loss": 0.0778, + "step": 6317 + }, + { + "epoch": 0.9098502304147466, + "grad_norm": 0.9192031621932983, + "learning_rate": 9.991053366930375e-07, + "loss": 0.0763, + "step": 6318 + }, + { + "epoch": 0.9099942396313364, + "grad_norm": 0.8826895356178284, + "learning_rate": 9.95942253087781e-07, + "loss": 0.0799, + "step": 6319 + }, + { + "epoch": 0.9101382488479263, + "grad_norm": 0.713643491268158, + "learning_rate": 9.92784082688561e-07, + "loss": 0.0871, + "step": 6320 + }, + { + "epoch": 0.9102822580645161, + "grad_norm": 4.429367542266846, + "learning_rate": 9.896308261417936e-07, + "loss": 2.0145, + "step": 6321 + }, + { + "epoch": 0.910426267281106, + "grad_norm": 0.6601721048355103, + "learning_rate": 9.864824840928987e-07, + "loss": 0.0801, + "step": 6322 + }, + { + "epoch": 0.9105702764976958, + "grad_norm": 0.752079963684082, + "learning_rate": 9.833390571862861e-07, + "loss": 0.0675, + "step": 6323 + }, + { + "epoch": 0.9107142857142857, + "grad_norm": 0.978042721748352, + "learning_rate": 9.802005460653573e-07, + "loss": 0.097, + "step": 6324 + }, + { + "epoch": 0.9108582949308756, + "grad_norm": 0.9117527604103088, + "learning_rate": 9.770669513725128e-07, + "loss": 0.0954, + "step": 6325 + }, + { + "epoch": 0.9110023041474654, + "grad_norm": 0.47375184297561646, + "learning_rate": 9.739382737491421e-07, + "loss": 0.0687, + "step": 6326 + }, + { + "epoch": 0.9111463133640553, + "grad_norm": 0.8595511317253113, + "learning_rate": 9.7081451383563e-07, + "loss": 0.0777, + "step": 6327 + }, + { + "epoch": 0.9112903225806451, + "grad_norm": 0.44263768196105957, + "learning_rate": 9.676956722713542e-07, + "loss": 0.0544, + "step": 6328 + }, + { + "epoch": 0.911434331797235, + "grad_norm": 1.3252954483032227, + "learning_rate": 9.645817496946903e-07, + "loss": 0.1113, + "step": 6329 + }, + { + "epoch": 0.9115783410138248, + "grad_norm": 0.6103683710098267, + "learning_rate": 9.614727467429975e-07, + "loss": 0.0615, + "step": 6330 + }, + { + "epoch": 0.9117223502304147, + "grad_norm": 0.8672882318496704, + "learning_rate": 9.583686640526391e-07, + "loss": 0.1573, + "step": 6331 + }, + { + "epoch": 0.9118663594470046, + "grad_norm": 2.0263593196868896, + "learning_rate": 9.552695022589624e-07, + "loss": 0.1416, + "step": 6332 + }, + { + "epoch": 0.9120103686635944, + "grad_norm": 4.142277240753174, + "learning_rate": 9.521752619963131e-07, + "loss": 3.1591, + "step": 6333 + }, + { + "epoch": 0.9121543778801844, + "grad_norm": 0.7587898373603821, + "learning_rate": 9.49085943898026e-07, + "loss": 0.0814, + "step": 6334 + }, + { + "epoch": 0.9122983870967742, + "grad_norm": 1.3250900506973267, + "learning_rate": 9.460015485964285e-07, + "loss": 0.1095, + "step": 6335 + }, + { + "epoch": 0.9124423963133641, + "grad_norm": 0.5191977024078369, + "learning_rate": 9.429220767228464e-07, + "loss": 0.0714, + "step": 6336 + }, + { + "epoch": 0.912586405529954, + "grad_norm": 0.824079155921936, + "learning_rate": 9.398475289075892e-07, + "loss": 0.0924, + "step": 6337 + }, + { + "epoch": 0.9127304147465438, + "grad_norm": 5.326190948486328, + "learning_rate": 9.367779057799647e-07, + "loss": 1.3508, + "step": 6338 + }, + { + "epoch": 0.9128744239631337, + "grad_norm": 0.7214009165763855, + "learning_rate": 9.337132079682704e-07, + "loss": 0.1109, + "step": 6339 + }, + { + "epoch": 0.9130184331797235, + "grad_norm": 0.7310283184051514, + "learning_rate": 9.306534360997932e-07, + "loss": 0.072, + "step": 6340 + }, + { + "epoch": 0.9131624423963134, + "grad_norm": 0.6043365001678467, + "learning_rate": 9.275985908008155e-07, + "loss": 0.056, + "step": 6341 + }, + { + "epoch": 0.9133064516129032, + "grad_norm": 0.7465869784355164, + "learning_rate": 9.245486726966123e-07, + "loss": 0.083, + "step": 6342 + }, + { + "epoch": 0.9134504608294931, + "grad_norm": 0.6639095544815063, + "learning_rate": 9.215036824114454e-07, + "loss": 0.0931, + "step": 6343 + }, + { + "epoch": 0.913594470046083, + "grad_norm": 0.546794056892395, + "learning_rate": 9.184636205685687e-07, + "loss": 0.0633, + "step": 6344 + }, + { + "epoch": 0.9137384792626728, + "grad_norm": 1.1459436416625977, + "learning_rate": 9.154284877902347e-07, + "loss": 0.1288, + "step": 6345 + }, + { + "epoch": 0.9138824884792627, + "grad_norm": 1.0744820833206177, + "learning_rate": 9.12398284697677e-07, + "loss": 0.0899, + "step": 6346 + }, + { + "epoch": 0.9140264976958525, + "grad_norm": 0.5044614672660828, + "learning_rate": 9.093730119111243e-07, + "loss": 0.0646, + "step": 6347 + }, + { + "epoch": 0.9141705069124424, + "grad_norm": 0.7225751876831055, + "learning_rate": 9.063526700498009e-07, + "loss": 0.0734, + "step": 6348 + }, + { + "epoch": 0.9143145161290323, + "grad_norm": 0.6747027635574341, + "learning_rate": 9.033372597319123e-07, + "loss": 0.0636, + "step": 6349 + }, + { + "epoch": 0.9144585253456221, + "grad_norm": 1.3550835847854614, + "learning_rate": 9.003267815746619e-07, + "loss": 0.1202, + "step": 6350 + }, + { + "epoch": 0.914602534562212, + "grad_norm": 1.1763837337493896, + "learning_rate": 8.973212361942401e-07, + "loss": 0.1144, + "step": 6351 + }, + { + "epoch": 0.9147465437788018, + "grad_norm": 0.8940527439117432, + "learning_rate": 8.9432062420583e-07, + "loss": 3.986, + "step": 6352 + }, + { + "epoch": 0.9148905529953917, + "grad_norm": 0.48230209946632385, + "learning_rate": 8.913249462236068e-07, + "loss": 0.0514, + "step": 6353 + }, + { + "epoch": 0.9150345622119815, + "grad_norm": 1.0339343547821045, + "learning_rate": 8.883342028607273e-07, + "loss": 0.0875, + "step": 6354 + }, + { + "epoch": 0.9151785714285714, + "grad_norm": 0.8201497793197632, + "learning_rate": 8.853483947293462e-07, + "loss": 0.0891, + "step": 6355 + }, + { + "epoch": 0.9153225806451613, + "grad_norm": 1.4099839925765991, + "learning_rate": 8.823675224406053e-07, + "loss": 0.1204, + "step": 6356 + }, + { + "epoch": 0.9154665898617511, + "grad_norm": 0.7900995016098022, + "learning_rate": 8.793915866046359e-07, + "loss": 0.0905, + "step": 6357 + }, + { + "epoch": 0.915610599078341, + "grad_norm": 0.42147836089134216, + "learning_rate": 8.76420587830562e-07, + "loss": 0.0422, + "step": 6358 + }, + { + "epoch": 0.9157546082949308, + "grad_norm": 0.4431954324245453, + "learning_rate": 8.734545267264916e-07, + "loss": 0.0692, + "step": 6359 + }, + { + "epoch": 0.9158986175115207, + "grad_norm": 3.238255739212036, + "learning_rate": 8.704934038995277e-07, + "loss": 0.7241, + "step": 6360 + }, + { + "epoch": 0.9160426267281107, + "grad_norm": 0.61680006980896, + "learning_rate": 8.675372199557552e-07, + "loss": 0.0716, + "step": 6361 + }, + { + "epoch": 0.9161866359447005, + "grad_norm": 0.7604984045028687, + "learning_rate": 8.645859755002567e-07, + "loss": 0.0813, + "step": 6362 + }, + { + "epoch": 0.9163306451612904, + "grad_norm": 0.7572020888328552, + "learning_rate": 8.616396711370989e-07, + "loss": 0.0926, + "step": 6363 + }, + { + "epoch": 0.9164746543778802, + "grad_norm": 0.904494047164917, + "learning_rate": 8.586983074693383e-07, + "loss": 0.1047, + "step": 6364 + }, + { + "epoch": 0.9166186635944701, + "grad_norm": 2.4131057262420654, + "learning_rate": 8.557618850990184e-07, + "loss": 0.1818, + "step": 6365 + }, + { + "epoch": 0.9167626728110599, + "grad_norm": 0.8358302116394043, + "learning_rate": 8.528304046271751e-07, + "loss": 0.1048, + "step": 6366 + }, + { + "epoch": 0.9169066820276498, + "grad_norm": 0.7353546619415283, + "learning_rate": 8.499038666538311e-07, + "loss": 0.0857, + "step": 6367 + }, + { + "epoch": 0.9170506912442397, + "grad_norm": 0.4443334937095642, + "learning_rate": 8.469822717779935e-07, + "loss": 0.069, + "step": 6368 + }, + { + "epoch": 0.9171947004608295, + "grad_norm": 3.296668529510498, + "learning_rate": 8.440656205976643e-07, + "loss": 1.327, + "step": 6369 + }, + { + "epoch": 0.9173387096774194, + "grad_norm": 1.0047543048858643, + "learning_rate": 8.411539137098274e-07, + "loss": 0.0941, + "step": 6370 + }, + { + "epoch": 0.9174827188940092, + "grad_norm": 0.5476846098899841, + "learning_rate": 8.382471517104612e-07, + "loss": 0.069, + "step": 6371 + }, + { + "epoch": 0.9176267281105991, + "grad_norm": 0.9272359609603882, + "learning_rate": 8.353453351945262e-07, + "loss": 0.1016, + "step": 6372 + }, + { + "epoch": 0.917770737327189, + "grad_norm": 0.5083015561103821, + "learning_rate": 8.324484647559749e-07, + "loss": 0.0694, + "step": 6373 + }, + { + "epoch": 0.9179147465437788, + "grad_norm": 0.9394140839576721, + "learning_rate": 8.295565409877415e-07, + "loss": 0.0619, + "step": 6374 + }, + { + "epoch": 0.9180587557603687, + "grad_norm": 0.8487826585769653, + "learning_rate": 8.266695644817552e-07, + "loss": 0.0756, + "step": 6375 + }, + { + "epoch": 0.9182027649769585, + "grad_norm": 3.899155855178833, + "learning_rate": 8.237875358289294e-07, + "loss": 1.1744, + "step": 6376 + }, + { + "epoch": 0.9183467741935484, + "grad_norm": 0.5501101016998291, + "learning_rate": 8.209104556191616e-07, + "loss": 0.0778, + "step": 6377 + }, + { + "epoch": 0.9184907834101382, + "grad_norm": 0.4727204740047455, + "learning_rate": 8.18038324441342e-07, + "loss": 0.0683, + "step": 6378 + }, + { + "epoch": 0.9186347926267281, + "grad_norm": 0.8646941781044006, + "learning_rate": 8.151711428833419e-07, + "loss": 0.1559, + "step": 6379 + }, + { + "epoch": 0.918778801843318, + "grad_norm": 2.138073682785034, + "learning_rate": 8.123089115320254e-07, + "loss": 0.1233, + "step": 6380 + }, + { + "epoch": 0.9189228110599078, + "grad_norm": 1.0259273052215576, + "learning_rate": 8.094516309732375e-07, + "loss": 0.099, + "step": 6381 + }, + { + "epoch": 0.9190668202764977, + "grad_norm": 5.122811317443848, + "learning_rate": 8.065993017918188e-07, + "loss": 2.4587, + "step": 6382 + }, + { + "epoch": 0.9192108294930875, + "grad_norm": 1.0587214231491089, + "learning_rate": 8.037519245715829e-07, + "loss": 0.1386, + "step": 6383 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 3.91579008102417, + "learning_rate": 8.009094998953443e-07, + "loss": 2.0524, + "step": 6384 + }, + { + "epoch": 0.9194988479262672, + "grad_norm": 1.0773937702178955, + "learning_rate": 7.980720283448956e-07, + "loss": 0.1035, + "step": 6385 + }, + { + "epoch": 0.9196428571428571, + "grad_norm": 0.4764016568660736, + "learning_rate": 7.952395105010113e-07, + "loss": 0.0669, + "step": 6386 + }, + { + "epoch": 0.919786866359447, + "grad_norm": 0.5155469179153442, + "learning_rate": 7.924119469434665e-07, + "loss": 0.0608, + "step": 6387 + }, + { + "epoch": 0.9199308755760369, + "grad_norm": 1.4742196798324585, + "learning_rate": 7.895893382510067e-07, + "loss": 0.0713, + "step": 6388 + }, + { + "epoch": 0.9200748847926268, + "grad_norm": 1.2457066774368286, + "learning_rate": 7.867716850013696e-07, + "loss": 0.1252, + "step": 6389 + }, + { + "epoch": 0.9202188940092166, + "grad_norm": 0.9247126579284668, + "learning_rate": 7.839589877712856e-07, + "loss": 0.101, + "step": 6390 + }, + { + "epoch": 0.9203629032258065, + "grad_norm": 1.459454894065857, + "learning_rate": 7.811512471364607e-07, + "loss": 0.1313, + "step": 6391 + }, + { + "epoch": 0.9205069124423964, + "grad_norm": 0.5805144309997559, + "learning_rate": 7.783484636715882e-07, + "loss": 0.0569, + "step": 6392 + }, + { + "epoch": 0.9206509216589862, + "grad_norm": 1.4943288564682007, + "learning_rate": 7.755506379503508e-07, + "loss": 0.2175, + "step": 6393 + }, + { + "epoch": 0.9207949308755761, + "grad_norm": 1.0005519390106201, + "learning_rate": 7.727577705454125e-07, + "loss": 0.0869, + "step": 6394 + }, + { + "epoch": 0.9209389400921659, + "grad_norm": 0.9903758764266968, + "learning_rate": 7.699698620284219e-07, + "loss": 0.1308, + "step": 6395 + }, + { + "epoch": 0.9210829493087558, + "grad_norm": 0.742578387260437, + "learning_rate": 7.671869129700165e-07, + "loss": 0.0824, + "step": 6396 + }, + { + "epoch": 0.9212269585253456, + "grad_norm": 7.134407043457031, + "learning_rate": 7.644089239398189e-07, + "loss": 2.4128, + "step": 6397 + }, + { + "epoch": 0.9213709677419355, + "grad_norm": 0.5449683666229248, + "learning_rate": 7.616358955064323e-07, + "loss": 0.0467, + "step": 6398 + }, + { + "epoch": 0.9215149769585254, + "grad_norm": 5.095037937164307, + "learning_rate": 7.588678282374445e-07, + "loss": 1.4138, + "step": 6399 + }, + { + "epoch": 0.9216589861751152, + "grad_norm": 0.5620525479316711, + "learning_rate": 7.561047226994328e-07, + "loss": 0.0644, + "step": 6400 + }, + { + "epoch": 0.9218029953917051, + "grad_norm": 0.6467030048370361, + "learning_rate": 7.533465794579558e-07, + "loss": 0.0756, + "step": 6401 + }, + { + "epoch": 0.9219470046082949, + "grad_norm": 0.7942535877227783, + "learning_rate": 7.505933990775565e-07, + "loss": 4.0733, + "step": 6402 + }, + { + "epoch": 0.9220910138248848, + "grad_norm": 1.3365429639816284, + "learning_rate": 7.478451821217591e-07, + "loss": 0.126, + "step": 6403 + }, + { + "epoch": 0.9222350230414746, + "grad_norm": 0.7991818189620972, + "learning_rate": 7.451019291530803e-07, + "loss": 0.1124, + "step": 6404 + }, + { + "epoch": 0.9223790322580645, + "grad_norm": 0.8053882718086243, + "learning_rate": 7.423636407330098e-07, + "loss": 0.0786, + "step": 6405 + }, + { + "epoch": 0.9225230414746544, + "grad_norm": 1.6954259872436523, + "learning_rate": 7.396303174220326e-07, + "loss": 0.1317, + "step": 6406 + }, + { + "epoch": 0.9226670506912442, + "grad_norm": 4.879339694976807, + "learning_rate": 7.369019597796068e-07, + "loss": 1.3663, + "step": 6407 + }, + { + "epoch": 0.9228110599078341, + "grad_norm": 3.943162202835083, + "learning_rate": 7.341785683641827e-07, + "loss": 1.5045, + "step": 6408 + }, + { + "epoch": 0.9229550691244239, + "grad_norm": 0.8395203351974487, + "learning_rate": 7.314601437331869e-07, + "loss": 0.0838, + "step": 6409 + }, + { + "epoch": 0.9230990783410138, + "grad_norm": 0.724405825138092, + "learning_rate": 7.287466864430353e-07, + "loss": 0.0871, + "step": 6410 + }, + { + "epoch": 0.9232430875576036, + "grad_norm": 0.9876272082328796, + "learning_rate": 7.260381970491253e-07, + "loss": 0.1018, + "step": 6411 + }, + { + "epoch": 0.9233870967741935, + "grad_norm": 1.1380808353424072, + "learning_rate": 7.23334676105833e-07, + "loss": 0.1089, + "step": 6412 + }, + { + "epoch": 0.9235311059907834, + "grad_norm": 0.7840459942817688, + "learning_rate": 7.206361241665266e-07, + "loss": 0.1092, + "step": 6413 + }, + { + "epoch": 0.9236751152073732, + "grad_norm": 0.6764335036277771, + "learning_rate": 7.179425417835451e-07, + "loss": 0.0779, + "step": 6414 + }, + { + "epoch": 0.9238191244239631, + "grad_norm": 0.643854558467865, + "learning_rate": 7.15253929508225e-07, + "loss": 0.0807, + "step": 6415 + }, + { + "epoch": 0.923963133640553, + "grad_norm": 3.9558770656585693, + "learning_rate": 7.125702878908708e-07, + "loss": 1.5451, + "step": 6416 + }, + { + "epoch": 0.9241071428571429, + "grad_norm": 1.7450734376907349, + "learning_rate": 7.098916174807763e-07, + "loss": 0.1435, + "step": 6417 + }, + { + "epoch": 0.9242511520737328, + "grad_norm": 0.7124820947647095, + "learning_rate": 7.072179188262251e-07, + "loss": 0.0926, + "step": 6418 + }, + { + "epoch": 0.9243951612903226, + "grad_norm": 0.8633823990821838, + "learning_rate": 7.04549192474474e-07, + "loss": 0.0727, + "step": 6419 + }, + { + "epoch": 0.9245391705069125, + "grad_norm": 1.0623916387557983, + "learning_rate": 7.018854389717582e-07, + "loss": 0.0984, + "step": 6420 + }, + { + "epoch": 0.9246831797235023, + "grad_norm": 0.7411383390426636, + "learning_rate": 6.992266588633084e-07, + "loss": 0.0838, + "step": 6421 + }, + { + "epoch": 0.9248271889400922, + "grad_norm": 0.7530263662338257, + "learning_rate": 6.965728526933224e-07, + "loss": 0.0658, + "step": 6422 + }, + { + "epoch": 0.924971198156682, + "grad_norm": 0.8651976585388184, + "learning_rate": 6.939240210049935e-07, + "loss": 0.0987, + "step": 6423 + }, + { + "epoch": 0.9251152073732719, + "grad_norm": 0.6491146683692932, + "learning_rate": 6.912801643404882e-07, + "loss": 0.0917, + "step": 6424 + }, + { + "epoch": 0.9252592165898618, + "grad_norm": 0.2950338125228882, + "learning_rate": 6.886412832409566e-07, + "loss": 0.0463, + "step": 6425 + }, + { + "epoch": 0.9254032258064516, + "grad_norm": 0.8692485690116882, + "learning_rate": 6.860073782465338e-07, + "loss": 0.0847, + "step": 6426 + }, + { + "epoch": 0.9255472350230415, + "grad_norm": 0.4833323359489441, + "learning_rate": 6.833784498963297e-07, + "loss": 0.0486, + "step": 6427 + }, + { + "epoch": 0.9256912442396313, + "grad_norm": 0.47471046447753906, + "learning_rate": 6.80754498728442e-07, + "loss": 0.0662, + "step": 6428 + }, + { + "epoch": 0.9258352534562212, + "grad_norm": 4.026058197021484, + "learning_rate": 6.781355252799465e-07, + "loss": 0.6598, + "step": 6429 + }, + { + "epoch": 0.925979262672811, + "grad_norm": 0.9416404366493225, + "learning_rate": 6.755215300869006e-07, + "loss": 0.0836, + "step": 6430 + }, + { + "epoch": 0.9261232718894009, + "grad_norm": 0.3640284538269043, + "learning_rate": 6.729125136843428e-07, + "loss": 0.0408, + "step": 6431 + }, + { + "epoch": 0.9262672811059908, + "grad_norm": 0.5491839647293091, + "learning_rate": 6.703084766062934e-07, + "loss": 0.0561, + "step": 6432 + }, + { + "epoch": 0.9264112903225806, + "grad_norm": 0.7322775721549988, + "learning_rate": 6.677094193857508e-07, + "loss": 0.069, + "step": 6433 + }, + { + "epoch": 0.9265552995391705, + "grad_norm": 0.8649881482124329, + "learning_rate": 6.65115342554698e-07, + "loss": 0.1553, + "step": 6434 + }, + { + "epoch": 0.9266993087557603, + "grad_norm": 0.29142704606056213, + "learning_rate": 6.625262466440934e-07, + "loss": 0.0462, + "step": 6435 + }, + { + "epoch": 0.9268433179723502, + "grad_norm": 5.550843238830566, + "learning_rate": 6.599421321838855e-07, + "loss": 1.8069, + "step": 6436 + }, + { + "epoch": 0.9269873271889401, + "grad_norm": 0.4167884886264801, + "learning_rate": 6.573629997029901e-07, + "loss": 0.0392, + "step": 6437 + }, + { + "epoch": 0.9271313364055299, + "grad_norm": 4.020675182342529, + "learning_rate": 6.547888497293153e-07, + "loss": 2.5998, + "step": 6438 + }, + { + "epoch": 0.9272753456221198, + "grad_norm": 5.876063346862793, + "learning_rate": 6.522196827897398e-07, + "loss": 1.4835, + "step": 6439 + }, + { + "epoch": 0.9274193548387096, + "grad_norm": 0.992565393447876, + "learning_rate": 6.496554994101289e-07, + "loss": 0.0877, + "step": 6440 + }, + { + "epoch": 0.9275633640552995, + "grad_norm": 1.1393488645553589, + "learning_rate": 6.470963001153269e-07, + "loss": 0.1036, + "step": 6441 + }, + { + "epoch": 0.9277073732718893, + "grad_norm": 0.636152446269989, + "learning_rate": 6.445420854291534e-07, + "loss": 0.0568, + "step": 6442 + }, + { + "epoch": 0.9278513824884793, + "grad_norm": 0.9237536191940308, + "learning_rate": 6.419928558744126e-07, + "loss": 0.0839, + "step": 6443 + }, + { + "epoch": 0.9279953917050692, + "grad_norm": 1.0171273946762085, + "learning_rate": 6.394486119728815e-07, + "loss": 0.0887, + "step": 6444 + }, + { + "epoch": 0.928139400921659, + "grad_norm": 5.22520637512207, + "learning_rate": 6.369093542453324e-07, + "loss": 1.5604, + "step": 6445 + }, + { + "epoch": 0.9282834101382489, + "grad_norm": 0.6387810111045837, + "learning_rate": 6.343750832114997e-07, + "loss": 0.0899, + "step": 6446 + }, + { + "epoch": 0.9284274193548387, + "grad_norm": 3.259471893310547, + "learning_rate": 6.318457993901072e-07, + "loss": 1.2683, + "step": 6447 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.3866783380508423, + "learning_rate": 6.293215032988492e-07, + "loss": 0.0595, + "step": 6448 + }, + { + "epoch": 0.9287154377880185, + "grad_norm": 6.3114848136901855, + "learning_rate": 6.268021954544096e-07, + "loss": 1.9706, + "step": 6449 + }, + { + "epoch": 0.9288594470046083, + "grad_norm": 1.0000113248825073, + "learning_rate": 6.242878763724452e-07, + "loss": 0.1061, + "step": 6450 + }, + { + "epoch": 0.9290034562211982, + "grad_norm": 4.441605091094971, + "learning_rate": 6.217785465675891e-07, + "loss": 1.0613, + "step": 6451 + }, + { + "epoch": 0.929147465437788, + "grad_norm": 0.5975131988525391, + "learning_rate": 6.192742065534607e-07, + "loss": 0.0578, + "step": 6452 + }, + { + "epoch": 0.9292914746543779, + "grad_norm": 1.010187029838562, + "learning_rate": 6.167748568426529e-07, + "loss": 0.1038, + "step": 6453 + }, + { + "epoch": 0.9294354838709677, + "grad_norm": 1.1115511655807495, + "learning_rate": 6.142804979467398e-07, + "loss": 0.0961, + "step": 6454 + }, + { + "epoch": 0.9295794930875576, + "grad_norm": 0.5164231657981873, + "learning_rate": 6.117911303762686e-07, + "loss": 0.0749, + "step": 6455 + }, + { + "epoch": 0.9297235023041475, + "grad_norm": 0.36419737339019775, + "learning_rate": 6.093067546407704e-07, + "loss": 0.0405, + "step": 6456 + }, + { + "epoch": 0.9298675115207373, + "grad_norm": 1.0656954050064087, + "learning_rate": 6.068273712487554e-07, + "loss": 0.115, + "step": 6457 + }, + { + "epoch": 0.9300115207373272, + "grad_norm": 10.190164566040039, + "learning_rate": 6.043529807077091e-07, + "loss": 3.4168, + "step": 6458 + }, + { + "epoch": 0.930155529953917, + "grad_norm": 0.9942910671234131, + "learning_rate": 6.018835835240905e-07, + "loss": 0.1275, + "step": 6459 + }, + { + "epoch": 0.9302995391705069, + "grad_norm": 1.242846131324768, + "learning_rate": 5.994191802033478e-07, + "loss": 0.1373, + "step": 6460 + }, + { + "epoch": 0.9304435483870968, + "grad_norm": 0.863568127155304, + "learning_rate": 5.96959771249897e-07, + "loss": 0.154, + "step": 6461 + }, + { + "epoch": 0.9305875576036866, + "grad_norm": 5.041396141052246, + "learning_rate": 5.945053571671383e-07, + "loss": 2.0882, + "step": 6462 + }, + { + "epoch": 0.9307315668202765, + "grad_norm": 0.34681418538093567, + "learning_rate": 5.920559384574448e-07, + "loss": 0.0481, + "step": 6463 + }, + { + "epoch": 0.9308755760368663, + "grad_norm": 0.8457270264625549, + "learning_rate": 5.89611515622171e-07, + "loss": 0.106, + "step": 6464 + }, + { + "epoch": 0.9310195852534562, + "grad_norm": 4.647346019744873, + "learning_rate": 5.871720891616444e-07, + "loss": 0.772, + "step": 6465 + }, + { + "epoch": 0.931163594470046, + "grad_norm": 1.0673258304595947, + "learning_rate": 5.847376595751714e-07, + "loss": 0.0958, + "step": 6466 + }, + { + "epoch": 0.9313076036866359, + "grad_norm": 0.7226126790046692, + "learning_rate": 5.82308227361042e-07, + "loss": 0.0909, + "step": 6467 + }, + { + "epoch": 0.9314516129032258, + "grad_norm": 1.0950428247451782, + "learning_rate": 5.798837930165141e-07, + "loss": 0.1096, + "step": 6468 + }, + { + "epoch": 0.9315956221198156, + "grad_norm": 6.247434139251709, + "learning_rate": 5.774643570378296e-07, + "loss": 1.5757, + "step": 6469 + }, + { + "epoch": 0.9317396313364056, + "grad_norm": 1.066108226776123, + "learning_rate": 5.750499199202008e-07, + "loss": 0.0954, + "step": 6470 + }, + { + "epoch": 0.9318836405529954, + "grad_norm": 0.3351675271987915, + "learning_rate": 5.726404821578185e-07, + "loss": 0.0478, + "step": 6471 + }, + { + "epoch": 0.9320276497695853, + "grad_norm": 0.5952367782592773, + "learning_rate": 5.702360442438576e-07, + "loss": 0.0599, + "step": 6472 + }, + { + "epoch": 0.9321716589861752, + "grad_norm": 3.6871917247772217, + "learning_rate": 5.678366066704632e-07, + "loss": 0.6451, + "step": 6473 + }, + { + "epoch": 0.932315668202765, + "grad_norm": 0.8877171874046326, + "learning_rate": 5.654421699287537e-07, + "loss": 0.0952, + "step": 6474 + }, + { + "epoch": 0.9324596774193549, + "grad_norm": 0.969255268573761, + "learning_rate": 5.630527345088316e-07, + "loss": 0.1008, + "step": 6475 + }, + { + "epoch": 0.9326036866359447, + "grad_norm": 5.665468692779541, + "learning_rate": 5.606683008997693e-07, + "loss": 1.5944, + "step": 6476 + }, + { + "epoch": 0.9327476958525346, + "grad_norm": 0.7970080375671387, + "learning_rate": 5.58288869589621e-07, + "loss": 0.069, + "step": 6477 + }, + { + "epoch": 0.9328917050691244, + "grad_norm": 0.2658809721469879, + "learning_rate": 5.559144410654138e-07, + "loss": 0.0449, + "step": 6478 + }, + { + "epoch": 0.9330357142857143, + "grad_norm": 0.9773562550544739, + "learning_rate": 5.535450158131506e-07, + "loss": 0.1132, + "step": 6479 + }, + { + "epoch": 0.9331797235023042, + "grad_norm": 1.3091464042663574, + "learning_rate": 5.5118059431781e-07, + "loss": 0.0947, + "step": 6480 + }, + { + "epoch": 0.933323732718894, + "grad_norm": 0.6748597621917725, + "learning_rate": 5.488211770633467e-07, + "loss": 0.0785, + "step": 6481 + }, + { + "epoch": 0.9334677419354839, + "grad_norm": 0.429298996925354, + "learning_rate": 5.46466764532691e-07, + "loss": 0.0671, + "step": 6482 + }, + { + "epoch": 0.9336117511520737, + "grad_norm": 0.4616495370864868, + "learning_rate": 5.441173572077546e-07, + "loss": 0.0639, + "step": 6483 + }, + { + "epoch": 0.9337557603686636, + "grad_norm": 6.8696136474609375, + "learning_rate": 5.41772955569414e-07, + "loss": 1.927, + "step": 6484 + }, + { + "epoch": 0.9338997695852534, + "grad_norm": 0.6811413764953613, + "learning_rate": 5.394335600975325e-07, + "loss": 0.0717, + "step": 6485 + }, + { + "epoch": 0.9340437788018433, + "grad_norm": 0.8169926404953003, + "learning_rate": 5.370991712709355e-07, + "loss": 0.0795, + "step": 6486 + }, + { + "epoch": 0.9341877880184332, + "grad_norm": 0.8212189674377441, + "learning_rate": 5.347697895674381e-07, + "loss": 0.096, + "step": 6487 + }, + { + "epoch": 0.934331797235023, + "grad_norm": 4.586338043212891, + "learning_rate": 5.324454154638198e-07, + "loss": 0.7546, + "step": 6488 + }, + { + "epoch": 0.9344758064516129, + "grad_norm": 0.6328390836715698, + "learning_rate": 5.30126049435839e-07, + "loss": 0.0739, + "step": 6489 + }, + { + "epoch": 0.9346198156682027, + "grad_norm": 0.6821079254150391, + "learning_rate": 5.278116919582299e-07, + "loss": 0.066, + "step": 6490 + }, + { + "epoch": 0.9347638248847926, + "grad_norm": 0.78536057472229, + "learning_rate": 5.255023435046996e-07, + "loss": 0.0995, + "step": 6491 + }, + { + "epoch": 0.9349078341013825, + "grad_norm": 0.26628825068473816, + "learning_rate": 5.231980045479312e-07, + "loss": 0.0452, + "step": 6492 + }, + { + "epoch": 0.9350518433179723, + "grad_norm": 0.6623007655143738, + "learning_rate": 5.208986755595807e-07, + "loss": 0.0783, + "step": 6493 + }, + { + "epoch": 0.9351958525345622, + "grad_norm": 0.5472280979156494, + "learning_rate": 5.186043570102828e-07, + "loss": 0.0682, + "step": 6494 + }, + { + "epoch": 0.935339861751152, + "grad_norm": 0.6098116040229797, + "learning_rate": 5.163150493696451e-07, + "loss": 0.0703, + "step": 6495 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 1.28338623046875, + "learning_rate": 5.140307531062455e-07, + "loss": 0.1203, + "step": 6496 + }, + { + "epoch": 0.9356278801843319, + "grad_norm": 1.0484437942504883, + "learning_rate": 5.117514686876379e-07, + "loss": 0.0926, + "step": 6497 + }, + { + "epoch": 0.9357718894009217, + "grad_norm": 0.8619883060455322, + "learning_rate": 5.094771965803546e-07, + "loss": 0.0906, + "step": 6498 + }, + { + "epoch": 0.9359158986175116, + "grad_norm": 1.0034738779067993, + "learning_rate": 5.072079372498983e-07, + "loss": 0.1281, + "step": 6499 + }, + { + "epoch": 0.9360599078341014, + "grad_norm": 1.3050228357315063, + "learning_rate": 5.049436911607447e-07, + "loss": 0.1177, + "step": 6500 + }, + { + "epoch": 0.9362039170506913, + "grad_norm": 0.6719378232955933, + "learning_rate": 5.026844587763452e-07, + "loss": 0.0757, + "step": 6501 + }, + { + "epoch": 0.9363479262672811, + "grad_norm": 4.337597846984863, + "learning_rate": 5.004302405591243e-07, + "loss": 1.4007, + "step": 6502 + }, + { + "epoch": 0.936491935483871, + "grad_norm": 0.7790647149085999, + "learning_rate": 4.981810369704853e-07, + "loss": 0.0913, + "step": 6503 + }, + { + "epoch": 0.9366359447004609, + "grad_norm": 0.7908790111541748, + "learning_rate": 4.959368484707932e-07, + "loss": 0.0685, + "step": 6504 + }, + { + "epoch": 0.9367799539170507, + "grad_norm": 3.5617425441741943, + "learning_rate": 4.936976755193973e-07, + "loss": 0.9893, + "step": 6505 + }, + { + "epoch": 0.9369239631336406, + "grad_norm": 1.2372599840164185, + "learning_rate": 4.914635185746197e-07, + "loss": 0.1217, + "step": 6506 + }, + { + "epoch": 0.9370679723502304, + "grad_norm": 0.935183584690094, + "learning_rate": 4.892343780937447e-07, + "loss": 0.096, + "step": 6507 + }, + { + "epoch": 0.9372119815668203, + "grad_norm": 1.3040456771850586, + "learning_rate": 4.870102545330463e-07, + "loss": 0.1064, + "step": 6508 + }, + { + "epoch": 0.9373559907834101, + "grad_norm": 0.7068073153495789, + "learning_rate": 4.847911483477601e-07, + "loss": 0.0765, + "step": 6509 + }, + { + "epoch": 0.9375, + "grad_norm": 0.8983472585678101, + "learning_rate": 4.825770599920953e-07, + "loss": 0.1034, + "step": 6510 + }, + { + "epoch": 0.9376440092165899, + "grad_norm": 0.5523391962051392, + "learning_rate": 4.803679899192392e-07, + "loss": 0.0627, + "step": 6511 + }, + { + "epoch": 0.9377880184331797, + "grad_norm": 0.7658730745315552, + "learning_rate": 4.781639385813497e-07, + "loss": 0.0672, + "step": 6512 + }, + { + "epoch": 0.9379320276497696, + "grad_norm": 1.0000334978103638, + "learning_rate": 4.759649064295546e-07, + "loss": 0.1254, + "step": 6513 + }, + { + "epoch": 0.9380760368663594, + "grad_norm": 5.873345375061035, + "learning_rate": 4.737708939139635e-07, + "loss": 1.3318, + "step": 6514 + }, + { + "epoch": 0.9382200460829493, + "grad_norm": 1.3134666681289673, + "learning_rate": 4.71581901483642e-07, + "loss": 0.1067, + "step": 6515 + }, + { + "epoch": 0.9383640552995391, + "grad_norm": 0.3869471251964569, + "learning_rate": 4.693979295866485e-07, + "loss": 0.0486, + "step": 6516 + }, + { + "epoch": 0.938508064516129, + "grad_norm": 0.6285882592201233, + "learning_rate": 4.672189786699949e-07, + "loss": 0.0558, + "step": 6517 + }, + { + "epoch": 0.9386520737327189, + "grad_norm": 0.8916806578636169, + "learning_rate": 4.6504504917967706e-07, + "loss": 0.1305, + "step": 6518 + }, + { + "epoch": 0.9387960829493087, + "grad_norm": 7.478692531585693, + "learning_rate": 4.628761415606614e-07, + "loss": 0.9456, + "step": 6519 + }, + { + "epoch": 0.9389400921658986, + "grad_norm": 0.4458000361919403, + "learning_rate": 4.607122562568844e-07, + "loss": 0.049, + "step": 6520 + }, + { + "epoch": 0.9390841013824884, + "grad_norm": 3.099358558654785, + "learning_rate": 4.5855339371125294e-07, + "loss": 2.179, + "step": 6521 + }, + { + "epoch": 0.9392281105990783, + "grad_norm": 0.6158254146575928, + "learning_rate": 4.563995543656496e-07, + "loss": 0.0701, + "step": 6522 + }, + { + "epoch": 0.9393721198156681, + "grad_norm": 1.0011011362075806, + "learning_rate": 4.542507386609274e-07, + "loss": 0.1039, + "step": 6523 + }, + { + "epoch": 0.9395161290322581, + "grad_norm": 0.6216937303543091, + "learning_rate": 4.5210694703691214e-07, + "loss": 0.0648, + "step": 6524 + }, + { + "epoch": 0.939660138248848, + "grad_norm": 0.8044523596763611, + "learning_rate": 4.499681799323946e-07, + "loss": 0.1042, + "step": 6525 + }, + { + "epoch": 0.9398041474654378, + "grad_norm": 0.9227869510650635, + "learning_rate": 4.478344377851496e-07, + "loss": 0.0913, + "step": 6526 + }, + { + "epoch": 0.9399481566820277, + "grad_norm": 0.8775557279586792, + "learning_rate": 4.45705721031911e-07, + "loss": 0.0691, + "step": 6527 + }, + { + "epoch": 0.9400921658986175, + "grad_norm": 0.789839506149292, + "learning_rate": 4.435820301083943e-07, + "loss": 0.0892, + "step": 6528 + }, + { + "epoch": 0.9402361751152074, + "grad_norm": 0.7342466711997986, + "learning_rate": 4.4146336544927667e-07, + "loss": 0.0656, + "step": 6529 + }, + { + "epoch": 0.9403801843317973, + "grad_norm": 0.9267168045043945, + "learning_rate": 4.393497274882141e-07, + "loss": 0.1022, + "step": 6530 + }, + { + "epoch": 0.9405241935483871, + "grad_norm": 0.5427201986312866, + "learning_rate": 4.3724111665782997e-07, + "loss": 0.0553, + "step": 6531 + }, + { + "epoch": 0.940668202764977, + "grad_norm": 1.1146256923675537, + "learning_rate": 4.351375333897206e-07, + "loss": 0.1808, + "step": 6532 + }, + { + "epoch": 0.9408122119815668, + "grad_norm": 0.8788318037986755, + "learning_rate": 4.3303897811445005e-07, + "loss": 0.0982, + "step": 6533 + }, + { + "epoch": 0.9409562211981567, + "grad_norm": 0.884023129940033, + "learning_rate": 4.3094545126155794e-07, + "loss": 0.099, + "step": 6534 + }, + { + "epoch": 0.9411002304147466, + "grad_norm": 0.561026394367218, + "learning_rate": 4.2885695325955435e-07, + "loss": 0.0495, + "step": 6535 + }, + { + "epoch": 0.9412442396313364, + "grad_norm": 5.167316913604736, + "learning_rate": 4.2677348453591117e-07, + "loss": 1.3919, + "step": 6536 + }, + { + "epoch": 0.9413882488479263, + "grad_norm": 1.1566896438598633, + "learning_rate": 4.246950455170817e-07, + "loss": 0.1104, + "step": 6537 + }, + { + "epoch": 0.9415322580645161, + "grad_norm": 0.9652101397514343, + "learning_rate": 4.2262163662848687e-07, + "loss": 0.0962, + "step": 6538 + }, + { + "epoch": 0.941676267281106, + "grad_norm": 0.6066707968711853, + "learning_rate": 4.205532582945121e-07, + "loss": 0.071, + "step": 6539 + }, + { + "epoch": 0.9418202764976958, + "grad_norm": 1.044021725654602, + "learning_rate": 4.184899109385243e-07, + "loss": 0.1334, + "step": 6540 + }, + { + "epoch": 0.9419642857142857, + "grad_norm": 1.4141733646392822, + "learning_rate": 4.1643159498284953e-07, + "loss": 0.1245, + "step": 6541 + }, + { + "epoch": 0.9421082949308756, + "grad_norm": 3.5381267070770264, + "learning_rate": 4.1437831084878974e-07, + "loss": 2.3862, + "step": 6542 + }, + { + "epoch": 0.9422523041474654, + "grad_norm": 1.4803664684295654, + "learning_rate": 4.123300589566143e-07, + "loss": 3.7803, + "step": 6543 + }, + { + "epoch": 0.9423963133640553, + "grad_norm": 7.125556468963623, + "learning_rate": 4.1028683972556824e-07, + "loss": 1.4382, + "step": 6544 + }, + { + "epoch": 0.9425403225806451, + "grad_norm": 5.95056676864624, + "learning_rate": 4.082486535738589e-07, + "loss": 1.4695, + "step": 6545 + }, + { + "epoch": 0.942684331797235, + "grad_norm": 1.0143465995788574, + "learning_rate": 4.062155009186691e-07, + "loss": 0.1024, + "step": 6546 + }, + { + "epoch": 0.9428283410138248, + "grad_norm": 0.9981006383895874, + "learning_rate": 4.041873821761466e-07, + "loss": 0.1244, + "step": 6547 + }, + { + "epoch": 0.9429723502304147, + "grad_norm": 6.608901500701904, + "learning_rate": 4.0216429776141207e-07, + "loss": 1.6793, + "step": 6548 + }, + { + "epoch": 0.9431163594470046, + "grad_norm": 0.9316585659980774, + "learning_rate": 4.001462480885593e-07, + "loss": 0.0941, + "step": 6549 + }, + { + "epoch": 0.9432603686635944, + "grad_norm": 0.8380259275436401, + "learning_rate": 3.9813323357064113e-07, + "loss": 0.0878, + "step": 6550 + }, + { + "epoch": 0.9434043778801844, + "grad_norm": 0.8414686322212219, + "learning_rate": 3.96125254619692e-07, + "loss": 0.0895, + "step": 6551 + }, + { + "epoch": 0.9435483870967742, + "grad_norm": 0.6220961809158325, + "learning_rate": 3.9412231164670246e-07, + "loss": 0.0685, + "step": 6552 + }, + { + "epoch": 0.9436923963133641, + "grad_norm": 0.9956130981445312, + "learning_rate": 3.921244050616446e-07, + "loss": 0.0648, + "step": 6553 + }, + { + "epoch": 0.943836405529954, + "grad_norm": 3.651357412338257, + "learning_rate": 3.9013153527345524e-07, + "loss": 0.4342, + "step": 6554 + }, + { + "epoch": 0.9439804147465438, + "grad_norm": 1.2966095209121704, + "learning_rate": 3.8814370269003864e-07, + "loss": 0.1159, + "step": 6555 + }, + { + "epoch": 0.9441244239631337, + "grad_norm": 0.9210472702980042, + "learning_rate": 3.8616090771826654e-07, + "loss": 3.9397, + "step": 6556 + }, + { + "epoch": 0.9442684331797235, + "grad_norm": 0.7806110978126526, + "learning_rate": 3.841831507639865e-07, + "loss": 0.0978, + "step": 6557 + }, + { + "epoch": 0.9444124423963134, + "grad_norm": 1.185314416885376, + "learning_rate": 3.8221043223200525e-07, + "loss": 0.1146, + "step": 6558 + }, + { + "epoch": 0.9445564516129032, + "grad_norm": 1.56231689453125, + "learning_rate": 3.802427525261054e-07, + "loss": 0.1349, + "step": 6559 + }, + { + "epoch": 0.9447004608294931, + "grad_norm": 0.7207516431808472, + "learning_rate": 3.7828011204903977e-07, + "loss": 0.0713, + "step": 6560 + }, + { + "epoch": 0.944844470046083, + "grad_norm": 0.7212814092636108, + "learning_rate": 3.7632251120252036e-07, + "loss": 0.0827, + "step": 6561 + }, + { + "epoch": 0.9449884792626728, + "grad_norm": 4.15453577041626, + "learning_rate": 3.74369950387235e-07, + "loss": 0.7031, + "step": 6562 + }, + { + "epoch": 0.9451324884792627, + "grad_norm": 3.8922243118286133, + "learning_rate": 3.724224300028417e-07, + "loss": 1.2624, + "step": 6563 + }, + { + "epoch": 0.9452764976958525, + "grad_norm": 1.1191190481185913, + "learning_rate": 3.7047995044796057e-07, + "loss": 4.0117, + "step": 6564 + }, + { + "epoch": 0.9454205069124424, + "grad_norm": 1.6063306331634521, + "learning_rate": 3.6854251212018465e-07, + "loss": 3.5914, + "step": 6565 + }, + { + "epoch": 0.9455645161290323, + "grad_norm": 1.0165891647338867, + "learning_rate": 3.6661011541606896e-07, + "loss": 0.1076, + "step": 6566 + }, + { + "epoch": 0.9457085253456221, + "grad_norm": 0.9802741408348083, + "learning_rate": 3.6468276073114705e-07, + "loss": 0.0986, + "step": 6567 + }, + { + "epoch": 0.945852534562212, + "grad_norm": 1.4688149690628052, + "learning_rate": 3.6276044845990896e-07, + "loss": 4.157, + "step": 6568 + }, + { + "epoch": 0.9459965437788018, + "grad_norm": 0.9276182055473328, + "learning_rate": 3.6084317899582057e-07, + "loss": 0.073, + "step": 6569 + }, + { + "epoch": 0.9461405529953917, + "grad_norm": 0.9741150140762329, + "learning_rate": 3.589309527313151e-07, + "loss": 0.0979, + "step": 6570 + }, + { + "epoch": 0.9462845622119815, + "grad_norm": 0.46647587418556213, + "learning_rate": 3.5702377005778773e-07, + "loss": 0.0506, + "step": 6571 + }, + { + "epoch": 0.9464285714285714, + "grad_norm": 0.7129952311515808, + "learning_rate": 3.5512163136560415e-07, + "loss": 0.08, + "step": 6572 + }, + { + "epoch": 0.9465725806451613, + "grad_norm": 1.2333449125289917, + "learning_rate": 3.5322453704410286e-07, + "loss": 0.1341, + "step": 6573 + }, + { + "epoch": 0.9467165898617511, + "grad_norm": 0.5604720711708069, + "learning_rate": 3.513324874815843e-07, + "loss": 0.0636, + "step": 6574 + }, + { + "epoch": 0.946860599078341, + "grad_norm": 0.7181684374809265, + "learning_rate": 3.4944548306531653e-07, + "loss": 0.0704, + "step": 6575 + }, + { + "epoch": 0.9470046082949308, + "grad_norm": 2.990478754043579, + "learning_rate": 3.4756352418153504e-07, + "loss": 0.2848, + "step": 6576 + }, + { + "epoch": 0.9471486175115207, + "grad_norm": 0.47407978773117065, + "learning_rate": 3.456866112154428e-07, + "loss": 0.0687, + "step": 6577 + }, + { + "epoch": 0.9472926267281107, + "grad_norm": 0.9808512330055237, + "learning_rate": 3.4381474455121575e-07, + "loss": 0.0852, + "step": 6578 + }, + { + "epoch": 0.9474366359447005, + "grad_norm": 0.9911962747573853, + "learning_rate": 3.419479245719864e-07, + "loss": 0.1067, + "step": 6579 + }, + { + "epoch": 0.9475806451612904, + "grad_norm": 0.8476753234863281, + "learning_rate": 3.40086151659863e-07, + "loss": 0.0749, + "step": 6580 + }, + { + "epoch": 0.9477246543778802, + "grad_norm": 0.49354153871536255, + "learning_rate": 3.3822942619591566e-07, + "loss": 0.0452, + "step": 6581 + }, + { + "epoch": 0.9478686635944701, + "grad_norm": 0.5564571619033813, + "learning_rate": 3.363777485601849e-07, + "loss": 0.0638, + "step": 6582 + }, + { + "epoch": 0.9480126728110599, + "grad_norm": 4.362706184387207, + "learning_rate": 3.3453111913167577e-07, + "loss": 2.2721, + "step": 6583 + }, + { + "epoch": 0.9481566820276498, + "grad_norm": 3.5149478912353516, + "learning_rate": 3.32689538288361e-07, + "loss": 1.0783, + "step": 6584 + }, + { + "epoch": 0.9483006912442397, + "grad_norm": 0.443703830242157, + "learning_rate": 3.30853006407178e-07, + "loss": 0.0484, + "step": 6585 + }, + { + "epoch": 0.9484447004608295, + "grad_norm": 1.0473214387893677, + "learning_rate": 3.290215238640343e-07, + "loss": 0.0966, + "step": 6586 + }, + { + "epoch": 0.9485887096774194, + "grad_norm": 1.6012871265411377, + "learning_rate": 3.271950910337995e-07, + "loss": 0.115, + "step": 6587 + }, + { + "epoch": 0.9487327188940092, + "grad_norm": 1.3954929113388062, + "learning_rate": 3.253737082903163e-07, + "loss": 0.1237, + "step": 6588 + }, + { + "epoch": 0.9488767281105991, + "grad_norm": 0.5502363443374634, + "learning_rate": 3.235573760063837e-07, + "loss": 0.0763, + "step": 6589 + }, + { + "epoch": 0.949020737327189, + "grad_norm": 1.402467131614685, + "learning_rate": 3.2174609455377923e-07, + "loss": 0.1223, + "step": 6590 + }, + { + "epoch": 0.9491647465437788, + "grad_norm": 0.9390389919281006, + "learning_rate": 3.1993986430323417e-07, + "loss": 0.0853, + "step": 6591 + }, + { + "epoch": 0.9493087557603687, + "grad_norm": 6.9760942459106445, + "learning_rate": 3.181386856244584e-07, + "loss": 1.1961, + "step": 6592 + }, + { + "epoch": 0.9494527649769585, + "grad_norm": 0.8923547267913818, + "learning_rate": 3.163425588861152e-07, + "loss": 0.1014, + "step": 6593 + }, + { + "epoch": 0.9495967741935484, + "grad_norm": 1.0368677377700806, + "learning_rate": 3.1455148445584116e-07, + "loss": 4.0385, + "step": 6594 + }, + { + "epoch": 0.9497407834101382, + "grad_norm": 0.9220501184463501, + "learning_rate": 3.127654627002402e-07, + "loss": 0.1016, + "step": 6595 + }, + { + "epoch": 0.9498847926267281, + "grad_norm": 5.043511390686035, + "learning_rate": 3.109844939848783e-07, + "loss": 2.2355, + "step": 6596 + }, + { + "epoch": 0.950028801843318, + "grad_norm": 0.9675378799438477, + "learning_rate": 3.0920857867428876e-07, + "loss": 0.1001, + "step": 6597 + }, + { + "epoch": 0.9501728110599078, + "grad_norm": 0.37486591935157776, + "learning_rate": 3.0743771713196703e-07, + "loss": 0.0447, + "step": 6598 + }, + { + "epoch": 0.9503168202764977, + "grad_norm": 0.38416290283203125, + "learning_rate": 3.056719097203814e-07, + "loss": 0.048, + "step": 6599 + }, + { + "epoch": 0.9504608294930875, + "grad_norm": 4.4810590744018555, + "learning_rate": 3.039111568009595e-07, + "loss": 2.1205, + "step": 6600 + }, + { + "epoch": 0.9506048387096774, + "grad_norm": 1.5497920513153076, + "learning_rate": 3.021554587340936e-07, + "loss": 0.1328, + "step": 6601 + }, + { + "epoch": 0.9507488479262672, + "grad_norm": 0.8234983086585999, + "learning_rate": 3.004048158791489e-07, + "loss": 0.0765, + "step": 6602 + }, + { + "epoch": 0.9508928571428571, + "grad_norm": 0.4643156826496124, + "learning_rate": 2.986592285944473e-07, + "loss": 0.0501, + "step": 6603 + }, + { + "epoch": 0.951036866359447, + "grad_norm": 1.2700607776641846, + "learning_rate": 2.969186972372806e-07, + "loss": 0.117, + "step": 6604 + }, + { + "epoch": 0.9511808755760369, + "grad_norm": 4.68686580657959, + "learning_rate": 2.951832221639056e-07, + "loss": 1.1287, + "step": 6605 + }, + { + "epoch": 0.9513248847926268, + "grad_norm": 0.7373579144477844, + "learning_rate": 2.934528037295409e-07, + "loss": 0.0821, + "step": 6606 + }, + { + "epoch": 0.9514688940092166, + "grad_norm": 0.8820493817329407, + "learning_rate": 2.917274422883781e-07, + "loss": 0.0928, + "step": 6607 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 5.013768672943115, + "learning_rate": 2.9000713819356263e-07, + "loss": 1.5531, + "step": 6608 + }, + { + "epoch": 0.9517569124423964, + "grad_norm": 1.0845385789871216, + "learning_rate": 2.8829189179721547e-07, + "loss": 0.0601, + "step": 6609 + }, + { + "epoch": 0.9519009216589862, + "grad_norm": 0.37004026770591736, + "learning_rate": 2.8658170345041146e-07, + "loss": 0.044, + "step": 6610 + }, + { + "epoch": 0.9520449308755761, + "grad_norm": 1.0143121480941772, + "learning_rate": 2.848765735031983e-07, + "loss": 0.1139, + "step": 6611 + }, + { + "epoch": 0.9521889400921659, + "grad_norm": 4.195336818695068, + "learning_rate": 2.831765023045885e-07, + "loss": 2.5364, + "step": 6612 + }, + { + "epoch": 0.9523329493087558, + "grad_norm": 1.2340540885925293, + "learning_rate": 2.814814902025509e-07, + "loss": 0.1314, + "step": 6613 + }, + { + "epoch": 0.9524769585253456, + "grad_norm": 1.0147005319595337, + "learning_rate": 2.797915375440302e-07, + "loss": 0.106, + "step": 6614 + }, + { + "epoch": 0.9526209677419355, + "grad_norm": 0.4918835759162903, + "learning_rate": 2.7810664467492755e-07, + "loss": 0.0632, + "step": 6615 + }, + { + "epoch": 0.9527649769585254, + "grad_norm": 1.0295203924179077, + "learning_rate": 2.7642681194010865e-07, + "loss": 0.1256, + "step": 6616 + }, + { + "epoch": 0.9529089861751152, + "grad_norm": 0.5488288402557373, + "learning_rate": 2.7475203968340967e-07, + "loss": 0.0476, + "step": 6617 + }, + { + "epoch": 0.9530529953917051, + "grad_norm": 0.6656190156936646, + "learning_rate": 2.73082328247623e-07, + "loss": 0.0705, + "step": 6618 + }, + { + "epoch": 0.9531970046082949, + "grad_norm": 0.7223609685897827, + "learning_rate": 2.7141767797451143e-07, + "loss": 0.0756, + "step": 6619 + }, + { + "epoch": 0.9533410138248848, + "grad_norm": 0.49485355615615845, + "learning_rate": 2.697580892047996e-07, + "loss": 0.0672, + "step": 6620 + }, + { + "epoch": 0.9534850230414746, + "grad_norm": 0.44283998012542725, + "learning_rate": 2.681035622781741e-07, + "loss": 0.0505, + "step": 6621 + }, + { + "epoch": 0.9536290322580645, + "grad_norm": 3.4538393020629883, + "learning_rate": 2.664540975332891e-07, + "loss": 0.8127, + "step": 6622 + }, + { + "epoch": 0.9537730414746544, + "grad_norm": 1.489531397819519, + "learning_rate": 2.648096953077578e-07, + "loss": 0.0988, + "step": 6623 + }, + { + "epoch": 0.9539170506912442, + "grad_norm": 0.7690650820732117, + "learning_rate": 2.631703559381665e-07, + "loss": 0.0886, + "step": 6624 + }, + { + "epoch": 0.9540610599078341, + "grad_norm": 0.9929276704788208, + "learning_rate": 2.6153607976005247e-07, + "loss": 0.0974, + "step": 6625 + }, + { + "epoch": 0.9542050691244239, + "grad_norm": 0.8119827508926392, + "learning_rate": 2.599068671079258e-07, + "loss": 0.0909, + "step": 6626 + }, + { + "epoch": 0.9543490783410138, + "grad_norm": 0.4993036985397339, + "learning_rate": 2.5828271831525864e-07, + "loss": 0.0675, + "step": 6627 + }, + { + "epoch": 0.9544930875576036, + "grad_norm": 1.005749225616455, + "learning_rate": 2.566636337144823e-07, + "loss": 0.1042, + "step": 6628 + }, + { + "epoch": 0.9546370967741935, + "grad_norm": 1.272089958190918, + "learning_rate": 2.550496136369984e-07, + "loss": 0.0928, + "step": 6629 + }, + { + "epoch": 0.9547811059907834, + "grad_norm": 0.38741210103034973, + "learning_rate": 2.534406584131649e-07, + "loss": 0.0472, + "step": 6630 + }, + { + "epoch": 0.9549251152073732, + "grad_norm": 0.8771342635154724, + "learning_rate": 2.5183676837231e-07, + "loss": 0.0926, + "step": 6631 + }, + { + "epoch": 0.9550691244239631, + "grad_norm": 0.5512773394584656, + "learning_rate": 2.5023794384271827e-07, + "loss": 0.0616, + "step": 6632 + }, + { + "epoch": 0.955213133640553, + "grad_norm": 0.6124026775360107, + "learning_rate": 2.4864418515164465e-07, + "loss": 0.0793, + "step": 6633 + }, + { + "epoch": 0.9553571428571429, + "grad_norm": 0.9111528992652893, + "learning_rate": 2.470554926252977e-07, + "loss": 0.1043, + "step": 6634 + }, + { + "epoch": 0.9555011520737328, + "grad_norm": 0.8112383484840393, + "learning_rate": 2.454718665888589e-07, + "loss": 0.0853, + "step": 6635 + }, + { + "epoch": 0.9556451612903226, + "grad_norm": 0.8250031471252441, + "learning_rate": 2.43893307366469e-07, + "loss": 0.0911, + "step": 6636 + }, + { + "epoch": 0.9557891705069125, + "grad_norm": 0.6312892436981201, + "learning_rate": 2.423198152812306e-07, + "loss": 0.1126, + "step": 6637 + }, + { + "epoch": 0.9559331797235023, + "grad_norm": 6.099482536315918, + "learning_rate": 2.4075139065520836e-07, + "loss": 1.5477, + "step": 6638 + }, + { + "epoch": 0.9560771889400922, + "grad_norm": 0.6328756809234619, + "learning_rate": 2.3918803380943154e-07, + "loss": 0.0487, + "step": 6639 + }, + { + "epoch": 0.956221198156682, + "grad_norm": 1.0242120027542114, + "learning_rate": 2.376297450638887e-07, + "loss": 0.1232, + "step": 6640 + }, + { + "epoch": 0.9563652073732719, + "grad_norm": 1.135146141052246, + "learning_rate": 2.3607652473754128e-07, + "loss": 0.1105, + "step": 6641 + }, + { + "epoch": 0.9565092165898618, + "grad_norm": 0.918692946434021, + "learning_rate": 2.345283731482989e-07, + "loss": 0.1078, + "step": 6642 + }, + { + "epoch": 0.9566532258064516, + "grad_norm": 0.4704742729663849, + "learning_rate": 2.3298529061304418e-07, + "loss": 0.0685, + "step": 6643 + }, + { + "epoch": 0.9567972350230415, + "grad_norm": 0.6159284114837646, + "learning_rate": 2.3144727744761895e-07, + "loss": 0.0703, + "step": 6644 + }, + { + "epoch": 0.9569412442396313, + "grad_norm": 0.4208681583404541, + "learning_rate": 2.2991433396682693e-07, + "loss": 0.0659, + "step": 6645 + }, + { + "epoch": 0.9570852534562212, + "grad_norm": 1.647375464439392, + "learning_rate": 2.283864604844338e-07, + "loss": 0.1226, + "step": 6646 + }, + { + "epoch": 0.957229262672811, + "grad_norm": 0.9975357055664062, + "learning_rate": 2.2686365731316718e-07, + "loss": 0.125, + "step": 6647 + }, + { + "epoch": 0.9573732718894009, + "grad_norm": 0.7209845185279846, + "learning_rate": 2.2534592476472215e-07, + "loss": 0.0711, + "step": 6648 + }, + { + "epoch": 0.9575172811059908, + "grad_norm": 1.2264796495437622, + "learning_rate": 2.238332631497475e-07, + "loss": 0.1298, + "step": 6649 + }, + { + "epoch": 0.9576612903225806, + "grad_norm": 4.532900333404541, + "learning_rate": 2.2232567277785942e-07, + "loss": 0.7691, + "step": 6650 + }, + { + "epoch": 0.9578052995391705, + "grad_norm": 1.6539093255996704, + "learning_rate": 2.208231539576361e-07, + "loss": 0.1231, + "step": 6651 + }, + { + "epoch": 0.9579493087557603, + "grad_norm": 3.130950689315796, + "learning_rate": 2.1932570699661482e-07, + "loss": 3.1152, + "step": 6652 + }, + { + "epoch": 0.9580933179723502, + "grad_norm": 0.5014954209327698, + "learning_rate": 2.1783333220129765e-07, + "loss": 0.0586, + "step": 6653 + }, + { + "epoch": 0.9582373271889401, + "grad_norm": 5.077042579650879, + "learning_rate": 2.163460298771486e-07, + "loss": 1.1893, + "step": 6654 + }, + { + "epoch": 0.9583813364055299, + "grad_norm": 0.925835371017456, + "learning_rate": 2.1486380032858798e-07, + "loss": 0.1471, + "step": 6655 + }, + { + "epoch": 0.9585253456221198, + "grad_norm": 1.892499327659607, + "learning_rate": 2.1338664385900653e-07, + "loss": 0.1142, + "step": 6656 + }, + { + "epoch": 0.9586693548387096, + "grad_norm": 0.54951411485672, + "learning_rate": 2.1191456077075122e-07, + "loss": 0.0619, + "step": 6657 + }, + { + "epoch": 0.9588133640552995, + "grad_norm": 6.399754524230957, + "learning_rate": 2.104475513651283e-07, + "loss": 1.5331, + "step": 6658 + }, + { + "epoch": 0.9589573732718893, + "grad_norm": 0.6329681277275085, + "learning_rate": 2.089856159424114e-07, + "loss": 0.0731, + "step": 6659 + }, + { + "epoch": 0.9591013824884793, + "grad_norm": 0.8552351593971252, + "learning_rate": 2.0752875480183065e-07, + "loss": 0.0945, + "step": 6660 + }, + { + "epoch": 0.9592453917050692, + "grad_norm": 0.9218329787254333, + "learning_rate": 2.0607696824158363e-07, + "loss": 0.0652, + "step": 6661 + }, + { + "epoch": 0.959389400921659, + "grad_norm": 0.914547324180603, + "learning_rate": 2.0463025655882152e-07, + "loss": 0.0713, + "step": 6662 + }, + { + "epoch": 0.9595334101382489, + "grad_norm": 0.7544251680374146, + "learning_rate": 2.03188620049663e-07, + "loss": 0.0807, + "step": 6663 + }, + { + "epoch": 0.9596774193548387, + "grad_norm": 1.7536948919296265, + "learning_rate": 2.0175205900918316e-07, + "loss": 0.1409, + "step": 6664 + }, + { + "epoch": 0.9598214285714286, + "grad_norm": 1.5751057863235474, + "learning_rate": 2.0032057373142454e-07, + "loss": 0.1262, + "step": 6665 + }, + { + "epoch": 0.9599654377880185, + "grad_norm": 1.1886732578277588, + "learning_rate": 1.9889416450938337e-07, + "loss": 0.0901, + "step": 6666 + }, + { + "epoch": 0.9601094470046083, + "grad_norm": 0.6622714400291443, + "learning_rate": 1.9747283163502328e-07, + "loss": 0.0922, + "step": 6667 + }, + { + "epoch": 0.9602534562211982, + "grad_norm": 4.23695182800293, + "learning_rate": 1.960565753992616e-07, + "loss": 1.6795, + "step": 6668 + }, + { + "epoch": 0.960397465437788, + "grad_norm": 0.5438573956489563, + "learning_rate": 1.9464539609198308e-07, + "loss": 0.0671, + "step": 6669 + }, + { + "epoch": 0.9605414746543779, + "grad_norm": 1.0934796333312988, + "learning_rate": 1.9323929400203445e-07, + "loss": 0.0599, + "step": 6670 + }, + { + "epoch": 0.9606854838709677, + "grad_norm": 0.6088090538978577, + "learning_rate": 1.9183826941721605e-07, + "loss": 0.0544, + "step": 6671 + }, + { + "epoch": 0.9608294930875576, + "grad_norm": 0.5957331657409668, + "learning_rate": 1.9044232262429296e-07, + "loss": 0.0803, + "step": 6672 + }, + { + "epoch": 0.9609735023041475, + "grad_norm": 1.2226102352142334, + "learning_rate": 1.8905145390899216e-07, + "loss": 0.1281, + "step": 6673 + }, + { + "epoch": 0.9611175115207373, + "grad_norm": 0.9906578063964844, + "learning_rate": 1.87665663555997e-07, + "loss": 0.1077, + "step": 6674 + }, + { + "epoch": 0.9612615207373272, + "grad_norm": 7.135962963104248, + "learning_rate": 1.8628495184896123e-07, + "loss": 2.3528, + "step": 6675 + }, + { + "epoch": 0.961405529953917, + "grad_norm": 0.47588104009628296, + "learning_rate": 1.849093190704837e-07, + "loss": 0.0617, + "step": 6676 + }, + { + "epoch": 0.9615495391705069, + "grad_norm": 0.6719211339950562, + "learning_rate": 1.8353876550213922e-07, + "loss": 0.0767, + "step": 6677 + }, + { + "epoch": 0.9616935483870968, + "grad_norm": 0.8792760968208313, + "learning_rate": 1.8217329142445061e-07, + "loss": 0.0942, + "step": 6678 + }, + { + "epoch": 0.9618375576036866, + "grad_norm": 1.5216933488845825, + "learning_rate": 1.808128971169082e-07, + "loss": 0.1465, + "step": 6679 + }, + { + "epoch": 0.9619815668202765, + "grad_norm": 0.9481956958770752, + "learning_rate": 1.7945758285796143e-07, + "loss": 0.0952, + "step": 6680 + }, + { + "epoch": 0.9621255760368663, + "grad_norm": 0.6805007457733154, + "learning_rate": 1.7810734892501624e-07, + "loss": 0.0663, + "step": 6681 + }, + { + "epoch": 0.9622695852534562, + "grad_norm": 3.27669095993042, + "learning_rate": 1.7676219559444595e-07, + "loss": 1.8986, + "step": 6682 + }, + { + "epoch": 0.962413594470046, + "grad_norm": 5.364407062530518, + "learning_rate": 1.7542212314157758e-07, + "loss": 1.5932, + "step": 6683 + }, + { + "epoch": 0.9625576036866359, + "grad_norm": 1.5149905681610107, + "learning_rate": 1.7408713184070001e-07, + "loss": 0.1455, + "step": 6684 + }, + { + "epoch": 0.9627016129032258, + "grad_norm": 1.3947943449020386, + "learning_rate": 1.727572219650614e-07, + "loss": 0.1207, + "step": 6685 + }, + { + "epoch": 0.9628456221198156, + "grad_norm": 0.5484460592269897, + "learning_rate": 1.714323937868745e-07, + "loss": 0.0521, + "step": 6686 + }, + { + "epoch": 0.9629896313364056, + "grad_norm": 0.694277286529541, + "learning_rate": 1.7011264757730295e-07, + "loss": 0.0806, + "step": 6687 + }, + { + "epoch": 0.9631336405529954, + "grad_norm": 1.177595853805542, + "learning_rate": 1.687979836064779e-07, + "loss": 0.0894, + "step": 6688 + }, + { + "epoch": 0.9632776497695853, + "grad_norm": 0.5021853446960449, + "learning_rate": 1.674884021434897e-07, + "loss": 0.0596, + "step": 6689 + }, + { + "epoch": 0.9634216589861752, + "grad_norm": 0.8931258320808411, + "learning_rate": 1.6618390345638225e-07, + "loss": 0.1051, + "step": 6690 + }, + { + "epoch": 0.963565668202765, + "grad_norm": 0.4477704167366028, + "learning_rate": 1.648844878121697e-07, + "loss": 0.0512, + "step": 6691 + }, + { + "epoch": 0.9637096774193549, + "grad_norm": 1.1622105836868286, + "learning_rate": 1.6359015547681433e-07, + "loss": 3.9593, + "step": 6692 + }, + { + "epoch": 0.9638536866359447, + "grad_norm": 0.6884612441062927, + "learning_rate": 1.623009067152431e-07, + "loss": 0.0995, + "step": 6693 + }, + { + "epoch": 0.9639976958525346, + "grad_norm": 0.7805122137069702, + "learning_rate": 1.6101674179134496e-07, + "loss": 0.1089, + "step": 6694 + }, + { + "epoch": 0.9641417050691244, + "grad_norm": 0.6340153217315674, + "learning_rate": 1.597376609679624e-07, + "loss": 0.0784, + "step": 6695 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 0.8145184516906738, + "learning_rate": 1.5846366450690542e-07, + "loss": 0.0804, + "step": 6696 + }, + { + "epoch": 0.9644297235023042, + "grad_norm": 0.9354131817817688, + "learning_rate": 1.571947526689349e-07, + "loss": 0.0992, + "step": 6697 + }, + { + "epoch": 0.964573732718894, + "grad_norm": 0.8126431703567505, + "learning_rate": 1.5593092571377644e-07, + "loss": 0.1001, + "step": 6698 + }, + { + "epoch": 0.9647177419354839, + "grad_norm": 0.7813040614128113, + "learning_rate": 1.5467218390011195e-07, + "loss": 0.097, + "step": 6699 + }, + { + "epoch": 0.9648617511520737, + "grad_norm": 0.741131603717804, + "learning_rate": 1.534185274855854e-07, + "loss": 0.0851, + "step": 6700 + }, + { + "epoch": 0.9650057603686636, + "grad_norm": 0.9763888120651245, + "learning_rate": 1.5216995672679423e-07, + "loss": 0.1013, + "step": 6701 + }, + { + "epoch": 0.9651497695852534, + "grad_norm": 0.7119227647781372, + "learning_rate": 1.5092647187930075e-07, + "loss": 0.0671, + "step": 6702 + }, + { + "epoch": 0.9652937788018433, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.4968807319762635e-07, + "loss": 0.0757, + "step": 6703 + }, + { + "epoch": 0.9654377880184332, + "grad_norm": 0.7665894627571106, + "learning_rate": 1.484547609352488e-07, + "loss": 0.0726, + "step": 6704 + }, + { + "epoch": 0.965581797235023, + "grad_norm": 0.6152949929237366, + "learning_rate": 1.4722653534460228e-07, + "loss": 0.0698, + "step": 6705 + }, + { + "epoch": 0.9657258064516129, + "grad_norm": 0.8717970848083496, + "learning_rate": 1.4600339667708573e-07, + "loss": 0.0932, + "step": 6706 + }, + { + "epoch": 0.9658698156682027, + "grad_norm": 1.3708994388580322, + "learning_rate": 1.4478534518305164e-07, + "loss": 0.121, + "step": 6707 + }, + { + "epoch": 0.9660138248847926, + "grad_norm": 0.7106877565383911, + "learning_rate": 1.4357238111181726e-07, + "loss": 0.0799, + "step": 6708 + }, + { + "epoch": 0.9661578341013825, + "grad_norm": 0.6497529149055481, + "learning_rate": 1.423645047116534e-07, + "loss": 0.0929, + "step": 6709 + }, + { + "epoch": 0.9663018433179723, + "grad_norm": 3.122237205505371, + "learning_rate": 1.4116171622978737e-07, + "loss": 1.9959, + "step": 6710 + }, + { + "epoch": 0.9664458525345622, + "grad_norm": 0.733344316482544, + "learning_rate": 1.399640159124138e-07, + "loss": 0.079, + "step": 6711 + }, + { + "epoch": 0.966589861751152, + "grad_norm": 0.822630763053894, + "learning_rate": 1.387714040046756e-07, + "loss": 0.115, + "step": 6712 + }, + { + "epoch": 0.9667338709677419, + "grad_norm": 7.082327842712402, + "learning_rate": 1.3758388075068574e-07, + "loss": 2.3444, + "step": 6713 + }, + { + "epoch": 0.9668778801843319, + "grad_norm": 0.8305864930152893, + "learning_rate": 1.364014463935054e-07, + "loss": 0.0971, + "step": 6714 + }, + { + "epoch": 0.9670218894009217, + "grad_norm": 0.6209385395050049, + "learning_rate": 1.3522410117515484e-07, + "loss": 0.0681, + "step": 6715 + }, + { + "epoch": 0.9671658986175116, + "grad_norm": 0.9688395857810974, + "learning_rate": 1.3405184533662186e-07, + "loss": 0.0881, + "step": 6716 + }, + { + "epoch": 0.9673099078341014, + "grad_norm": 0.4746958911418915, + "learning_rate": 1.328846791178451e-07, + "loss": 0.0689, + "step": 6717 + }, + { + "epoch": 0.9674539170506913, + "grad_norm": 1.1219205856323242, + "learning_rate": 1.3172260275771952e-07, + "loss": 0.0931, + "step": 6718 + }, + { + "epoch": 0.9675979262672811, + "grad_norm": 0.3073391914367676, + "learning_rate": 1.3056561649410493e-07, + "loss": 0.0405, + "step": 6719 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.38481301069259644, + "learning_rate": 1.2941372056381463e-07, + "loss": 0.0469, + "step": 6720 + }, + { + "epoch": 0.9678859447004609, + "grad_norm": 0.8265698552131653, + "learning_rate": 1.2826691520262114e-07, + "loss": 0.0889, + "step": 6721 + }, + { + "epoch": 0.9680299539170507, + "grad_norm": 0.6629499197006226, + "learning_rate": 1.2712520064525613e-07, + "loss": 0.0712, + "step": 6722 + }, + { + "epoch": 0.9681739631336406, + "grad_norm": 1.0074321031570435, + "learning_rate": 1.2598857712540768e-07, + "loss": 0.1041, + "step": 6723 + }, + { + "epoch": 0.9683179723502304, + "grad_norm": 0.9239470958709717, + "learning_rate": 1.2485704487572303e-07, + "loss": 0.0908, + "step": 6724 + }, + { + "epoch": 0.9684619815668203, + "grad_norm": 0.6424285173416138, + "learning_rate": 1.237306041278058e-07, + "loss": 0.0599, + "step": 6725 + }, + { + "epoch": 0.9686059907834101, + "grad_norm": 0.7873046398162842, + "learning_rate": 1.2260925511221877e-07, + "loss": 0.1025, + "step": 6726 + }, + { + "epoch": 0.96875, + "grad_norm": 0.8414080739021301, + "learning_rate": 1.214929980584839e-07, + "loss": 0.0841, + "step": 6727 + }, + { + "epoch": 0.9688940092165899, + "grad_norm": 0.9744022488594055, + "learning_rate": 1.2038183319507955e-07, + "loss": 0.0852, + "step": 6728 + }, + { + "epoch": 0.9690380184331797, + "grad_norm": 0.9843376874923706, + "learning_rate": 1.192757607494377e-07, + "loss": 0.0927, + "step": 6729 + }, + { + "epoch": 0.9691820276497696, + "grad_norm": 0.8413404226303101, + "learning_rate": 1.181747809479522e-07, + "loss": 0.0992, + "step": 6730 + }, + { + "epoch": 0.9693260368663594, + "grad_norm": 0.6687656044960022, + "learning_rate": 1.1707889401597893e-07, + "loss": 0.0755, + "step": 6731 + }, + { + "epoch": 0.9694700460829493, + "grad_norm": 1.2754795551300049, + "learning_rate": 1.1598810017782457e-07, + "loss": 0.1047, + "step": 6732 + }, + { + "epoch": 0.9696140552995391, + "grad_norm": 1.1086647510528564, + "learning_rate": 1.1490239965675221e-07, + "loss": 0.177, + "step": 6733 + }, + { + "epoch": 0.969758064516129, + "grad_norm": 1.5418813228607178, + "learning_rate": 1.1382179267498683e-07, + "loss": 0.1129, + "step": 6734 + }, + { + "epoch": 0.9699020737327189, + "grad_norm": 0.6707262992858887, + "learning_rate": 1.1274627945371263e-07, + "loss": 0.0754, + "step": 6735 + }, + { + "epoch": 0.9700460829493087, + "grad_norm": 0.9281526207923889, + "learning_rate": 1.1167586021306465e-07, + "loss": 4.0914, + "step": 6736 + }, + { + "epoch": 0.9701900921658986, + "grad_norm": 0.7984203100204468, + "learning_rate": 1.1061053517214259e-07, + "loss": 0.0705, + "step": 6737 + }, + { + "epoch": 0.9703341013824884, + "grad_norm": 1.797810435295105, + "learning_rate": 1.0955030454899428e-07, + "loss": 0.1531, + "step": 6738 + }, + { + "epoch": 0.9704781105990783, + "grad_norm": 0.7995812892913818, + "learning_rate": 1.0849516856063502e-07, + "loss": 0.0878, + "step": 6739 + }, + { + "epoch": 0.9706221198156681, + "grad_norm": 5.165927886962891, + "learning_rate": 1.0744512742302815e-07, + "loss": 1.6205, + "step": 6740 + }, + { + "epoch": 0.9707661290322581, + "grad_norm": 1.122226357460022, + "learning_rate": 1.0640018135110174e-07, + "loss": 0.093, + "step": 6741 + }, + { + "epoch": 0.970910138248848, + "grad_norm": 0.9897922277450562, + "learning_rate": 1.053603305587375e-07, + "loss": 0.0968, + "step": 6742 + }, + { + "epoch": 0.9710541474654378, + "grad_norm": 8.346633911132812, + "learning_rate": 1.0432557525877351e-07, + "loss": 1.5359, + "step": 6743 + }, + { + "epoch": 0.9711981566820277, + "grad_norm": 0.45162785053253174, + "learning_rate": 1.03295915663007e-07, + "loss": 0.0542, + "step": 6744 + }, + { + "epoch": 0.9713421658986175, + "grad_norm": 0.855884313583374, + "learning_rate": 1.0227135198218885e-07, + "loss": 0.0966, + "step": 6745 + }, + { + "epoch": 0.9714861751152074, + "grad_norm": 5.821923732757568, + "learning_rate": 1.0125188442603185e-07, + "loss": 2.2514, + "step": 6746 + }, + { + "epoch": 0.9716301843317973, + "grad_norm": 1.1170289516448975, + "learning_rate": 1.002375132032024e-07, + "loss": 0.0883, + "step": 6747 + }, + { + "epoch": 0.9717741935483871, + "grad_norm": 1.5510212182998657, + "learning_rate": 9.922823852132335e-08, + "loss": 0.1296, + "step": 6748 + }, + { + "epoch": 0.971918202764977, + "grad_norm": 0.5434585213661194, + "learning_rate": 9.822406058697664e-08, + "loss": 0.047, + "step": 6749 + }, + { + "epoch": 0.9720622119815668, + "grad_norm": 0.9164115190505981, + "learning_rate": 9.722497960569787e-08, + "loss": 0.0894, + "step": 6750 + }, + { + "epoch": 0.9722062211981567, + "grad_norm": 3.7723278999328613, + "learning_rate": 9.62309957819818e-08, + "loss": 0.8383, + "step": 6751 + }, + { + "epoch": 0.9723502304147466, + "grad_norm": 0.7343043088912964, + "learning_rate": 9.524210931927957e-08, + "loss": 0.0826, + "step": 6752 + }, + { + "epoch": 0.9724942396313364, + "grad_norm": 0.8361272215843201, + "learning_rate": 9.425832041999871e-08, + "loss": 0.0823, + "step": 6753 + }, + { + "epoch": 0.9726382488479263, + "grad_norm": 6.443047523498535, + "learning_rate": 9.327962928550315e-08, + "loss": 1.4752, + "step": 6754 + }, + { + "epoch": 0.9727822580645161, + "grad_norm": 0.3590412735939026, + "learning_rate": 9.230603611611599e-08, + "loss": 0.0399, + "step": 6755 + }, + { + "epoch": 0.972926267281106, + "grad_norm": 1.3677092790603638, + "learning_rate": 9.133754111111114e-08, + "loss": 0.119, + "step": 6756 + }, + { + "epoch": 0.9730702764976958, + "grad_norm": 1.0742532014846802, + "learning_rate": 9.03741444687245e-08, + "loss": 0.1018, + "step": 6757 + }, + { + "epoch": 0.9732142857142857, + "grad_norm": 6.454593181610107, + "learning_rate": 8.941584638614553e-08, + "loss": 1.3236, + "step": 6758 + }, + { + "epoch": 0.9733582949308756, + "grad_norm": 0.9683122634887695, + "learning_rate": 8.846264705952289e-08, + "loss": 0.0576, + "step": 6759 + }, + { + "epoch": 0.9735023041474654, + "grad_norm": 0.672667920589447, + "learning_rate": 8.751454668395608e-08, + "loss": 0.062, + "step": 6760 + }, + { + "epoch": 0.9736463133640553, + "grad_norm": 2.355233907699585, + "learning_rate": 8.657154545350654e-08, + "loss": 0.1697, + "step": 6761 + }, + { + "epoch": 0.9737903225806451, + "grad_norm": 0.709780216217041, + "learning_rate": 8.56336435611893e-08, + "loss": 0.0847, + "step": 6762 + }, + { + "epoch": 0.973934331797235, + "grad_norm": 0.9585935473442078, + "learning_rate": 8.470084119897581e-08, + "loss": 0.096, + "step": 6763 + }, + { + "epoch": 0.9740783410138248, + "grad_norm": 4.016872882843018, + "learning_rate": 8.377313855779668e-08, + "loss": 3.1213, + "step": 6764 + }, + { + "epoch": 0.9742223502304147, + "grad_norm": 0.4826772212982178, + "learning_rate": 8.285053582753332e-08, + "loss": 0.0475, + "step": 6765 + }, + { + "epoch": 0.9743663594470046, + "grad_norm": 6.027637004852295, + "learning_rate": 8.193303319702916e-08, + "loss": 1.0412, + "step": 6766 + }, + { + "epoch": 0.9745103686635944, + "grad_norm": 6.535375118255615, + "learning_rate": 8.102063085407563e-08, + "loss": 1.5551, + "step": 6767 + }, + { + "epoch": 0.9746543778801844, + "grad_norm": 0.7938113808631897, + "learning_rate": 8.011332898543167e-08, + "loss": 0.0777, + "step": 6768 + }, + { + "epoch": 0.9747983870967742, + "grad_norm": 0.6426879167556763, + "learning_rate": 7.92111277768015e-08, + "loss": 0.0789, + "step": 6769 + }, + { + "epoch": 0.9749423963133641, + "grad_norm": 1.531423568725586, + "learning_rate": 7.831402741285409e-08, + "loss": 0.1264, + "step": 6770 + }, + { + "epoch": 0.975086405529954, + "grad_norm": 0.9267395734786987, + "learning_rate": 7.742202807720366e-08, + "loss": 0.0967, + "step": 6771 + }, + { + "epoch": 0.9752304147465438, + "grad_norm": 0.842285692691803, + "learning_rate": 7.653512995243195e-08, + "loss": 0.0851, + "step": 6772 + }, + { + "epoch": 0.9753744239631337, + "grad_norm": 0.9809455871582031, + "learning_rate": 7.565333322006873e-08, + "loss": 0.1143, + "step": 6773 + }, + { + "epoch": 0.9755184331797235, + "grad_norm": 0.8313623070716858, + "learning_rate": 7.477663806060576e-08, + "loss": 0.0757, + "step": 6774 + }, + { + "epoch": 0.9756624423963134, + "grad_norm": 0.6127511262893677, + "learning_rate": 7.390504465348003e-08, + "loss": 0.0702, + "step": 6775 + }, + { + "epoch": 0.9758064516129032, + "grad_norm": 1.176688313484192, + "learning_rate": 7.303855317709884e-08, + "loss": 0.1151, + "step": 6776 + }, + { + "epoch": 0.9759504608294931, + "grad_norm": 1.1486831903457642, + "learning_rate": 7.217716380881479e-08, + "loss": 0.1017, + "step": 6777 + }, + { + "epoch": 0.976094470046083, + "grad_norm": 1.1494383811950684, + "learning_rate": 7.132087672493681e-08, + "loss": 0.1276, + "step": 6778 + }, + { + "epoch": 0.9762384792626728, + "grad_norm": 0.3444020450115204, + "learning_rate": 7.046969210073307e-08, + "loss": 0.0471, + "step": 6779 + }, + { + "epoch": 0.9763824884792627, + "grad_norm": 1.2511495351791382, + "learning_rate": 6.962361011042806e-08, + "loss": 0.1155, + "step": 6780 + }, + { + "epoch": 0.9765264976958525, + "grad_norm": 1.0839293003082275, + "learning_rate": 6.878263092719717e-08, + "loss": 0.1163, + "step": 6781 + }, + { + "epoch": 0.9766705069124424, + "grad_norm": 0.7861602902412415, + "learning_rate": 6.794675472317769e-08, + "loss": 0.1083, + "step": 6782 + }, + { + "epoch": 0.9768145161290323, + "grad_norm": 0.895510196685791, + "learning_rate": 6.711598166945221e-08, + "loss": 0.0972, + "step": 6783 + }, + { + "epoch": 0.9769585253456221, + "grad_norm": 0.9206796884536743, + "learning_rate": 6.629031193607082e-08, + "loss": 0.0933, + "step": 6784 + }, + { + "epoch": 0.977102534562212, + "grad_norm": 0.9865223169326782, + "learning_rate": 6.546974569203446e-08, + "loss": 0.0763, + "step": 6785 + }, + { + "epoch": 0.9772465437788018, + "grad_norm": 0.9689768552780151, + "learning_rate": 6.46542831052921e-08, + "loss": 0.0809, + "step": 6786 + }, + { + "epoch": 0.9773905529953917, + "grad_norm": 0.8001164197921753, + "learning_rate": 6.384392434276021e-08, + "loss": 0.0874, + "step": 6787 + }, + { + "epoch": 0.9775345622119815, + "grad_norm": 1.6492871046066284, + "learning_rate": 6.303866957030058e-08, + "loss": 0.1212, + "step": 6788 + }, + { + "epoch": 0.9776785714285714, + "grad_norm": 0.9315882921218872, + "learning_rate": 6.223851895273969e-08, + "loss": 0.0727, + "step": 6789 + }, + { + "epoch": 0.9778225806451613, + "grad_norm": 0.6280080080032349, + "learning_rate": 6.14434726538493e-08, + "loss": 0.0781, + "step": 6790 + }, + { + "epoch": 0.9779665898617511, + "grad_norm": 0.7705098986625671, + "learning_rate": 6.065353083636594e-08, + "loss": 0.0785, + "step": 6791 + }, + { + "epoch": 0.978110599078341, + "grad_norm": 0.2656269669532776, + "learning_rate": 5.986869366197412e-08, + "loss": 0.045, + "step": 6792 + }, + { + "epoch": 0.9782546082949308, + "grad_norm": 0.635819137096405, + "learning_rate": 5.9088961291314805e-08, + "loss": 0.079, + "step": 6793 + }, + { + "epoch": 0.9783986175115207, + "grad_norm": 0.8759706020355225, + "learning_rate": 5.831433388398811e-08, + "loss": 0.0936, + "step": 6794 + }, + { + "epoch": 0.9785426267281107, + "grad_norm": 1.4099797010421753, + "learning_rate": 5.7544811598544966e-08, + "loss": 0.0961, + "step": 6795 + }, + { + "epoch": 0.9786866359447005, + "grad_norm": 0.6875733733177185, + "learning_rate": 5.6780394592492733e-08, + "loss": 0.0985, + "step": 6796 + }, + { + "epoch": 0.9788306451612904, + "grad_norm": 0.8451463580131531, + "learning_rate": 5.6021083022297917e-08, + "loss": 0.0859, + "step": 6797 + }, + { + "epoch": 0.9789746543778802, + "grad_norm": 1.204497218132019, + "learning_rate": 5.52668770433723e-08, + "loss": 3.4547, + "step": 6798 + }, + { + "epoch": 0.9791186635944701, + "grad_norm": 0.38400599360466003, + "learning_rate": 5.4517776810089625e-08, + "loss": 0.0481, + "step": 6799 + }, + { + "epoch": 0.9792626728110599, + "grad_norm": 0.8395406603813171, + "learning_rate": 5.37737824757828e-08, + "loss": 0.1066, + "step": 6800 + }, + { + "epoch": 0.9794066820276498, + "grad_norm": 0.5504254102706909, + "learning_rate": 5.3034894192727224e-08, + "loss": 0.0796, + "step": 6801 + }, + { + "epoch": 0.9795506912442397, + "grad_norm": 0.504349946975708, + "learning_rate": 5.230111211216582e-08, + "loss": 0.0588, + "step": 6802 + }, + { + "epoch": 0.9796947004608295, + "grad_norm": 0.7115333676338196, + "learning_rate": 5.1572436384289544e-08, + "loss": 0.0801, + "step": 6803 + }, + { + "epoch": 0.9798387096774194, + "grad_norm": 4.769011974334717, + "learning_rate": 5.0848867158242995e-08, + "loss": 0.7941, + "step": 6804 + }, + { + "epoch": 0.9799827188940092, + "grad_norm": 0.8300862908363342, + "learning_rate": 5.013040458212714e-08, + "loss": 0.0875, + "step": 6805 + }, + { + "epoch": 0.9801267281105991, + "grad_norm": 1.1259530782699585, + "learning_rate": 4.94170488030049e-08, + "loss": 0.1048, + "step": 6806 + }, + { + "epoch": 0.980270737327189, + "grad_norm": 0.780486524105072, + "learning_rate": 4.870879996687894e-08, + "loss": 0.0772, + "step": 6807 + }, + { + "epoch": 0.9804147465437788, + "grad_norm": 0.9172807931900024, + "learning_rate": 4.8005658218724934e-08, + "loss": 0.094, + "step": 6808 + }, + { + "epoch": 0.9805587557603687, + "grad_norm": 0.7723053693771362, + "learning_rate": 4.730762370245556e-08, + "loss": 0.0943, + "step": 6809 + }, + { + "epoch": 0.9807027649769585, + "grad_norm": 0.9252755641937256, + "learning_rate": 4.661469656094819e-08, + "loss": 0.1022, + "step": 6810 + }, + { + "epoch": 0.9808467741935484, + "grad_norm": 0.643317699432373, + "learning_rate": 4.592687693603659e-08, + "loss": 0.0711, + "step": 6811 + }, + { + "epoch": 0.9809907834101382, + "grad_norm": 0.9628289937973022, + "learning_rate": 4.524416496849981e-08, + "loss": 0.096, + "step": 6812 + }, + { + "epoch": 0.9811347926267281, + "grad_norm": 0.8869218826293945, + "learning_rate": 4.456656079808163e-08, + "loss": 0.0951, + "step": 6813 + }, + { + "epoch": 0.981278801843318, + "grad_norm": 3.591003179550171, + "learning_rate": 4.3894064563471115e-08, + "loss": 1.7169, + "step": 6814 + }, + { + "epoch": 0.9814228110599078, + "grad_norm": 0.5478070378303528, + "learning_rate": 4.322667640232203e-08, + "loss": 0.0748, + "step": 6815 + }, + { + "epoch": 0.9815668202764977, + "grad_norm": 1.2825103998184204, + "learning_rate": 4.256439645123067e-08, + "loss": 0.0924, + "step": 6816 + }, + { + "epoch": 0.9817108294930875, + "grad_norm": 0.6459072232246399, + "learning_rate": 4.190722484575804e-08, + "loss": 0.071, + "step": 6817 + }, + { + "epoch": 0.9818548387096774, + "grad_norm": 0.9634479284286499, + "learning_rate": 4.125516172041322e-08, + "loss": 0.1014, + "step": 6818 + }, + { + "epoch": 0.9819988479262672, + "grad_norm": 1.1433266401290894, + "learning_rate": 4.060820720866443e-08, + "loss": 0.1021, + "step": 6819 + }, + { + "epoch": 0.9821428571428571, + "grad_norm": 3.2778615951538086, + "learning_rate": 3.9966361442930755e-08, + "loss": 0.9854, + "step": 6820 + }, + { + "epoch": 0.982286866359447, + "grad_norm": 4.981570243835449, + "learning_rate": 3.9329624554584884e-08, + "loss": 1.2444, + "step": 6821 + }, + { + "epoch": 0.9824308755760369, + "grad_norm": 0.7090340852737427, + "learning_rate": 3.869799667395868e-08, + "loss": 0.0922, + "step": 6822 + }, + { + "epoch": 0.9825748847926268, + "grad_norm": 0.5227774381637573, + "learning_rate": 3.807147793033483e-08, + "loss": 0.0694, + "step": 6823 + }, + { + "epoch": 0.9827188940092166, + "grad_norm": 0.8775211572647095, + "learning_rate": 3.745006845194687e-08, + "loss": 0.0982, + "step": 6824 + }, + { + "epoch": 0.9828629032258065, + "grad_norm": 2.9700841903686523, + "learning_rate": 3.683376836599029e-08, + "loss": 1.2453, + "step": 6825 + }, + { + "epoch": 0.9830069124423964, + "grad_norm": 1.0161906480789185, + "learning_rate": 3.6222577798611376e-08, + "loss": 0.1074, + "step": 6826 + }, + { + "epoch": 0.9831509216589862, + "grad_norm": 0.8374365568161011, + "learning_rate": 3.561649687490454e-08, + "loss": 0.0888, + "step": 6827 + }, + { + "epoch": 0.9832949308755761, + "grad_norm": 0.38606804609298706, + "learning_rate": 3.5015525718928854e-08, + "loss": 0.0482, + "step": 6828 + }, + { + "epoch": 0.9834389400921659, + "grad_norm": 0.7983390092849731, + "learning_rate": 3.4419664453694264e-08, + "loss": 0.1021, + "step": 6829 + }, + { + "epoch": 0.9835829493087558, + "grad_norm": 0.6800190806388855, + "learning_rate": 3.3828913201156e-08, + "loss": 0.0786, + "step": 6830 + }, + { + "epoch": 0.9837269585253456, + "grad_norm": 4.489196300506592, + "learning_rate": 3.3243272082236764e-08, + "loss": 1.7441, + "step": 6831 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 0.9224117994308472, + "learning_rate": 3.2662741216801795e-08, + "loss": 0.1065, + "step": 6832 + }, + { + "epoch": 0.9840149769585254, + "grad_norm": 0.9417597055435181, + "learning_rate": 3.208732072368104e-08, + "loss": 0.0622, + "step": 6833 + }, + { + "epoch": 0.9841589861751152, + "grad_norm": 3.376523017883301, + "learning_rate": 3.151701072064694e-08, + "loss": 0.3299, + "step": 6834 + }, + { + "epoch": 0.9843029953917051, + "grad_norm": 0.5116720199584961, + "learning_rate": 3.0951811324436695e-08, + "loss": 0.0601, + "step": 6835 + }, + { + "epoch": 0.9844470046082949, + "grad_norm": 0.8699091672897339, + "learning_rate": 3.039172265073553e-08, + "loss": 0.0682, + "step": 6836 + }, + { + "epoch": 0.9845910138248848, + "grad_norm": 0.6956921219825745, + "learning_rate": 2.9836744814182305e-08, + "loss": 0.0809, + "step": 6837 + }, + { + "epoch": 0.9847350230414746, + "grad_norm": 1.1887785196304321, + "learning_rate": 2.928687792836948e-08, + "loss": 0.084, + "step": 6838 + }, + { + "epoch": 0.9848790322580645, + "grad_norm": 5.5473222732543945, + "learning_rate": 2.8742122105851477e-08, + "loss": 0.8718, + "step": 6839 + }, + { + "epoch": 0.9850230414746544, + "grad_norm": 0.8678961992263794, + "learning_rate": 2.8202477458122435e-08, + "loss": 0.0676, + "step": 6840 + }, + { + "epoch": 0.9851670506912442, + "grad_norm": 0.7098332047462463, + "learning_rate": 2.7667944095643994e-08, + "loss": 0.088, + "step": 6841 + }, + { + "epoch": 0.9853110599078341, + "grad_norm": 0.983818531036377, + "learning_rate": 2.7138522127823084e-08, + "loss": 0.0863, + "step": 6842 + }, + { + "epoch": 0.9854550691244239, + "grad_norm": 0.29925814270973206, + "learning_rate": 2.6614211663023024e-08, + "loss": 0.0611, + "step": 6843 + }, + { + "epoch": 0.9855990783410138, + "grad_norm": 1.7967623472213745, + "learning_rate": 2.6095012808563523e-08, + "loss": 0.1531, + "step": 6844 + }, + { + "epoch": 0.9857430875576036, + "grad_norm": 5.331538677215576, + "learning_rate": 2.5580925670712354e-08, + "loss": 1.6179, + "step": 6845 + }, + { + "epoch": 0.9858870967741935, + "grad_norm": 0.9687462449073792, + "learning_rate": 2.5071950354693675e-08, + "loss": 0.0934, + "step": 6846 + }, + { + "epoch": 0.9860311059907834, + "grad_norm": 3.45524263381958, + "learning_rate": 2.4568086964685267e-08, + "loss": 1.9719, + "step": 6847 + }, + { + "epoch": 0.9861751152073732, + "grad_norm": 0.8311625719070435, + "learning_rate": 2.4069335603824072e-08, + "loss": 0.0747, + "step": 6848 + }, + { + "epoch": 0.9863191244239631, + "grad_norm": 0.26510298252105713, + "learning_rate": 2.3575696374189548e-08, + "loss": 0.0445, + "step": 6849 + }, + { + "epoch": 0.986463133640553, + "grad_norm": 1.3210736513137817, + "learning_rate": 2.3087169376825868e-08, + "loss": 0.1082, + "step": 6850 + }, + { + "epoch": 0.9866071428571429, + "grad_norm": 0.6394079327583313, + "learning_rate": 2.260375471172249e-08, + "loss": 0.0654, + "step": 6851 + }, + { + "epoch": 0.9867511520737328, + "grad_norm": 4.920920372009277, + "learning_rate": 2.2125452477828047e-08, + "loss": 1.5053, + "step": 6852 + }, + { + "epoch": 0.9868951612903226, + "grad_norm": 3.502390146255493, + "learning_rate": 2.165226277303922e-08, + "loss": 0.6068, + "step": 6853 + }, + { + "epoch": 0.9870391705069125, + "grad_norm": 0.9605260491371155, + "learning_rate": 2.1184185694214653e-08, + "loss": 0.0999, + "step": 6854 + }, + { + "epoch": 0.9871831797235023, + "grad_norm": 0.44490939378738403, + "learning_rate": 2.072122133715826e-08, + "loss": 0.0497, + "step": 6855 + }, + { + "epoch": 0.9873271889400922, + "grad_norm": 0.83362877368927, + "learning_rate": 2.026336979663035e-08, + "loss": 0.1115, + "step": 6856 + }, + { + "epoch": 0.987471198156682, + "grad_norm": 0.5649289488792419, + "learning_rate": 1.981063116634485e-08, + "loss": 0.0638, + "step": 6857 + }, + { + "epoch": 0.9876152073732719, + "grad_norm": 0.476527601480484, + "learning_rate": 1.9363005538972078e-08, + "loss": 0.0687, + "step": 6858 + }, + { + "epoch": 0.9877592165898618, + "grad_norm": 4.429252624511719, + "learning_rate": 1.8920493006130413e-08, + "loss": 1.0478, + "step": 6859 + }, + { + "epoch": 0.9879032258064516, + "grad_norm": 4.275373935699463, + "learning_rate": 1.8483093658394624e-08, + "loss": 2.4103, + "step": 6860 + }, + { + "epoch": 0.9880472350230415, + "grad_norm": 1.650784969329834, + "learning_rate": 1.8050807585293095e-08, + "loss": 0.1208, + "step": 6861 + }, + { + "epoch": 0.9881912442396313, + "grad_norm": 0.6140692234039307, + "learning_rate": 1.7623634875307826e-08, + "loss": 0.0696, + "step": 6862 + }, + { + "epoch": 0.9883352534562212, + "grad_norm": 1.0568695068359375, + "learning_rate": 1.7201575615871658e-08, + "loss": 0.1075, + "step": 6863 + }, + { + "epoch": 0.988479262672811, + "grad_norm": 4.941908359527588, + "learning_rate": 1.678462989337659e-08, + "loss": 0.8867, + "step": 6864 + }, + { + "epoch": 0.9886232718894009, + "grad_norm": 0.47531282901763916, + "learning_rate": 1.6372797793159923e-08, + "loss": 0.0584, + "step": 6865 + }, + { + "epoch": 0.9887672811059908, + "grad_norm": 0.8885167837142944, + "learning_rate": 1.596607939951811e-08, + "loss": 0.1288, + "step": 6866 + }, + { + "epoch": 0.9889112903225806, + "grad_norm": 3.484968900680542, + "learning_rate": 1.5564474795698448e-08, + "loss": 2.061, + "step": 6867 + }, + { + "epoch": 0.9890552995391705, + "grad_norm": 0.4211609661579132, + "learning_rate": 1.5167984063901852e-08, + "loss": 0.0423, + "step": 6868 + }, + { + "epoch": 0.9891993087557603, + "grad_norm": 1.0663167238235474, + "learning_rate": 1.4776607285285626e-08, + "loss": 0.1012, + "step": 6869 + }, + { + "epoch": 0.9893433179723502, + "grad_norm": 0.918316662311554, + "learning_rate": 1.4390344539955136e-08, + "loss": 0.0886, + "step": 6870 + }, + { + "epoch": 0.9894873271889401, + "grad_norm": 0.8756847977638245, + "learning_rate": 1.400919590697214e-08, + "loss": 0.0784, + "step": 6871 + }, + { + "epoch": 0.9896313364055299, + "grad_norm": 0.3713900148868561, + "learning_rate": 1.3633161464352007e-08, + "loss": 0.0442, + "step": 6872 + }, + { + "epoch": 0.9897753456221198, + "grad_norm": 0.8275884389877319, + "learning_rate": 1.326224128906095e-08, + "loss": 0.0962, + "step": 6873 + }, + { + "epoch": 0.9899193548387096, + "grad_norm": 0.8011140823364258, + "learning_rate": 1.2896435457021571e-08, + "loss": 0.0603, + "step": 6874 + }, + { + "epoch": 0.9900633640552995, + "grad_norm": 0.8407508134841919, + "learning_rate": 1.2535744043107312e-08, + "loss": 0.0842, + "step": 6875 + }, + { + "epoch": 0.9902073732718893, + "grad_norm": 0.8296141624450684, + "learning_rate": 1.218016712114245e-08, + "loss": 0.0744, + "step": 6876 + }, + { + "epoch": 0.9903513824884793, + "grad_norm": 5.102777481079102, + "learning_rate": 1.1829704763910432e-08, + "loss": 2.6028, + "step": 6877 + }, + { + "epoch": 0.9904953917050692, + "grad_norm": 4.753918170928955, + "learning_rate": 1.1484357043142768e-08, + "loss": 1.8983, + "step": 6878 + }, + { + "epoch": 0.990639400921659, + "grad_norm": 1.2891186475753784, + "learning_rate": 1.1144124029527359e-08, + "loss": 0.1183, + "step": 6879 + }, + { + "epoch": 0.9907834101382489, + "grad_norm": 0.645734965801239, + "learning_rate": 1.0809005792705717e-08, + "loss": 0.071, + "step": 6880 + }, + { + "epoch": 0.9909274193548387, + "grad_norm": 0.7106531262397766, + "learning_rate": 1.0479002401264648e-08, + "loss": 0.0924, + "step": 6881 + }, + { + "epoch": 0.9910714285714286, + "grad_norm": 1.297256588935852, + "learning_rate": 1.0154113922758446e-08, + "loss": 0.1198, + "step": 6882 + }, + { + "epoch": 0.9912154377880185, + "grad_norm": 1.224646806716919, + "learning_rate": 9.834340423678368e-09, + "loss": 0.1302, + "step": 6883 + }, + { + "epoch": 0.9913594470046083, + "grad_norm": 1.3297088146209717, + "learning_rate": 9.519681969480387e-09, + "loss": 0.1094, + "step": 6884 + }, + { + "epoch": 0.9915034562211982, + "grad_norm": 1.3527473211288452, + "learning_rate": 9.210138624568544e-09, + "loss": 0.1202, + "step": 6885 + }, + { + "epoch": 0.991647465437788, + "grad_norm": 0.9184957146644592, + "learning_rate": 8.905710452300487e-09, + "loss": 0.0891, + "step": 6886 + }, + { + "epoch": 0.9917914746543779, + "grad_norm": 0.775843620300293, + "learning_rate": 8.606397514987486e-09, + "loss": 0.0792, + "step": 6887 + }, + { + "epoch": 0.9919354838709677, + "grad_norm": 1.0912812948226929, + "learning_rate": 8.312199873894423e-09, + "loss": 0.1063, + "step": 6888 + }, + { + "epoch": 0.9920794930875576, + "grad_norm": 0.7073888778686523, + "learning_rate": 8.023117589237017e-09, + "loss": 0.0874, + "step": 6889 + }, + { + "epoch": 0.9922235023041475, + "grad_norm": 1.5545682907104492, + "learning_rate": 7.739150720187383e-09, + "loss": 0.1136, + "step": 6890 + }, + { + "epoch": 0.9923675115207373, + "grad_norm": 1.150858759880066, + "learning_rate": 7.460299324865694e-09, + "loss": 0.087, + "step": 6891 + }, + { + "epoch": 0.9925115207373272, + "grad_norm": 4.403290271759033, + "learning_rate": 7.186563460351292e-09, + "loss": 2.2913, + "step": 6892 + }, + { + "epoch": 0.992655529953917, + "grad_norm": 0.6643279194831848, + "learning_rate": 6.917943182668807e-09, + "loss": 0.0784, + "step": 6893 + }, + { + "epoch": 0.9927995391705069, + "grad_norm": 1.0125926733016968, + "learning_rate": 6.6544385468048085e-09, + "loss": 0.1125, + "step": 6894 + }, + { + "epoch": 0.9929435483870968, + "grad_norm": 4.672982215881348, + "learning_rate": 6.396049606688381e-09, + "loss": 1.213, + "step": 6895 + }, + { + "epoch": 0.9930875576036866, + "grad_norm": 0.7798107862472534, + "learning_rate": 6.1427764152133245e-09, + "loss": 0.0784, + "step": 6896 + }, + { + "epoch": 0.9932315668202765, + "grad_norm": 4.957879543304443, + "learning_rate": 5.8946190242159525e-09, + "loss": 1.3072, + "step": 6897 + }, + { + "epoch": 0.9933755760368663, + "grad_norm": 0.8339235782623291, + "learning_rate": 5.651577484491743e-09, + "loss": 0.1043, + "step": 6898 + }, + { + "epoch": 0.9935195852534562, + "grad_norm": 0.4913248121738434, + "learning_rate": 5.413651845787016e-09, + "loss": 0.0451, + "step": 6899 + }, + { + "epoch": 0.993663594470046, + "grad_norm": 0.37946733832359314, + "learning_rate": 5.180842156798926e-09, + "loss": 0.0427, + "step": 6900 + }, + { + "epoch": 0.9938076036866359, + "grad_norm": 0.6951727867126465, + "learning_rate": 4.953148465181023e-09, + "loss": 0.0806, + "step": 6901 + }, + { + "epoch": 0.9939516129032258, + "grad_norm": 0.8225105404853821, + "learning_rate": 4.730570817537694e-09, + "loss": 0.0958, + "step": 6902 + }, + { + "epoch": 0.9940956221198156, + "grad_norm": 1.090965747833252, + "learning_rate": 4.5131092594269396e-09, + "loss": 0.1056, + "step": 6903 + }, + { + "epoch": 0.9942396313364056, + "grad_norm": 3.356872797012329, + "learning_rate": 4.300763835360377e-09, + "loss": 0.5481, + "step": 6904 + }, + { + "epoch": 0.9943836405529954, + "grad_norm": 0.7273156642913818, + "learning_rate": 4.093534588797687e-09, + "loss": 0.0786, + "step": 6905 + }, + { + "epoch": 0.9945276497695853, + "grad_norm": 0.6715142130851746, + "learning_rate": 3.891421562160491e-09, + "loss": 0.062, + "step": 6906 + }, + { + "epoch": 0.9946716589861752, + "grad_norm": 0.6631482243537903, + "learning_rate": 3.694424796812923e-09, + "loss": 0.0929, + "step": 6907 + }, + { + "epoch": 0.994815668202765, + "grad_norm": 3.691619873046875, + "learning_rate": 3.502544333078284e-09, + "loss": 2.073, + "step": 6908 + }, + { + "epoch": 0.9949596774193549, + "grad_norm": 0.46154072880744934, + "learning_rate": 3.3157802102334877e-09, + "loss": 0.0546, + "step": 6909 + }, + { + "epoch": 0.9951036866359447, + "grad_norm": 3.5096936225891113, + "learning_rate": 3.1341324665035144e-09, + "loss": 0.6092, + "step": 6910 + }, + { + "epoch": 0.9952476958525346, + "grad_norm": 4.379936218261719, + "learning_rate": 2.9576011390669567e-09, + "loss": 1.4346, + "step": 6911 + }, + { + "epoch": 0.9953917050691244, + "grad_norm": 0.4555901288986206, + "learning_rate": 2.786186264058799e-09, + "loss": 0.05, + "step": 6912 + }, + { + "epoch": 0.9955357142857143, + "grad_norm": 0.8649787902832031, + "learning_rate": 2.619887876564864e-09, + "loss": 0.0959, + "step": 6913 + }, + { + "epoch": 0.9956797235023042, + "grad_norm": 0.7020791172981262, + "learning_rate": 2.4587060106245895e-09, + "loss": 0.0735, + "step": 6914 + }, + { + "epoch": 0.995823732718894, + "grad_norm": 5.843635559082031, + "learning_rate": 2.3026406992254777e-09, + "loss": 0.9467, + "step": 6915 + }, + { + "epoch": 0.9959677419354839, + "grad_norm": 0.38158074021339417, + "learning_rate": 2.151691974314196e-09, + "loss": 0.0428, + "step": 6916 + }, + { + "epoch": 0.9961117511520737, + "grad_norm": 7.797924518585205, + "learning_rate": 2.0058598667854756e-09, + "loss": 1.5339, + "step": 6917 + }, + { + "epoch": 0.9962557603686636, + "grad_norm": 0.6645235419273376, + "learning_rate": 1.8651444064904377e-09, + "loss": 0.0925, + "step": 6918 + }, + { + "epoch": 0.9963997695852534, + "grad_norm": 4.228973865509033, + "learning_rate": 1.729545622228268e-09, + "loss": 1.6053, + "step": 6919 + }, + { + "epoch": 0.9965437788018433, + "grad_norm": 1.1392529010772705, + "learning_rate": 1.5990635417573174e-09, + "loss": 0.1117, + "step": 6920 + }, + { + "epoch": 0.9966877880184332, + "grad_norm": 0.950932502746582, + "learning_rate": 1.4736981917812253e-09, + "loss": 0.0819, + "step": 6921 + }, + { + "epoch": 0.996831797235023, + "grad_norm": 1.089673638343811, + "learning_rate": 1.353449597962797e-09, + "loss": 0.1055, + "step": 6922 + }, + { + "epoch": 0.9969758064516129, + "grad_norm": 0.6373267769813538, + "learning_rate": 1.2383177849129013e-09, + "loss": 0.0889, + "step": 6923 + }, + { + "epoch": 0.9971198156682027, + "grad_norm": 0.47325238585472107, + "learning_rate": 1.1283027761987975e-09, + "loss": 0.0656, + "step": 6924 + }, + { + "epoch": 0.9972638248847926, + "grad_norm": 0.7107694745063782, + "learning_rate": 1.023404594338584e-09, + "loss": 0.0798, + "step": 6925 + }, + { + "epoch": 0.9974078341013825, + "grad_norm": 0.5448468923568726, + "learning_rate": 9.236232608011986e-10, + "loss": 0.067, + "step": 6926 + }, + { + "epoch": 0.9975518433179723, + "grad_norm": 1.058615803718567, + "learning_rate": 8.289587960119694e-10, + "loss": 0.1125, + "step": 6927 + }, + { + "epoch": 0.9976958525345622, + "grad_norm": 0.8051578402519226, + "learning_rate": 7.394112193470637e-10, + "loss": 0.0994, + "step": 6928 + }, + { + "epoch": 0.997839861751152, + "grad_norm": 0.9503149390220642, + "learning_rate": 6.549805491307126e-10, + "loss": 0.0788, + "step": 6929 + }, + { + "epoch": 0.9979838709677419, + "grad_norm": 0.8057255744934082, + "learning_rate": 5.756668026518642e-10, + "loss": 0.0989, + "step": 6930 + }, + { + "epoch": 0.9981278801843319, + "grad_norm": 0.9445319771766663, + "learning_rate": 5.014699961392033e-10, + "loss": 0.0857, + "step": 6931 + }, + { + "epoch": 0.9982718894009217, + "grad_norm": 0.9044331312179565, + "learning_rate": 4.3239014478058116e-10, + "loss": 0.0928, + "step": 6932 + }, + { + "epoch": 0.9984158986175116, + "grad_norm": 0.2885947823524475, + "learning_rate": 3.684272627174634e-10, + "loss": 0.0466, + "step": 6933 + }, + { + "epoch": 0.9985599078341014, + "grad_norm": 1.0779584646224976, + "learning_rate": 3.095813630421551e-10, + "loss": 0.1093, + "step": 6934 + }, + { + "epoch": 0.9987039170506913, + "grad_norm": 9.967235565185547, + "learning_rate": 2.5585245779502497e-10, + "loss": 1.7871, + "step": 6935 + }, + { + "epoch": 0.9988479262672811, + "grad_norm": 1.2622928619384766, + "learning_rate": 2.072405579756076e-10, + "loss": 4.0534, + "step": 6936 + }, + { + "epoch": 0.998991935483871, + "grad_norm": 0.7092199325561523, + "learning_rate": 1.637456735370524e-10, + "loss": 0.0753, + "step": 6937 + }, + { + "epoch": 0.9991359447004609, + "grad_norm": 1.7479561567306519, + "learning_rate": 1.253678133777969e-10, + "loss": 0.1163, + "step": 6938 + }, + { + "epoch": 0.9992799539170507, + "grad_norm": 4.601142406463623, + "learning_rate": 9.210698535266904e-11, + "loss": 0.6929, + "step": 6939 + }, + { + "epoch": 0.9994239631336406, + "grad_norm": 0.7757306098937988, + "learning_rate": 6.39631962756626e-11, + "loss": 0.0837, + "step": 6940 + }, + { + "epoch": 0.9995679723502304, + "grad_norm": 6.930806636810303, + "learning_rate": 4.093645190050843e-11, + "loss": 1.9002, + "step": 6941 + }, + { + "epoch": 0.9997119815668203, + "grad_norm": 0.5599818229675293, + "learning_rate": 2.3026756942878812e-11, + "loss": 0.049, + "step": 6942 + }, + { + "epoch": 0.9998559907834101, + "grad_norm": 1.1041451692581177, + "learning_rate": 1.0234115069285288e-11, + "loss": 0.1775, + "step": 6943 + }, + { + "epoch": 1.0, + "grad_norm": 3.9122467041015625, + "learning_rate": 2.558528897078638e-12, + "loss": 1.6896, + "step": 6944 + } + ], + "logging_steps": 1, + "max_steps": 6944, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.223820137849856e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}