{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 20, "global_step": 12178, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016424069473813874, "grad_norm": 0.5388180017471313, "learning_rate": 0.0002, "loss": 1.8932, "step": 20 }, { "epoch": 0.003284813894762775, "grad_norm": 0.46543794870376587, "learning_rate": 0.0002, "loss": 1.6701, "step": 40 }, { "epoch": 0.004927220842144162, "grad_norm": 0.45620647072792053, "learning_rate": 0.0002, "loss": 1.5541, "step": 60 }, { "epoch": 0.00656962778952555, "grad_norm": 0.4583057761192322, "learning_rate": 0.0002, "loss": 1.5777, "step": 80 }, { "epoch": 0.008212034736906937, "grad_norm": 0.5295430421829224, "learning_rate": 0.0002, "loss": 1.3046, "step": 100 }, { "epoch": 0.009854441684288324, "grad_norm": 0.44552722573280334, "learning_rate": 0.0002, "loss": 1.3053, "step": 120 }, { "epoch": 0.011496848631669712, "grad_norm": 0.45540332794189453, "learning_rate": 0.0002, "loss": 1.1971, "step": 140 }, { "epoch": 0.0131392555790511, "grad_norm": 0.4302205443382263, "learning_rate": 0.0002, "loss": 1.2143, "step": 160 }, { "epoch": 0.014781662526432487, "grad_norm": 0.4064156413078308, "learning_rate": 0.0002, "loss": 1.1695, "step": 180 }, { "epoch": 0.016424069473813873, "grad_norm": 0.43175607919692993, "learning_rate": 0.0002, "loss": 1.1836, "step": 200 }, { "epoch": 0.01806647642119526, "grad_norm": 0.5280532240867615, "learning_rate": 0.0002, "loss": 1.1627, "step": 220 }, { "epoch": 0.01970888336857665, "grad_norm": 0.4442996382713318, "learning_rate": 0.0002, "loss": 1.2294, "step": 240 }, { "epoch": 0.021351290315958036, "grad_norm": 0.4584205448627472, "learning_rate": 0.0002, "loss": 1.058, "step": 260 }, { "epoch": 0.022993697263339424, "grad_norm": 0.40979012846946716, "learning_rate": 0.0002, "loss": 1.0436, "step": 280 }, { "epoch": 0.02463610421072081, "grad_norm": 0.4241325557231903, "learning_rate": 0.0002, "loss": 1.1414, "step": 300 }, { "epoch": 0.0262785111581022, "grad_norm": 0.4106293022632599, "learning_rate": 0.0002, "loss": 1.0744, "step": 320 }, { "epoch": 0.027920918105483587, "grad_norm": 0.46253764629364014, "learning_rate": 0.0002, "loss": 1.0589, "step": 340 }, { "epoch": 0.029563325052864974, "grad_norm": 0.4244967997074127, "learning_rate": 0.0002, "loss": 1.0263, "step": 360 }, { "epoch": 0.031205732000246362, "grad_norm": 0.35677096247673035, "learning_rate": 0.0002, "loss": 1.0447, "step": 380 }, { "epoch": 0.032848138947627746, "grad_norm": 0.4948490262031555, "learning_rate": 0.0002, "loss": 1.0826, "step": 400 }, { "epoch": 0.034490545895009134, "grad_norm": 0.5756106972694397, "learning_rate": 0.0002, "loss": 0.948, "step": 420 }, { "epoch": 0.03613295284239052, "grad_norm": 0.5383228063583374, "learning_rate": 0.0002, "loss": 1.0025, "step": 440 }, { "epoch": 0.03777535978977191, "grad_norm": 0.3955784738063812, "learning_rate": 0.0002, "loss": 0.9027, "step": 460 }, { "epoch": 0.0394177667371533, "grad_norm": 0.37915533781051636, "learning_rate": 0.0002, "loss": 0.9936, "step": 480 }, { "epoch": 0.041060173684534684, "grad_norm": 0.5413188934326172, "learning_rate": 0.0002, "loss": 0.9077, "step": 500 }, { "epoch": 0.04270258063191607, "grad_norm": 0.5334627032279968, "learning_rate": 0.0002, "loss": 0.9009, "step": 520 }, { "epoch": 0.04434498757929746, "grad_norm": 0.5394805073738098, "learning_rate": 0.0002, "loss": 0.9542, "step": 540 }, { "epoch": 0.04598739452667885, "grad_norm": 0.532177746295929, "learning_rate": 0.0002, "loss": 0.8743, "step": 560 }, { "epoch": 0.047629801474060235, "grad_norm": 0.5266315937042236, "learning_rate": 0.0002, "loss": 0.8931, "step": 580 }, { "epoch": 0.04927220842144162, "grad_norm": 0.4725072979927063, "learning_rate": 0.0002, "loss": 0.908, "step": 600 }, { "epoch": 0.05091461536882301, "grad_norm": 0.6026243567466736, "learning_rate": 0.0002, "loss": 0.7898, "step": 620 }, { "epoch": 0.0525570223162044, "grad_norm": 0.4928111732006073, "learning_rate": 0.0002, "loss": 0.8406, "step": 640 }, { "epoch": 0.054199429263585785, "grad_norm": 0.4555020332336426, "learning_rate": 0.0002, "loss": 0.8222, "step": 660 }, { "epoch": 0.05584183621096717, "grad_norm": 0.6445655822753906, "learning_rate": 0.0002, "loss": 0.832, "step": 680 }, { "epoch": 0.05748424315834856, "grad_norm": 0.5854527950286865, "learning_rate": 0.0002, "loss": 0.8435, "step": 700 }, { "epoch": 0.05912665010572995, "grad_norm": 0.4609089195728302, "learning_rate": 0.0002, "loss": 0.748, "step": 720 }, { "epoch": 0.060769057053111336, "grad_norm": 0.5567362904548645, "learning_rate": 0.0002, "loss": 0.7777, "step": 740 }, { "epoch": 0.062411464000492724, "grad_norm": 0.5161166191101074, "learning_rate": 0.0002, "loss": 0.7597, "step": 760 }, { "epoch": 0.06405387094787411, "grad_norm": 0.5450626611709595, "learning_rate": 0.0002, "loss": 0.7337, "step": 780 }, { "epoch": 0.06569627789525549, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "loss": 0.7668, "step": 800 }, { "epoch": 0.06733868484263689, "grad_norm": 0.4653383493423462, "learning_rate": 0.0002, "loss": 0.7417, "step": 820 }, { "epoch": 0.06898109179001827, "grad_norm": 0.4846251308917999, "learning_rate": 0.0002, "loss": 0.7506, "step": 840 }, { "epoch": 0.07062349873739966, "grad_norm": 0.4887784719467163, "learning_rate": 0.0002, "loss": 0.7115, "step": 860 }, { "epoch": 0.07226590568478104, "grad_norm": 0.5024611949920654, "learning_rate": 0.0002, "loss": 0.7402, "step": 880 }, { "epoch": 0.07390831263216244, "grad_norm": 0.5007764101028442, "learning_rate": 0.0002, "loss": 0.6529, "step": 900 }, { "epoch": 0.07555071957954382, "grad_norm": 0.5097551345825195, "learning_rate": 0.0002, "loss": 0.7776, "step": 920 }, { "epoch": 0.07719312652692521, "grad_norm": 0.5517822504043579, "learning_rate": 0.0002, "loss": 0.6609, "step": 940 }, { "epoch": 0.0788355334743066, "grad_norm": 0.5290623307228088, "learning_rate": 0.0002, "loss": 0.7015, "step": 960 }, { "epoch": 0.08047794042168799, "grad_norm": 0.576545000076294, "learning_rate": 0.0002, "loss": 0.6752, "step": 980 }, { "epoch": 0.08212034736906937, "grad_norm": 0.4689784049987793, "learning_rate": 0.0002, "loss": 0.7047, "step": 1000 }, { "epoch": 0.08376275431645076, "grad_norm": 0.455814003944397, "learning_rate": 0.0002, "loss": 0.6378, "step": 1020 }, { "epoch": 0.08540516126383214, "grad_norm": 0.6452861428260803, "learning_rate": 0.0002, "loss": 0.6962, "step": 1040 }, { "epoch": 0.08704756821121354, "grad_norm": 0.5699702501296997, "learning_rate": 0.0002, "loss": 0.6508, "step": 1060 }, { "epoch": 0.08868997515859492, "grad_norm": 0.5086561441421509, "learning_rate": 0.0002, "loss": 0.6174, "step": 1080 }, { "epoch": 0.09033238210597631, "grad_norm": 0.48543211817741394, "learning_rate": 0.0002, "loss": 0.6261, "step": 1100 }, { "epoch": 0.0919747890533577, "grad_norm": 0.6361482739448547, "learning_rate": 0.0002, "loss": 0.6336, "step": 1120 }, { "epoch": 0.09361719600073909, "grad_norm": 0.5558167695999146, "learning_rate": 0.0002, "loss": 0.6678, "step": 1140 }, { "epoch": 0.09525960294812047, "grad_norm": 0.5599238872528076, "learning_rate": 0.0002, "loss": 0.6169, "step": 1160 }, { "epoch": 0.09690200989550186, "grad_norm": 0.5939186215400696, "learning_rate": 0.0002, "loss": 0.6059, "step": 1180 }, { "epoch": 0.09854441684288325, "grad_norm": 0.5663330554962158, "learning_rate": 0.0002, "loss": 0.5737, "step": 1200 }, { "epoch": 0.10018682379026464, "grad_norm": 0.49742865562438965, "learning_rate": 0.0002, "loss": 0.6013, "step": 1220 }, { "epoch": 0.10182923073764602, "grad_norm": 0.520782470703125, "learning_rate": 0.0002, "loss": 0.5929, "step": 1240 }, { "epoch": 0.1034716376850274, "grad_norm": 0.45269444584846497, "learning_rate": 0.0002, "loss": 0.5981, "step": 1260 }, { "epoch": 0.1051140446324088, "grad_norm": 0.5428550243377686, "learning_rate": 0.0002, "loss": 0.5814, "step": 1280 }, { "epoch": 0.10675645157979018, "grad_norm": 0.4782160818576813, "learning_rate": 0.0002, "loss": 0.5858, "step": 1300 }, { "epoch": 0.10839885852717157, "grad_norm": 0.5338163375854492, "learning_rate": 0.0002, "loss": 0.6255, "step": 1320 }, { "epoch": 0.11004126547455295, "grad_norm": 0.4596363306045532, "learning_rate": 0.0002, "loss": 0.5974, "step": 1340 }, { "epoch": 0.11168367242193435, "grad_norm": 0.5203448534011841, "learning_rate": 0.0002, "loss": 0.5452, "step": 1360 }, { "epoch": 0.11332607936931573, "grad_norm": 0.44463276863098145, "learning_rate": 0.0002, "loss": 0.576, "step": 1380 }, { "epoch": 0.11496848631669712, "grad_norm": 0.5106232762336731, "learning_rate": 0.0002, "loss": 0.5679, "step": 1400 }, { "epoch": 0.1166108932640785, "grad_norm": 0.5451502799987793, "learning_rate": 0.0002, "loss": 0.5673, "step": 1420 }, { "epoch": 0.1182533002114599, "grad_norm": 0.6638749837875366, "learning_rate": 0.0002, "loss": 0.543, "step": 1440 }, { "epoch": 0.11989570715884128, "grad_norm": 0.5045977830886841, "learning_rate": 0.0002, "loss": 0.5803, "step": 1460 }, { "epoch": 0.12153811410622267, "grad_norm": 0.5385071635246277, "learning_rate": 0.0002, "loss": 0.5357, "step": 1480 }, { "epoch": 0.12318052105360405, "grad_norm": 0.43107932806015015, "learning_rate": 0.0002, "loss": 0.5378, "step": 1500 }, { "epoch": 0.12482292800098545, "grad_norm": 0.5887011885643005, "learning_rate": 0.0002, "loss": 0.5594, "step": 1520 }, { "epoch": 0.12646533494836684, "grad_norm": 0.547126829624176, "learning_rate": 0.0002, "loss": 0.5574, "step": 1540 }, { "epoch": 0.12810774189574822, "grad_norm": 0.532454788684845, "learning_rate": 0.0002, "loss": 0.5506, "step": 1560 }, { "epoch": 0.1297501488431296, "grad_norm": 0.592251718044281, "learning_rate": 0.0002, "loss": 0.5206, "step": 1580 }, { "epoch": 0.13139255579051098, "grad_norm": 0.6189798712730408, "learning_rate": 0.0002, "loss": 0.516, "step": 1600 }, { "epoch": 0.1330349627378924, "grad_norm": 0.4614121913909912, "learning_rate": 0.0002, "loss": 0.4948, "step": 1620 }, { "epoch": 0.13467736968527377, "grad_norm": 0.6192139983177185, "learning_rate": 0.0002, "loss": 0.4924, "step": 1640 }, { "epoch": 0.13631977663265515, "grad_norm": 0.5383406281471252, "learning_rate": 0.0002, "loss": 0.4955, "step": 1660 }, { "epoch": 0.13796218358003653, "grad_norm": 0.681564450263977, "learning_rate": 0.0002, "loss": 0.5224, "step": 1680 }, { "epoch": 0.13960459052741794, "grad_norm": 0.51935875415802, "learning_rate": 0.0002, "loss": 0.508, "step": 1700 }, { "epoch": 0.14124699747479932, "grad_norm": 0.532661497592926, "learning_rate": 0.0002, "loss": 0.5362, "step": 1720 }, { "epoch": 0.1428894044221807, "grad_norm": 0.40774333477020264, "learning_rate": 0.0002, "loss": 0.4908, "step": 1740 }, { "epoch": 0.14453181136956209, "grad_norm": 0.6406064033508301, "learning_rate": 0.0002, "loss": 0.4891, "step": 1760 }, { "epoch": 0.1461742183169435, "grad_norm": 0.41497862339019775, "learning_rate": 0.0002, "loss": 0.5234, "step": 1780 }, { "epoch": 0.14781662526432487, "grad_norm": 0.502389132976532, "learning_rate": 0.0002, "loss": 0.459, "step": 1800 }, { "epoch": 0.14945903221170626, "grad_norm": 0.5248283743858337, "learning_rate": 0.0002, "loss": 0.4659, "step": 1820 }, { "epoch": 0.15110143915908764, "grad_norm": 0.5587234497070312, "learning_rate": 0.0002, "loss": 0.4877, "step": 1840 }, { "epoch": 0.15274384610646902, "grad_norm": 0.479913592338562, "learning_rate": 0.0002, "loss": 0.4598, "step": 1860 }, { "epoch": 0.15438625305385043, "grad_norm": 0.5423480272293091, "learning_rate": 0.0002, "loss": 0.4754, "step": 1880 }, { "epoch": 0.1560286600012318, "grad_norm": 0.5485461354255676, "learning_rate": 0.0002, "loss": 0.4681, "step": 1900 }, { "epoch": 0.1576710669486132, "grad_norm": 0.48511844873428345, "learning_rate": 0.0002, "loss": 0.4672, "step": 1920 }, { "epoch": 0.15931347389599457, "grad_norm": 0.49132347106933594, "learning_rate": 0.0002, "loss": 0.4694, "step": 1940 }, { "epoch": 0.16095588084337598, "grad_norm": 0.5654798746109009, "learning_rate": 0.0002, "loss": 0.5047, "step": 1960 }, { "epoch": 0.16259828779075736, "grad_norm": 0.571369469165802, "learning_rate": 0.0002, "loss": 0.4486, "step": 1980 }, { "epoch": 0.16424069473813874, "grad_norm": 0.5438801646232605, "learning_rate": 0.0002, "loss": 0.4756, "step": 2000 }, { "epoch": 0.16588310168552012, "grad_norm": 0.5384829044342041, "learning_rate": 0.0002, "loss": 0.4404, "step": 2020 }, { "epoch": 0.16752550863290153, "grad_norm": 0.5565232634544373, "learning_rate": 0.0002, "loss": 0.4672, "step": 2040 }, { "epoch": 0.1691679155802829, "grad_norm": 0.5227774381637573, "learning_rate": 0.0002, "loss": 0.4452, "step": 2060 }, { "epoch": 0.1708103225276643, "grad_norm": 0.47740334272384644, "learning_rate": 0.0002, "loss": 0.492, "step": 2080 }, { "epoch": 0.17245272947504567, "grad_norm": 0.4206157326698303, "learning_rate": 0.0002, "loss": 0.4517, "step": 2100 }, { "epoch": 0.17409513642242708, "grad_norm": 0.5148787498474121, "learning_rate": 0.0002, "loss": 0.4801, "step": 2120 }, { "epoch": 0.17573754336980846, "grad_norm": 0.4815204441547394, "learning_rate": 0.0002, "loss": 0.4415, "step": 2140 }, { "epoch": 0.17737995031718984, "grad_norm": 0.5302825570106506, "learning_rate": 0.0002, "loss": 0.4558, "step": 2160 }, { "epoch": 0.17902235726457122, "grad_norm": 0.574350118637085, "learning_rate": 0.0002, "loss": 0.4709, "step": 2180 }, { "epoch": 0.18066476421195263, "grad_norm": 0.5393965244293213, "learning_rate": 0.0002, "loss": 0.4528, "step": 2200 }, { "epoch": 0.182307171159334, "grad_norm": 0.43285471200942993, "learning_rate": 0.0002, "loss": 0.4294, "step": 2220 }, { "epoch": 0.1839495781067154, "grad_norm": 0.4550113081932068, "learning_rate": 0.0002, "loss": 0.4395, "step": 2240 }, { "epoch": 0.18559198505409677, "grad_norm": 0.586071789264679, "learning_rate": 0.0002, "loss": 0.4456, "step": 2260 }, { "epoch": 0.18723439200147818, "grad_norm": 0.5634139776229858, "learning_rate": 0.0002, "loss": 0.4295, "step": 2280 }, { "epoch": 0.18887679894885956, "grad_norm": 0.5095311403274536, "learning_rate": 0.0002, "loss": 0.4347, "step": 2300 }, { "epoch": 0.19051920589624094, "grad_norm": 0.6051989793777466, "learning_rate": 0.0002, "loss": 0.4278, "step": 2320 }, { "epoch": 0.19216161284362232, "grad_norm": 0.45743292570114136, "learning_rate": 0.0002, "loss": 0.4191, "step": 2340 }, { "epoch": 0.19380401979100373, "grad_norm": 0.6048611402511597, "learning_rate": 0.0002, "loss": 0.4512, "step": 2360 }, { "epoch": 0.1954464267383851, "grad_norm": 0.495731920003891, "learning_rate": 0.0002, "loss": 0.4087, "step": 2380 }, { "epoch": 0.1970888336857665, "grad_norm": 0.5746319890022278, "learning_rate": 0.0002, "loss": 0.4112, "step": 2400 }, { "epoch": 0.19873124063314787, "grad_norm": 0.4899024963378906, "learning_rate": 0.0002, "loss": 0.4403, "step": 2420 }, { "epoch": 0.20037364758052928, "grad_norm": 0.40732160210609436, "learning_rate": 0.0002, "loss": 0.4281, "step": 2440 }, { "epoch": 0.20201605452791066, "grad_norm": 0.4896198809146881, "learning_rate": 0.0002, "loss": 0.4533, "step": 2460 }, { "epoch": 0.20365846147529204, "grad_norm": 0.5733948349952698, "learning_rate": 0.0002, "loss": 0.4113, "step": 2480 }, { "epoch": 0.20530086842267342, "grad_norm": 0.4565046429634094, "learning_rate": 0.0002, "loss": 0.4237, "step": 2500 }, { "epoch": 0.2069432753700548, "grad_norm": 0.5932797789573669, "learning_rate": 0.0002, "loss": 0.4367, "step": 2520 }, { "epoch": 0.2085856823174362, "grad_norm": 0.5838333368301392, "learning_rate": 0.0002, "loss": 0.4331, "step": 2540 }, { "epoch": 0.2102280892648176, "grad_norm": 0.5022397637367249, "learning_rate": 0.0002, "loss": 0.4004, "step": 2560 }, { "epoch": 0.21187049621219897, "grad_norm": 0.5949686765670776, "learning_rate": 0.0002, "loss": 0.4119, "step": 2580 }, { "epoch": 0.21351290315958035, "grad_norm": 0.45230528712272644, "learning_rate": 0.0002, "loss": 0.4217, "step": 2600 }, { "epoch": 0.21515531010696176, "grad_norm": 0.4186144471168518, "learning_rate": 0.0002, "loss": 0.428, "step": 2620 }, { "epoch": 0.21679771705434314, "grad_norm": 0.5562434196472168, "learning_rate": 0.0002, "loss": 0.394, "step": 2640 }, { "epoch": 0.21844012400172452, "grad_norm": 0.5947513580322266, "learning_rate": 0.0002, "loss": 0.3998, "step": 2660 }, { "epoch": 0.2200825309491059, "grad_norm": 0.4886711835861206, "learning_rate": 0.0002, "loss": 0.389, "step": 2680 }, { "epoch": 0.2217249378964873, "grad_norm": 0.551491379737854, "learning_rate": 0.0002, "loss": 0.3952, "step": 2700 }, { "epoch": 0.2233673448438687, "grad_norm": 0.383627712726593, "learning_rate": 0.0002, "loss": 0.3733, "step": 2720 }, { "epoch": 0.22500975179125007, "grad_norm": 0.45694270730018616, "learning_rate": 0.0002, "loss": 0.4075, "step": 2740 }, { "epoch": 0.22665215873863145, "grad_norm": 0.46876367926597595, "learning_rate": 0.0002, "loss": 0.4135, "step": 2760 }, { "epoch": 0.22829456568601286, "grad_norm": 0.9062886238098145, "learning_rate": 0.0002, "loss": 0.3891, "step": 2780 }, { "epoch": 0.22993697263339424, "grad_norm": 0.47902002930641174, "learning_rate": 0.0002, "loss": 0.405, "step": 2800 }, { "epoch": 0.23157937958077562, "grad_norm": 0.6828575134277344, "learning_rate": 0.0002, "loss": 0.3985, "step": 2820 }, { "epoch": 0.233221786528157, "grad_norm": 0.5411036610603333, "learning_rate": 0.0002, "loss": 0.3658, "step": 2840 }, { "epoch": 0.2348641934755384, "grad_norm": 0.6698014736175537, "learning_rate": 0.0002, "loss": 0.4003, "step": 2860 }, { "epoch": 0.2365066004229198, "grad_norm": 0.5779656171798706, "learning_rate": 0.0002, "loss": 0.4003, "step": 2880 }, { "epoch": 0.23814900737030117, "grad_norm": 0.5321545004844666, "learning_rate": 0.0002, "loss": 0.3667, "step": 2900 }, { "epoch": 0.23979141431768256, "grad_norm": 0.43935510516166687, "learning_rate": 0.0002, "loss": 0.375, "step": 2920 }, { "epoch": 0.24143382126506396, "grad_norm": 0.67582768201828, "learning_rate": 0.0002, "loss": 0.3814, "step": 2940 }, { "epoch": 0.24307622821244534, "grad_norm": 0.6373169422149658, "learning_rate": 0.0002, "loss": 0.4079, "step": 2960 }, { "epoch": 0.24471863515982673, "grad_norm": 0.4568232595920563, "learning_rate": 0.0002, "loss": 0.3821, "step": 2980 }, { "epoch": 0.2463610421072081, "grad_norm": 0.5706847310066223, "learning_rate": 0.0002, "loss": 0.3745, "step": 3000 }, { "epoch": 0.24800344905458951, "grad_norm": 0.5293543338775635, "learning_rate": 0.0002, "loss": 0.3945, "step": 3020 }, { "epoch": 0.2496458560019709, "grad_norm": 0.5566920042037964, "learning_rate": 0.0002, "loss": 0.3739, "step": 3040 }, { "epoch": 0.2512882629493523, "grad_norm": 0.5758338570594788, "learning_rate": 0.0002, "loss": 0.4115, "step": 3060 }, { "epoch": 0.2529306698967337, "grad_norm": 0.5503116250038147, "learning_rate": 0.0002, "loss": 0.3841, "step": 3080 }, { "epoch": 0.25457307684411506, "grad_norm": 0.5829768776893616, "learning_rate": 0.0002, "loss": 0.3679, "step": 3100 }, { "epoch": 0.25621548379149645, "grad_norm": 0.4771459400653839, "learning_rate": 0.0002, "loss": 0.3787, "step": 3120 }, { "epoch": 0.2578578907388778, "grad_norm": 0.508679986000061, "learning_rate": 0.0002, "loss": 0.3424, "step": 3140 }, { "epoch": 0.2595002976862592, "grad_norm": 0.5478394031524658, "learning_rate": 0.0002, "loss": 0.3616, "step": 3160 }, { "epoch": 0.2611427046336406, "grad_norm": 0.48918816447257996, "learning_rate": 0.0002, "loss": 0.364, "step": 3180 }, { "epoch": 0.26278511158102197, "grad_norm": 0.6158058047294617, "learning_rate": 0.0002, "loss": 0.3563, "step": 3200 }, { "epoch": 0.26442751852840335, "grad_norm": 0.6302765607833862, "learning_rate": 0.0002, "loss": 0.3472, "step": 3220 }, { "epoch": 0.2660699254757848, "grad_norm": 0.42650097608566284, "learning_rate": 0.0002, "loss": 0.374, "step": 3240 }, { "epoch": 0.26771233242316617, "grad_norm": 0.5517419576644897, "learning_rate": 0.0002, "loss": 0.3747, "step": 3260 }, { "epoch": 0.26935473937054755, "grad_norm": 0.5887686014175415, "learning_rate": 0.0002, "loss": 0.3655, "step": 3280 }, { "epoch": 0.2709971463179289, "grad_norm": 0.5252538323402405, "learning_rate": 0.0002, "loss": 0.3864, "step": 3300 }, { "epoch": 0.2726395532653103, "grad_norm": 0.4829944968223572, "learning_rate": 0.0002, "loss": 0.3526, "step": 3320 }, { "epoch": 0.2742819602126917, "grad_norm": 0.4375133216381073, "learning_rate": 0.0002, "loss": 0.3536, "step": 3340 }, { "epoch": 0.27592436716007307, "grad_norm": 0.5371789336204529, "learning_rate": 0.0002, "loss": 0.3501, "step": 3360 }, { "epoch": 0.27756677410745445, "grad_norm": 0.44075456261634827, "learning_rate": 0.0002, "loss": 0.3584, "step": 3380 }, { "epoch": 0.2792091810548359, "grad_norm": 0.53825443983078, "learning_rate": 0.0002, "loss": 0.3304, "step": 3400 }, { "epoch": 0.28085158800221727, "grad_norm": 0.48521581292152405, "learning_rate": 0.0002, "loss": 0.3588, "step": 3420 }, { "epoch": 0.28249399494959865, "grad_norm": 0.4189339578151703, "learning_rate": 0.0002, "loss": 0.3556, "step": 3440 }, { "epoch": 0.28413640189698003, "grad_norm": 0.4011813700199127, "learning_rate": 0.0002, "loss": 0.3403, "step": 3460 }, { "epoch": 0.2857788088443614, "grad_norm": 0.4910661280155182, "learning_rate": 0.0002, "loss": 0.3897, "step": 3480 }, { "epoch": 0.2874212157917428, "grad_norm": 0.5664734840393066, "learning_rate": 0.0002, "loss": 0.3503, "step": 3500 }, { "epoch": 0.28906362273912417, "grad_norm": 0.45044422149658203, "learning_rate": 0.0002, "loss": 0.3357, "step": 3520 }, { "epoch": 0.29070602968650555, "grad_norm": 0.6162013411521912, "learning_rate": 0.0002, "loss": 0.3827, "step": 3540 }, { "epoch": 0.292348436633887, "grad_norm": 0.428659588098526, "learning_rate": 0.0002, "loss": 0.3418, "step": 3560 }, { "epoch": 0.29399084358126837, "grad_norm": 0.48843899369239807, "learning_rate": 0.0002, "loss": 0.3695, "step": 3580 }, { "epoch": 0.29563325052864975, "grad_norm": 0.5662574768066406, "learning_rate": 0.0002, "loss": 0.3418, "step": 3600 }, { "epoch": 0.29727565747603113, "grad_norm": 0.5488101243972778, "learning_rate": 0.0002, "loss": 0.3619, "step": 3620 }, { "epoch": 0.2989180644234125, "grad_norm": 0.4078102111816406, "learning_rate": 0.0002, "loss": 0.3339, "step": 3640 }, { "epoch": 0.3005604713707939, "grad_norm": 0.6991748213768005, "learning_rate": 0.0002, "loss": 0.3653, "step": 3660 }, { "epoch": 0.30220287831817527, "grad_norm": 0.4532040059566498, "learning_rate": 0.0002, "loss": 0.343, "step": 3680 }, { "epoch": 0.30384528526555665, "grad_norm": 0.47306913137435913, "learning_rate": 0.0002, "loss": 0.3551, "step": 3700 }, { "epoch": 0.30548769221293803, "grad_norm": 0.4408378303050995, "learning_rate": 0.0002, "loss": 0.3441, "step": 3720 }, { "epoch": 0.30713009916031947, "grad_norm": 0.5125454068183899, "learning_rate": 0.0002, "loss": 0.3578, "step": 3740 }, { "epoch": 0.30877250610770085, "grad_norm": 0.5483905076980591, "learning_rate": 0.0002, "loss": 0.3344, "step": 3760 }, { "epoch": 0.31041491305508223, "grad_norm": 0.3780999779701233, "learning_rate": 0.0002, "loss": 0.3491, "step": 3780 }, { "epoch": 0.3120573200024636, "grad_norm": 0.4443167746067047, "learning_rate": 0.0002, "loss": 0.3406, "step": 3800 }, { "epoch": 0.313699726949845, "grad_norm": 0.5337740182876587, "learning_rate": 0.0002, "loss": 0.3369, "step": 3820 }, { "epoch": 0.3153421338972264, "grad_norm": 0.5371155738830566, "learning_rate": 0.0002, "loss": 0.3579, "step": 3840 }, { "epoch": 0.31698454084460775, "grad_norm": 0.49183839559555054, "learning_rate": 0.0002, "loss": 0.3359, "step": 3860 }, { "epoch": 0.31862694779198913, "grad_norm": 0.5076944828033447, "learning_rate": 0.0002, "loss": 0.3604, "step": 3880 }, { "epoch": 0.32026935473937057, "grad_norm": 0.5076488256454468, "learning_rate": 0.0002, "loss": 0.3373, "step": 3900 }, { "epoch": 0.32191176168675195, "grad_norm": 0.519506573677063, "learning_rate": 0.0002, "loss": 0.3529, "step": 3920 }, { "epoch": 0.32355416863413333, "grad_norm": 0.3967176079750061, "learning_rate": 0.0002, "loss": 0.3203, "step": 3940 }, { "epoch": 0.3251965755815147, "grad_norm": 0.5084711313247681, "learning_rate": 0.0002, "loss": 0.3323, "step": 3960 }, { "epoch": 0.3268389825288961, "grad_norm": 0.5324501991271973, "learning_rate": 0.0002, "loss": 0.3351, "step": 3980 }, { "epoch": 0.3284813894762775, "grad_norm": 0.4679279923439026, "learning_rate": 0.0002, "loss": 0.322, "step": 4000 }, { "epoch": 0.33012379642365886, "grad_norm": 0.5273401737213135, "learning_rate": 0.0002, "loss": 0.358, "step": 4020 }, { "epoch": 0.33176620337104024, "grad_norm": 0.560130774974823, "learning_rate": 0.0002, "loss": 0.3252, "step": 4040 }, { "epoch": 0.33340861031842167, "grad_norm": 0.7334967851638794, "learning_rate": 0.0002, "loss": 0.3125, "step": 4060 }, { "epoch": 0.33505101726580305, "grad_norm": 0.448902428150177, "learning_rate": 0.0002, "loss": 0.3337, "step": 4080 }, { "epoch": 0.33669342421318443, "grad_norm": 0.42839765548706055, "learning_rate": 0.0002, "loss": 0.3332, "step": 4100 }, { "epoch": 0.3383358311605658, "grad_norm": 0.43117448687553406, "learning_rate": 0.0002, "loss": 0.3204, "step": 4120 }, { "epoch": 0.3399782381079472, "grad_norm": 0.4213992953300476, "learning_rate": 0.0002, "loss": 0.3421, "step": 4140 }, { "epoch": 0.3416206450553286, "grad_norm": 0.40054526925086975, "learning_rate": 0.0002, "loss": 0.3115, "step": 4160 }, { "epoch": 0.34326305200270996, "grad_norm": 0.5090795159339905, "learning_rate": 0.0002, "loss": 0.3324, "step": 4180 }, { "epoch": 0.34490545895009134, "grad_norm": 0.5156223177909851, "learning_rate": 0.0002, "loss": 0.3186, "step": 4200 }, { "epoch": 0.3465478658974728, "grad_norm": 0.4297846555709839, "learning_rate": 0.0002, "loss": 0.312, "step": 4220 }, { "epoch": 0.34819027284485415, "grad_norm": 0.4857240617275238, "learning_rate": 0.0002, "loss": 0.3202, "step": 4240 }, { "epoch": 0.34983267979223553, "grad_norm": 0.6078678965568542, "learning_rate": 0.0002, "loss": 0.3329, "step": 4260 }, { "epoch": 0.3514750867396169, "grad_norm": 0.5576339364051819, "learning_rate": 0.0002, "loss": 0.333, "step": 4280 }, { "epoch": 0.3531174936869983, "grad_norm": 0.5340404510498047, "learning_rate": 0.0002, "loss": 0.3367, "step": 4300 }, { "epoch": 0.3547599006343797, "grad_norm": 0.5187095999717712, "learning_rate": 0.0002, "loss": 0.3579, "step": 4320 }, { "epoch": 0.35640230758176106, "grad_norm": 0.4246378540992737, "learning_rate": 0.0002, "loss": 0.3281, "step": 4340 }, { "epoch": 0.35804471452914244, "grad_norm": 0.6137174963951111, "learning_rate": 0.0002, "loss": 0.3248, "step": 4360 }, { "epoch": 0.3596871214765238, "grad_norm": 0.44220972061157227, "learning_rate": 0.0002, "loss": 0.3267, "step": 4380 }, { "epoch": 0.36132952842390526, "grad_norm": 0.4254567325115204, "learning_rate": 0.0002, "loss": 0.315, "step": 4400 }, { "epoch": 0.36297193537128664, "grad_norm": 0.66693115234375, "learning_rate": 0.0002, "loss": 0.3354, "step": 4420 }, { "epoch": 0.364614342318668, "grad_norm": 0.5646852254867554, "learning_rate": 0.0002, "loss": 0.3275, "step": 4440 }, { "epoch": 0.3662567492660494, "grad_norm": 0.525794506072998, "learning_rate": 0.0002, "loss": 0.3095, "step": 4460 }, { "epoch": 0.3678991562134308, "grad_norm": 0.5454958081245422, "learning_rate": 0.0002, "loss": 0.3177, "step": 4480 }, { "epoch": 0.36954156316081216, "grad_norm": 0.5054097771644592, "learning_rate": 0.0002, "loss": 0.3291, "step": 4500 }, { "epoch": 0.37118397010819354, "grad_norm": 0.45259889960289, "learning_rate": 0.0002, "loss": 0.3309, "step": 4520 }, { "epoch": 0.3728263770555749, "grad_norm": 0.4160098135471344, "learning_rate": 0.0002, "loss": 0.3416, "step": 4540 }, { "epoch": 0.37446878400295636, "grad_norm": 0.36465033888816833, "learning_rate": 0.0002, "loss": 0.3244, "step": 4560 }, { "epoch": 0.37611119095033774, "grad_norm": 0.3822501301765442, "learning_rate": 0.0002, "loss": 0.3163, "step": 4580 }, { "epoch": 0.3777535978977191, "grad_norm": 0.4484947621822357, "learning_rate": 0.0002, "loss": 0.3186, "step": 4600 }, { "epoch": 0.3793960048451005, "grad_norm": 0.481303334236145, "learning_rate": 0.0002, "loss": 0.3202, "step": 4620 }, { "epoch": 0.3810384117924819, "grad_norm": 0.5275722742080688, "learning_rate": 0.0002, "loss": 0.319, "step": 4640 }, { "epoch": 0.38268081873986326, "grad_norm": 0.5782263278961182, "learning_rate": 0.0002, "loss": 0.327, "step": 4660 }, { "epoch": 0.38432322568724464, "grad_norm": 0.511466920375824, "learning_rate": 0.0002, "loss": 0.3176, "step": 4680 }, { "epoch": 0.385965632634626, "grad_norm": 0.5383144617080688, "learning_rate": 0.0002, "loss": 0.3215, "step": 4700 }, { "epoch": 0.38760803958200746, "grad_norm": 0.47731462121009827, "learning_rate": 0.0002, "loss": 0.3184, "step": 4720 }, { "epoch": 0.38925044652938884, "grad_norm": 0.43928396701812744, "learning_rate": 0.0002, "loss": 0.2998, "step": 4740 }, { "epoch": 0.3908928534767702, "grad_norm": 0.47170737385749817, "learning_rate": 0.0002, "loss": 0.3211, "step": 4760 }, { "epoch": 0.3925352604241516, "grad_norm": 0.39744389057159424, "learning_rate": 0.0002, "loss": 0.3119, "step": 4780 }, { "epoch": 0.394177667371533, "grad_norm": 0.4669509828090668, "learning_rate": 0.0002, "loss": 0.2965, "step": 4800 }, { "epoch": 0.39582007431891436, "grad_norm": 0.4926499128341675, "learning_rate": 0.0002, "loss": 0.2996, "step": 4820 }, { "epoch": 0.39746248126629574, "grad_norm": 0.4818594455718994, "learning_rate": 0.0002, "loss": 0.3116, "step": 4840 }, { "epoch": 0.3991048882136771, "grad_norm": 0.4344610571861267, "learning_rate": 0.0002, "loss": 0.2884, "step": 4860 }, { "epoch": 0.40074729516105856, "grad_norm": 0.3993249535560608, "learning_rate": 0.0002, "loss": 0.3096, "step": 4880 }, { "epoch": 0.40238970210843994, "grad_norm": 0.4467979967594147, "learning_rate": 0.0002, "loss": 0.2976, "step": 4900 }, { "epoch": 0.4040321090558213, "grad_norm": 0.5102105736732483, "learning_rate": 0.0002, "loss": 0.3005, "step": 4920 }, { "epoch": 0.4056745160032027, "grad_norm": 0.49601197242736816, "learning_rate": 0.0002, "loss": 0.2983, "step": 4940 }, { "epoch": 0.4073169229505841, "grad_norm": 0.39463695883750916, "learning_rate": 0.0002, "loss": 0.3071, "step": 4960 }, { "epoch": 0.40895932989796546, "grad_norm": 0.5963265299797058, "learning_rate": 0.0002, "loss": 0.3017, "step": 4980 }, { "epoch": 0.41060173684534684, "grad_norm": 0.5571741461753845, "learning_rate": 0.0002, "loss": 0.312, "step": 5000 }, { "epoch": 0.4122441437927282, "grad_norm": 0.430397629737854, "learning_rate": 0.0002, "loss": 0.3077, "step": 5020 }, { "epoch": 0.4138865507401096, "grad_norm": 0.5038132667541504, "learning_rate": 0.0002, "loss": 0.3065, "step": 5040 }, { "epoch": 0.41552895768749104, "grad_norm": 0.41420304775238037, "learning_rate": 0.0002, "loss": 0.3061, "step": 5060 }, { "epoch": 0.4171713646348724, "grad_norm": 0.6602872610092163, "learning_rate": 0.0002, "loss": 0.3101, "step": 5080 }, { "epoch": 0.4188137715822538, "grad_norm": 0.46677547693252563, "learning_rate": 0.0002, "loss": 0.3097, "step": 5100 }, { "epoch": 0.4204561785296352, "grad_norm": 0.5312944054603577, "learning_rate": 0.0002, "loss": 0.3136, "step": 5120 }, { "epoch": 0.42209858547701656, "grad_norm": 0.4542620778083801, "learning_rate": 0.0002, "loss": 0.3177, "step": 5140 }, { "epoch": 0.42374099242439794, "grad_norm": 0.5240755081176758, "learning_rate": 0.0002, "loss": 0.3121, "step": 5160 }, { "epoch": 0.4253833993717793, "grad_norm": 0.49393558502197266, "learning_rate": 0.0002, "loss": 0.3145, "step": 5180 }, { "epoch": 0.4270258063191607, "grad_norm": 0.3480128347873688, "learning_rate": 0.0002, "loss": 0.3047, "step": 5200 }, { "epoch": 0.42866821326654214, "grad_norm": 0.4269355833530426, "learning_rate": 0.0002, "loss": 0.3128, "step": 5220 }, { "epoch": 0.4303106202139235, "grad_norm": 0.46620428562164307, "learning_rate": 0.0002, "loss": 0.2892, "step": 5240 }, { "epoch": 0.4319530271613049, "grad_norm": 0.502040684223175, "learning_rate": 0.0002, "loss": 0.2977, "step": 5260 }, { "epoch": 0.4335954341086863, "grad_norm": 0.4725840091705322, "learning_rate": 0.0002, "loss": 0.2926, "step": 5280 }, { "epoch": 0.43523784105606766, "grad_norm": 0.4031844735145569, "learning_rate": 0.0002, "loss": 0.2931, "step": 5300 }, { "epoch": 0.43688024800344905, "grad_norm": 0.5044718384742737, "learning_rate": 0.0002, "loss": 0.2925, "step": 5320 }, { "epoch": 0.4385226549508304, "grad_norm": 0.43350791931152344, "learning_rate": 0.0002, "loss": 0.3064, "step": 5340 }, { "epoch": 0.4401650618982118, "grad_norm": 0.4503776431083679, "learning_rate": 0.0002, "loss": 0.2935, "step": 5360 }, { "epoch": 0.44180746884559324, "grad_norm": 0.4562300145626068, "learning_rate": 0.0002, "loss": 0.2908, "step": 5380 }, { "epoch": 0.4434498757929746, "grad_norm": 0.4543699026107788, "learning_rate": 0.0002, "loss": 0.2971, "step": 5400 }, { "epoch": 0.445092282740356, "grad_norm": 0.45582354068756104, "learning_rate": 0.0002, "loss": 0.3039, "step": 5420 }, { "epoch": 0.4467346896877374, "grad_norm": 0.535355269908905, "learning_rate": 0.0002, "loss": 0.3023, "step": 5440 }, { "epoch": 0.44837709663511877, "grad_norm": 0.6104617118835449, "learning_rate": 0.0002, "loss": 0.3001, "step": 5460 }, { "epoch": 0.45001950358250015, "grad_norm": 0.5111253261566162, "learning_rate": 0.0002, "loss": 0.281, "step": 5480 }, { "epoch": 0.4516619105298815, "grad_norm": 0.49691838026046753, "learning_rate": 0.0002, "loss": 0.3043, "step": 5500 }, { "epoch": 0.4533043174772629, "grad_norm": 0.5030774474143982, "learning_rate": 0.0002, "loss": 0.2963, "step": 5520 }, { "epoch": 0.4549467244246443, "grad_norm": 0.4874095320701599, "learning_rate": 0.0002, "loss": 0.3063, "step": 5540 }, { "epoch": 0.4565891313720257, "grad_norm": 0.4713788330554962, "learning_rate": 0.0002, "loss": 0.2997, "step": 5560 }, { "epoch": 0.4582315383194071, "grad_norm": 0.48497167229652405, "learning_rate": 0.0002, "loss": 0.2936, "step": 5580 }, { "epoch": 0.4598739452667885, "grad_norm": 0.5291727185249329, "learning_rate": 0.0002, "loss": 0.2863, "step": 5600 }, { "epoch": 0.46151635221416987, "grad_norm": 0.5845544934272766, "learning_rate": 0.0002, "loss": 0.2834, "step": 5620 }, { "epoch": 0.46315875916155125, "grad_norm": 0.5052700638771057, "learning_rate": 0.0002, "loss": 0.281, "step": 5640 }, { "epoch": 0.46480116610893263, "grad_norm": 0.47813382744789124, "learning_rate": 0.0002, "loss": 0.2859, "step": 5660 }, { "epoch": 0.466443573056314, "grad_norm": 0.4913572072982788, "learning_rate": 0.0002, "loss": 0.2765, "step": 5680 }, { "epoch": 0.4680859800036954, "grad_norm": 0.5044130086898804, "learning_rate": 0.0002, "loss": 0.3068, "step": 5700 }, { "epoch": 0.4697283869510768, "grad_norm": 0.45967990159988403, "learning_rate": 0.0002, "loss": 0.294, "step": 5720 }, { "epoch": 0.4713707938984582, "grad_norm": 0.4834402799606323, "learning_rate": 0.0002, "loss": 0.2902, "step": 5740 }, { "epoch": 0.4730132008458396, "grad_norm": 0.4889473617076874, "learning_rate": 0.0002, "loss": 0.2931, "step": 5760 }, { "epoch": 0.47465560779322097, "grad_norm": 0.37159985303878784, "learning_rate": 0.0002, "loss": 0.2836, "step": 5780 }, { "epoch": 0.47629801474060235, "grad_norm": 0.44428759813308716, "learning_rate": 0.0002, "loss": 0.2994, "step": 5800 }, { "epoch": 0.47794042168798373, "grad_norm": 0.5093443989753723, "learning_rate": 0.0002, "loss": 0.2943, "step": 5820 }, { "epoch": 0.4795828286353651, "grad_norm": 0.539089024066925, "learning_rate": 0.0002, "loss": 0.2968, "step": 5840 }, { "epoch": 0.4812252355827465, "grad_norm": 0.33726248145103455, "learning_rate": 0.0002, "loss": 0.283, "step": 5860 }, { "epoch": 0.4828676425301279, "grad_norm": 0.451824426651001, "learning_rate": 0.0002, "loss": 0.2824, "step": 5880 }, { "epoch": 0.4845100494775093, "grad_norm": 0.4333132207393646, "learning_rate": 0.0002, "loss": 0.2908, "step": 5900 }, { "epoch": 0.4861524564248907, "grad_norm": 0.4399010241031647, "learning_rate": 0.0002, "loss": 0.2857, "step": 5920 }, { "epoch": 0.48779486337227207, "grad_norm": 0.46633288264274597, "learning_rate": 0.0002, "loss": 0.2796, "step": 5940 }, { "epoch": 0.48943727031965345, "grad_norm": 0.6088176965713501, "learning_rate": 0.0002, "loss": 0.2868, "step": 5960 }, { "epoch": 0.49107967726703483, "grad_norm": 0.5191177129745483, "learning_rate": 0.0002, "loss": 0.2713, "step": 5980 }, { "epoch": 0.4927220842144162, "grad_norm": 0.6080117225646973, "learning_rate": 0.0002, "loss": 0.2925, "step": 6000 }, { "epoch": 0.4943644911617976, "grad_norm": 0.4405871629714966, "learning_rate": 0.0002, "loss": 0.2827, "step": 6020 }, { "epoch": 0.49600689810917903, "grad_norm": 0.44443821907043457, "learning_rate": 0.0002, "loss": 0.2641, "step": 6040 }, { "epoch": 0.4976493050565604, "grad_norm": 0.401265025138855, "learning_rate": 0.0002, "loss": 0.2908, "step": 6060 }, { "epoch": 0.4992917120039418, "grad_norm": 0.4125641882419586, "learning_rate": 0.0002, "loss": 0.2717, "step": 6080 }, { "epoch": 0.5009341189513231, "grad_norm": 0.4346245229244232, "learning_rate": 0.0002, "loss": 0.2706, "step": 6100 }, { "epoch": 0.5025765258987046, "grad_norm": 0.47208690643310547, "learning_rate": 0.0002, "loss": 0.2851, "step": 6120 }, { "epoch": 0.504218932846086, "grad_norm": 0.4369046986103058, "learning_rate": 0.0002, "loss": 0.2809, "step": 6140 }, { "epoch": 0.5058613397934674, "grad_norm": 0.5451960563659668, "learning_rate": 0.0002, "loss": 0.293, "step": 6160 }, { "epoch": 0.5075037467408487, "grad_norm": 0.6085506677627563, "learning_rate": 0.0002, "loss": 0.2748, "step": 6180 }, { "epoch": 0.5091461536882301, "grad_norm": 0.3898778259754181, "learning_rate": 0.0002, "loss": 0.276, "step": 6200 }, { "epoch": 0.5107885606356115, "grad_norm": 0.5069212317466736, "learning_rate": 0.0002, "loss": 0.2925, "step": 6220 }, { "epoch": 0.5124309675829929, "grad_norm": 0.48736870288848877, "learning_rate": 0.0002, "loss": 0.2718, "step": 6240 }, { "epoch": 0.5140733745303743, "grad_norm": 0.5182287693023682, "learning_rate": 0.0002, "loss": 0.2783, "step": 6260 }, { "epoch": 0.5157157814777557, "grad_norm": 0.5157051086425781, "learning_rate": 0.0002, "loss": 0.2828, "step": 6280 }, { "epoch": 0.517358188425137, "grad_norm": 0.4653798043727875, "learning_rate": 0.0002, "loss": 0.2802, "step": 6300 }, { "epoch": 0.5190005953725184, "grad_norm": 0.4838721454143524, "learning_rate": 0.0002, "loss": 0.2758, "step": 6320 }, { "epoch": 0.5206430023198998, "grad_norm": 0.47830331325531006, "learning_rate": 0.0002, "loss": 0.2999, "step": 6340 }, { "epoch": 0.5222854092672812, "grad_norm": 0.45021089911460876, "learning_rate": 0.0002, "loss": 0.2673, "step": 6360 }, { "epoch": 0.5239278162146626, "grad_norm": 0.4527071714401245, "learning_rate": 0.0002, "loss": 0.2624, "step": 6380 }, { "epoch": 0.5255702231620439, "grad_norm": 0.508590817451477, "learning_rate": 0.0002, "loss": 0.2555, "step": 6400 }, { "epoch": 0.5272126301094253, "grad_norm": 0.38745129108428955, "learning_rate": 0.0002, "loss": 0.2863, "step": 6420 }, { "epoch": 0.5288550370568067, "grad_norm": 0.6669766902923584, "learning_rate": 0.0002, "loss": 0.2813, "step": 6440 }, { "epoch": 0.5304974440041882, "grad_norm": 0.5111877918243408, "learning_rate": 0.0002, "loss": 0.2712, "step": 6460 }, { "epoch": 0.5321398509515696, "grad_norm": 0.5499460697174072, "learning_rate": 0.0002, "loss": 0.2656, "step": 6480 }, { "epoch": 0.533782257898951, "grad_norm": 0.5004873275756836, "learning_rate": 0.0002, "loss": 0.2873, "step": 6500 }, { "epoch": 0.5354246648463323, "grad_norm": 0.6010814309120178, "learning_rate": 0.0002, "loss": 0.3005, "step": 6520 }, { "epoch": 0.5370670717937137, "grad_norm": 0.4720690846443176, "learning_rate": 0.0002, "loss": 0.2675, "step": 6540 }, { "epoch": 0.5387094787410951, "grad_norm": 0.47902727127075195, "learning_rate": 0.0002, "loss": 0.2715, "step": 6560 }, { "epoch": 0.5403518856884765, "grad_norm": 0.46664199233055115, "learning_rate": 0.0002, "loss": 0.2713, "step": 6580 }, { "epoch": 0.5419942926358579, "grad_norm": 0.5385149121284485, "learning_rate": 0.0002, "loss": 0.2867, "step": 6600 }, { "epoch": 0.5436366995832392, "grad_norm": 0.3878926932811737, "learning_rate": 0.0002, "loss": 0.2802, "step": 6620 }, { "epoch": 0.5452791065306206, "grad_norm": 0.390656054019928, "learning_rate": 0.0002, "loss": 0.2676, "step": 6640 }, { "epoch": 0.546921513478002, "grad_norm": 0.4342198669910431, "learning_rate": 0.0002, "loss": 0.2874, "step": 6660 }, { "epoch": 0.5485639204253834, "grad_norm": 0.42557764053344727, "learning_rate": 0.0002, "loss": 0.2829, "step": 6680 }, { "epoch": 0.5502063273727648, "grad_norm": 0.5569108128547668, "learning_rate": 0.0002, "loss": 0.2929, "step": 6700 }, { "epoch": 0.5518487343201461, "grad_norm": 0.38765788078308105, "learning_rate": 0.0002, "loss": 0.2804, "step": 6720 }, { "epoch": 0.5534911412675275, "grad_norm": 0.5068329572677612, "learning_rate": 0.0002, "loss": 0.2629, "step": 6740 }, { "epoch": 0.5551335482149089, "grad_norm": 0.5097832083702087, "learning_rate": 0.0002, "loss": 0.2846, "step": 6760 }, { "epoch": 0.5567759551622903, "grad_norm": 0.37154141068458557, "learning_rate": 0.0002, "loss": 0.2625, "step": 6780 }, { "epoch": 0.5584183621096718, "grad_norm": 0.41640445590019226, "learning_rate": 0.0002, "loss": 0.2669, "step": 6800 }, { "epoch": 0.5600607690570532, "grad_norm": 0.45431575179100037, "learning_rate": 0.0002, "loss": 0.2644, "step": 6820 }, { "epoch": 0.5617031760044345, "grad_norm": 0.46759283542633057, "learning_rate": 0.0002, "loss": 0.2742, "step": 6840 }, { "epoch": 0.5633455829518159, "grad_norm": 0.4959569275379181, "learning_rate": 0.0002, "loss": 0.2746, "step": 6860 }, { "epoch": 0.5649879898991973, "grad_norm": 0.44646400213241577, "learning_rate": 0.0002, "loss": 0.2803, "step": 6880 }, { "epoch": 0.5666303968465787, "grad_norm": 0.5323026180267334, "learning_rate": 0.0002, "loss": 0.2685, "step": 6900 }, { "epoch": 0.5682728037939601, "grad_norm": 0.5455038547515869, "learning_rate": 0.0002, "loss": 0.2737, "step": 6920 }, { "epoch": 0.5699152107413414, "grad_norm": 0.429975301027298, "learning_rate": 0.0002, "loss": 0.2826, "step": 6940 }, { "epoch": 0.5715576176887228, "grad_norm": 0.5396720170974731, "learning_rate": 0.0002, "loss": 0.266, "step": 6960 }, { "epoch": 0.5732000246361042, "grad_norm": 0.45468002557754517, "learning_rate": 0.0002, "loss": 0.2676, "step": 6980 }, { "epoch": 0.5748424315834856, "grad_norm": 0.4196678698062897, "learning_rate": 0.0002, "loss": 0.2786, "step": 7000 }, { "epoch": 0.576484838530867, "grad_norm": 0.4681088328361511, "learning_rate": 0.0002, "loss": 0.2731, "step": 7020 }, { "epoch": 0.5781272454782483, "grad_norm": 0.4538247287273407, "learning_rate": 0.0002, "loss": 0.287, "step": 7040 }, { "epoch": 0.5797696524256297, "grad_norm": 0.4834930896759033, "learning_rate": 0.0002, "loss": 0.2808, "step": 7060 }, { "epoch": 0.5814120593730111, "grad_norm": 0.5876035690307617, "learning_rate": 0.0002, "loss": 0.2631, "step": 7080 }, { "epoch": 0.5830544663203925, "grad_norm": 0.5164270401000977, "learning_rate": 0.0002, "loss": 0.2502, "step": 7100 }, { "epoch": 0.584696873267774, "grad_norm": 0.46229973435401917, "learning_rate": 0.0002, "loss": 0.2575, "step": 7120 }, { "epoch": 0.5863392802151554, "grad_norm": 0.438803106546402, "learning_rate": 0.0002, "loss": 0.2625, "step": 7140 }, { "epoch": 0.5879816871625367, "grad_norm": 0.5476749539375305, "learning_rate": 0.0002, "loss": 0.2706, "step": 7160 }, { "epoch": 0.5896240941099181, "grad_norm": 0.5194425582885742, "learning_rate": 0.0002, "loss": 0.2766, "step": 7180 }, { "epoch": 0.5912665010572995, "grad_norm": 0.4764098525047302, "learning_rate": 0.0002, "loss": 0.2784, "step": 7200 }, { "epoch": 0.5929089080046809, "grad_norm": 0.4703931510448456, "learning_rate": 0.0002, "loss": 0.2652, "step": 7220 }, { "epoch": 0.5945513149520623, "grad_norm": 0.43372678756713867, "learning_rate": 0.0002, "loss": 0.2644, "step": 7240 }, { "epoch": 0.5961937218994436, "grad_norm": 0.40813469886779785, "learning_rate": 0.0002, "loss": 0.2721, "step": 7260 }, { "epoch": 0.597836128846825, "grad_norm": 0.5182124376296997, "learning_rate": 0.0002, "loss": 0.2741, "step": 7280 }, { "epoch": 0.5994785357942064, "grad_norm": 0.4767136573791504, "learning_rate": 0.0002, "loss": 0.277, "step": 7300 }, { "epoch": 0.6011209427415878, "grad_norm": 0.43762916326522827, "learning_rate": 0.0002, "loss": 0.2645, "step": 7320 }, { "epoch": 0.6027633496889692, "grad_norm": 0.44736623764038086, "learning_rate": 0.0002, "loss": 0.2639, "step": 7340 }, { "epoch": 0.6044057566363505, "grad_norm": 0.44404810667037964, "learning_rate": 0.0002, "loss": 0.269, "step": 7360 }, { "epoch": 0.6060481635837319, "grad_norm": 0.4380868673324585, "learning_rate": 0.0002, "loss": 0.2615, "step": 7380 }, { "epoch": 0.6076905705311133, "grad_norm": 0.4491208791732788, "learning_rate": 0.0002, "loss": 0.2462, "step": 7400 }, { "epoch": 0.6093329774784947, "grad_norm": 0.5080710053443909, "learning_rate": 0.0002, "loss": 0.2823, "step": 7420 }, { "epoch": 0.6109753844258761, "grad_norm": 0.47498422861099243, "learning_rate": 0.0002, "loss": 0.2706, "step": 7440 }, { "epoch": 0.6126177913732576, "grad_norm": 0.4133289158344269, "learning_rate": 0.0002, "loss": 0.2684, "step": 7460 }, { "epoch": 0.6142601983206389, "grad_norm": 0.4456469416618347, "learning_rate": 0.0002, "loss": 0.2542, "step": 7480 }, { "epoch": 0.6159026052680203, "grad_norm": 0.5421611070632935, "learning_rate": 0.0002, "loss": 0.2737, "step": 7500 }, { "epoch": 0.6175450122154017, "grad_norm": 0.4131532609462738, "learning_rate": 0.0002, "loss": 0.2507, "step": 7520 }, { "epoch": 0.6191874191627831, "grad_norm": 0.47127702832221985, "learning_rate": 0.0002, "loss": 0.2819, "step": 7540 }, { "epoch": 0.6208298261101645, "grad_norm": 0.43743231892585754, "learning_rate": 0.0002, "loss": 0.2822, "step": 7560 }, { "epoch": 0.6224722330575458, "grad_norm": 0.42425501346588135, "learning_rate": 0.0002, "loss": 0.2654, "step": 7580 }, { "epoch": 0.6241146400049272, "grad_norm": 0.4609832763671875, "learning_rate": 0.0002, "loss": 0.2466, "step": 7600 }, { "epoch": 0.6257570469523086, "grad_norm": 0.42701244354248047, "learning_rate": 0.0002, "loss": 0.255, "step": 7620 }, { "epoch": 0.62739945389969, "grad_norm": 0.5154401063919067, "learning_rate": 0.0002, "loss": 0.2705, "step": 7640 }, { "epoch": 0.6290418608470714, "grad_norm": 0.451377809047699, "learning_rate": 0.0002, "loss": 0.2586, "step": 7660 }, { "epoch": 0.6306842677944527, "grad_norm": 0.47166112065315247, "learning_rate": 0.0002, "loss": 0.2605, "step": 7680 }, { "epoch": 0.6323266747418341, "grad_norm": 0.3716096878051758, "learning_rate": 0.0002, "loss": 0.2539, "step": 7700 }, { "epoch": 0.6339690816892155, "grad_norm": 0.45413604378700256, "learning_rate": 0.0002, "loss": 0.2633, "step": 7720 }, { "epoch": 0.6356114886365969, "grad_norm": 0.48580700159072876, "learning_rate": 0.0002, "loss": 0.256, "step": 7740 }, { "epoch": 0.6372538955839783, "grad_norm": 0.40647098422050476, "learning_rate": 0.0002, "loss": 0.2655, "step": 7760 }, { "epoch": 0.6388963025313598, "grad_norm": 0.4718053638935089, "learning_rate": 0.0002, "loss": 0.261, "step": 7780 }, { "epoch": 0.6405387094787411, "grad_norm": 0.5230545401573181, "learning_rate": 0.0002, "loss": 0.2464, "step": 7800 }, { "epoch": 0.6421811164261225, "grad_norm": 0.5010546445846558, "learning_rate": 0.0002, "loss": 0.261, "step": 7820 }, { "epoch": 0.6438235233735039, "grad_norm": 0.41263461112976074, "learning_rate": 0.0002, "loss": 0.2626, "step": 7840 }, { "epoch": 0.6454659303208853, "grad_norm": 0.538346529006958, "learning_rate": 0.0002, "loss": 0.2557, "step": 7860 }, { "epoch": 0.6471083372682667, "grad_norm": 0.4800877869129181, "learning_rate": 0.0002, "loss": 0.2742, "step": 7880 }, { "epoch": 0.648750744215648, "grad_norm": 0.5247358083724976, "learning_rate": 0.0002, "loss": 0.2608, "step": 7900 }, { "epoch": 0.6503931511630294, "grad_norm": 0.5625537037849426, "learning_rate": 0.0002, "loss": 0.2445, "step": 7920 }, { "epoch": 0.6520355581104108, "grad_norm": 0.44077080488204956, "learning_rate": 0.0002, "loss": 0.2572, "step": 7940 }, { "epoch": 0.6536779650577922, "grad_norm": 0.4610736072063446, "learning_rate": 0.0002, "loss": 0.2645, "step": 7960 }, { "epoch": 0.6553203720051736, "grad_norm": 0.4790017008781433, "learning_rate": 0.0002, "loss": 0.2556, "step": 7980 }, { "epoch": 0.656962778952555, "grad_norm": 0.45367711782455444, "learning_rate": 0.0002, "loss": 0.253, "step": 8000 }, { "epoch": 0.6586051858999363, "grad_norm": 0.4644503593444824, "learning_rate": 0.0002, "loss": 0.25, "step": 8020 }, { "epoch": 0.6602475928473177, "grad_norm": 0.3938300311565399, "learning_rate": 0.0002, "loss": 0.2524, "step": 8040 }, { "epoch": 0.6618899997946991, "grad_norm": 0.4796749949455261, "learning_rate": 0.0002, "loss": 0.2643, "step": 8060 }, { "epoch": 0.6635324067420805, "grad_norm": 0.3965921700000763, "learning_rate": 0.0002, "loss": 0.252, "step": 8080 }, { "epoch": 0.6651748136894619, "grad_norm": 0.4033324420452118, "learning_rate": 0.0002, "loss": 0.2469, "step": 8100 }, { "epoch": 0.6668172206368433, "grad_norm": 0.5205174088478088, "learning_rate": 0.0002, "loss": 0.2479, "step": 8120 }, { "epoch": 0.6684596275842247, "grad_norm": 0.4026409685611725, "learning_rate": 0.0002, "loss": 0.2482, "step": 8140 }, { "epoch": 0.6701020345316061, "grad_norm": 0.33538395166397095, "learning_rate": 0.0002, "loss": 0.2452, "step": 8160 }, { "epoch": 0.6717444414789875, "grad_norm": 0.43549609184265137, "learning_rate": 0.0002, "loss": 0.2548, "step": 8180 }, { "epoch": 0.6733868484263689, "grad_norm": 0.5167241096496582, "learning_rate": 0.0002, "loss": 0.2664, "step": 8200 }, { "epoch": 0.6750292553737502, "grad_norm": 0.4824913740158081, "learning_rate": 0.0002, "loss": 0.2668, "step": 8220 }, { "epoch": 0.6766716623211316, "grad_norm": 0.49560844898223877, "learning_rate": 0.0002, "loss": 0.2639, "step": 8240 }, { "epoch": 0.678314069268513, "grad_norm": 0.43627840280532837, "learning_rate": 0.0002, "loss": 0.2536, "step": 8260 }, { "epoch": 0.6799564762158944, "grad_norm": 0.4371199905872345, "learning_rate": 0.0002, "loss": 0.259, "step": 8280 }, { "epoch": 0.6815988831632758, "grad_norm": 0.43210867047309875, "learning_rate": 0.0002, "loss": 0.2413, "step": 8300 }, { "epoch": 0.6832412901106572, "grad_norm": 0.4612789750099182, "learning_rate": 0.0002, "loss": 0.257, "step": 8320 }, { "epoch": 0.6848836970580385, "grad_norm": 0.5780384540557861, "learning_rate": 0.0002, "loss": 0.2497, "step": 8340 }, { "epoch": 0.6865261040054199, "grad_norm": 0.3581444323062897, "learning_rate": 0.0002, "loss": 0.2542, "step": 8360 }, { "epoch": 0.6881685109528013, "grad_norm": 0.5276636481285095, "learning_rate": 0.0002, "loss": 0.2482, "step": 8380 }, { "epoch": 0.6898109179001827, "grad_norm": 0.419548362493515, "learning_rate": 0.0002, "loss": 0.2778, "step": 8400 }, { "epoch": 0.691453324847564, "grad_norm": 0.5594448447227478, "learning_rate": 0.0002, "loss": 0.271, "step": 8420 }, { "epoch": 0.6930957317949455, "grad_norm": 0.4505052864551544, "learning_rate": 0.0002, "loss": 0.2531, "step": 8440 }, { "epoch": 0.6947381387423269, "grad_norm": 0.4273683726787567, "learning_rate": 0.0002, "loss": 0.2687, "step": 8460 }, { "epoch": 0.6963805456897083, "grad_norm": 0.41312068700790405, "learning_rate": 0.0002, "loss": 0.2535, "step": 8480 }, { "epoch": 0.6980229526370897, "grad_norm": 0.3998921811580658, "learning_rate": 0.0002, "loss": 0.2507, "step": 8500 }, { "epoch": 0.6996653595844711, "grad_norm": 0.4063471257686615, "learning_rate": 0.0002, "loss": 0.2604, "step": 8520 }, { "epoch": 0.7013077665318525, "grad_norm": 0.4816170036792755, "learning_rate": 0.0002, "loss": 0.2563, "step": 8540 }, { "epoch": 0.7029501734792338, "grad_norm": 0.47880151867866516, "learning_rate": 0.0002, "loss": 0.2582, "step": 8560 }, { "epoch": 0.7045925804266152, "grad_norm": 0.43934714794158936, "learning_rate": 0.0002, "loss": 0.2588, "step": 8580 }, { "epoch": 0.7062349873739966, "grad_norm": 0.5664840340614319, "learning_rate": 0.0002, "loss": 0.2361, "step": 8600 }, { "epoch": 0.707877394321378, "grad_norm": 0.4387499690055847, "learning_rate": 0.0002, "loss": 0.2784, "step": 8620 }, { "epoch": 0.7095198012687594, "grad_norm": 0.4497361183166504, "learning_rate": 0.0002, "loss": 0.2419, "step": 8640 }, { "epoch": 0.7111622082161407, "grad_norm": 0.36037716269493103, "learning_rate": 0.0002, "loss": 0.2479, "step": 8660 }, { "epoch": 0.7128046151635221, "grad_norm": 0.5163317918777466, "learning_rate": 0.0002, "loss": 0.2535, "step": 8680 }, { "epoch": 0.7144470221109035, "grad_norm": 0.466194748878479, "learning_rate": 0.0002, "loss": 0.2533, "step": 8700 }, { "epoch": 0.7160894290582849, "grad_norm": 0.328848272562027, "learning_rate": 0.0002, "loss": 0.254, "step": 8720 }, { "epoch": 0.7177318360056663, "grad_norm": 0.5417701005935669, "learning_rate": 0.0002, "loss": 0.2544, "step": 8740 }, { "epoch": 0.7193742429530476, "grad_norm": 0.5538254976272583, "learning_rate": 0.0002, "loss": 0.2453, "step": 8760 }, { "epoch": 0.7210166499004291, "grad_norm": 0.4739200174808502, "learning_rate": 0.0002, "loss": 0.258, "step": 8780 }, { "epoch": 0.7226590568478105, "grad_norm": 0.40133044123649597, "learning_rate": 0.0002, "loss": 0.2684, "step": 8800 }, { "epoch": 0.7243014637951919, "grad_norm": 0.4493289291858673, "learning_rate": 0.0002, "loss": 0.2565, "step": 8820 }, { "epoch": 0.7259438707425733, "grad_norm": 0.4970559775829315, "learning_rate": 0.0002, "loss": 0.2506, "step": 8840 }, { "epoch": 0.7275862776899547, "grad_norm": 0.5687580108642578, "learning_rate": 0.0002, "loss": 0.2511, "step": 8860 }, { "epoch": 0.729228684637336, "grad_norm": 0.5328338742256165, "learning_rate": 0.0002, "loss": 0.2428, "step": 8880 }, { "epoch": 0.7308710915847174, "grad_norm": 0.47104090452194214, "learning_rate": 0.0002, "loss": 0.2491, "step": 8900 }, { "epoch": 0.7325134985320988, "grad_norm": 0.4887702167034149, "learning_rate": 0.0002, "loss": 0.2532, "step": 8920 }, { "epoch": 0.7341559054794802, "grad_norm": 0.3589889705181122, "learning_rate": 0.0002, "loss": 0.2587, "step": 8940 }, { "epoch": 0.7357983124268616, "grad_norm": 0.4665176570415497, "learning_rate": 0.0002, "loss": 0.2407, "step": 8960 }, { "epoch": 0.7374407193742429, "grad_norm": 0.2580777108669281, "learning_rate": 0.0002, "loss": 0.2501, "step": 8980 }, { "epoch": 0.7390831263216243, "grad_norm": 0.5562865734100342, "learning_rate": 0.0002, "loss": 0.2589, "step": 9000 }, { "epoch": 0.7407255332690057, "grad_norm": 0.36843666434288025, "learning_rate": 0.0002, "loss": 0.2639, "step": 9020 }, { "epoch": 0.7423679402163871, "grad_norm": 0.433339387178421, "learning_rate": 0.0002, "loss": 0.239, "step": 9040 }, { "epoch": 0.7440103471637685, "grad_norm": 0.5565098524093628, "learning_rate": 0.0002, "loss": 0.2528, "step": 9060 }, { "epoch": 0.7456527541111498, "grad_norm": 0.39954161643981934, "learning_rate": 0.0002, "loss": 0.24, "step": 9080 }, { "epoch": 0.7472951610585313, "grad_norm": 0.43612274527549744, "learning_rate": 0.0002, "loss": 0.2373, "step": 9100 }, { "epoch": 0.7489375680059127, "grad_norm": 0.4511432945728302, "learning_rate": 0.0002, "loss": 0.2564, "step": 9120 }, { "epoch": 0.7505799749532941, "grad_norm": 0.3895890414714813, "learning_rate": 0.0002, "loss": 0.2469, "step": 9140 }, { "epoch": 0.7522223819006755, "grad_norm": 0.4349375069141388, "learning_rate": 0.0002, "loss": 0.2582, "step": 9160 }, { "epoch": 0.7538647888480569, "grad_norm": 0.39693930745124817, "learning_rate": 0.0002, "loss": 0.2576, "step": 9180 }, { "epoch": 0.7555071957954382, "grad_norm": 0.35806095600128174, "learning_rate": 0.0002, "loss": 0.235, "step": 9200 }, { "epoch": 0.7571496027428196, "grad_norm": 0.5650025010108948, "learning_rate": 0.0002, "loss": 0.2541, "step": 9220 }, { "epoch": 0.758792009690201, "grad_norm": 0.45522645115852356, "learning_rate": 0.0002, "loss": 0.2323, "step": 9240 }, { "epoch": 0.7604344166375824, "grad_norm": 0.45849525928497314, "learning_rate": 0.0002, "loss": 0.2459, "step": 9260 }, { "epoch": 0.7620768235849638, "grad_norm": 0.5666941404342651, "learning_rate": 0.0002, "loss": 0.2634, "step": 9280 }, { "epoch": 0.7637192305323451, "grad_norm": 0.43697381019592285, "learning_rate": 0.0002, "loss": 0.2482, "step": 9300 }, { "epoch": 0.7653616374797265, "grad_norm": 0.5133718848228455, "learning_rate": 0.0002, "loss": 0.2631, "step": 9320 }, { "epoch": 0.7670040444271079, "grad_norm": 0.5440112352371216, "learning_rate": 0.0002, "loss": 0.2593, "step": 9340 }, { "epoch": 0.7686464513744893, "grad_norm": 0.5012624263763428, "learning_rate": 0.0002, "loss": 0.243, "step": 9360 }, { "epoch": 0.7702888583218707, "grad_norm": 0.4387590289115906, "learning_rate": 0.0002, "loss": 0.2448, "step": 9380 }, { "epoch": 0.771931265269252, "grad_norm": 0.4327554702758789, "learning_rate": 0.0002, "loss": 0.2514, "step": 9400 }, { "epoch": 0.7735736722166334, "grad_norm": 0.4909968078136444, "learning_rate": 0.0002, "loss": 0.2503, "step": 9420 }, { "epoch": 0.7752160791640149, "grad_norm": 0.4279715120792389, "learning_rate": 0.0002, "loss": 0.2558, "step": 9440 }, { "epoch": 0.7768584861113963, "grad_norm": 0.4973134994506836, "learning_rate": 0.0002, "loss": 0.2412, "step": 9460 }, { "epoch": 0.7785008930587777, "grad_norm": 0.3873676359653473, "learning_rate": 0.0002, "loss": 0.2409, "step": 9480 }, { "epoch": 0.7801433000061591, "grad_norm": 0.40915995836257935, "learning_rate": 0.0002, "loss": 0.2322, "step": 9500 }, { "epoch": 0.7817857069535404, "grad_norm": 0.5738871693611145, "learning_rate": 0.0002, "loss": 0.2408, "step": 9520 }, { "epoch": 0.7834281139009218, "grad_norm": 0.49270549416542053, "learning_rate": 0.0002, "loss": 0.2477, "step": 9540 }, { "epoch": 0.7850705208483032, "grad_norm": 0.4603147804737091, "learning_rate": 0.0002, "loss": 0.2402, "step": 9560 }, { "epoch": 0.7867129277956846, "grad_norm": 0.47675642371177673, "learning_rate": 0.0002, "loss": 0.2528, "step": 9580 }, { "epoch": 0.788355334743066, "grad_norm": 0.41800156235694885, "learning_rate": 0.0002, "loss": 0.2571, "step": 9600 }, { "epoch": 0.7899977416904473, "grad_norm": 0.42527106404304504, "learning_rate": 0.0002, "loss": 0.2452, "step": 9620 }, { "epoch": 0.7916401486378287, "grad_norm": 0.5056847333908081, "learning_rate": 0.0002, "loss": 0.2511, "step": 9640 }, { "epoch": 0.7932825555852101, "grad_norm": 0.2951577305793762, "learning_rate": 0.0002, "loss": 0.233, "step": 9660 }, { "epoch": 0.7949249625325915, "grad_norm": 0.4254283010959625, "learning_rate": 0.0002, "loss": 0.2474, "step": 9680 }, { "epoch": 0.7965673694799729, "grad_norm": 0.5127973556518555, "learning_rate": 0.0002, "loss": 0.2655, "step": 9700 }, { "epoch": 0.7982097764273542, "grad_norm": 0.3507694900035858, "learning_rate": 0.0002, "loss": 0.227, "step": 9720 }, { "epoch": 0.7998521833747356, "grad_norm": 0.4255737364292145, "learning_rate": 0.0002, "loss": 0.2591, "step": 9740 }, { "epoch": 0.8014945903221171, "grad_norm": 0.44822582602500916, "learning_rate": 0.0002, "loss": 0.2287, "step": 9760 }, { "epoch": 0.8031369972694985, "grad_norm": 0.4737776517868042, "learning_rate": 0.0002, "loss": 0.2412, "step": 9780 }, { "epoch": 0.8047794042168799, "grad_norm": 0.4281519651412964, "learning_rate": 0.0002, "loss": 0.2559, "step": 9800 }, { "epoch": 0.8064218111642613, "grad_norm": 0.3413679301738739, "learning_rate": 0.0002, "loss": 0.2479, "step": 9820 }, { "epoch": 0.8080642181116426, "grad_norm": 0.4361155033111572, "learning_rate": 0.0002, "loss": 0.2539, "step": 9840 }, { "epoch": 0.809706625059024, "grad_norm": 0.48523005843162537, "learning_rate": 0.0002, "loss": 0.2534, "step": 9860 }, { "epoch": 0.8113490320064054, "grad_norm": 0.4045993685722351, "learning_rate": 0.0002, "loss": 0.2455, "step": 9880 }, { "epoch": 0.8129914389537868, "grad_norm": 0.5103000998497009, "learning_rate": 0.0002, "loss": 0.2535, "step": 9900 }, { "epoch": 0.8146338459011682, "grad_norm": 0.3670307397842407, "learning_rate": 0.0002, "loss": 0.2337, "step": 9920 }, { "epoch": 0.8162762528485495, "grad_norm": 0.3149369955062866, "learning_rate": 0.0002, "loss": 0.2586, "step": 9940 }, { "epoch": 0.8179186597959309, "grad_norm": 0.5316740274429321, "learning_rate": 0.0002, "loss": 0.2373, "step": 9960 }, { "epoch": 0.8195610667433123, "grad_norm": 0.5300164222717285, "learning_rate": 0.0002, "loss": 0.2399, "step": 9980 }, { "epoch": 0.8212034736906937, "grad_norm": 0.48414990305900574, "learning_rate": 0.0002, "loss": 0.2331, "step": 10000 }, { "epoch": 0.8228458806380751, "grad_norm": 0.41733840107917786, "learning_rate": 0.0002, "loss": 0.2454, "step": 10020 }, { "epoch": 0.8244882875854564, "grad_norm": 0.5048840045928955, "learning_rate": 0.0002, "loss": 0.2421, "step": 10040 }, { "epoch": 0.8261306945328378, "grad_norm": 0.4444895386695862, "learning_rate": 0.0002, "loss": 0.2537, "step": 10060 }, { "epoch": 0.8277731014802192, "grad_norm": 0.45051780343055725, "learning_rate": 0.0002, "loss": 0.2462, "step": 10080 }, { "epoch": 0.8294155084276007, "grad_norm": 0.3937041163444519, "learning_rate": 0.0002, "loss": 0.243, "step": 10100 }, { "epoch": 0.8310579153749821, "grad_norm": 0.45621591806411743, "learning_rate": 0.0002, "loss": 0.2469, "step": 10120 }, { "epoch": 0.8327003223223635, "grad_norm": 0.5431267619132996, "learning_rate": 0.0002, "loss": 0.2425, "step": 10140 }, { "epoch": 0.8343427292697448, "grad_norm": 0.5039596557617188, "learning_rate": 0.0002, "loss": 0.2379, "step": 10160 }, { "epoch": 0.8359851362171262, "grad_norm": 0.3915367126464844, "learning_rate": 0.0002, "loss": 0.241, "step": 10180 }, { "epoch": 0.8376275431645076, "grad_norm": 0.46073317527770996, "learning_rate": 0.0002, "loss": 0.2485, "step": 10200 }, { "epoch": 0.839269950111889, "grad_norm": 0.47057440876960754, "learning_rate": 0.0002, "loss": 0.2452, "step": 10220 }, { "epoch": 0.8409123570592704, "grad_norm": 0.6143821477890015, "learning_rate": 0.0002, "loss": 0.2394, "step": 10240 }, { "epoch": 0.8425547640066517, "grad_norm": 0.41434940695762634, "learning_rate": 0.0002, "loss": 0.2332, "step": 10260 }, { "epoch": 0.8441971709540331, "grad_norm": 0.467459499835968, "learning_rate": 0.0002, "loss": 0.2439, "step": 10280 }, { "epoch": 0.8458395779014145, "grad_norm": 0.49404439330101013, "learning_rate": 0.0002, "loss": 0.2378, "step": 10300 }, { "epoch": 0.8474819848487959, "grad_norm": 0.4313650131225586, "learning_rate": 0.0002, "loss": 0.2455, "step": 10320 }, { "epoch": 0.8491243917961773, "grad_norm": 0.34277698397636414, "learning_rate": 0.0002, "loss": 0.2396, "step": 10340 }, { "epoch": 0.8507667987435586, "grad_norm": 0.3649916350841522, "learning_rate": 0.0002, "loss": 0.2348, "step": 10360 }, { "epoch": 0.85240920569094, "grad_norm": 0.4841578006744385, "learning_rate": 0.0002, "loss": 0.2488, "step": 10380 }, { "epoch": 0.8540516126383214, "grad_norm": 0.5488325953483582, "learning_rate": 0.0002, "loss": 0.2399, "step": 10400 }, { "epoch": 0.8556940195857029, "grad_norm": 0.41103577613830566, "learning_rate": 0.0002, "loss": 0.2371, "step": 10420 }, { "epoch": 0.8573364265330843, "grad_norm": 0.42253378033638, "learning_rate": 0.0002, "loss": 0.2478, "step": 10440 }, { "epoch": 0.8589788334804657, "grad_norm": 0.43092676997184753, "learning_rate": 0.0002, "loss": 0.2316, "step": 10460 }, { "epoch": 0.860621240427847, "grad_norm": 0.5474075078964233, "learning_rate": 0.0002, "loss": 0.2734, "step": 10480 }, { "epoch": 0.8622636473752284, "grad_norm": 0.474618524312973, "learning_rate": 0.0002, "loss": 0.2378, "step": 10500 }, { "epoch": 0.8639060543226098, "grad_norm": 0.44008612632751465, "learning_rate": 0.0002, "loss": 0.236, "step": 10520 }, { "epoch": 0.8655484612699912, "grad_norm": 0.4194040894508362, "learning_rate": 0.0002, "loss": 0.2433, "step": 10540 }, { "epoch": 0.8671908682173726, "grad_norm": 0.3890872597694397, "learning_rate": 0.0002, "loss": 0.2308, "step": 10560 }, { "epoch": 0.868833275164754, "grad_norm": 0.41979917883872986, "learning_rate": 0.0002, "loss": 0.2417, "step": 10580 }, { "epoch": 0.8704756821121353, "grad_norm": 0.3800947666168213, "learning_rate": 0.0002, "loss": 0.244, "step": 10600 }, { "epoch": 0.8721180890595167, "grad_norm": 0.38609811663627625, "learning_rate": 0.0002, "loss": 0.2477, "step": 10620 }, { "epoch": 0.8737604960068981, "grad_norm": 0.514067530632019, "learning_rate": 0.0002, "loss": 0.2382, "step": 10640 }, { "epoch": 0.8754029029542795, "grad_norm": 0.47742265462875366, "learning_rate": 0.0002, "loss": 0.2298, "step": 10660 }, { "epoch": 0.8770453099016609, "grad_norm": 0.45849281549453735, "learning_rate": 0.0002, "loss": 0.2332, "step": 10680 }, { "epoch": 0.8786877168490422, "grad_norm": 0.39788320660591125, "learning_rate": 0.0002, "loss": 0.2363, "step": 10700 }, { "epoch": 0.8803301237964236, "grad_norm": 0.5124650597572327, "learning_rate": 0.0002, "loss": 0.2292, "step": 10720 }, { "epoch": 0.881972530743805, "grad_norm": 0.48688754439353943, "learning_rate": 0.0002, "loss": 0.2444, "step": 10740 }, { "epoch": 0.8836149376911865, "grad_norm": 0.46146026253700256, "learning_rate": 0.0002, "loss": 0.2473, "step": 10760 }, { "epoch": 0.8852573446385679, "grad_norm": 0.38401076197624207, "learning_rate": 0.0002, "loss": 0.2441, "step": 10780 }, { "epoch": 0.8868997515859492, "grad_norm": 0.4642081558704376, "learning_rate": 0.0002, "loss": 0.2338, "step": 10800 }, { "epoch": 0.8885421585333306, "grad_norm": 0.378845751285553, "learning_rate": 0.0002, "loss": 0.2203, "step": 10820 }, { "epoch": 0.890184565480712, "grad_norm": 0.3785631060600281, "learning_rate": 0.0002, "loss": 0.2474, "step": 10840 }, { "epoch": 0.8918269724280934, "grad_norm": 0.4151659309864044, "learning_rate": 0.0002, "loss": 0.2361, "step": 10860 }, { "epoch": 0.8934693793754748, "grad_norm": 0.3314524292945862, "learning_rate": 0.0002, "loss": 0.241, "step": 10880 }, { "epoch": 0.8951117863228562, "grad_norm": 0.4619898200035095, "learning_rate": 0.0002, "loss": 0.2426, "step": 10900 }, { "epoch": 0.8967541932702375, "grad_norm": 0.5724550485610962, "learning_rate": 0.0002, "loss": 0.2455, "step": 10920 }, { "epoch": 0.8983966002176189, "grad_norm": 0.3766199052333832, "learning_rate": 0.0002, "loss": 0.2319, "step": 10940 }, { "epoch": 0.9000390071650003, "grad_norm": 0.4241611659526825, "learning_rate": 0.0002, "loss": 0.2316, "step": 10960 }, { "epoch": 0.9016814141123817, "grad_norm": 0.35726866126060486, "learning_rate": 0.0002, "loss": 0.2343, "step": 10980 }, { "epoch": 0.903323821059763, "grad_norm": 0.5252423882484436, "learning_rate": 0.0002, "loss": 0.2431, "step": 11000 }, { "epoch": 0.9049662280071444, "grad_norm": 0.47167885303497314, "learning_rate": 0.0002, "loss": 0.2512, "step": 11020 }, { "epoch": 0.9066086349545258, "grad_norm": 0.4106541872024536, "learning_rate": 0.0002, "loss": 0.2397, "step": 11040 }, { "epoch": 0.9082510419019072, "grad_norm": 0.4804975390434265, "learning_rate": 0.0002, "loss": 0.2445, "step": 11060 }, { "epoch": 0.9098934488492886, "grad_norm": 0.4177796542644501, "learning_rate": 0.0002, "loss": 0.2302, "step": 11080 }, { "epoch": 0.9115358557966701, "grad_norm": 0.34781017899513245, "learning_rate": 0.0002, "loss": 0.2285, "step": 11100 }, { "epoch": 0.9131782627440514, "grad_norm": 0.34392043948173523, "learning_rate": 0.0002, "loss": 0.232, "step": 11120 }, { "epoch": 0.9148206696914328, "grad_norm": 0.46544018387794495, "learning_rate": 0.0002, "loss": 0.2332, "step": 11140 }, { "epoch": 0.9164630766388142, "grad_norm": 0.47958704829216003, "learning_rate": 0.0002, "loss": 0.2481, "step": 11160 }, { "epoch": 0.9181054835861956, "grad_norm": 0.4493333697319031, "learning_rate": 0.0002, "loss": 0.238, "step": 11180 }, { "epoch": 0.919747890533577, "grad_norm": 0.47599494457244873, "learning_rate": 0.0002, "loss": 0.2416, "step": 11200 }, { "epoch": 0.9213902974809584, "grad_norm": 0.39547592401504517, "learning_rate": 0.0002, "loss": 0.2456, "step": 11220 }, { "epoch": 0.9230327044283397, "grad_norm": 0.42187511920928955, "learning_rate": 0.0002, "loss": 0.2425, "step": 11240 }, { "epoch": 0.9246751113757211, "grad_norm": 0.3870528042316437, "learning_rate": 0.0002, "loss": 0.2366, "step": 11260 }, { "epoch": 0.9263175183231025, "grad_norm": 0.40943118929862976, "learning_rate": 0.0002, "loss": 0.2088, "step": 11280 }, { "epoch": 0.9279599252704839, "grad_norm": 0.3936561346054077, "learning_rate": 0.0002, "loss": 0.239, "step": 11300 }, { "epoch": 0.9296023322178653, "grad_norm": 0.4154857397079468, "learning_rate": 0.0002, "loss": 0.2413, "step": 11320 }, { "epoch": 0.9312447391652466, "grad_norm": 0.5544102191925049, "learning_rate": 0.0002, "loss": 0.2565, "step": 11340 }, { "epoch": 0.932887146112628, "grad_norm": 0.5494611263275146, "learning_rate": 0.0002, "loss": 0.2469, "step": 11360 }, { "epoch": 0.9345295530600094, "grad_norm": 0.41848114132881165, "learning_rate": 0.0002, "loss": 0.2333, "step": 11380 }, { "epoch": 0.9361719600073908, "grad_norm": 0.41343703866004944, "learning_rate": 0.0002, "loss": 0.2342, "step": 11400 }, { "epoch": 0.9378143669547723, "grad_norm": 0.6060330867767334, "learning_rate": 0.0002, "loss": 0.2507, "step": 11420 }, { "epoch": 0.9394567739021537, "grad_norm": 0.42079275846481323, "learning_rate": 0.0002, "loss": 0.2322, "step": 11440 }, { "epoch": 0.941099180849535, "grad_norm": 0.43053537607192993, "learning_rate": 0.0002, "loss": 0.2257, "step": 11460 }, { "epoch": 0.9427415877969164, "grad_norm": 0.41895121335983276, "learning_rate": 0.0002, "loss": 0.2501, "step": 11480 }, { "epoch": 0.9443839947442978, "grad_norm": 0.467018723487854, "learning_rate": 0.0002, "loss": 0.2282, "step": 11500 }, { "epoch": 0.9460264016916792, "grad_norm": 0.5707799196243286, "learning_rate": 0.0002, "loss": 0.2319, "step": 11520 }, { "epoch": 0.9476688086390606, "grad_norm": 0.4575120806694031, "learning_rate": 0.0002, "loss": 0.2291, "step": 11540 }, { "epoch": 0.9493112155864419, "grad_norm": 0.38349372148513794, "learning_rate": 0.0002, "loss": 0.2263, "step": 11560 }, { "epoch": 0.9509536225338233, "grad_norm": 0.4487491846084595, "learning_rate": 0.0002, "loss": 0.2505, "step": 11580 }, { "epoch": 0.9525960294812047, "grad_norm": 0.39065688848495483, "learning_rate": 0.0002, "loss": 0.239, "step": 11600 }, { "epoch": 0.9542384364285861, "grad_norm": 0.4473966658115387, "learning_rate": 0.0002, "loss": 0.2409, "step": 11620 }, { "epoch": 0.9558808433759675, "grad_norm": 0.39066895842552185, "learning_rate": 0.0002, "loss": 0.2431, "step": 11640 }, { "epoch": 0.9575232503233488, "grad_norm": 0.470277339220047, "learning_rate": 0.0002, "loss": 0.2419, "step": 11660 }, { "epoch": 0.9591656572707302, "grad_norm": 0.405834436416626, "learning_rate": 0.0002, "loss": 0.2408, "step": 11680 }, { "epoch": 0.9608080642181116, "grad_norm": 0.5717544555664062, "learning_rate": 0.0002, "loss": 0.2352, "step": 11700 }, { "epoch": 0.962450471165493, "grad_norm": 0.4837093651294708, "learning_rate": 0.0002, "loss": 0.2435, "step": 11720 }, { "epoch": 0.9640928781128744, "grad_norm": 0.4689130187034607, "learning_rate": 0.0002, "loss": 0.2324, "step": 11740 }, { "epoch": 0.9657352850602559, "grad_norm": 0.511249840259552, "learning_rate": 0.0002, "loss": 0.2394, "step": 11760 }, { "epoch": 0.9673776920076372, "grad_norm": 0.43555593490600586, "learning_rate": 0.0002, "loss": 0.2377, "step": 11780 }, { "epoch": 0.9690200989550186, "grad_norm": 0.41933077573776245, "learning_rate": 0.0002, "loss": 0.2355, "step": 11800 }, { "epoch": 0.9706625059024, "grad_norm": 0.41573819518089294, "learning_rate": 0.0002, "loss": 0.2345, "step": 11820 }, { "epoch": 0.9723049128497814, "grad_norm": 0.3951037526130676, "learning_rate": 0.0002, "loss": 0.2399, "step": 11840 }, { "epoch": 0.9739473197971628, "grad_norm": 0.477756142616272, "learning_rate": 0.0002, "loss": 0.2425, "step": 11860 }, { "epoch": 0.9755897267445441, "grad_norm": 0.5147901773452759, "learning_rate": 0.0002, "loss": 0.2354, "step": 11880 }, { "epoch": 0.9772321336919255, "grad_norm": 0.40053385496139526, "learning_rate": 0.0002, "loss": 0.2325, "step": 11900 }, { "epoch": 0.9788745406393069, "grad_norm": 0.4459463953971863, "learning_rate": 0.0002, "loss": 0.2492, "step": 11920 }, { "epoch": 0.9805169475866883, "grad_norm": 0.42749595642089844, "learning_rate": 0.0002, "loss": 0.2308, "step": 11940 }, { "epoch": 0.9821593545340697, "grad_norm": 0.4053783714771271, "learning_rate": 0.0002, "loss": 0.2263, "step": 11960 }, { "epoch": 0.983801761481451, "grad_norm": 0.43342533707618713, "learning_rate": 0.0002, "loss": 0.2348, "step": 11980 }, { "epoch": 0.9854441684288324, "grad_norm": 0.43272313475608826, "learning_rate": 0.0002, "loss": 0.2234, "step": 12000 }, { "epoch": 0.9870865753762138, "grad_norm": 0.3550325036048889, "learning_rate": 0.0002, "loss": 0.2186, "step": 12020 }, { "epoch": 0.9887289823235952, "grad_norm": 0.35271936655044556, "learning_rate": 0.0002, "loss": 0.2326, "step": 12040 }, { "epoch": 0.9903713892709766, "grad_norm": 0.37404924631118774, "learning_rate": 0.0002, "loss": 0.2483, "step": 12060 }, { "epoch": 0.9920137962183581, "grad_norm": 0.46686896681785583, "learning_rate": 0.0002, "loss": 0.2213, "step": 12080 }, { "epoch": 0.9936562031657394, "grad_norm": 0.37012913823127747, "learning_rate": 0.0002, "loss": 0.2415, "step": 12100 }, { "epoch": 0.9952986101131208, "grad_norm": 0.4403967559337616, "learning_rate": 0.0002, "loss": 0.2261, "step": 12120 }, { "epoch": 0.9969410170605022, "grad_norm": 0.36877259612083435, "learning_rate": 0.0002, "loss": 0.2295, "step": 12140 }, { "epoch": 0.9985834240078836, "grad_norm": 0.34526777267456055, "learning_rate": 0.0002, "loss": 0.2236, "step": 12160 }, { "epoch": 1.0, "eval_loss": 0.30336490273475647, "eval_runtime": 533.8677, "eval_samples_per_second": 7.092, "eval_steps_per_second": 0.888, "step": 12178 } ], "logging_steps": 20, "max_steps": 14000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 77, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.0518674601423667e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }