OpenCoder-8B-Instruct-query_nsx / trainer_state.json
k1h0's picture
Upload folder using huggingface_hub
0208592 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9977924944812362,
"eval_steps": 500,
"global_step": 113,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008830022075055188,
"grad_norm": 0.8537317514419556,
"learning_rate": 4.999033893736386e-05,
"loss": 0.9875,
"num_input_tokens_seen": 2097152,
"step": 1
},
{
"epoch": 0.017660044150110375,
"grad_norm": 0.7285566926002502,
"learning_rate": 4.99613632163459e-05,
"loss": 0.944,
"num_input_tokens_seen": 4194304,
"step": 2
},
{
"epoch": 0.026490066225165563,
"grad_norm": 0.5551783442497253,
"learning_rate": 4.991309523184661e-05,
"loss": 0.904,
"num_input_tokens_seen": 6291456,
"step": 3
},
{
"epoch": 0.03532008830022075,
"grad_norm": 0.4819079041481018,
"learning_rate": 4.98455722894677e-05,
"loss": 0.8913,
"num_input_tokens_seen": 8388608,
"step": 4
},
{
"epoch": 0.04415011037527594,
"grad_norm": 0.373820960521698,
"learning_rate": 4.975884657667922e-05,
"loss": 0.8511,
"num_input_tokens_seen": 10485760,
"step": 5
},
{
"epoch": 0.052980132450331126,
"grad_norm": 0.2974053621292114,
"learning_rate": 4.965298512248466e-05,
"loss": 0.8311,
"num_input_tokens_seen": 12582912,
"step": 6
},
{
"epoch": 0.06181015452538632,
"grad_norm": 0.25162145495414734,
"learning_rate": 4.952806974561518e-05,
"loss": 0.838,
"num_input_tokens_seen": 14680064,
"step": 7
},
{
"epoch": 0.0706401766004415,
"grad_norm": 0.20863372087478638,
"learning_rate": 4.9384196991293205e-05,
"loss": 0.8468,
"num_input_tokens_seen": 16777216,
"step": 8
},
{
"epoch": 0.07947019867549669,
"grad_norm": 0.18535731732845306,
"learning_rate": 4.922147805661402e-05,
"loss": 0.7884,
"num_input_tokens_seen": 18874368,
"step": 9
},
{
"epoch": 0.08830022075055188,
"grad_norm": 0.16335929930210114,
"learning_rate": 4.904003870460323e-05,
"loss": 0.8009,
"num_input_tokens_seen": 20971520,
"step": 10
},
{
"epoch": 0.09713024282560706,
"grad_norm": 0.1265847086906433,
"learning_rate": 4.884001916701639e-05,
"loss": 0.7764,
"num_input_tokens_seen": 23068672,
"step": 11
},
{
"epoch": 0.10596026490066225,
"grad_norm": 0.11690282076597214,
"learning_rate": 4.862157403595598e-05,
"loss": 0.7774,
"num_input_tokens_seen": 25165824,
"step": 12
},
{
"epoch": 0.11479028697571744,
"grad_norm": 0.10680913180112839,
"learning_rate": 4.838487214438951e-05,
"loss": 0.7808,
"num_input_tokens_seen": 27262976,
"step": 13
},
{
"epoch": 0.12362030905077263,
"grad_norm": 0.09127331525087357,
"learning_rate": 4.813009643566101e-05,
"loss": 0.8111,
"num_input_tokens_seen": 29360128,
"step": 14
},
{
"epoch": 0.13245033112582782,
"grad_norm": 0.08718351274728775,
"learning_rate": 4.7857443822096905e-05,
"loss": 0.7997,
"num_input_tokens_seen": 31457280,
"step": 15
},
{
"epoch": 0.141280353200883,
"grad_norm": 0.06960419565439224,
"learning_rate": 4.7567125032815394e-05,
"loss": 0.7537,
"num_input_tokens_seen": 33554432,
"step": 16
},
{
"epoch": 0.15011037527593818,
"grad_norm": 0.0721953734755516,
"learning_rate": 4.7259364450857096e-05,
"loss": 0.7645,
"num_input_tokens_seen": 35651584,
"step": 17
},
{
"epoch": 0.15894039735099338,
"grad_norm": 0.0680830106139183,
"learning_rate": 4.6934399939762746e-05,
"loss": 0.7712,
"num_input_tokens_seen": 37748736,
"step": 18
},
{
"epoch": 0.16777041942604856,
"grad_norm": 0.06572319567203522,
"learning_rate": 4.659248265973205e-05,
"loss": 0.74,
"num_input_tokens_seen": 39845888,
"step": 19
},
{
"epoch": 0.17660044150110377,
"grad_norm": 0.06348133087158203,
"learning_rate": 4.6233876873505694e-05,
"loss": 0.777,
"num_input_tokens_seen": 41943040,
"step": 20
},
{
"epoch": 0.18543046357615894,
"grad_norm": 0.06005046144127846,
"learning_rate": 4.585885974212068e-05,
"loss": 0.7537,
"num_input_tokens_seen": 44040192,
"step": 21
},
{
"epoch": 0.19426048565121412,
"grad_norm": 0.057457394897937775,
"learning_rate": 4.5467721110696685e-05,
"loss": 0.7453,
"num_input_tokens_seen": 46137344,
"step": 22
},
{
"epoch": 0.20309050772626933,
"grad_norm": 0.054359156638383865,
"learning_rate": 4.5060763284419114e-05,
"loss": 0.7573,
"num_input_tokens_seen": 48234496,
"step": 23
},
{
"epoch": 0.2119205298013245,
"grad_norm": 0.054271843284368515,
"learning_rate": 4.463830079489196e-05,
"loss": 0.7626,
"num_input_tokens_seen": 50331648,
"step": 24
},
{
"epoch": 0.22075055187637968,
"grad_norm": 0.048914600163698196,
"learning_rate": 4.420066015704105e-05,
"loss": 0.7558,
"num_input_tokens_seen": 52428800,
"step": 25
},
{
"epoch": 0.22958057395143489,
"grad_norm": 0.04995320364832878,
"learning_rate": 4.374817961675553e-05,
"loss": 0.7654,
"num_input_tokens_seen": 54525952,
"step": 26
},
{
"epoch": 0.23841059602649006,
"grad_norm": 0.04679872468113899,
"learning_rate": 4.3281208889462715e-05,
"loss": 0.7363,
"num_input_tokens_seen": 56623104,
"step": 27
},
{
"epoch": 0.24724061810154527,
"grad_norm": 0.04957474768161774,
"learning_rate": 4.2800108889838244e-05,
"loss": 0.7503,
"num_input_tokens_seen": 58720256,
"step": 28
},
{
"epoch": 0.2560706401766004,
"grad_norm": 0.04368240758776665,
"learning_rate": 4.230525145286057e-05,
"loss": 0.7474,
"num_input_tokens_seen": 60817408,
"step": 29
},
{
"epoch": 0.26490066225165565,
"grad_norm": 0.04365404695272446,
"learning_rate": 4.1797019046425264e-05,
"loss": 0.7442,
"num_input_tokens_seen": 62914560,
"step": 30
},
{
"epoch": 0.2737306843267108,
"grad_norm": 0.04533332213759422,
"learning_rate": 4.127580447574131e-05,
"loss": 0.7492,
"num_input_tokens_seen": 65011712,
"step": 31
},
{
"epoch": 0.282560706401766,
"grad_norm": 0.045380424708127975,
"learning_rate": 4.0742010579737855e-05,
"loss": 0.7601,
"num_input_tokens_seen": 67108864,
"step": 32
},
{
"epoch": 0.2913907284768212,
"grad_norm": 0.04397201910614967,
"learning_rate": 4.0196049919716004e-05,
"loss": 0.7381,
"num_input_tokens_seen": 69206016,
"step": 33
},
{
"epoch": 0.30022075055187636,
"grad_norm": 0.042593635618686676,
"learning_rate": 3.963834446048644e-05,
"loss": 0.7406,
"num_input_tokens_seen": 71303168,
"step": 34
},
{
"epoch": 0.3090507726269316,
"grad_norm": 0.04130704328417778,
"learning_rate": 3.9069325244239095e-05,
"loss": 0.7573,
"num_input_tokens_seen": 73400320,
"step": 35
},
{
"epoch": 0.31788079470198677,
"grad_norm": 0.04113290086388588,
"learning_rate": 3.848943205739711e-05,
"loss": 0.7419,
"num_input_tokens_seen": 75497472,
"step": 36
},
{
"epoch": 0.32671081677704195,
"grad_norm": 0.0397503562271595,
"learning_rate": 3.7899113090712526e-05,
"loss": 0.7357,
"num_input_tokens_seen": 77594624,
"step": 37
},
{
"epoch": 0.3355408388520971,
"grad_norm": 0.03884141892194748,
"learning_rate": 3.729882459286632e-05,
"loss": 0.7346,
"num_input_tokens_seen": 79691776,
"step": 38
},
{
"epoch": 0.3443708609271523,
"grad_norm": 0.03996223211288452,
"learning_rate": 3.66890305178407e-05,
"loss": 0.7437,
"num_input_tokens_seen": 81788928,
"step": 39
},
{
"epoch": 0.35320088300220753,
"grad_norm": 0.0395955815911293,
"learning_rate": 3.607020216633599e-05,
"loss": 0.742,
"num_input_tokens_seen": 83886080,
"step": 40
},
{
"epoch": 0.3620309050772627,
"grad_norm": 0.03895486891269684,
"learning_rate": 3.544281782150936e-05,
"loss": 0.7136,
"num_input_tokens_seen": 85983232,
"step": 41
},
{
"epoch": 0.3708609271523179,
"grad_norm": 0.03683311864733696,
"learning_rate": 3.4807362379317025e-05,
"loss": 0.7417,
"num_input_tokens_seen": 88080384,
"step": 42
},
{
"epoch": 0.37969094922737306,
"grad_norm": 0.037801578640937805,
"learning_rate": 3.416432697374533e-05,
"loss": 0.7102,
"num_input_tokens_seen": 90177536,
"step": 43
},
{
"epoch": 0.38852097130242824,
"grad_norm": 0.03917045146226883,
"learning_rate": 3.3514208597220705e-05,
"loss": 0.7685,
"num_input_tokens_seen": 92274688,
"step": 44
},
{
"epoch": 0.3973509933774834,
"grad_norm": 0.037231214344501495,
"learning_rate": 3.285750971649167e-05,
"loss": 0.7332,
"num_input_tokens_seen": 94371840,
"step": 45
},
{
"epoch": 0.40618101545253865,
"grad_norm": 0.03943084925413132,
"learning_rate": 3.219473788427984e-05,
"loss": 0.7387,
"num_input_tokens_seen": 96468992,
"step": 46
},
{
"epoch": 0.41501103752759383,
"grad_norm": 0.03614073246717453,
"learning_rate": 3.1526405346999946e-05,
"loss": 0.7096,
"num_input_tokens_seen": 98566144,
"step": 47
},
{
"epoch": 0.423841059602649,
"grad_norm": 0.03576625511050224,
"learning_rate": 3.085302864885235e-05,
"loss": 0.7242,
"num_input_tokens_seen": 100663296,
"step": 48
},
{
"epoch": 0.4326710816777042,
"grad_norm": 0.0361945666372776,
"learning_rate": 3.017512823259373e-05,
"loss": 0.7338,
"num_input_tokens_seen": 102760448,
"step": 49
},
{
"epoch": 0.44150110375275936,
"grad_norm": 0.03698369115591049,
"learning_rate": 2.9493228037294702e-05,
"loss": 0.7494,
"num_input_tokens_seen": 104857600,
"step": 50
},
{
"epoch": 0.4503311258278146,
"grad_norm": 0.03586776927113533,
"learning_rate": 2.8807855093395126e-05,
"loss": 0.7272,
"num_input_tokens_seen": 106954752,
"step": 51
},
{
"epoch": 0.45916114790286977,
"grad_norm": 0.03969455882906914,
"learning_rate": 2.8119539115370218e-05,
"loss": 0.7447,
"num_input_tokens_seen": 109051904,
"step": 52
},
{
"epoch": 0.46799116997792495,
"grad_norm": 0.037024304270744324,
"learning_rate": 2.742881209232215e-05,
"loss": 0.7219,
"num_input_tokens_seen": 111149056,
"step": 53
},
{
"epoch": 0.4768211920529801,
"grad_norm": 0.035661764442920685,
"learning_rate": 2.6736207876813646e-05,
"loss": 0.7484,
"num_input_tokens_seen": 113246208,
"step": 54
},
{
"epoch": 0.4856512141280353,
"grad_norm": 0.03690381348133087,
"learning_rate": 2.604226177226137e-05,
"loss": 0.7352,
"num_input_tokens_seen": 115343360,
"step": 55
},
{
"epoch": 0.49448123620309054,
"grad_norm": 0.03607625514268875,
"learning_rate": 2.5347510119207878e-05,
"loss": 0.7254,
"num_input_tokens_seen": 117440512,
"step": 56
},
{
"epoch": 0.5033112582781457,
"grad_norm": 0.036771535873413086,
"learning_rate": 2.4652489880792128e-05,
"loss": 0.724,
"num_input_tokens_seen": 119537664,
"step": 57
},
{
"epoch": 0.5121412803532008,
"grad_norm": 0.035380277782678604,
"learning_rate": 2.395773822773863e-05,
"loss": 0.747,
"num_input_tokens_seen": 121634816,
"step": 58
},
{
"epoch": 0.5209713024282561,
"grad_norm": 0.03334279730916023,
"learning_rate": 2.3263792123186353e-05,
"loss": 0.7246,
"num_input_tokens_seen": 123731968,
"step": 59
},
{
"epoch": 0.5298013245033113,
"grad_norm": 0.03534376993775368,
"learning_rate": 2.2571187907677853e-05,
"loss": 0.7424,
"num_input_tokens_seen": 125829120,
"step": 60
},
{
"epoch": 0.5386313465783664,
"grad_norm": 0.035675469785928726,
"learning_rate": 2.188046088462979e-05,
"loss": 0.7422,
"num_input_tokens_seen": 127926272,
"step": 61
},
{
"epoch": 0.5474613686534217,
"grad_norm": 0.03412646800279617,
"learning_rate": 2.1192144906604876e-05,
"loss": 0.7166,
"num_input_tokens_seen": 130023424,
"step": 62
},
{
"epoch": 0.5562913907284768,
"grad_norm": 0.03617184981703758,
"learning_rate": 2.0506771962705304e-05,
"loss": 0.7564,
"num_input_tokens_seen": 132120576,
"step": 63
},
{
"epoch": 0.565121412803532,
"grad_norm": 0.03402591496706009,
"learning_rate": 1.982487176740627e-05,
"loss": 0.744,
"num_input_tokens_seen": 134217728,
"step": 64
},
{
"epoch": 0.5739514348785872,
"grad_norm": 0.035137876868247986,
"learning_rate": 1.9146971351147655e-05,
"loss": 0.7523,
"num_input_tokens_seen": 136314880,
"step": 65
},
{
"epoch": 0.5827814569536424,
"grad_norm": 0.03335074707865715,
"learning_rate": 1.847359465300006e-05,
"loss": 0.7274,
"num_input_tokens_seen": 138412032,
"step": 66
},
{
"epoch": 0.5916114790286976,
"grad_norm": 0.032923776656389236,
"learning_rate": 1.780526211572016e-05,
"loss": 0.746,
"num_input_tokens_seen": 140509184,
"step": 67
},
{
"epoch": 0.6004415011037527,
"grad_norm": 0.038766708225011826,
"learning_rate": 1.7142490283508324e-05,
"loss": 0.7423,
"num_input_tokens_seen": 142606336,
"step": 68
},
{
"epoch": 0.609271523178808,
"grad_norm": 0.032216496765613556,
"learning_rate": 1.648579140277931e-05,
"loss": 0.7234,
"num_input_tokens_seen": 144703488,
"step": 69
},
{
"epoch": 0.6181015452538632,
"grad_norm": 0.03458288684487343,
"learning_rate": 1.583567302625469e-05,
"loss": 0.7495,
"num_input_tokens_seen": 146800640,
"step": 70
},
{
"epoch": 0.6269315673289183,
"grad_norm": 0.033755529671907425,
"learning_rate": 1.5192637620682981e-05,
"loss": 0.7227,
"num_input_tokens_seen": 148897792,
"step": 71
},
{
"epoch": 0.6357615894039735,
"grad_norm": 0.03497767448425293,
"learning_rate": 1.4557182178490636e-05,
"loss": 0.7607,
"num_input_tokens_seen": 150994944,
"step": 72
},
{
"epoch": 0.6445916114790287,
"grad_norm": 0.03499361500144005,
"learning_rate": 1.3929797833664013e-05,
"loss": 0.7296,
"num_input_tokens_seen": 153092096,
"step": 73
},
{
"epoch": 0.6534216335540839,
"grad_norm": 0.033749066293239594,
"learning_rate": 1.3310969482159297e-05,
"loss": 0.7361,
"num_input_tokens_seen": 155189248,
"step": 74
},
{
"epoch": 0.6622516556291391,
"grad_norm": 0.03505317121744156,
"learning_rate": 1.270117540713368e-05,
"loss": 0.7379,
"num_input_tokens_seen": 157286400,
"step": 75
},
{
"epoch": 0.6710816777041942,
"grad_norm": 0.035697367042303085,
"learning_rate": 1.2100886909287478e-05,
"loss": 0.7351,
"num_input_tokens_seen": 159383552,
"step": 76
},
{
"epoch": 0.6799116997792495,
"grad_norm": 0.035562630742788315,
"learning_rate": 1.151056794260289e-05,
"loss": 0.7295,
"num_input_tokens_seen": 161480704,
"step": 77
},
{
"epoch": 0.6887417218543046,
"grad_norm": 0.0363004133105278,
"learning_rate": 1.0930674755760908e-05,
"loss": 0.7333,
"num_input_tokens_seen": 163577856,
"step": 78
},
{
"epoch": 0.6975717439293598,
"grad_norm": 0.034324079751968384,
"learning_rate": 1.0361655539513565e-05,
"loss": 0.7334,
"num_input_tokens_seen": 165675008,
"step": 79
},
{
"epoch": 0.7064017660044151,
"grad_norm": 0.03306734561920166,
"learning_rate": 9.803950080284005e-06,
"loss": 0.7437,
"num_input_tokens_seen": 167772160,
"step": 80
},
{
"epoch": 0.7152317880794702,
"grad_norm": 0.03080984577536583,
"learning_rate": 9.257989420262151e-06,
"loss": 0.7133,
"num_input_tokens_seen": 169869312,
"step": 81
},
{
"epoch": 0.7240618101545254,
"grad_norm": 0.03477946296334267,
"learning_rate": 8.724195524258688e-06,
"loss": 0.7415,
"num_input_tokens_seen": 171966464,
"step": 82
},
{
"epoch": 0.7328918322295805,
"grad_norm": 0.03363949805498123,
"learning_rate": 8.202980953574735e-06,
"loss": 0.7423,
"num_input_tokens_seen": 174063616,
"step": 83
},
{
"epoch": 0.7417218543046358,
"grad_norm": 0.032141055911779404,
"learning_rate": 7.69474854713943e-06,
"loss": 0.7125,
"num_input_tokens_seen": 176160768,
"step": 84
},
{
"epoch": 0.7505518763796909,
"grad_norm": 0.03413500636816025,
"learning_rate": 7.1998911101617575e-06,
"loss": 0.7346,
"num_input_tokens_seen": 178257920,
"step": 85
},
{
"epoch": 0.7593818984547461,
"grad_norm": 0.032283272594213486,
"learning_rate": 6.718791110537287e-06,
"loss": 0.7235,
"num_input_tokens_seen": 180355072,
"step": 86
},
{
"epoch": 0.7682119205298014,
"grad_norm": 0.03352838382124901,
"learning_rate": 6.25182038324447e-06,
"loss": 0.7268,
"num_input_tokens_seen": 182452224,
"step": 87
},
{
"epoch": 0.7770419426048565,
"grad_norm": 0.033114444464445114,
"learning_rate": 5.7993398429589506e-06,
"loss": 0.7251,
"num_input_tokens_seen": 184549376,
"step": 88
},
{
"epoch": 0.7858719646799117,
"grad_norm": 0.03312879428267479,
"learning_rate": 5.361699205108042e-06,
"loss": 0.7216,
"num_input_tokens_seen": 186646528,
"step": 89
},
{
"epoch": 0.7947019867549668,
"grad_norm": 0.034220073372125626,
"learning_rate": 4.939236715580884e-06,
"loss": 0.7386,
"num_input_tokens_seen": 188743680,
"step": 90
},
{
"epoch": 0.8035320088300221,
"grad_norm": 0.03237886354327202,
"learning_rate": 4.5322788893033155e-06,
"loss": 0.734,
"num_input_tokens_seen": 190840832,
"step": 91
},
{
"epoch": 0.8123620309050773,
"grad_norm": 0.032567963004112244,
"learning_rate": 4.14114025787932e-06,
"loss": 0.7519,
"num_input_tokens_seen": 192937984,
"step": 92
},
{
"epoch": 0.8211920529801324,
"grad_norm": 0.032218772917985916,
"learning_rate": 3.7661231264943086e-06,
"loss": 0.7194,
"num_input_tokens_seen": 195035136,
"step": 93
},
{
"epoch": 0.8300220750551877,
"grad_norm": 0.0331319235265255,
"learning_rate": 3.4075173402679574e-06,
"loss": 0.7263,
"num_input_tokens_seen": 197132288,
"step": 94
},
{
"epoch": 0.8388520971302428,
"grad_norm": 0.03236314281821251,
"learning_rate": 3.0656000602372558e-06,
"loss": 0.7093,
"num_input_tokens_seen": 199229440,
"step": 95
},
{
"epoch": 0.847682119205298,
"grad_norm": 0.03465864434838295,
"learning_rate": 2.7406355491429086e-06,
"loss": 0.7654,
"num_input_tokens_seen": 201326592,
"step": 96
},
{
"epoch": 0.8565121412803532,
"grad_norm": 0.033873483538627625,
"learning_rate": 2.4328749671846116e-06,
"loss": 0.7365,
"num_input_tokens_seen": 203423744,
"step": 97
},
{
"epoch": 0.8653421633554084,
"grad_norm": 0.03332989290356636,
"learning_rate": 2.142556177903096e-06,
"loss": 0.7468,
"num_input_tokens_seen": 205520896,
"step": 98
},
{
"epoch": 0.8741721854304636,
"grad_norm": 0.033005475997924805,
"learning_rate": 1.8699035643389928e-06,
"loss": 0.7264,
"num_input_tokens_seen": 207618048,
"step": 99
},
{
"epoch": 0.8830022075055187,
"grad_norm": 0.034544438123703,
"learning_rate": 1.615127855610496e-06,
"loss": 0.7392,
"num_input_tokens_seen": 209715200,
"step": 100
},
{
"epoch": 0.891832229580574,
"grad_norm": 0.03568817302584648,
"learning_rate": 1.3784259640440279e-06,
"loss": 0.7378,
"num_input_tokens_seen": 211812352,
"step": 101
},
{
"epoch": 0.9006622516556292,
"grad_norm": 0.03453758731484413,
"learning_rate": 1.1599808329836177e-06,
"loss": 0.7384,
"num_input_tokens_seen": 213909504,
"step": 102
},
{
"epoch": 0.9094922737306843,
"grad_norm": 0.03320000693202019,
"learning_rate": 9.599612953967746e-07,
"loss": 0.7326,
"num_input_tokens_seen": 216006656,
"step": 103
},
{
"epoch": 0.9183222958057395,
"grad_norm": 0.033046990633010864,
"learning_rate": 7.785219433859847e-07,
"loss": 0.7319,
"num_input_tokens_seen": 218103808,
"step": 104
},
{
"epoch": 0.9271523178807947,
"grad_norm": 0.032943662256002426,
"learning_rate": 6.158030087068001e-07,
"loss": 0.7365,
"num_input_tokens_seen": 220200960,
"step": 105
},
{
"epoch": 0.9359823399558499,
"grad_norm": 0.03540504723787308,
"learning_rate": 4.719302543848225e-07,
"loss": 0.75,
"num_input_tokens_seen": 222298112,
"step": 106
},
{
"epoch": 0.9448123620309051,
"grad_norm": 0.03443380072712898,
"learning_rate": 3.470148775153448e-07,
"loss": 0.7468,
"num_input_tokens_seen": 224395264,
"step": 107
},
{
"epoch": 0.9536423841059603,
"grad_norm": 0.03330874443054199,
"learning_rate": 2.4115342332078074e-07,
"loss": 0.7425,
"num_input_tokens_seen": 226492416,
"step": 108
},
{
"epoch": 0.9624724061810155,
"grad_norm": 0.03456017002463341,
"learning_rate": 1.5442771053230665e-07,
"loss": 0.7314,
"num_input_tokens_seen": 228589568,
"step": 109
},
{
"epoch": 0.9713024282560706,
"grad_norm": 0.03423641249537468,
"learning_rate": 8.690476815339244e-08,
"loss": 0.7407,
"num_input_tokens_seen": 230686720,
"step": 110
},
{
"epoch": 0.9801324503311258,
"grad_norm": 0.03197889402508736,
"learning_rate": 3.8636783654100174e-08,
"loss": 0.729,
"num_input_tokens_seen": 232783872,
"step": 111
},
{
"epoch": 0.9889624724061811,
"grad_norm": 0.034751225262880325,
"learning_rate": 9.661062636148744e-09,
"loss": 0.7333,
"num_input_tokens_seen": 234881024,
"step": 112
},
{
"epoch": 0.9977924944812362,
"grad_norm": 0.03236055746674538,
"learning_rate": 0.0,
"loss": 0.7217,
"num_input_tokens_seen": 236978176,
"step": 113
},
{
"epoch": 0.9977924944812362,
"num_input_tokens_seen": 236978176,
"step": 113,
"total_flos": 1.0486889344470614e+19,
"train_loss": 0.7524756815581195,
"train_runtime": 19134.3763,
"train_samples_per_second": 3.027,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1.0,
"max_steps": 113,
"num_input_tokens_seen": 236978176,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0486889344470614e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}