| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.928870292887029, | |
| "eval_steps": 100, | |
| "global_step": 5600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05230125523012552, | |
| "eval_accuracy": 0.10215735262383098, | |
| "eval_loss": 5.751355171203613, | |
| "eval_runtime": 184.0542, | |
| "eval_samples_per_second": 80.194, | |
| "eval_steps_per_second": 0.63, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10460251046025104, | |
| "eval_accuracy": 0.12205444695787844, | |
| "eval_loss": 5.327404975891113, | |
| "eval_runtime": 185.0066, | |
| "eval_samples_per_second": 79.781, | |
| "eval_steps_per_second": 0.627, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.15690376569037656, | |
| "eval_accuracy": 0.13593891242440487, | |
| "eval_loss": 5.167089939117432, | |
| "eval_runtime": 184.2672, | |
| "eval_samples_per_second": 80.101, | |
| "eval_steps_per_second": 0.63, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.20920502092050208, | |
| "eval_accuracy": 0.14769799376050313, | |
| "eval_loss": 5.03983736038208, | |
| "eval_runtime": 184.4381, | |
| "eval_samples_per_second": 80.027, | |
| "eval_steps_per_second": 0.629, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2615062761506276, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 4.564156206415621e-05, | |
| "loss": 5.3792, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2615062761506276, | |
| "eval_accuracy": 0.15595240798761018, | |
| "eval_loss": 4.976524353027344, | |
| "eval_runtime": 184.12, | |
| "eval_samples_per_second": 80.165, | |
| "eval_steps_per_second": 0.63, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3138075313807531, | |
| "eval_accuracy": 0.16177388065723808, | |
| "eval_loss": 4.927117347717285, | |
| "eval_runtime": 184.3296, | |
| "eval_samples_per_second": 80.074, | |
| "eval_steps_per_second": 0.629, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.36610878661087864, | |
| "eval_accuracy": 0.16636529077842815, | |
| "eval_loss": 4.898035049438477, | |
| "eval_runtime": 184.3142, | |
| "eval_samples_per_second": 80.081, | |
| "eval_steps_per_second": 0.629, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.41841004184100417, | |
| "eval_accuracy": 0.1694151293866953, | |
| "eval_loss": 4.875, | |
| "eval_runtime": 185.3011, | |
| "eval_samples_per_second": 79.654, | |
| "eval_steps_per_second": 0.626, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4707112970711297, | |
| "eval_accuracy": 0.17219956637036357, | |
| "eval_loss": 4.854589939117432, | |
| "eval_runtime": 184.8498, | |
| "eval_samples_per_second": 79.849, | |
| "eval_steps_per_second": 0.628, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5230125523012552, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 4.128312412831242e-05, | |
| "loss": 4.8385, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5230125523012552, | |
| "eval_accuracy": 0.17474116808932638, | |
| "eval_loss": 4.833265781402588, | |
| "eval_runtime": 184.5693, | |
| "eval_samples_per_second": 79.97, | |
| "eval_steps_per_second": 0.628, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5753138075313807, | |
| "eval_accuracy": 0.17643840085242252, | |
| "eval_loss": 4.817920207977295, | |
| "eval_runtime": 184.1405, | |
| "eval_samples_per_second": 80.156, | |
| "eval_steps_per_second": 0.63, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6276150627615062, | |
| "eval_accuracy": 0.1773740269532399, | |
| "eval_loss": 4.811416149139404, | |
| "eval_runtime": 184.0435, | |
| "eval_samples_per_second": 80.198, | |
| "eval_steps_per_second": 0.63, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6799163179916318, | |
| "eval_accuracy": 0.17846844188652472, | |
| "eval_loss": 4.802201747894287, | |
| "eval_runtime": 185.0497, | |
| "eval_samples_per_second": 79.762, | |
| "eval_steps_per_second": 0.627, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7322175732217573, | |
| "eval_accuracy": 0.1789691379773809, | |
| "eval_loss": 4.7901930809021, | |
| "eval_runtime": 184.9855, | |
| "eval_samples_per_second": 79.79, | |
| "eval_steps_per_second": 0.627, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7845188284518828, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 3.6924686192468624e-05, | |
| "loss": 4.7486, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7845188284518828, | |
| "eval_accuracy": 0.18001693513769368, | |
| "eval_loss": 4.785585880279541, | |
| "eval_runtime": 183.9922, | |
| "eval_samples_per_second": 80.221, | |
| "eval_steps_per_second": 0.63, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8368200836820083, | |
| "eval_accuracy": 0.18058760065826654, | |
| "eval_loss": 4.779539108276367, | |
| "eval_runtime": 184.1103, | |
| "eval_samples_per_second": 80.169, | |
| "eval_steps_per_second": 0.63, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8891213389121339, | |
| "eval_accuracy": 0.18107188047030814, | |
| "eval_loss": 4.776101112365723, | |
| "eval_runtime": 184.1674, | |
| "eval_samples_per_second": 80.144, | |
| "eval_steps_per_second": 0.63, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9414225941422594, | |
| "eval_accuracy": 0.18138128706525738, | |
| "eval_loss": 4.7754740715026855, | |
| "eval_runtime": 184.197, | |
| "eval_samples_per_second": 80.132, | |
| "eval_steps_per_second": 0.63, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9937238493723849, | |
| "eval_accuracy": 0.18191716908996425, | |
| "eval_loss": 4.767343997955322, | |
| "eval_runtime": 184.0497, | |
| "eval_samples_per_second": 80.196, | |
| "eval_steps_per_second": 0.63, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.0460251046025104, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 3.2566248256624825e-05, | |
| "loss": 4.7159, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0460251046025104, | |
| "eval_accuracy": 0.18160189904678178, | |
| "eval_loss": 4.769783020019531, | |
| "eval_runtime": 185.2615, | |
| "eval_samples_per_second": 79.671, | |
| "eval_steps_per_second": 0.626, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.098326359832636, | |
| "eval_accuracy": 0.18223365637995678, | |
| "eval_loss": 4.763906002044678, | |
| "eval_runtime": 184.3991, | |
| "eval_samples_per_second": 80.044, | |
| "eval_steps_per_second": 0.629, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.1506276150627615, | |
| "eval_accuracy": 0.18256457647135646, | |
| "eval_loss": 4.761280536651611, | |
| "eval_runtime": 185.0808, | |
| "eval_samples_per_second": 79.749, | |
| "eval_steps_per_second": 0.627, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.202928870292887, | |
| "eval_accuracy": 0.18278985816542134, | |
| "eval_loss": 4.7557759284973145, | |
| "eval_runtime": 184.6355, | |
| "eval_samples_per_second": 79.941, | |
| "eval_steps_per_second": 0.628, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.2552301255230125, | |
| "eval_accuracy": 0.18294602847055624, | |
| "eval_loss": 4.75867223739624, | |
| "eval_runtime": 184.9626, | |
| "eval_samples_per_second": 79.8, | |
| "eval_steps_per_second": 0.627, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3075313807531381, | |
| "grad_norm": 1.75, | |
| "learning_rate": 2.8207810320781032e-05, | |
| "loss": 4.6997, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3075313807531381, | |
| "eval_accuracy": 0.18342178332552747, | |
| "eval_loss": 4.754149913787842, | |
| "eval_runtime": 185.1377, | |
| "eval_samples_per_second": 79.724, | |
| "eval_steps_per_second": 0.627, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3598326359832635, | |
| "eval_accuracy": 0.1834138543952502, | |
| "eval_loss": 4.75181245803833, | |
| "eval_runtime": 184.6159, | |
| "eval_samples_per_second": 79.95, | |
| "eval_steps_per_second": 0.628, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.4121338912133892, | |
| "eval_accuracy": 0.1836629457159783, | |
| "eval_loss": 4.746477127075195, | |
| "eval_runtime": 185.4196, | |
| "eval_samples_per_second": 79.603, | |
| "eval_steps_per_second": 0.626, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.4644351464435146, | |
| "eval_accuracy": 0.18383634667304455, | |
| "eval_loss": 4.750728130340576, | |
| "eval_runtime": 184.604, | |
| "eval_samples_per_second": 79.955, | |
| "eval_steps_per_second": 0.628, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.5167364016736402, | |
| "eval_accuracy": 0.1834985251891202, | |
| "eval_loss": 4.751083850860596, | |
| "eval_runtime": 183.9829, | |
| "eval_samples_per_second": 80.225, | |
| "eval_steps_per_second": 0.63, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.5690376569037658, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 2.3849372384937242e-05, | |
| "loss": 4.6905, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.5690376569037658, | |
| "eval_accuracy": 0.18385013375825832, | |
| "eval_loss": 4.750813007354736, | |
| "eval_runtime": 184.3904, | |
| "eval_samples_per_second": 80.048, | |
| "eval_steps_per_second": 0.629, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.6213389121338913, | |
| "eval_accuracy": 0.18415784707342428, | |
| "eval_loss": 4.746849536895752, | |
| "eval_runtime": 184.2103, | |
| "eval_samples_per_second": 80.126, | |
| "eval_steps_per_second": 0.63, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.6736401673640167, | |
| "eval_accuracy": 0.1842263748450887, | |
| "eval_loss": 4.746747970581055, | |
| "eval_runtime": 184.1204, | |
| "eval_samples_per_second": 80.165, | |
| "eval_steps_per_second": 0.63, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.7259414225941423, | |
| "eval_accuracy": 0.18430792521293346, | |
| "eval_loss": 4.745037078857422, | |
| "eval_runtime": 184.2802, | |
| "eval_samples_per_second": 80.095, | |
| "eval_steps_per_second": 0.629, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.778242677824268, | |
| "eval_accuracy": 0.184430838994751, | |
| "eval_loss": 4.746375560760498, | |
| "eval_runtime": 184.234, | |
| "eval_samples_per_second": 80.116, | |
| "eval_steps_per_second": 0.63, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.8305439330543933, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.9490934449093446e-05, | |
| "loss": 4.687, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.8305439330543933, | |
| "eval_accuracy": 0.18449599939000436, | |
| "eval_loss": 4.7423272132873535, | |
| "eval_runtime": 184.1596, | |
| "eval_samples_per_second": 80.148, | |
| "eval_steps_per_second": 0.63, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.8828451882845187, | |
| "eval_accuracy": 0.18466320271445863, | |
| "eval_loss": 4.74322509765625, | |
| "eval_runtime": 183.9907, | |
| "eval_samples_per_second": 80.221, | |
| "eval_steps_per_second": 0.63, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.9351464435146444, | |
| "eval_accuracy": 0.18432753476168612, | |
| "eval_loss": 4.744410514831543, | |
| "eval_runtime": 183.9473, | |
| "eval_samples_per_second": 80.24, | |
| "eval_steps_per_second": 0.631, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.98744769874477, | |
| "eval_accuracy": 0.18470474137931034, | |
| "eval_loss": 4.74097204208374, | |
| "eval_runtime": 184.0059, | |
| "eval_samples_per_second": 80.215, | |
| "eval_steps_per_second": 0.63, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.0397489539748954, | |
| "eval_accuracy": 0.1845896528685144, | |
| "eval_loss": 4.744495391845703, | |
| "eval_runtime": 184.1792, | |
| "eval_samples_per_second": 80.139, | |
| "eval_steps_per_second": 0.63, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.092050209205021, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.5132496513249652e-05, | |
| "loss": 4.6822, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.092050209205021, | |
| "eval_accuracy": 0.1840621610356637, | |
| "eval_loss": 4.743766784667969, | |
| "eval_runtime": 184.0079, | |
| "eval_samples_per_second": 80.214, | |
| "eval_steps_per_second": 0.63, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.1443514644351462, | |
| "eval_accuracy": 0.18438914008407487, | |
| "eval_loss": 4.742242336273193, | |
| "eval_runtime": 184.052, | |
| "eval_samples_per_second": 80.195, | |
| "eval_steps_per_second": 0.63, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.196652719665272, | |
| "eval_accuracy": 0.18475792298529636, | |
| "eval_loss": 4.741429328918457, | |
| "eval_runtime": 184.719, | |
| "eval_samples_per_second": 79.905, | |
| "eval_steps_per_second": 0.628, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.2489539748953975, | |
| "eval_accuracy": 0.1848885987251326, | |
| "eval_loss": 4.740514755249023, | |
| "eval_runtime": 184.0592, | |
| "eval_samples_per_second": 80.192, | |
| "eval_steps_per_second": 0.63, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.301255230125523, | |
| "eval_accuracy": 0.18497058180948975, | |
| "eval_loss": 4.738888740539551, | |
| "eval_runtime": 184.7818, | |
| "eval_samples_per_second": 79.878, | |
| "eval_steps_per_second": 0.628, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.3535564853556483, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.0774058577405859e-05, | |
| "loss": 4.6787, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.3535564853556483, | |
| "eval_accuracy": 0.18458716879639453, | |
| "eval_loss": 4.743495941162109, | |
| "eval_runtime": 185.012, | |
| "eval_samples_per_second": 79.779, | |
| "eval_steps_per_second": 0.627, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.405857740585774, | |
| "eval_accuracy": 0.18485314383095383, | |
| "eval_loss": 4.742412090301514, | |
| "eval_runtime": 184.018, | |
| "eval_samples_per_second": 80.21, | |
| "eval_steps_per_second": 0.63, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.4581589958158996, | |
| "eval_accuracy": 0.18451762653729054, | |
| "eval_loss": 4.744495391845703, | |
| "eval_runtime": 184.1989, | |
| "eval_samples_per_second": 80.131, | |
| "eval_steps_per_second": 0.63, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.510460251046025, | |
| "eval_accuracy": 0.18499375944599727, | |
| "eval_loss": 4.742056369781494, | |
| "eval_runtime": 184.9544, | |
| "eval_samples_per_second": 79.803, | |
| "eval_steps_per_second": 0.627, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.562761506276151, | |
| "eval_accuracy": 0.1845635125940072, | |
| "eval_loss": 4.74495267868042, | |
| "eval_runtime": 184.45, | |
| "eval_samples_per_second": 80.022, | |
| "eval_steps_per_second": 0.629, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.6150627615062763, | |
| "grad_norm": 1.25, | |
| "learning_rate": 6.415620641562065e-06, | |
| "loss": 4.6809, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.6150627615062763, | |
| "eval_accuracy": 0.18461182940869522, | |
| "eval_loss": 4.739973068237305, | |
| "eval_runtime": 185.0928, | |
| "eval_samples_per_second": 79.744, | |
| "eval_steps_per_second": 0.627, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.6673640167364017, | |
| "eval_accuracy": 0.18471778348337312, | |
| "eval_loss": 4.740243911743164, | |
| "eval_runtime": 184.3979, | |
| "eval_samples_per_second": 80.044, | |
| "eval_steps_per_second": 0.629, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.719665271966527, | |
| "eval_accuracy": 0.18489124970131737, | |
| "eval_loss": 4.738804340362549, | |
| "eval_runtime": 184.6067, | |
| "eval_samples_per_second": 79.954, | |
| "eval_steps_per_second": 0.628, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.7719665271966525, | |
| "eval_accuracy": 0.18462476994845917, | |
| "eval_loss": 4.741514205932617, | |
| "eval_runtime": 184.2661, | |
| "eval_samples_per_second": 80.102, | |
| "eval_steps_per_second": 0.63, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.8242677824267783, | |
| "eval_accuracy": 0.18472434773026122, | |
| "eval_loss": 4.739160060882568, | |
| "eval_runtime": 185.2276, | |
| "eval_samples_per_second": 79.686, | |
| "eval_steps_per_second": 0.626, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.8765690376569037, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.057182705718271e-06, | |
| "loss": 4.6819, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.8765690376569037, | |
| "eval_accuracy": 0.18477532080101736, | |
| "eval_loss": 4.742056369781494, | |
| "eval_runtime": 184.4871, | |
| "eval_samples_per_second": 80.006, | |
| "eval_steps_per_second": 0.629, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.928870292887029, | |
| "eval_accuracy": 0.18478108276943064, | |
| "eval_loss": 4.740701198577881, | |
| "eval_runtime": 184.224, | |
| "eval_samples_per_second": 80.12, | |
| "eval_steps_per_second": 0.63, | |
| "step": 5600 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 5736, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "total_flos": 6.743272090868122e+17, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |