{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 3118, "global_step": 31174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03207801372938988, "grad_norm": 0.8564678430557251, "learning_rate": 0.0007978657428198713, "loss": 0.5444, "step": 500 }, { "epoch": 0.06415602745877975, "grad_norm": 0.7482010722160339, "learning_rate": 0.0007957314856397425, "loss": 0.461, "step": 1000 }, { "epoch": 0.09623404118816963, "grad_norm": 0.9299066662788391, "learning_rate": 0.0007935929513911166, "loss": 0.4262, "step": 1500 }, { "epoch": 0.1283120549175595, "grad_norm": 0.742969810962677, "learning_rate": 0.0007914544171424906, "loss": 0.4121, "step": 2000 }, { "epoch": 0.16039006864694938, "grad_norm": 1.0129729509353638, "learning_rate": 0.0007893158828938645, "loss": 0.3987, "step": 2500 }, { "epoch": 0.19246808237633925, "grad_norm": 0.7069104313850403, "learning_rate": 0.0007871816257137359, "loss": 0.385, "step": 3000 }, { "epoch": 0.20003849361647527, "eval_loss": 0.36575084924697876, "eval_runtime": 5.7691, "eval_samples_per_second": 86.668, "eval_steps_per_second": 5.547, "step": 3118 }, { "epoch": 0.22454609610572912, "grad_norm": 0.8256860375404358, "learning_rate": 0.0007850473685336071, "loss": 0.3829, "step": 3500 }, { "epoch": 0.256624109835119, "grad_norm": 0.9610025882720947, "learning_rate": 0.0007829088342849811, "loss": 0.3745, "step": 4000 }, { "epoch": 0.28870212356450886, "grad_norm": 0.9205407500267029, "learning_rate": 0.0007807703000363551, "loss": 0.3767, "step": 4500 }, { "epoch": 0.32078013729389876, "grad_norm": 1.0542463064193726, "learning_rate": 0.0007786317657877291, "loss": 0.3699, "step": 5000 }, { "epoch": 0.35285815102328866, "grad_norm": 0.8851079344749451, "learning_rate": 0.0007764932315391031, "loss": 0.3684, "step": 5500 }, { "epoch": 0.3849361647526785, "grad_norm": 0.8109485507011414, "learning_rate": 0.0007743546972904772, "loss": 0.3677, "step": 6000 }, { "epoch": 0.40007698723295054, "eval_loss": 0.34624484181404114, "eval_runtime": 5.7422, "eval_samples_per_second": 87.074, "eval_steps_per_second": 5.573, "step": 6236 }, { "epoch": 0.4170141784820684, "grad_norm": 0.8642650246620178, "learning_rate": 0.0007722161630418511, "loss": 0.3578, "step": 6500 }, { "epoch": 0.44909219221145824, "grad_norm": 0.8750723600387573, "learning_rate": 0.0007700776287932251, "loss": 0.358, "step": 7000 }, { "epoch": 0.48117020594084814, "grad_norm": 0.9213278889656067, "learning_rate": 0.0007679390945445992, "loss": 0.3568, "step": 7500 }, { "epoch": 0.513248219670238, "grad_norm": 1.209458589553833, "learning_rate": 0.0007658048373644704, "loss": 0.3509, "step": 8000 }, { "epoch": 0.5453262333996279, "grad_norm": 0.8048808574676514, "learning_rate": 0.0007636663031158445, "loss": 0.3491, "step": 8500 }, { "epoch": 0.5774042471290177, "grad_norm": 0.8589063882827759, "learning_rate": 0.0007615277688672184, "loss": 0.3488, "step": 9000 }, { "epoch": 0.6001154808494258, "eval_loss": 0.3319118916988373, "eval_runtime": 5.941, "eval_samples_per_second": 84.161, "eval_steps_per_second": 5.386, "step": 9354 }, { "epoch": 0.6094822608584076, "grad_norm": 1.1071431636810303, "learning_rate": 0.0007593892346185924, "loss": 0.3475, "step": 9500 }, { "epoch": 0.6415602745877975, "grad_norm": 1.250051736831665, "learning_rate": 0.0007572549774384637, "loss": 0.3434, "step": 10000 }, { "epoch": 0.6736382883171874, "grad_norm": 0.9173659682273865, "learning_rate": 0.0007551164431898377, "loss": 0.3425, "step": 10500 }, { "epoch": 0.7057163020465773, "grad_norm": 0.9546225666999817, "learning_rate": 0.000752982186009709, "loss": 0.3415, "step": 11000 }, { "epoch": 0.7377943157759671, "grad_norm": 0.756817102432251, "learning_rate": 0.000750843651761083, "loss": 0.3366, "step": 11500 }, { "epoch": 0.769872329505357, "grad_norm": 0.7823662757873535, "learning_rate": 0.0007487051175124569, "loss": 0.3397, "step": 12000 }, { "epoch": 0.8001539744659011, "eval_loss": 0.33135783672332764, "eval_runtime": 5.8442, "eval_samples_per_second": 85.554, "eval_steps_per_second": 5.475, "step": 12472 }, { "epoch": 0.8019503432347469, "grad_norm": 1.3129873275756836, "learning_rate": 0.000746566583263831, "loss": 0.3342, "step": 12500 }, { "epoch": 0.8340283569641368, "grad_norm": 1.0603216886520386, "learning_rate": 0.000744428049015205, "loss": 0.3372, "step": 13000 }, { "epoch": 0.8661063706935267, "grad_norm": 0.9776498079299927, "learning_rate": 0.0007422895147665791, "loss": 0.3343, "step": 13500 }, { "epoch": 0.8981843844229165, "grad_norm": 0.9603497385978699, "learning_rate": 0.000740150980517953, "loss": 0.332, "step": 14000 }, { "epoch": 0.9302623981523064, "grad_norm": 1.0065163373947144, "learning_rate": 0.0007380124462693271, "loss": 0.335, "step": 14500 }, { "epoch": 0.9623404118816963, "grad_norm": 0.947246789932251, "learning_rate": 0.0007358739120207011, "loss": 0.3322, "step": 15000 }, { "epoch": 0.9944184256110862, "grad_norm": 1.138590693473816, "learning_rate": 0.0007337396548405722, "loss": 0.3329, "step": 15500 }, { "epoch": 1.0001924680823764, "eval_loss": 0.3191450238227844, "eval_runtime": 5.9188, "eval_samples_per_second": 84.477, "eval_steps_per_second": 5.407, "step": 15590 }, { "epoch": 1.026496439340476, "grad_norm": 1.0730034112930298, "learning_rate": 0.0007316053976604436, "loss": 0.3214, "step": 16000 }, { "epoch": 1.058574453069866, "grad_norm": 1.155540108680725, "learning_rate": 0.0007294668634118175, "loss": 0.3203, "step": 16500 }, { "epoch": 1.0906524667992559, "grad_norm": 1.322080373764038, "learning_rate": 0.0007273283291631916, "loss": 0.32, "step": 17000 }, { "epoch": 1.1227304805286458, "grad_norm": 1.028536319732666, "learning_rate": 0.0007251897949145656, "loss": 0.3206, "step": 17500 }, { "epoch": 1.1548084942580354, "grad_norm": 1.0141762495040894, "learning_rate": 0.0007230512606659396, "loss": 0.3246, "step": 18000 }, { "epoch": 1.1868865079874253, "grad_norm": 1.2617709636688232, "learning_rate": 0.0007209127264173135, "loss": 0.3179, "step": 18500 }, { "epoch": 1.2002309616988516, "eval_loss": 0.3040919303894043, "eval_runtime": 5.8325, "eval_samples_per_second": 85.727, "eval_steps_per_second": 5.487, "step": 18708 }, { "epoch": 1.2189645217168152, "grad_norm": 0.9643025398254395, "learning_rate": 0.0007187741921686877, "loss": 0.3131, "step": 19000 }, { "epoch": 1.2510425354462051, "grad_norm": 0.8644528388977051, "learning_rate": 0.0007166399349885588, "loss": 0.3197, "step": 19500 }, { "epoch": 1.283120549175595, "grad_norm": 1.0242154598236084, "learning_rate": 0.000714501400739933, "loss": 0.3189, "step": 20000 }, { "epoch": 1.315198562904985, "grad_norm": 0.7361490726470947, "learning_rate": 0.0007123628664913069, "loss": 0.3147, "step": 20500 }, { "epoch": 1.3472765766343748, "grad_norm": 0.9061699509620667, "learning_rate": 0.0007102243322426809, "loss": 0.3168, "step": 21000 }, { "epoch": 1.3793545903637647, "grad_norm": 0.7674645781517029, "learning_rate": 0.000708085797994055, "loss": 0.3144, "step": 21500 }, { "epoch": 1.400269455315327, "eval_loss": 0.303521990776062, "eval_runtime": 5.9543, "eval_samples_per_second": 83.973, "eval_steps_per_second": 5.374, "step": 21826 }, { "epoch": 1.4114326040931546, "grad_norm": 1.2573202848434448, "learning_rate": 0.0007059472637454289, "loss": 0.3182, "step": 22000 }, { "epoch": 1.4435106178225445, "grad_norm": 0.7668033838272095, "learning_rate": 0.0007038087294968029, "loss": 0.3087, "step": 22500 }, { "epoch": 1.4755886315519344, "grad_norm": 0.7923159003257751, "learning_rate": 0.0007016701952481769, "loss": 0.3136, "step": 23000 }, { "epoch": 1.5076666452813243, "grad_norm": 0.9079853296279907, "learning_rate": 0.000699531660999551, "loss": 0.3136, "step": 23500 }, { "epoch": 1.5397446590107142, "grad_norm": 0.807373583316803, "learning_rate": 0.0006973974038194221, "loss": 0.3129, "step": 24000 }, { "epoch": 1.571822672740104, "grad_norm": 1.0894283056259155, "learning_rate": 0.0006952588695707963, "loss": 0.3122, "step": 24500 }, { "epoch": 1.6003079489318022, "eval_loss": 0.30009227991104126, "eval_runtime": 5.9543, "eval_samples_per_second": 83.973, "eval_steps_per_second": 5.374, "step": 24944 }, { "epoch": 1.6039006864694938, "grad_norm": 0.8650055527687073, "learning_rate": 0.0006931203353221702, "loss": 0.3128, "step": 25000 }, { "epoch": 1.6359787001988837, "grad_norm": 1.0704525709152222, "learning_rate": 0.0006909818010735442, "loss": 0.3152, "step": 25500 }, { "epoch": 1.6680567139282736, "grad_norm": 1.6046242713928223, "learning_rate": 0.0006888518209619127, "loss": 0.3153, "step": 26000 }, { "epoch": 1.7001347276576635, "grad_norm": 0.891106367111206, "learning_rate": 0.0006867132867132868, "loss": 0.3123, "step": 26500 }, { "epoch": 1.7322127413870532, "grad_norm": 0.8591095805168152, "learning_rate": 0.0006845747524646608, "loss": 0.31, "step": 27000 }, { "epoch": 1.764290755116443, "grad_norm": 0.8793129920959473, "learning_rate": 0.0006824362182160348, "loss": 0.3135, "step": 27500 }, { "epoch": 1.796368768845833, "grad_norm": 0.9400936961174011, "learning_rate": 0.0006802976839674088, "loss": 0.3077, "step": 28000 }, { "epoch": 1.8003464425482774, "eval_loss": 0.29524701833724976, "eval_runtime": 5.9571, "eval_samples_per_second": 83.934, "eval_steps_per_second": 5.372, "step": 28062 }, { "epoch": 1.8284467825752229, "grad_norm": 0.7908840775489807, "learning_rate": 0.0006781591497187827, "loss": 0.309, "step": 28500 }, { "epoch": 1.8605247963046128, "grad_norm": 1.1478577852249146, "learning_rate": 0.0006760206154701568, "loss": 0.305, "step": 29000 }, { "epoch": 1.8926028100340027, "grad_norm": 0.7777372598648071, "learning_rate": 0.0006738820812215308, "loss": 0.3092, "step": 29500 }, { "epoch": 1.9246808237633926, "grad_norm": 0.8342514634132385, "learning_rate": 0.000671747824041402, "loss": 0.306, "step": 30000 }, { "epoch": 1.9567588374927825, "grad_norm": 0.9895392060279846, "learning_rate": 0.0006696092897927761, "loss": 0.3128, "step": 30500 }, { "epoch": 1.9888368512221724, "grad_norm": 1.0536723136901855, "learning_rate": 0.0006674750326126473, "loss": 0.3066, "step": 31000 } ], "logging_steps": 500, "max_steps": 187044, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0952725212375286e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }