| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 50, | |
| "global_step": 670, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.14925373134328357, | |
| "grad_norm": 35.596871044953, | |
| "learning_rate": 1.4925373134328358e-06, | |
| "loss": 1.9213, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 20.609759250234525, | |
| "learning_rate": 2.9850746268656716e-06, | |
| "loss": 1.151, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.44776119402985076, | |
| "grad_norm": 17.51541675553714, | |
| "learning_rate": 4.477611940298508e-06, | |
| "loss": 0.9953, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 12.529056819699033, | |
| "learning_rate": 5.970149253731343e-06, | |
| "loss": 1.0143, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.746268656716418, | |
| "grad_norm": 12.891928164600921, | |
| "learning_rate": 7.46268656716418e-06, | |
| "loss": 1.0534, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.746268656716418, | |
| "eval_loss": 1.2635215520858765, | |
| "eval_runtime": 0.9888, | |
| "eval_samples_per_second": 22.25, | |
| "eval_steps_per_second": 3.034, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 13.18139035026108, | |
| "learning_rate": 8.955223880597016e-06, | |
| "loss": 1.0096, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.044776119402985, | |
| "grad_norm": 10.740181523405264, | |
| "learning_rate": 9.999389284703265e-06, | |
| "loss": 0.9688, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.1940298507462686, | |
| "grad_norm": 10.70923590419392, | |
| "learning_rate": 9.988536273658876e-06, | |
| "loss": 0.711, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.3432835820895521, | |
| "grad_norm": 11.258468673854201, | |
| "learning_rate": 9.964145714351633e-06, | |
| "loss": 0.7263, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.4925373134328357, | |
| "grad_norm": 13.780566009646858, | |
| "learning_rate": 9.926283796211796e-06, | |
| "loss": 0.8118, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.4925373134328357, | |
| "eval_loss": 1.3804577589035034, | |
| "eval_runtime": 0.9931, | |
| "eval_samples_per_second": 22.152, | |
| "eval_steps_per_second": 3.021, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.6417910447761193, | |
| "grad_norm": 9.18658078765986, | |
| "learning_rate": 9.87505326632108e-06, | |
| "loss": 0.8126, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.7910447761194028, | |
| "grad_norm": 10.993066748048813, | |
| "learning_rate": 9.810593150584658e-06, | |
| "loss": 0.8099, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.9402985074626866, | |
| "grad_norm": 8.891553991106202, | |
| "learning_rate": 9.733078376452172e-06, | |
| "loss": 0.804, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.08955223880597, | |
| "grad_norm": 8.268316338255294, | |
| "learning_rate": 9.642719298211602e-06, | |
| "loss": 0.4978, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.2388059701492535, | |
| "grad_norm": 8.48854187035418, | |
| "learning_rate": 9.539761126144193e-06, | |
| "loss": 0.3889, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.2388059701492535, | |
| "eval_loss": 1.6006965637207031, | |
| "eval_runtime": 0.9927, | |
| "eval_samples_per_second": 22.162, | |
| "eval_steps_per_second": 3.022, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.388059701492537, | |
| "grad_norm": 6.609934011045634, | |
| "learning_rate": 9.424483261089584e-06, | |
| "loss": 0.3846, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.5373134328358207, | |
| "grad_norm": 7.50682908737009, | |
| "learning_rate": 9.297198536226927e-06, | |
| "loss": 0.4125, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.6865671641791042, | |
| "grad_norm": 8.821510256598781, | |
| "learning_rate": 9.158252368129628e-06, | |
| "loss": 0.4261, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.835820895522388, | |
| "grad_norm": 7.416255028966862, | |
| "learning_rate": 9.008021819397488e-06, | |
| "loss": 0.4488, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.9850746268656714, | |
| "grad_norm": 8.233603986842171, | |
| "learning_rate": 8.846914575410035e-06, | |
| "loss": 0.4361, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.9850746268656714, | |
| "eval_loss": 1.532719612121582, | |
| "eval_runtime": 0.9841, | |
| "eval_samples_per_second": 22.354, | |
| "eval_steps_per_second": 3.048, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.1343283582089554, | |
| "grad_norm": 8.824258454974311, | |
| "learning_rate": 8.675367837977848e-06, | |
| "loss": 0.2399, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.283582089552239, | |
| "grad_norm": 7.063462594451812, | |
| "learning_rate": 8.49384713889421e-06, | |
| "loss": 0.2246, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.4328358208955225, | |
| "grad_norm": 7.331660049583603, | |
| "learning_rate": 8.302845076606786e-06, | |
| "loss": 0.2415, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.582089552238806, | |
| "grad_norm": 6.424375821012275, | |
| "learning_rate": 8.10287997943769e-06, | |
| "loss": 0.2615, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.7313432835820897, | |
| "grad_norm": 9.043879070705094, | |
| "learning_rate": 7.894494498979558e-06, | |
| "loss": 0.265, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.7313432835820897, | |
| "eval_loss": 1.6067496538162231, | |
| "eval_runtime": 0.99, | |
| "eval_samples_per_second": 22.223, | |
| "eval_steps_per_second": 3.03, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.8805970149253732, | |
| "grad_norm": 7.232856290764463, | |
| "learning_rate": 7.678254137484797e-06, | |
| "loss": 0.2371, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.029850746268656, | |
| "grad_norm": 3.885849230377616, | |
| "learning_rate": 7.4547457132442895e-06, | |
| "loss": 0.2234, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.17910447761194, | |
| "grad_norm": 8.201942272896055, | |
| "learning_rate": 7.2245757681200835e-06, | |
| "loss": 0.1273, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.3283582089552235, | |
| "grad_norm": 6.742950931031516, | |
| "learning_rate": 6.988368921553601e-06, | |
| "loss": 0.1232, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.477611940298507, | |
| "grad_norm": 7.18067667554869, | |
| "learning_rate": 6.746766175516159e-06, | |
| "loss": 0.1347, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.477611940298507, | |
| "eval_loss": 1.8177189826965332, | |
| "eval_runtime": 0.9881, | |
| "eval_samples_per_second": 22.265, | |
| "eval_steps_per_second": 3.036, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.6268656716417915, | |
| "grad_norm": 4.134448034439449, | |
| "learning_rate": 6.500423175001705e-06, | |
| "loss": 0.1326, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.776119402985074, | |
| "grad_norm": 5.984462599094163, | |
| "learning_rate": 6.2500084287822925e-06, | |
| "loss": 0.1424, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.925373134328359, | |
| "grad_norm": 7.603214916490548, | |
| "learning_rate": 5.996201495254757e-06, | |
| "loss": 0.1489, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 5.074626865671641, | |
| "grad_norm": 4.174840343630945, | |
| "learning_rate": 5.73969113830165e-06, | |
| "loss": 0.0922, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.223880597014926, | |
| "grad_norm": 4.1502544333430995, | |
| "learning_rate": 5.481173458170952e-06, | |
| "loss": 0.0857, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.223880597014926, | |
| "eval_loss": 1.977053165435791, | |
| "eval_runtime": 0.9844, | |
| "eval_samples_per_second": 22.349, | |
| "eval_steps_per_second": 3.048, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.373134328358209, | |
| "grad_norm": 6.862487683858245, | |
| "learning_rate": 5.221350002446882e-06, | |
| "loss": 0.0723, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.522388059701493, | |
| "grad_norm": 4.515093540059101, | |
| "learning_rate": 4.96092586223808e-06, | |
| "loss": 0.0693, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 5.6716417910447765, | |
| "grad_norm": 5.571199681266628, | |
| "learning_rate": 4.700607758749626e-06, | |
| "loss": 0.0978, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.82089552238806, | |
| "grad_norm": 4.718390811556805, | |
| "learning_rate": 4.441102125431398e-06, | |
| "loss": 0.0792, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.970149253731344, | |
| "grad_norm": 3.7184414288353422, | |
| "learning_rate": 4.183113190907349e-06, | |
| "loss": 0.0709, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.970149253731344, | |
| "eval_loss": 1.9007922410964966, | |
| "eval_runtime": 0.9935, | |
| "eval_samples_per_second": 22.145, | |
| "eval_steps_per_second": 3.02, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 6.119402985074627, | |
| "grad_norm": 3.9943470043032603, | |
| "learning_rate": 3.927341067888065e-06, | |
| "loss": 0.0309, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 6.268656716417911, | |
| "grad_norm": 2.122106221543816, | |
| "learning_rate": 3.6744798532528137e-06, | |
| "loss": 0.0318, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.417910447761194, | |
| "grad_norm": 5.125755419214468, | |
| "learning_rate": 3.4252157444569478e-06, | |
| "loss": 0.0388, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 6.567164179104478, | |
| "grad_norm": 3.6926779235153173, | |
| "learning_rate": 3.1802251773762294e-06, | |
| "loss": 0.0549, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 6.7164179104477615, | |
| "grad_norm": 3.2890545865060345, | |
| "learning_rate": 2.9401729906414385e-06, | |
| "loss": 0.0474, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.7164179104477615, | |
| "eval_loss": 2.131742238998413, | |
| "eval_runtime": 0.9906, | |
| "eval_samples_per_second": 22.208, | |
| "eval_steps_per_second": 3.028, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.865671641791045, | |
| "grad_norm": 2.1811442357317303, | |
| "learning_rate": 2.7057106214448216e-06, | |
| "loss": 0.0502, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 7.014925373134329, | |
| "grad_norm": 1.2788531958401737, | |
| "learning_rate": 2.4774743377144265e-06, | |
| "loss": 0.0228, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 7.164179104477612, | |
| "grad_norm": 5.299430914553839, | |
| "learning_rate": 2.256083511453747e-06, | |
| "loss": 0.0145, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 7.313432835820896, | |
| "grad_norm": 2.916687972196201, | |
| "learning_rate": 2.042138937932388e-06, | |
| "loss": 0.019, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 7.462686567164179, | |
| "grad_norm": 0.1327510881497756, | |
| "learning_rate": 1.8362212052889827e-06, | |
| "loss": 0.0286, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.462686567164179, | |
| "eval_loss": 2.2198660373687744, | |
| "eval_runtime": 0.9905, | |
| "eval_samples_per_second": 22.21, | |
| "eval_steps_per_second": 3.029, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 7.611940298507463, | |
| "grad_norm": 2.6883922474481623, | |
| "learning_rate": 1.63888911897084e-06, | |
| "loss": 0.0177, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 7.7611940298507465, | |
| "grad_norm": 2.30912712845238, | |
| "learning_rate": 1.4506781852859836e-06, | |
| "loss": 0.0247, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 7.91044776119403, | |
| "grad_norm": 0.9713338532013991, | |
| "learning_rate": 1.2720991581827852e-06, | |
| "loss": 0.0129, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 8.059701492537313, | |
| "grad_norm": 0.21817578611988062, | |
| "learning_rate": 1.1036366532008552e-06, | |
| "loss": 0.0118, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 8.208955223880597, | |
| "grad_norm": 0.9407606204330533, | |
| "learning_rate": 9.457478323545749e-07, | |
| "loss": 0.0091, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 8.208955223880597, | |
| "eval_loss": 2.2086477279663086, | |
| "eval_runtime": 0.9909, | |
| "eval_samples_per_second": 22.202, | |
| "eval_steps_per_second": 3.028, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 8.35820895522388, | |
| "grad_norm": 0.49920720779292554, | |
| "learning_rate": 7.988611635181099e-07, | |
| "loss": 0.0065, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 8.507462686567164, | |
| "grad_norm": 0.4774032163274442, | |
| "learning_rate": 6.633752576786251e-07, | |
| "loss": 0.0104, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 8.656716417910447, | |
| "grad_norm": 0.5916059752548383, | |
| "learning_rate": 5.396577872130676e-07, | |
| "loss": 0.006, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 8.805970149253731, | |
| "grad_norm": 0.2966687083499466, | |
| "learning_rate": 4.2804448812404754e-07, | |
| "loss": 0.006, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 8.955223880597014, | |
| "grad_norm": 0.7007110357286702, | |
| "learning_rate": 3.288382489424502e-07, | |
| "loss": 0.0054, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 8.955223880597014, | |
| "eval_loss": 2.2865116596221924, | |
| "eval_runtime": 0.9893, | |
| "eval_samples_per_second": 22.239, | |
| "eval_steps_per_second": 3.033, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 9.104477611940299, | |
| "grad_norm": 1.48416033550746, | |
| "learning_rate": 2.4230828876927293e-07, | |
| "loss": 0.006, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 9.253731343283581, | |
| "grad_norm": 1.0330803943081195, | |
| "learning_rate": 1.6868942668726408e-07, | |
| "loss": 0.0033, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 9.402985074626866, | |
| "grad_norm": 0.9190962016297656, | |
| "learning_rate": 1.0818144452496293e-07, | |
| "loss": 0.0064, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 9.552238805970148, | |
| "grad_norm": 0.03995791087011875, | |
| "learning_rate": 6.094854470245326e-08, | |
| "loss": 0.0023, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 9.701492537313433, | |
| "grad_norm": 0.04136050995455753, | |
| "learning_rate": 2.711890463007405e-08, | |
| "loss": 0.0038, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 9.701492537313433, | |
| "eval_loss": 2.301581621170044, | |
| "eval_runtime": 0.9856, | |
| "eval_samples_per_second": 22.322, | |
| "eval_steps_per_second": 3.044, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 9.850746268656717, | |
| "grad_norm": 0.875079509506855, | |
| "learning_rate": 6.784328869339218e-09, | |
| "loss": 0.0037, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.27404452578171096, | |
| "learning_rate": 0.0, | |
| "loss": 0.0034, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 670, | |
| "total_flos": 51054080163840.0, | |
| "train_loss": 0.28853574658174125, | |
| "train_runtime": 2679.9258, | |
| "train_samples_per_second": 3.989, | |
| "train_steps_per_second": 0.25 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 670, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 51054080163840.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |