| { | |
| "best_metric": 3.070533275604248, | |
| "best_model_checkpoint": "./distilled3/checkpoint-46000", | |
| "epoch": 1.7583705765990183, | |
| "eval_steps": 2000, | |
| "global_step": 48000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "combined_loss": 13.355602264404297, | |
| "distill_loss": 1.4010732173919678, | |
| "epoch": 0, | |
| "step": 0, | |
| "student_mlm_loss": 25.310131072998047 | |
| }, | |
| { | |
| "epoch": 0.003663272034581288, | |
| "grad_norm": 11.128765106201172, | |
| "learning_rate": 1e-05, | |
| "loss": 17.4544, | |
| "step": 100 | |
| }, | |
| { | |
| "combined_loss": 9.379831314086914, | |
| "distill_loss": 1.5227235555648804, | |
| "epoch": 0.003663272034581288, | |
| "step": 100, | |
| "student_mlm_loss": 17.2369384765625 | |
| }, | |
| { | |
| "epoch": 0.007326544069162576, | |
| "grad_norm": 14.151921272277832, | |
| "learning_rate": 2e-05, | |
| "loss": 16.0099, | |
| "step": 200 | |
| }, | |
| { | |
| "combined_loss": 28.136512756347656, | |
| "distill_loss": 1.571045160293579, | |
| "epoch": 0.007326544069162576, | |
| "step": 200, | |
| "student_mlm_loss": 54.70198059082031 | |
| }, | |
| { | |
| "epoch": 0.010989816103743864, | |
| "grad_norm": 11.68195915222168, | |
| "learning_rate": 3e-05, | |
| "loss": 18.8223, | |
| "step": 300 | |
| }, | |
| { | |
| "combined_loss": 15.699158668518066, | |
| "distill_loss": 1.5519400835037231, | |
| "epoch": 0.010989816103743864, | |
| "step": 300, | |
| "student_mlm_loss": 29.846376419067383 | |
| }, | |
| { | |
| "epoch": 0.014653088138325152, | |
| "grad_norm": 8.982569694519043, | |
| "learning_rate": 4e-05, | |
| "loss": 16.9008, | |
| "step": 400 | |
| }, | |
| { | |
| "combined_loss": 3.035900592803955, | |
| "distill_loss": 1.4880340099334717, | |
| "epoch": 0.014653088138325152, | |
| "step": 400, | |
| "student_mlm_loss": 4.583766937255859 | |
| }, | |
| { | |
| "epoch": 0.01831636017290644, | |
| "grad_norm": 7.045658111572266, | |
| "learning_rate": 5e-05, | |
| "loss": 8.812, | |
| "step": 500 | |
| }, | |
| { | |
| "combined_loss": 7.002770900726318, | |
| "distill_loss": 1.351847529411316, | |
| "epoch": 0.01831636017290644, | |
| "step": 500, | |
| "student_mlm_loss": 12.653694152832031 | |
| }, | |
| { | |
| "epoch": 0.021979632207487727, | |
| "grad_norm": 4.265043258666992, | |
| "learning_rate": 4.9938570410595373e-05, | |
| "loss": 16.8853, | |
| "step": 600 | |
| }, | |
| { | |
| "combined_loss": 3.2060928344726562, | |
| "distill_loss": 1.2962806224822998, | |
| "epoch": 0.021979632207487727, | |
| "step": 600, | |
| "student_mlm_loss": 5.115904808044434 | |
| }, | |
| { | |
| "epoch": 0.025642904242069015, | |
| "grad_norm": 7.744924545288086, | |
| "learning_rate": 4.987714082119075e-05, | |
| "loss": 7.1609, | |
| "step": 700 | |
| }, | |
| { | |
| "combined_loss": 2.2816712856292725, | |
| "distill_loss": 1.5105196237564087, | |
| "epoch": 0.025642904242069015, | |
| "step": 700, | |
| "student_mlm_loss": 3.052823066711426 | |
| }, | |
| { | |
| "epoch": 0.029306176276650303, | |
| "grad_norm": 12.44052791595459, | |
| "learning_rate": 4.981571123178613e-05, | |
| "loss": 13.0471, | |
| "step": 800 | |
| }, | |
| { | |
| "combined_loss": 3.225351095199585, | |
| "distill_loss": 1.5753816366195679, | |
| "epoch": 0.029306176276650303, | |
| "step": 800, | |
| "student_mlm_loss": 4.8753204345703125 | |
| }, | |
| { | |
| "epoch": 0.032969448311231594, | |
| "grad_norm": 6.2059645652771, | |
| "learning_rate": 4.975428164238151e-05, | |
| "loss": 6.2833, | |
| "step": 900 | |
| }, | |
| { | |
| "combined_loss": 8.580605506896973, | |
| "distill_loss": 1.530474066734314, | |
| "epoch": 0.032969448311231594, | |
| "step": 900, | |
| "student_mlm_loss": 15.630736351013184 | |
| }, | |
| { | |
| "epoch": 0.03663272034581288, | |
| "grad_norm": 14.731459617614746, | |
| "learning_rate": 4.969285205297688e-05, | |
| "loss": 5.8549, | |
| "step": 1000 | |
| }, | |
| { | |
| "combined_loss": 3.7085845470428467, | |
| "distill_loss": 1.4659323692321777, | |
| "epoch": 0.03663272034581288, | |
| "step": 1000, | |
| "student_mlm_loss": 5.951236724853516 | |
| }, | |
| { | |
| "epoch": 0.04029599238039417, | |
| "grad_norm": 9.745060920715332, | |
| "learning_rate": 4.9631422463572256e-05, | |
| "loss": 5.174, | |
| "step": 1100 | |
| }, | |
| { | |
| "combined_loss": 4.752764701843262, | |
| "distill_loss": 1.4000483751296997, | |
| "epoch": 0.04029599238039417, | |
| "step": 1100, | |
| "student_mlm_loss": 8.105481147766113 | |
| }, | |
| { | |
| "epoch": 0.043959264414975455, | |
| "grad_norm": 13.801424026489258, | |
| "learning_rate": 4.9569992874167634e-05, | |
| "loss": 19.8368, | |
| "step": 1200 | |
| }, | |
| { | |
| "combined_loss": 3.1324005126953125, | |
| "distill_loss": 1.404078483581543, | |
| "epoch": 0.043959264414975455, | |
| "step": 1200, | |
| "student_mlm_loss": 4.860722541809082 | |
| }, | |
| { | |
| "epoch": 0.047622536449556746, | |
| "grad_norm": 52.244632720947266, | |
| "learning_rate": 4.9508563284763005e-05, | |
| "loss": 5.547, | |
| "step": 1300 | |
| }, | |
| { | |
| "combined_loss": 3.1176328659057617, | |
| "distill_loss": 1.3057805299758911, | |
| "epoch": 0.047622536449556746, | |
| "step": 1300, | |
| "student_mlm_loss": 4.929485321044922 | |
| }, | |
| { | |
| "epoch": 0.05128580848413803, | |
| "grad_norm": 47.002349853515625, | |
| "learning_rate": 4.944713369535838e-05, | |
| "loss": 4.7784, | |
| "step": 1400 | |
| }, | |
| { | |
| "combined_loss": 3.871903657913208, | |
| "distill_loss": 1.5537463426589966, | |
| "epoch": 0.05128580848413803, | |
| "step": 1400, | |
| "student_mlm_loss": 6.190061092376709 | |
| }, | |
| { | |
| "epoch": 0.05494908051871932, | |
| "grad_norm": 11.417911529541016, | |
| "learning_rate": 4.9385704105953754e-05, | |
| "loss": 5.9593, | |
| "step": 1500 | |
| }, | |
| { | |
| "combined_loss": 6.293668270111084, | |
| "distill_loss": 1.3082151412963867, | |
| "epoch": 0.05494908051871932, | |
| "step": 1500, | |
| "student_mlm_loss": 11.279121398925781 | |
| }, | |
| { | |
| "epoch": 0.058612352553300606, | |
| "grad_norm": 24.519105911254883, | |
| "learning_rate": 4.932427451654914e-05, | |
| "loss": 7.2762, | |
| "step": 1600 | |
| }, | |
| { | |
| "combined_loss": 3.350501775741577, | |
| "distill_loss": 1.4593900442123413, | |
| "epoch": 0.058612352553300606, | |
| "step": 1600, | |
| "student_mlm_loss": 5.241613388061523 | |
| }, | |
| { | |
| "epoch": 0.0622756245878819, | |
| "grad_norm": 42.58499526977539, | |
| "learning_rate": 4.926284492714451e-05, | |
| "loss": 7.1364, | |
| "step": 1700 | |
| }, | |
| { | |
| "combined_loss": 10.976073265075684, | |
| "distill_loss": 1.594639539718628, | |
| "epoch": 0.0622756245878819, | |
| "step": 1700, | |
| "student_mlm_loss": 20.357507705688477 | |
| }, | |
| { | |
| "epoch": 0.06593889662246319, | |
| "grad_norm": 105.27689361572266, | |
| "learning_rate": 4.920141533773989e-05, | |
| "loss": 5.7662, | |
| "step": 1800 | |
| }, | |
| { | |
| "combined_loss": 4.272126197814941, | |
| "distill_loss": 1.4649100303649902, | |
| "epoch": 0.06593889662246319, | |
| "step": 1800, | |
| "student_mlm_loss": 7.079341888427734 | |
| }, | |
| { | |
| "epoch": 0.06960216865704447, | |
| "grad_norm": 9.272991180419922, | |
| "learning_rate": 4.913998574833526e-05, | |
| "loss": 4.9898, | |
| "step": 1900 | |
| }, | |
| { | |
| "combined_loss": 2.2884514331817627, | |
| "distill_loss": 1.5105092525482178, | |
| "epoch": 0.06960216865704447, | |
| "step": 1900, | |
| "student_mlm_loss": 3.0663936138153076 | |
| }, | |
| { | |
| "epoch": 0.07326544069162576, | |
| "grad_norm": 15.299578666687012, | |
| "learning_rate": 4.9078556158930636e-05, | |
| "loss": 6.8909, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.07326544069162576, | |
| "eval_loss": 6.166979789733887, | |
| "eval_runtime": 2.1158, | |
| "eval_samples_per_second": 3306.616, | |
| "eval_steps_per_second": 13.234, | |
| "step": 2000 | |
| }, | |
| { | |
| "combined_loss": 5.612101078033447, | |
| "distill_loss": 1.332657814025879, | |
| "epoch": 0.07326544069162576, | |
| "step": 2000, | |
| "student_mlm_loss": 9.891544342041016 | |
| }, | |
| { | |
| "epoch": 0.07692871272620705, | |
| "grad_norm": 12.242279052734375, | |
| "learning_rate": 4.9017126569526014e-05, | |
| "loss": 8.6608, | |
| "step": 2100 | |
| }, | |
| { | |
| "combined_loss": 2.035828113555908, | |
| "distill_loss": 1.3731106519699097, | |
| "epoch": 0.07692871272620705, | |
| "step": 2100, | |
| "student_mlm_loss": 2.6985456943511963 | |
| }, | |
| { | |
| "epoch": 0.08059198476078834, | |
| "grad_norm": 27.212379455566406, | |
| "learning_rate": 4.8955696980121385e-05, | |
| "loss": 9.4649, | |
| "step": 2200 | |
| }, | |
| { | |
| "combined_loss": 2.5593996047973633, | |
| "distill_loss": 1.5456775426864624, | |
| "epoch": 0.08059198476078834, | |
| "step": 2200, | |
| "student_mlm_loss": 3.5731217861175537 | |
| }, | |
| { | |
| "epoch": 0.08425525679536962, | |
| "grad_norm": 9.444129943847656, | |
| "learning_rate": 4.889426739071676e-05, | |
| "loss": 12.6304, | |
| "step": 2300 | |
| }, | |
| { | |
| "combined_loss": 3.0112435817718506, | |
| "distill_loss": 1.268593192100525, | |
| "epoch": 0.08425525679536962, | |
| "step": 2300, | |
| "student_mlm_loss": 4.753893852233887 | |
| }, | |
| { | |
| "epoch": 0.08791852882995091, | |
| "grad_norm": 6.72172212600708, | |
| "learning_rate": 4.8832837801312134e-05, | |
| "loss": 4.2453, | |
| "step": 2400 | |
| }, | |
| { | |
| "combined_loss": 2.3823843002319336, | |
| "distill_loss": 1.3674836158752441, | |
| "epoch": 0.08791852882995091, | |
| "step": 2400, | |
| "student_mlm_loss": 3.397284984588623 | |
| }, | |
| { | |
| "epoch": 0.0915818008645322, | |
| "grad_norm": 88.5478744506836, | |
| "learning_rate": 4.877140821190752e-05, | |
| "loss": 4.6849, | |
| "step": 2500 | |
| }, | |
| { | |
| "combined_loss": 3.8919034004211426, | |
| "distill_loss": 1.523806095123291, | |
| "epoch": 0.0915818008645322, | |
| "step": 2500, | |
| "student_mlm_loss": 6.260000705718994 | |
| }, | |
| { | |
| "epoch": 0.09524507289911349, | |
| "grad_norm": 11.671692848205566, | |
| "learning_rate": 4.870997862250289e-05, | |
| "loss": 4.8686, | |
| "step": 2600 | |
| }, | |
| { | |
| "combined_loss": 2.8186635971069336, | |
| "distill_loss": 1.313085913658142, | |
| "epoch": 0.09524507289911349, | |
| "step": 2600, | |
| "student_mlm_loss": 4.3242411613464355 | |
| }, | |
| { | |
| "epoch": 0.09890834493369477, | |
| "grad_norm": 7.681136131286621, | |
| "learning_rate": 4.864854903309827e-05, | |
| "loss": 14.7468, | |
| "step": 2700 | |
| }, | |
| { | |
| "combined_loss": 2.6350021362304688, | |
| "distill_loss": 1.5300695896148682, | |
| "epoch": 0.09890834493369477, | |
| "step": 2700, | |
| "student_mlm_loss": 3.7399346828460693 | |
| }, | |
| { | |
| "epoch": 0.10257161696827606, | |
| "grad_norm": 10.245522499084473, | |
| "learning_rate": 4.858711944369364e-05, | |
| "loss": 4.7465, | |
| "step": 2800 | |
| }, | |
| { | |
| "combined_loss": 1.9805179834365845, | |
| "distill_loss": 1.3671844005584717, | |
| "epoch": 0.10257161696827606, | |
| "step": 2800, | |
| "student_mlm_loss": 2.5938515663146973 | |
| }, | |
| { | |
| "epoch": 0.10623488900285735, | |
| "grad_norm": 51.705352783203125, | |
| "learning_rate": 4.8525689854289016e-05, | |
| "loss": 3.8985, | |
| "step": 2900 | |
| }, | |
| { | |
| "combined_loss": 1.9335501194000244, | |
| "distill_loss": 1.3294615745544434, | |
| "epoch": 0.10623488900285735, | |
| "step": 2900, | |
| "student_mlm_loss": 2.5376386642456055 | |
| }, | |
| { | |
| "epoch": 0.10989816103743864, | |
| "grad_norm": 7.661074161529541, | |
| "learning_rate": 4.8464260264884394e-05, | |
| "loss": 3.9846, | |
| "step": 3000 | |
| }, | |
| { | |
| "combined_loss": 2.815329074859619, | |
| "distill_loss": 1.5120948553085327, | |
| "epoch": 0.10989816103743864, | |
| "step": 3000, | |
| "student_mlm_loss": 4.118563175201416 | |
| }, | |
| { | |
| "epoch": 0.11356143307201993, | |
| "grad_norm": 3.9512596130371094, | |
| "learning_rate": 4.8402830675479765e-05, | |
| "loss": 5.6509, | |
| "step": 3100 | |
| }, | |
| { | |
| "combined_loss": 5.329846382141113, | |
| "distill_loss": 1.5839005708694458, | |
| "epoch": 0.11356143307201993, | |
| "step": 3100, | |
| "student_mlm_loss": 9.07579231262207 | |
| }, | |
| { | |
| "epoch": 0.11722470510660121, | |
| "grad_norm": 21.47922134399414, | |
| "learning_rate": 4.834140108607514e-05, | |
| "loss": 4.5437, | |
| "step": 3200 | |
| }, | |
| { | |
| "combined_loss": 3.32517147064209, | |
| "distill_loss": 1.4834882020950317, | |
| "epoch": 0.11722470510660121, | |
| "step": 3200, | |
| "student_mlm_loss": 5.1668548583984375 | |
| }, | |
| { | |
| "epoch": 0.1208879771411825, | |
| "grad_norm": 11.865033149719238, | |
| "learning_rate": 4.827997149667052e-05, | |
| "loss": 5.0218, | |
| "step": 3300 | |
| }, | |
| { | |
| "combined_loss": 2.84318208694458, | |
| "distill_loss": 1.302217960357666, | |
| "epoch": 0.1208879771411825, | |
| "step": 3300, | |
| "student_mlm_loss": 4.384146213531494 | |
| }, | |
| { | |
| "epoch": 0.1245512491757638, | |
| "grad_norm": 13.824487686157227, | |
| "learning_rate": 4.82185419072659e-05, | |
| "loss": 33.2949, | |
| "step": 3400 | |
| }, | |
| { | |
| "combined_loss": 2.065192937850952, | |
| "distill_loss": 1.3474924564361572, | |
| "epoch": 0.1245512491757638, | |
| "step": 3400, | |
| "student_mlm_loss": 2.782893419265747 | |
| }, | |
| { | |
| "epoch": 0.12821452121034507, | |
| "grad_norm": 34.21382522583008, | |
| "learning_rate": 4.815711231786127e-05, | |
| "loss": 12.5775, | |
| "step": 3500 | |
| }, | |
| { | |
| "combined_loss": 2.2148988246917725, | |
| "distill_loss": 1.616875171661377, | |
| "epoch": 0.12821452121034507, | |
| "step": 3500, | |
| "student_mlm_loss": 2.812922477722168 | |
| }, | |
| { | |
| "epoch": 0.13187779324492638, | |
| "grad_norm": 8.859841346740723, | |
| "learning_rate": 4.809568272845665e-05, | |
| "loss": 4.6975, | |
| "step": 3600 | |
| }, | |
| { | |
| "combined_loss": 4.478976726531982, | |
| "distill_loss": 1.3554083108901978, | |
| "epoch": 0.13187779324492638, | |
| "step": 3600, | |
| "student_mlm_loss": 7.602544784545898 | |
| }, | |
| { | |
| "epoch": 0.13554106527950766, | |
| "grad_norm": 12.680179595947266, | |
| "learning_rate": 4.803425313905202e-05, | |
| "loss": 4.5414, | |
| "step": 3700 | |
| }, | |
| { | |
| "combined_loss": 6.908867835998535, | |
| "distill_loss": 1.3570021390914917, | |
| "epoch": 0.13554106527950766, | |
| "step": 3700, | |
| "student_mlm_loss": 12.460733413696289 | |
| }, | |
| { | |
| "epoch": 0.13920433731408893, | |
| "grad_norm": 18.478200912475586, | |
| "learning_rate": 4.7972823549647396e-05, | |
| "loss": 35.1443, | |
| "step": 3800 | |
| }, | |
| { | |
| "combined_loss": 13.97608757019043, | |
| "distill_loss": 1.418832778930664, | |
| "epoch": 0.13920433731408893, | |
| "step": 3800, | |
| "student_mlm_loss": 26.533342361450195 | |
| }, | |
| { | |
| "epoch": 0.14286760934867024, | |
| "grad_norm": 10.53610897064209, | |
| "learning_rate": 4.7911393960242774e-05, | |
| "loss": 13.766, | |
| "step": 3900 | |
| }, | |
| { | |
| "combined_loss": 2.1997413635253906, | |
| "distill_loss": 1.4529953002929688, | |
| "epoch": 0.14286760934867024, | |
| "step": 3900, | |
| "student_mlm_loss": 2.9464874267578125 | |
| }, | |
| { | |
| "epoch": 0.14653088138325152, | |
| "grad_norm": 42.095558166503906, | |
| "learning_rate": 4.7849964370838145e-05, | |
| "loss": 3.297, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.14653088138325152, | |
| "eval_loss": 4.568027496337891, | |
| "eval_runtime": 2.0693, | |
| "eval_samples_per_second": 3380.818, | |
| "eval_steps_per_second": 13.531, | |
| "step": 4000 | |
| }, | |
| { | |
| "combined_loss": 2.278163433074951, | |
| "distill_loss": 1.5395259857177734, | |
| "epoch": 0.14653088138325152, | |
| "step": 4000, | |
| "student_mlm_loss": 3.016800880432129 | |
| }, | |
| { | |
| "epoch": 0.15019415341783282, | |
| "grad_norm": 15.655592918395996, | |
| "learning_rate": 4.778853478143352e-05, | |
| "loss": 4.5795, | |
| "step": 4100 | |
| }, | |
| { | |
| "combined_loss": 2.117962598800659, | |
| "distill_loss": 1.5073814392089844, | |
| "epoch": 0.15019415341783282, | |
| "step": 4100, | |
| "student_mlm_loss": 2.728543758392334 | |
| }, | |
| { | |
| "epoch": 0.1538574254524141, | |
| "grad_norm": 9.47999382019043, | |
| "learning_rate": 4.77271051920289e-05, | |
| "loss": 4.6384, | |
| "step": 4200 | |
| }, | |
| { | |
| "combined_loss": 2.2614216804504395, | |
| "distill_loss": 1.3999947309494019, | |
| "epoch": 0.1538574254524141, | |
| "step": 4200, | |
| "student_mlm_loss": 3.1228485107421875 | |
| }, | |
| { | |
| "epoch": 0.15752069748699538, | |
| "grad_norm": 12.137129783630371, | |
| "learning_rate": 4.766567560262428e-05, | |
| "loss": 3.6101, | |
| "step": 4300 | |
| }, | |
| { | |
| "combined_loss": 1.9776763916015625, | |
| "distill_loss": 1.4785245656967163, | |
| "epoch": 0.15752069748699538, | |
| "step": 4300, | |
| "student_mlm_loss": 2.476828098297119 | |
| }, | |
| { | |
| "epoch": 0.16118396952157668, | |
| "grad_norm": 74.8094253540039, | |
| "learning_rate": 4.760424601321965e-05, | |
| "loss": 4.9111, | |
| "step": 4400 | |
| }, | |
| { | |
| "combined_loss": 3.0158274173736572, | |
| "distill_loss": 1.2940564155578613, | |
| "epoch": 0.16118396952157668, | |
| "step": 4400, | |
| "student_mlm_loss": 4.737598419189453 | |
| }, | |
| { | |
| "epoch": 0.16484724155615796, | |
| "grad_norm": 5.339694499969482, | |
| "learning_rate": 4.754281642381502e-05, | |
| "loss": 3.4013, | |
| "step": 4500 | |
| }, | |
| { | |
| "combined_loss": 2.176065683364868, | |
| "distill_loss": 1.5688632726669312, | |
| "epoch": 0.16484724155615796, | |
| "step": 4500, | |
| "student_mlm_loss": 2.7832682132720947 | |
| }, | |
| { | |
| "epoch": 0.16851051359073924, | |
| "grad_norm": 12.745500564575195, | |
| "learning_rate": 4.74813868344104e-05, | |
| "loss": 3.1244, | |
| "step": 4600 | |
| }, | |
| { | |
| "combined_loss": 2.4230682849884033, | |
| "distill_loss": 1.46636962890625, | |
| "epoch": 0.16851051359073924, | |
| "step": 4600, | |
| "student_mlm_loss": 3.3797669410705566 | |
| }, | |
| { | |
| "epoch": 0.17217378562532054, | |
| "grad_norm": 14.515507698059082, | |
| "learning_rate": 4.7419957245005777e-05, | |
| "loss": 4.9862, | |
| "step": 4700 | |
| }, | |
| { | |
| "combined_loss": 6.772428512573242, | |
| "distill_loss": 1.6445391178131104, | |
| "epoch": 0.17217378562532054, | |
| "step": 4700, | |
| "student_mlm_loss": 11.900318145751953 | |
| }, | |
| { | |
| "epoch": 0.17583705765990182, | |
| "grad_norm": 10.036664962768555, | |
| "learning_rate": 4.7358527655601154e-05, | |
| "loss": 3.72, | |
| "step": 4800 | |
| }, | |
| { | |
| "combined_loss": 27.606048583984375, | |
| "distill_loss": 1.4302338361740112, | |
| "epoch": 0.17583705765990182, | |
| "step": 4800, | |
| "student_mlm_loss": 53.781864166259766 | |
| }, | |
| { | |
| "epoch": 0.17950032969448312, | |
| "grad_norm": 14.220582008361816, | |
| "learning_rate": 4.7297098066196525e-05, | |
| "loss": 9.0684, | |
| "step": 4900 | |
| }, | |
| { | |
| "combined_loss": 7.97739839553833, | |
| "distill_loss": 1.4764257669448853, | |
| "epoch": 0.17950032969448312, | |
| "step": 4900, | |
| "student_mlm_loss": 14.478370666503906 | |
| }, | |
| { | |
| "epoch": 0.1831636017290644, | |
| "grad_norm": 8.734748840332031, | |
| "learning_rate": 4.72356684767919e-05, | |
| "loss": 13.2974, | |
| "step": 5000 | |
| }, | |
| { | |
| "combined_loss": 3.3007736206054688, | |
| "distill_loss": 1.5111989974975586, | |
| "epoch": 0.1831636017290644, | |
| "step": 5000, | |
| "student_mlm_loss": 5.090348243713379 | |
| }, | |
| { | |
| "epoch": 0.18682687376364568, | |
| "grad_norm": 23.457653045654297, | |
| "learning_rate": 4.717423888738728e-05, | |
| "loss": 4.4811, | |
| "step": 5100 | |
| }, | |
| { | |
| "combined_loss": 2.695789337158203, | |
| "distill_loss": 1.4495799541473389, | |
| "epoch": 0.18682687376364568, | |
| "step": 5100, | |
| "student_mlm_loss": 3.9419989585876465 | |
| }, | |
| { | |
| "epoch": 0.19049014579822698, | |
| "grad_norm": 11.504470825195312, | |
| "learning_rate": 4.711280929798265e-05, | |
| "loss": 3.2576, | |
| "step": 5200 | |
| }, | |
| { | |
| "combined_loss": 3.5765743255615234, | |
| "distill_loss": 1.3500127792358398, | |
| "epoch": 0.19049014579822698, | |
| "step": 5200, | |
| "student_mlm_loss": 5.803135871887207 | |
| }, | |
| { | |
| "epoch": 0.19415341783280826, | |
| "grad_norm": 34.68207550048828, | |
| "learning_rate": 4.705137970857803e-05, | |
| "loss": 5.8403, | |
| "step": 5300 | |
| }, | |
| { | |
| "combined_loss": 4.304483413696289, | |
| "distill_loss": 1.4075747728347778, | |
| "epoch": 0.19415341783280826, | |
| "step": 5300, | |
| "student_mlm_loss": 7.20139217376709 | |
| }, | |
| { | |
| "epoch": 0.19781668986738954, | |
| "grad_norm": 22.416582107543945, | |
| "learning_rate": 4.69899501191734e-05, | |
| "loss": 4.045, | |
| "step": 5400 | |
| }, | |
| { | |
| "combined_loss": 1.9111289978027344, | |
| "distill_loss": 1.321276307106018, | |
| "epoch": 0.19781668986738954, | |
| "step": 5400, | |
| "student_mlm_loss": 2.500981569290161 | |
| }, | |
| { | |
| "epoch": 0.20147996190197084, | |
| "grad_norm": 27.66775894165039, | |
| "learning_rate": 4.6928520529768786e-05, | |
| "loss": 3.8896, | |
| "step": 5500 | |
| }, | |
| { | |
| "combined_loss": 2.142390251159668, | |
| "distill_loss": 1.4025957584381104, | |
| "epoch": 0.20147996190197084, | |
| "step": 5500, | |
| "student_mlm_loss": 2.8821845054626465 | |
| }, | |
| { | |
| "epoch": 0.20514323393655212, | |
| "grad_norm": 35.84339141845703, | |
| "learning_rate": 4.686709094036416e-05, | |
| "loss": 4.94, | |
| "step": 5600 | |
| }, | |
| { | |
| "combined_loss": 2.1642816066741943, | |
| "distill_loss": 1.392912745475769, | |
| "epoch": 0.20514323393655212, | |
| "step": 5600, | |
| "student_mlm_loss": 2.935650587081909 | |
| }, | |
| { | |
| "epoch": 0.20880650597113343, | |
| "grad_norm": 18.43452262878418, | |
| "learning_rate": 4.6805661350959535e-05, | |
| "loss": 7.4575, | |
| "step": 5700 | |
| }, | |
| { | |
| "combined_loss": 2.354356288909912, | |
| "distill_loss": 1.3411612510681152, | |
| "epoch": 0.20880650597113343, | |
| "step": 5700, | |
| "student_mlm_loss": 3.36755108833313 | |
| }, | |
| { | |
| "epoch": 0.2124697780057147, | |
| "grad_norm": 5.364467144012451, | |
| "learning_rate": 4.6744231761554906e-05, | |
| "loss": 3.2172, | |
| "step": 5800 | |
| }, | |
| { | |
| "combined_loss": 2.129748821258545, | |
| "distill_loss": 1.4555408954620361, | |
| "epoch": 0.2124697780057147, | |
| "step": 5800, | |
| "student_mlm_loss": 2.8039567470550537 | |
| }, | |
| { | |
| "epoch": 0.21613305004029598, | |
| "grad_norm": 12.704414367675781, | |
| "learning_rate": 4.6682802172150283e-05, | |
| "loss": 9.9214, | |
| "step": 5900 | |
| }, | |
| { | |
| "combined_loss": 5.396609783172607, | |
| "distill_loss": 1.3954136371612549, | |
| "epoch": 0.21613305004029598, | |
| "step": 5900, | |
| "student_mlm_loss": 9.397806167602539 | |
| }, | |
| { | |
| "epoch": 0.2197963220748773, | |
| "grad_norm": 9.411243438720703, | |
| "learning_rate": 4.662137258274566e-05, | |
| "loss": 4.6268, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.2197963220748773, | |
| "eval_loss": 4.474331855773926, | |
| "eval_runtime": 2.0765, | |
| "eval_samples_per_second": 3369.116, | |
| "eval_steps_per_second": 13.484, | |
| "step": 6000 | |
| }, | |
| { | |
| "combined_loss": 2.3863794803619385, | |
| "distill_loss": 1.4665789604187012, | |
| "epoch": 0.2197963220748773, | |
| "step": 6000, | |
| "student_mlm_loss": 3.306180000305176 | |
| }, | |
| { | |
| "epoch": 0.22345959410945856, | |
| "grad_norm": 15.34604263305664, | |
| "learning_rate": 4.655994299334103e-05, | |
| "loss": 3.586, | |
| "step": 6100 | |
| }, | |
| { | |
| "combined_loss": 2.5740702152252197, | |
| "distill_loss": 1.5186127424240112, | |
| "epoch": 0.22345959410945856, | |
| "step": 6100, | |
| "student_mlm_loss": 3.6295275688171387 | |
| }, | |
| { | |
| "epoch": 0.22712286614403987, | |
| "grad_norm": 10.821826934814453, | |
| "learning_rate": 4.649851340393641e-05, | |
| "loss": 5.516, | |
| "step": 6200 | |
| }, | |
| { | |
| "combined_loss": 4.770940780639648, | |
| "distill_loss": 1.5328683853149414, | |
| "epoch": 0.22712286614403987, | |
| "step": 6200, | |
| "student_mlm_loss": 8.009013175964355 | |
| }, | |
| { | |
| "epoch": 0.23078613817862115, | |
| "grad_norm": 45.33203887939453, | |
| "learning_rate": 4.643708381453178e-05, | |
| "loss": 6.4937, | |
| "step": 6300 | |
| }, | |
| { | |
| "combined_loss": 2.257235050201416, | |
| "distill_loss": 1.4594223499298096, | |
| "epoch": 0.23078613817862115, | |
| "step": 6300, | |
| "student_mlm_loss": 3.0550475120544434 | |
| }, | |
| { | |
| "epoch": 0.23444941021320242, | |
| "grad_norm": 24.137001037597656, | |
| "learning_rate": 4.6375654225127166e-05, | |
| "loss": 2.8761, | |
| "step": 6400 | |
| }, | |
| { | |
| "combined_loss": 3.673408031463623, | |
| "distill_loss": 1.5113860368728638, | |
| "epoch": 0.23444941021320242, | |
| "step": 6400, | |
| "student_mlm_loss": 5.835430145263672 | |
| }, | |
| { | |
| "epoch": 0.23811268224778373, | |
| "grad_norm": 89.53437042236328, | |
| "learning_rate": 4.631422463572254e-05, | |
| "loss": 4.9469, | |
| "step": 6500 | |
| }, | |
| { | |
| "combined_loss": 2.289175271987915, | |
| "distill_loss": 1.6255369186401367, | |
| "epoch": 0.23811268224778373, | |
| "step": 6500, | |
| "student_mlm_loss": 2.9528136253356934 | |
| }, | |
| { | |
| "epoch": 0.241775954282365, | |
| "grad_norm": 29.47341537475586, | |
| "learning_rate": 4.6252795046317915e-05, | |
| "loss": 3.2857, | |
| "step": 6600 | |
| }, | |
| { | |
| "combined_loss": 2.986036777496338, | |
| "distill_loss": 1.3628634214401245, | |
| "epoch": 0.241775954282365, | |
| "step": 6600, | |
| "student_mlm_loss": 4.609210014343262 | |
| }, | |
| { | |
| "epoch": 0.24543922631694629, | |
| "grad_norm": 8.413643836975098, | |
| "learning_rate": 4.6191365456913286e-05, | |
| "loss": 4.1874, | |
| "step": 6700 | |
| }, | |
| { | |
| "combined_loss": 4.9381103515625, | |
| "distill_loss": 1.5604116916656494, | |
| "epoch": 0.24543922631694629, | |
| "step": 6700, | |
| "student_mlm_loss": 8.31580924987793 | |
| }, | |
| { | |
| "epoch": 0.2491024983515276, | |
| "grad_norm": 19.279678344726562, | |
| "learning_rate": 4.6129935867508664e-05, | |
| "loss": 5.5581, | |
| "step": 6800 | |
| }, | |
| { | |
| "combined_loss": 4.7175493240356445, | |
| "distill_loss": 1.5657355785369873, | |
| "epoch": 0.2491024983515276, | |
| "step": 6800, | |
| "student_mlm_loss": 7.869362831115723 | |
| }, | |
| { | |
| "epoch": 0.25276577038610887, | |
| "grad_norm": 14.9283447265625, | |
| "learning_rate": 4.606850627810404e-05, | |
| "loss": 4.6319, | |
| "step": 6900 | |
| }, | |
| { | |
| "combined_loss": 5.707411766052246, | |
| "distill_loss": 1.566019058227539, | |
| "epoch": 0.25276577038610887, | |
| "step": 6900, | |
| "student_mlm_loss": 9.848804473876953 | |
| }, | |
| { | |
| "epoch": 0.25642904242069015, | |
| "grad_norm": 5.006555557250977, | |
| "learning_rate": 4.600707668869941e-05, | |
| "loss": 6.1192, | |
| "step": 7000 | |
| }, | |
| { | |
| "combined_loss": 4.373297691345215, | |
| "distill_loss": 1.4654217958450317, | |
| "epoch": 0.25642904242069015, | |
| "step": 7000, | |
| "student_mlm_loss": 7.281173229217529 | |
| }, | |
| { | |
| "epoch": 0.2600923144552714, | |
| "grad_norm": 15.025683403015137, | |
| "learning_rate": 4.594564709929479e-05, | |
| "loss": 3.472, | |
| "step": 7100 | |
| }, | |
| { | |
| "combined_loss": 5.1388630867004395, | |
| "distill_loss": 1.5254905223846436, | |
| "epoch": 0.2600923144552714, | |
| "step": 7100, | |
| "student_mlm_loss": 8.752235412597656 | |
| }, | |
| { | |
| "epoch": 0.26375558648985276, | |
| "grad_norm": 44.157169342041016, | |
| "learning_rate": 4.588421750989017e-05, | |
| "loss": 8.8482, | |
| "step": 7200 | |
| }, | |
| { | |
| "combined_loss": 2.1565892696380615, | |
| "distill_loss": 1.2985585927963257, | |
| "epoch": 0.26375558648985276, | |
| "step": 7200, | |
| "student_mlm_loss": 3.014619827270508 | |
| }, | |
| { | |
| "epoch": 0.26741885852443403, | |
| "grad_norm": 5.755523204803467, | |
| "learning_rate": 4.5822787920485546e-05, | |
| "loss": 5.7829, | |
| "step": 7300 | |
| }, | |
| { | |
| "combined_loss": 2.5404441356658936, | |
| "distill_loss": 1.5058717727661133, | |
| "epoch": 0.26741885852443403, | |
| "step": 7300, | |
| "student_mlm_loss": 3.575016498565674 | |
| }, | |
| { | |
| "epoch": 0.2710821305590153, | |
| "grad_norm": 15.252013206481934, | |
| "learning_rate": 4.576135833108092e-05, | |
| "loss": 7.9361, | |
| "step": 7400 | |
| }, | |
| { | |
| "combined_loss": 2.5752511024475098, | |
| "distill_loss": 1.5916697978973389, | |
| "epoch": 0.2710821305590153, | |
| "step": 7400, | |
| "student_mlm_loss": 3.5588326454162598 | |
| }, | |
| { | |
| "epoch": 0.2747454025935966, | |
| "grad_norm": 26.218740463256836, | |
| "learning_rate": 4.5699928741676295e-05, | |
| "loss": 4.8534, | |
| "step": 7500 | |
| }, | |
| { | |
| "combined_loss": 2.1656486988067627, | |
| "distill_loss": 1.4179739952087402, | |
| "epoch": 0.2747454025935966, | |
| "step": 7500, | |
| "student_mlm_loss": 2.913323402404785 | |
| }, | |
| { | |
| "epoch": 0.27840867462817787, | |
| "grad_norm": 6.031148910522461, | |
| "learning_rate": 4.5638499152271666e-05, | |
| "loss": 6.4535, | |
| "step": 7600 | |
| }, | |
| { | |
| "combined_loss": 2.8603813648223877, | |
| "distill_loss": 1.5837383270263672, | |
| "epoch": 0.27840867462817787, | |
| "step": 7600, | |
| "student_mlm_loss": 4.137024402618408 | |
| }, | |
| { | |
| "epoch": 0.2820719466627592, | |
| "grad_norm": 107.95591735839844, | |
| "learning_rate": 4.5577069562867044e-05, | |
| "loss": 3.2702, | |
| "step": 7700 | |
| }, | |
| { | |
| "combined_loss": 1.8474111557006836, | |
| "distill_loss": 1.437280297279358, | |
| "epoch": 0.2820719466627592, | |
| "step": 7700, | |
| "student_mlm_loss": 2.257542133331299 | |
| }, | |
| { | |
| "epoch": 0.2857352186973405, | |
| "grad_norm": 5.394913673400879, | |
| "learning_rate": 4.551563997346242e-05, | |
| "loss": 2.8998, | |
| "step": 7800 | |
| }, | |
| { | |
| "combined_loss": 4.77987813949585, | |
| "distill_loss": 1.5358555316925049, | |
| "epoch": 0.2857352186973405, | |
| "step": 7800, | |
| "student_mlm_loss": 8.023900985717773 | |
| }, | |
| { | |
| "epoch": 0.28939849073192175, | |
| "grad_norm": 7.790286540985107, | |
| "learning_rate": 4.545421038405779e-05, | |
| "loss": 2.9018, | |
| "step": 7900 | |
| }, | |
| { | |
| "combined_loss": 3.34071946144104, | |
| "distill_loss": 1.3893283605575562, | |
| "epoch": 0.28939849073192175, | |
| "step": 7900, | |
| "student_mlm_loss": 5.292110443115234 | |
| }, | |
| { | |
| "epoch": 0.29306176276650303, | |
| "grad_norm": 10.3685941696167, | |
| "learning_rate": 4.539278079465317e-05, | |
| "loss": 3.5884, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.29306176276650303, | |
| "eval_loss": 3.7581117153167725, | |
| "eval_runtime": 2.0302, | |
| "eval_samples_per_second": 3446.049, | |
| "eval_steps_per_second": 13.792, | |
| "step": 8000 | |
| }, | |
| { | |
| "combined_loss": 2.8955559730529785, | |
| "distill_loss": 1.3627426624298096, | |
| "epoch": 0.29306176276650303, | |
| "step": 8000, | |
| "student_mlm_loss": 4.428369522094727 | |
| }, | |
| { | |
| "epoch": 0.2967250348010843, | |
| "grad_norm": 49.06619644165039, | |
| "learning_rate": 4.533135120524855e-05, | |
| "loss": 3.5788, | |
| "step": 8100 | |
| }, | |
| { | |
| "combined_loss": 4.52724552154541, | |
| "distill_loss": 1.3924285173416138, | |
| "epoch": 0.2967250348010843, | |
| "step": 8100, | |
| "student_mlm_loss": 7.662062644958496 | |
| }, | |
| { | |
| "epoch": 0.30038830683566564, | |
| "grad_norm": 27.40319061279297, | |
| "learning_rate": 4.5269921615843926e-05, | |
| "loss": 3.9229, | |
| "step": 8200 | |
| }, | |
| { | |
| "combined_loss": 3.3075461387634277, | |
| "distill_loss": 1.5311795473098755, | |
| "epoch": 0.30038830683566564, | |
| "step": 8200, | |
| "student_mlm_loss": 5.0839128494262695 | |
| }, | |
| { | |
| "epoch": 0.3040515788702469, | |
| "grad_norm": 31.07562255859375, | |
| "learning_rate": 4.52084920264393e-05, | |
| "loss": 3.9566, | |
| "step": 8300 | |
| }, | |
| { | |
| "combined_loss": 1.9784274101257324, | |
| "distill_loss": 1.41036057472229, | |
| "epoch": 0.3040515788702469, | |
| "step": 8300, | |
| "student_mlm_loss": 2.546494245529175 | |
| }, | |
| { | |
| "epoch": 0.3077148509048282, | |
| "grad_norm": 4.548298358917236, | |
| "learning_rate": 4.514706243703467e-05, | |
| "loss": 5.1591, | |
| "step": 8400 | |
| }, | |
| { | |
| "combined_loss": 1.9796760082244873, | |
| "distill_loss": 1.408158302307129, | |
| "epoch": 0.3077148509048282, | |
| "step": 8400, | |
| "student_mlm_loss": 2.5511937141418457 | |
| }, | |
| { | |
| "epoch": 0.3113781229394095, | |
| "grad_norm": 8.897561073303223, | |
| "learning_rate": 4.5085632847630046e-05, | |
| "loss": 5.7057, | |
| "step": 8500 | |
| }, | |
| { | |
| "combined_loss": 2.080671548843384, | |
| "distill_loss": 1.4321857690811157, | |
| "epoch": 0.3113781229394095, | |
| "step": 8500, | |
| "student_mlm_loss": 2.7291574478149414 | |
| }, | |
| { | |
| "epoch": 0.31504139497399075, | |
| "grad_norm": 10.005053520202637, | |
| "learning_rate": 4.5024203258225424e-05, | |
| "loss": 7.7928, | |
| "step": 8600 | |
| }, | |
| { | |
| "combined_loss": 2.6395342350006104, | |
| "distill_loss": 1.5675503015518188, | |
| "epoch": 0.31504139497399075, | |
| "step": 8600, | |
| "student_mlm_loss": 3.7115182876586914 | |
| }, | |
| { | |
| "epoch": 0.31870466700857203, | |
| "grad_norm": 5.425146579742432, | |
| "learning_rate": 4.49627736688208e-05, | |
| "loss": 3.7716, | |
| "step": 8700 | |
| }, | |
| { | |
| "combined_loss": 2.9848690032958984, | |
| "distill_loss": 1.592170000076294, | |
| "epoch": 0.31870466700857203, | |
| "step": 8700, | |
| "student_mlm_loss": 4.377568244934082 | |
| }, | |
| { | |
| "epoch": 0.32236793904315336, | |
| "grad_norm": 5.64302396774292, | |
| "learning_rate": 4.490134407941617e-05, | |
| "loss": 6.8888, | |
| "step": 8800 | |
| }, | |
| { | |
| "combined_loss": 4.167844772338867, | |
| "distill_loss": 1.4308810234069824, | |
| "epoch": 0.32236793904315336, | |
| "step": 8800, | |
| "student_mlm_loss": 6.904808044433594 | |
| }, | |
| { | |
| "epoch": 0.32603121107773464, | |
| "grad_norm": 99.88166809082031, | |
| "learning_rate": 4.483991449001155e-05, | |
| "loss": 3.988, | |
| "step": 8900 | |
| }, | |
| { | |
| "combined_loss": 2.484290599822998, | |
| "distill_loss": 1.3509743213653564, | |
| "epoch": 0.32603121107773464, | |
| "step": 8900, | |
| "student_mlm_loss": 3.6176071166992188 | |
| }, | |
| { | |
| "epoch": 0.3296944831123159, | |
| "grad_norm": 74.52608489990234, | |
| "learning_rate": 4.477848490060693e-05, | |
| "loss": 7.0959, | |
| "step": 9000 | |
| }, | |
| { | |
| "combined_loss": 3.0457074642181396, | |
| "distill_loss": 1.3116565942764282, | |
| "epoch": 0.3296944831123159, | |
| "step": 9000, | |
| "student_mlm_loss": 4.779758453369141 | |
| }, | |
| { | |
| "epoch": 0.3333577551468972, | |
| "grad_norm": 11.735849380493164, | |
| "learning_rate": 4.47170553112023e-05, | |
| "loss": 3.3274, | |
| "step": 9100 | |
| }, | |
| { | |
| "combined_loss": 4.452191352844238, | |
| "distill_loss": 1.3943032026290894, | |
| "epoch": 0.3333577551468972, | |
| "step": 9100, | |
| "student_mlm_loss": 7.510079860687256 | |
| }, | |
| { | |
| "epoch": 0.33702102718147847, | |
| "grad_norm": 9.601778030395508, | |
| "learning_rate": 4.465562572179768e-05, | |
| "loss": 3.8928, | |
| "step": 9200 | |
| }, | |
| { | |
| "combined_loss": 4.875356197357178, | |
| "distill_loss": 1.4536867141723633, | |
| "epoch": 0.33702102718147847, | |
| "step": 9200, | |
| "student_mlm_loss": 8.297025680541992 | |
| }, | |
| { | |
| "epoch": 0.3406842992160598, | |
| "grad_norm": 9.49219799041748, | |
| "learning_rate": 4.459419613239305e-05, | |
| "loss": 3.7362, | |
| "step": 9300 | |
| }, | |
| { | |
| "combined_loss": 2.9027719497680664, | |
| "distill_loss": 1.3480241298675537, | |
| "epoch": 0.3406842992160598, | |
| "step": 9300, | |
| "student_mlm_loss": 4.45751953125 | |
| }, | |
| { | |
| "epoch": 0.3443475712506411, | |
| "grad_norm": 7.6804728507995605, | |
| "learning_rate": 4.453276654298843e-05, | |
| "loss": 4.4018, | |
| "step": 9400 | |
| }, | |
| { | |
| "combined_loss": 2.7022647857666016, | |
| "distill_loss": 1.3614214658737183, | |
| "epoch": 0.3443475712506411, | |
| "step": 9400, | |
| "student_mlm_loss": 4.043107986450195 | |
| }, | |
| { | |
| "epoch": 0.34801084328522236, | |
| "grad_norm": 38.41388702392578, | |
| "learning_rate": 4.4471336953583804e-05, | |
| "loss": 3.0632, | |
| "step": 9500 | |
| }, | |
| { | |
| "combined_loss": 1.9494025707244873, | |
| "distill_loss": 1.3876396417617798, | |
| "epoch": 0.34801084328522236, | |
| "step": 9500, | |
| "student_mlm_loss": 2.5111656188964844 | |
| }, | |
| { | |
| "epoch": 0.35167411531980364, | |
| "grad_norm": 37.10932540893555, | |
| "learning_rate": 4.440990736417918e-05, | |
| "loss": 3.3258, | |
| "step": 9600 | |
| }, | |
| { | |
| "combined_loss": 2.6435036659240723, | |
| "distill_loss": 1.3941702842712402, | |
| "epoch": 0.35167411531980364, | |
| "step": 9600, | |
| "student_mlm_loss": 3.8928370475769043 | |
| }, | |
| { | |
| "epoch": 0.3553373873543849, | |
| "grad_norm": 17.652099609375, | |
| "learning_rate": 4.434847777477455e-05, | |
| "loss": 8.3854, | |
| "step": 9700 | |
| }, | |
| { | |
| "combined_loss": 2.336359977722168, | |
| "distill_loss": 1.5497583150863647, | |
| "epoch": 0.3553373873543849, | |
| "step": 9700, | |
| "student_mlm_loss": 3.1229615211486816 | |
| }, | |
| { | |
| "epoch": 0.35900065938896625, | |
| "grad_norm": 58.41902160644531, | |
| "learning_rate": 4.428704818536993e-05, | |
| "loss": 6.9624, | |
| "step": 9800 | |
| }, | |
| { | |
| "combined_loss": 2.6561923027038574, | |
| "distill_loss": 1.5154696702957153, | |
| "epoch": 0.35900065938896625, | |
| "step": 9800, | |
| "student_mlm_loss": 3.796915054321289 | |
| }, | |
| { | |
| "epoch": 0.3626639314235475, | |
| "grad_norm": 23.230680465698242, | |
| "learning_rate": 4.422561859596531e-05, | |
| "loss": 3.4226, | |
| "step": 9900 | |
| }, | |
| { | |
| "combined_loss": 1.9643871784210205, | |
| "distill_loss": 1.3770619630813599, | |
| "epoch": 0.3626639314235475, | |
| "step": 9900, | |
| "student_mlm_loss": 2.5517125129699707 | |
| }, | |
| { | |
| "epoch": 0.3663272034581288, | |
| "grad_norm": 11.580951690673828, | |
| "learning_rate": 4.416418900656068e-05, | |
| "loss": 4.7414, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.3663272034581288, | |
| "eval_loss": 3.8432743549346924, | |
| "eval_runtime": 2.2879, | |
| "eval_samples_per_second": 3057.772, | |
| "eval_steps_per_second": 12.238, | |
| "step": 10000 | |
| }, | |
| { | |
| "combined_loss": 2.395519971847534, | |
| "distill_loss": 1.382614254951477, | |
| "epoch": 0.3663272034581288, | |
| "step": 10000, | |
| "student_mlm_loss": 3.408425807952881 | |
| }, | |
| { | |
| "epoch": 0.3699904754927101, | |
| "grad_norm": 19.014955520629883, | |
| "learning_rate": 4.410275941715606e-05, | |
| "loss": 6.6365, | |
| "step": 10100 | |
| }, | |
| { | |
| "combined_loss": 2.1697921752929688, | |
| "distill_loss": 1.5128508806228638, | |
| "epoch": 0.3699904754927101, | |
| "step": 10100, | |
| "student_mlm_loss": 2.8267335891723633 | |
| }, | |
| { | |
| "epoch": 0.37365374752729136, | |
| "grad_norm": 6.532296180725098, | |
| "learning_rate": 4.404132982775143e-05, | |
| "loss": 3.199, | |
| "step": 10200 | |
| }, | |
| { | |
| "combined_loss": 1.8516874313354492, | |
| "distill_loss": 1.413927674293518, | |
| "epoch": 0.37365374752729136, | |
| "step": 10200, | |
| "student_mlm_loss": 2.289447069168091 | |
| }, | |
| { | |
| "epoch": 0.3773170195618727, | |
| "grad_norm": 25.607181549072266, | |
| "learning_rate": 4.397990023834681e-05, | |
| "loss": 3.822, | |
| "step": 10300 | |
| }, | |
| { | |
| "combined_loss": 3.3827946186065674, | |
| "distill_loss": 1.4635933637619019, | |
| "epoch": 0.3773170195618727, | |
| "step": 10300, | |
| "student_mlm_loss": 5.301995754241943 | |
| }, | |
| { | |
| "epoch": 0.38098029159645397, | |
| "grad_norm": 12.52314567565918, | |
| "learning_rate": 4.3918470648942184e-05, | |
| "loss": 6.9491, | |
| "step": 10400 | |
| }, | |
| { | |
| "combined_loss": 1.9748457670211792, | |
| "distill_loss": 1.445707082748413, | |
| "epoch": 0.38098029159645397, | |
| "step": 10400, | |
| "student_mlm_loss": 2.5039844512939453 | |
| }, | |
| { | |
| "epoch": 0.38464356363103525, | |
| "grad_norm": 12.69713306427002, | |
| "learning_rate": 4.385704105953756e-05, | |
| "loss": 9.4794, | |
| "step": 10500 | |
| }, | |
| { | |
| "combined_loss": 3.5582261085510254, | |
| "distill_loss": 1.4324952363967896, | |
| "epoch": 0.38464356363103525, | |
| "step": 10500, | |
| "student_mlm_loss": 5.683957099914551 | |
| }, | |
| { | |
| "epoch": 0.3883068356656165, | |
| "grad_norm": 9.131495475769043, | |
| "learning_rate": 4.379561147013293e-05, | |
| "loss": 7.1932, | |
| "step": 10600 | |
| }, | |
| { | |
| "combined_loss": 6.080216407775879, | |
| "distill_loss": 1.477283000946045, | |
| "epoch": 0.3883068356656165, | |
| "step": 10600, | |
| "student_mlm_loss": 10.683149337768555 | |
| }, | |
| { | |
| "epoch": 0.3919701077001978, | |
| "grad_norm": 24.739810943603516, | |
| "learning_rate": 4.373418188072831e-05, | |
| "loss": 5.6399, | |
| "step": 10700 | |
| }, | |
| { | |
| "combined_loss": 3.7993698120117188, | |
| "distill_loss": 1.452317476272583, | |
| "epoch": 0.3919701077001978, | |
| "step": 10700, | |
| "student_mlm_loss": 6.146422386169434 | |
| }, | |
| { | |
| "epoch": 0.3956333797347791, | |
| "grad_norm": 42.44218063354492, | |
| "learning_rate": 4.367275229132369e-05, | |
| "loss": 4.2291, | |
| "step": 10800 | |
| }, | |
| { | |
| "combined_loss": 2.037079095840454, | |
| "distill_loss": 1.4349570274353027, | |
| "epoch": 0.3956333797347791, | |
| "step": 10800, | |
| "student_mlm_loss": 2.6392011642456055 | |
| }, | |
| { | |
| "epoch": 0.3992966517693604, | |
| "grad_norm": 231.26116943359375, | |
| "learning_rate": 4.361132270191906e-05, | |
| "loss": 4.6188, | |
| "step": 10900 | |
| }, | |
| { | |
| "combined_loss": 182.1781768798828, | |
| "distill_loss": 1.4427307844161987, | |
| "epoch": 0.3992966517693604, | |
| "step": 10900, | |
| "student_mlm_loss": 362.91363525390625 | |
| }, | |
| { | |
| "epoch": 0.4029599238039417, | |
| "grad_norm": 16.01262092590332, | |
| "learning_rate": 4.354989311251444e-05, | |
| "loss": 4.8535, | |
| "step": 11000 | |
| }, | |
| { | |
| "combined_loss": 3.2922308444976807, | |
| "distill_loss": 1.7308834791183472, | |
| "epoch": 0.4029599238039417, | |
| "step": 11000, | |
| "student_mlm_loss": 4.853578090667725 | |
| }, | |
| { | |
| "epoch": 0.40662319583852297, | |
| "grad_norm": 23.69573974609375, | |
| "learning_rate": 4.3488463523109816e-05, | |
| "loss": 2.8692, | |
| "step": 11100 | |
| }, | |
| { | |
| "combined_loss": 2.1010890007019043, | |
| "distill_loss": 1.3140019178390503, | |
| "epoch": 0.40662319583852297, | |
| "step": 11100, | |
| "student_mlm_loss": 2.888176202774048 | |
| }, | |
| { | |
| "epoch": 0.41028646787310424, | |
| "grad_norm": 9.695125579833984, | |
| "learning_rate": 4.3427033933705193e-05, | |
| "loss": 7.6829, | |
| "step": 11200 | |
| }, | |
| { | |
| "combined_loss": 2.24194598197937, | |
| "distill_loss": 1.560063362121582, | |
| "epoch": 0.41028646787310424, | |
| "step": 11200, | |
| "student_mlm_loss": 2.923828601837158 | |
| }, | |
| { | |
| "epoch": 0.4139497399076855, | |
| "grad_norm": 37.06310272216797, | |
| "learning_rate": 4.3365604344300565e-05, | |
| "loss": 3.5562, | |
| "step": 11300 | |
| }, | |
| { | |
| "combined_loss": 9.297407150268555, | |
| "distill_loss": 1.2328678369522095, | |
| "epoch": 0.4139497399076855, | |
| "step": 11300, | |
| "student_mlm_loss": 17.36194610595703 | |
| }, | |
| { | |
| "epoch": 0.41761301194226685, | |
| "grad_norm": 6.411166667938232, | |
| "learning_rate": 4.330417475489594e-05, | |
| "loss": 4.0543, | |
| "step": 11400 | |
| }, | |
| { | |
| "combined_loss": 2.141500949859619, | |
| "distill_loss": 1.467064380645752, | |
| "epoch": 0.41761301194226685, | |
| "step": 11400, | |
| "student_mlm_loss": 2.8159377574920654 | |
| }, | |
| { | |
| "epoch": 0.42127628397684813, | |
| "grad_norm": 5.802677154541016, | |
| "learning_rate": 4.3242745165491313e-05, | |
| "loss": 14.3215, | |
| "step": 11500 | |
| }, | |
| { | |
| "combined_loss": 6.576130390167236, | |
| "distill_loss": 1.46802818775177, | |
| "epoch": 0.42127628397684813, | |
| "step": 11500, | |
| "student_mlm_loss": 11.684232711791992 | |
| }, | |
| { | |
| "epoch": 0.4249395560114294, | |
| "grad_norm": 15.660844802856445, | |
| "learning_rate": 4.318131557608669e-05, | |
| "loss": 30.5877, | |
| "step": 11600 | |
| }, | |
| { | |
| "combined_loss": 1.9305293560028076, | |
| "distill_loss": 1.405720591545105, | |
| "epoch": 0.4249395560114294, | |
| "step": 11600, | |
| "student_mlm_loss": 2.4553380012512207 | |
| }, | |
| { | |
| "epoch": 0.4286028280460107, | |
| "grad_norm": 3.041947603225708, | |
| "learning_rate": 4.311988598668207e-05, | |
| "loss": 3.7156, | |
| "step": 11700 | |
| }, | |
| { | |
| "combined_loss": 2.78572940826416, | |
| "distill_loss": 1.45219886302948, | |
| "epoch": 0.4286028280460107, | |
| "step": 11700, | |
| "student_mlm_loss": 4.119259834289551 | |
| }, | |
| { | |
| "epoch": 0.43226610008059196, | |
| "grad_norm": 20.6744384765625, | |
| "learning_rate": 4.305845639727744e-05, | |
| "loss": 3.3939, | |
| "step": 11800 | |
| }, | |
| { | |
| "combined_loss": 2.0835349559783936, | |
| "distill_loss": 1.4508671760559082, | |
| "epoch": 0.43226610008059196, | |
| "step": 11800, | |
| "student_mlm_loss": 2.716202735900879 | |
| }, | |
| { | |
| "epoch": 0.4359293721151733, | |
| "grad_norm": 5.804731369018555, | |
| "learning_rate": 4.299702680787282e-05, | |
| "loss": 6.1951, | |
| "step": 11900 | |
| }, | |
| { | |
| "combined_loss": 3.1048030853271484, | |
| "distill_loss": 1.455564260482788, | |
| "epoch": 0.4359293721151733, | |
| "step": 11900, | |
| "student_mlm_loss": 4.75404167175293 | |
| }, | |
| { | |
| "epoch": 0.4395926441497546, | |
| "grad_norm": 33.689720153808594, | |
| "learning_rate": 4.2935597218468196e-05, | |
| "loss": 3.6583, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.4395926441497546, | |
| "eval_loss": 3.919630527496338, | |
| "eval_runtime": 2.0425, | |
| "eval_samples_per_second": 3425.261, | |
| "eval_steps_per_second": 13.709, | |
| "step": 12000 | |
| }, | |
| { | |
| "combined_loss": 2.315965175628662, | |
| "distill_loss": 1.3009124994277954, | |
| "epoch": 0.4395926441497546, | |
| "step": 12000, | |
| "student_mlm_loss": 3.3310179710388184 | |
| }, | |
| { | |
| "epoch": 0.44325591618433585, | |
| "grad_norm": 24.73545265197754, | |
| "learning_rate": 4.2874167629063574e-05, | |
| "loss": 2.9828, | |
| "step": 12100 | |
| }, | |
| { | |
| "combined_loss": 5.060952186584473, | |
| "distill_loss": 1.3712559938430786, | |
| "epoch": 0.44325591618433585, | |
| "step": 12100, | |
| "student_mlm_loss": 8.750648498535156 | |
| }, | |
| { | |
| "epoch": 0.44691918821891713, | |
| "grad_norm": 19.548921585083008, | |
| "learning_rate": 4.2812738039658945e-05, | |
| "loss": 3.1716, | |
| "step": 12200 | |
| }, | |
| { | |
| "combined_loss": 2.3697307109832764, | |
| "distill_loss": 1.480096459388733, | |
| "epoch": 0.44691918821891713, | |
| "step": 12200, | |
| "student_mlm_loss": 3.2593650817871094 | |
| }, | |
| { | |
| "epoch": 0.4505824602534984, | |
| "grad_norm": 6.217925548553467, | |
| "learning_rate": 4.2751308450254316e-05, | |
| "loss": 5.1037, | |
| "step": 12300 | |
| }, | |
| { | |
| "combined_loss": 1.9682085514068604, | |
| "distill_loss": 1.3534774780273438, | |
| "epoch": 0.4505824602534984, | |
| "step": 12300, | |
| "student_mlm_loss": 2.582939624786377 | |
| }, | |
| { | |
| "epoch": 0.45424573228807974, | |
| "grad_norm": 53.592735290527344, | |
| "learning_rate": 4.2689878860849694e-05, | |
| "loss": 5.3409, | |
| "step": 12400 | |
| }, | |
| { | |
| "combined_loss": 2.413550853729248, | |
| "distill_loss": 1.3951433897018433, | |
| "epoch": 0.45424573228807974, | |
| "step": 12400, | |
| "student_mlm_loss": 3.4319584369659424 | |
| }, | |
| { | |
| "epoch": 0.457909004322661, | |
| "grad_norm": 13.716507911682129, | |
| "learning_rate": 4.262844927144507e-05, | |
| "loss": 3.2261, | |
| "step": 12500 | |
| }, | |
| { | |
| "combined_loss": 3.6318020820617676, | |
| "distill_loss": 1.3529082536697388, | |
| "epoch": 0.457909004322661, | |
| "step": 12500, | |
| "student_mlm_loss": 5.910696029663086 | |
| }, | |
| { | |
| "epoch": 0.4615722763572423, | |
| "grad_norm": 16.206933975219727, | |
| "learning_rate": 4.256701968204045e-05, | |
| "loss": 3.1534, | |
| "step": 12600 | |
| }, | |
| { | |
| "combined_loss": 15.371432304382324, | |
| "distill_loss": 1.4290032386779785, | |
| "epoch": 0.4615722763572423, | |
| "step": 12600, | |
| "student_mlm_loss": 29.313861846923828 | |
| }, | |
| { | |
| "epoch": 0.4652355483918236, | |
| "grad_norm": 8.626960754394531, | |
| "learning_rate": 4.250559009263582e-05, | |
| "loss": 3.0824, | |
| "step": 12700 | |
| }, | |
| { | |
| "combined_loss": 2.0715112686157227, | |
| "distill_loss": 1.3553932905197144, | |
| "epoch": 0.4652355483918236, | |
| "step": 12700, | |
| "student_mlm_loss": 2.7876293659210205 | |
| }, | |
| { | |
| "epoch": 0.46889882042640485, | |
| "grad_norm": 8.153878211975098, | |
| "learning_rate": 4.24441605032312e-05, | |
| "loss": 3.8805, | |
| "step": 12800 | |
| }, | |
| { | |
| "combined_loss": 2.0972392559051514, | |
| "distill_loss": 1.2276250123977661, | |
| "epoch": 0.46889882042640485, | |
| "step": 12800, | |
| "student_mlm_loss": 2.966853618621826 | |
| }, | |
| { | |
| "epoch": 0.4725620924609861, | |
| "grad_norm": 12.068700790405273, | |
| "learning_rate": 4.2382730913826576e-05, | |
| "loss": 2.8937, | |
| "step": 12900 | |
| }, | |
| { | |
| "combined_loss": 2.9497852325439453, | |
| "distill_loss": 1.314728021621704, | |
| "epoch": 0.4725620924609861, | |
| "step": 12900, | |
| "student_mlm_loss": 4.584842681884766 | |
| }, | |
| { | |
| "epoch": 0.47622536449556746, | |
| "grad_norm": 12.260379791259766, | |
| "learning_rate": 4.232130132442195e-05, | |
| "loss": 5.581, | |
| "step": 13000 | |
| }, | |
| { | |
| "combined_loss": 1.8658246994018555, | |
| "distill_loss": 1.2703187465667725, | |
| "epoch": 0.47622536449556746, | |
| "step": 13000, | |
| "student_mlm_loss": 2.4613306522369385 | |
| }, | |
| { | |
| "epoch": 0.47988863653014874, | |
| "grad_norm": 22.688852310180664, | |
| "learning_rate": 4.2259871735017325e-05, | |
| "loss": 7.0059, | |
| "step": 13100 | |
| }, | |
| { | |
| "combined_loss": 3.673346519470215, | |
| "distill_loss": 1.397099256515503, | |
| "epoch": 0.47988863653014874, | |
| "step": 13100, | |
| "student_mlm_loss": 5.949593544006348 | |
| }, | |
| { | |
| "epoch": 0.48355190856473, | |
| "grad_norm": 28.811817169189453, | |
| "learning_rate": 4.2198442145612696e-05, | |
| "loss": 9.6395, | |
| "step": 13200 | |
| }, | |
| { | |
| "combined_loss": 2.036362409591675, | |
| "distill_loss": 1.3239866495132446, | |
| "epoch": 0.48355190856473, | |
| "step": 13200, | |
| "student_mlm_loss": 2.7487380504608154 | |
| }, | |
| { | |
| "epoch": 0.4872151805993113, | |
| "grad_norm": 6.380947589874268, | |
| "learning_rate": 4.213701255620808e-05, | |
| "loss": 2.7095, | |
| "step": 13300 | |
| }, | |
| { | |
| "combined_loss": 2.2547478675842285, | |
| "distill_loss": 1.4122509956359863, | |
| "epoch": 0.4872151805993113, | |
| "step": 13300, | |
| "student_mlm_loss": 3.09724497795105 | |
| }, | |
| { | |
| "epoch": 0.49087845263389257, | |
| "grad_norm": 83.60982513427734, | |
| "learning_rate": 4.207558296680345e-05, | |
| "loss": 3.2917, | |
| "step": 13400 | |
| }, | |
| { | |
| "combined_loss": 2.009040355682373, | |
| "distill_loss": 1.4236946105957031, | |
| "epoch": 0.49087845263389257, | |
| "step": 13400, | |
| "student_mlm_loss": 2.594385862350464 | |
| }, | |
| { | |
| "epoch": 0.4945417246684739, | |
| "grad_norm": 10.06588077545166, | |
| "learning_rate": 4.201415337739883e-05, | |
| "loss": 12.3205, | |
| "step": 13500 | |
| }, | |
| { | |
| "combined_loss": 2.9317073822021484, | |
| "distill_loss": 1.4229042530059814, | |
| "epoch": 0.4945417246684739, | |
| "step": 13500, | |
| "student_mlm_loss": 4.440510272979736 | |
| }, | |
| { | |
| "epoch": 0.4982049967030552, | |
| "grad_norm": 4.126479625701904, | |
| "learning_rate": 4.19527237879942e-05, | |
| "loss": 3.8077, | |
| "step": 13600 | |
| }, | |
| { | |
| "combined_loss": 1.9033926725387573, | |
| "distill_loss": 1.357490062713623, | |
| "epoch": 0.4982049967030552, | |
| "step": 13600, | |
| "student_mlm_loss": 2.4492952823638916 | |
| }, | |
| { | |
| "epoch": 0.5018682687376365, | |
| "grad_norm": 18.483203887939453, | |
| "learning_rate": 4.189129419858958e-05, | |
| "loss": 11.6361, | |
| "step": 13700 | |
| }, | |
| { | |
| "combined_loss": 3.165005683898926, | |
| "distill_loss": 1.3812006711959839, | |
| "epoch": 0.5018682687376365, | |
| "step": 13700, | |
| "student_mlm_loss": 4.948810577392578 | |
| }, | |
| { | |
| "epoch": 0.5055315407722177, | |
| "grad_norm": 7.388655662536621, | |
| "learning_rate": 4.1829864609184956e-05, | |
| "loss": 3.875, | |
| "step": 13800 | |
| }, | |
| { | |
| "combined_loss": 1.8155145645141602, | |
| "distill_loss": 1.3641600608825684, | |
| "epoch": 0.5055315407722177, | |
| "step": 13800, | |
| "student_mlm_loss": 2.266869068145752 | |
| }, | |
| { | |
| "epoch": 0.509194812806799, | |
| "grad_norm": 9.352982521057129, | |
| "learning_rate": 4.176843501978033e-05, | |
| "loss": 9.268, | |
| "step": 13900 | |
| }, | |
| { | |
| "combined_loss": 2.3618173599243164, | |
| "distill_loss": 1.3162891864776611, | |
| "epoch": 0.509194812806799, | |
| "step": 13900, | |
| "student_mlm_loss": 3.4073452949523926 | |
| }, | |
| { | |
| "epoch": 0.5128580848413803, | |
| "grad_norm": 8.513871192932129, | |
| "learning_rate": 4.1707005430375705e-05, | |
| "loss": 3.3999, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.5128580848413803, | |
| "eval_loss": 3.5987370014190674, | |
| "eval_runtime": 2.2869, | |
| "eval_samples_per_second": 3059.222, | |
| "eval_steps_per_second": 12.244, | |
| "step": 14000 | |
| }, | |
| { | |
| "combined_loss": 2.6841559410095215, | |
| "distill_loss": 1.401199460029602, | |
| "epoch": 0.5128580848413803, | |
| "step": 14000, | |
| "student_mlm_loss": 3.9671125411987305 | |
| }, | |
| { | |
| "epoch": 0.5165213568759616, | |
| "grad_norm": 30.661813735961914, | |
| "learning_rate": 4.1645575840971076e-05, | |
| "loss": 18.3341, | |
| "step": 14100 | |
| }, | |
| { | |
| "combined_loss": 4.752758026123047, | |
| "distill_loss": 1.247560977935791, | |
| "epoch": 0.5165213568759616, | |
| "step": 14100, | |
| "student_mlm_loss": 8.257954597473145 | |
| }, | |
| { | |
| "epoch": 0.5201846289105428, | |
| "grad_norm": 40.303707122802734, | |
| "learning_rate": 4.158414625156646e-05, | |
| "loss": 3.1057, | |
| "step": 14200 | |
| }, | |
| { | |
| "combined_loss": 1.988144874572754, | |
| "distill_loss": 1.2577546834945679, | |
| "epoch": 0.5201846289105428, | |
| "step": 14200, | |
| "student_mlm_loss": 2.7185349464416504 | |
| }, | |
| { | |
| "epoch": 0.5238479009451242, | |
| "grad_norm": 19.77947235107422, | |
| "learning_rate": 4.152271666216183e-05, | |
| "loss": 7.3457, | |
| "step": 14300 | |
| }, | |
| { | |
| "combined_loss": 4.299380779266357, | |
| "distill_loss": 1.2770593166351318, | |
| "epoch": 0.5238479009451242, | |
| "step": 14300, | |
| "student_mlm_loss": 7.321702480316162 | |
| }, | |
| { | |
| "epoch": 0.5275111729797055, | |
| "grad_norm": 7.412100315093994, | |
| "learning_rate": 4.146128707275721e-05, | |
| "loss": 4.8104, | |
| "step": 14400 | |
| }, | |
| { | |
| "combined_loss": 10.650766372680664, | |
| "distill_loss": 1.3233892917633057, | |
| "epoch": 0.5275111729797055, | |
| "step": 14400, | |
| "student_mlm_loss": 19.9781436920166 | |
| }, | |
| { | |
| "epoch": 0.5311744450142868, | |
| "grad_norm": 5.799710750579834, | |
| "learning_rate": 4.139985748335258e-05, | |
| "loss": 3.4765, | |
| "step": 14500 | |
| }, | |
| { | |
| "combined_loss": 2.4540774822235107, | |
| "distill_loss": 1.319036841392517, | |
| "epoch": 0.5311744450142868, | |
| "step": 14500, | |
| "student_mlm_loss": 3.589118003845215 | |
| }, | |
| { | |
| "epoch": 0.5348377170488681, | |
| "grad_norm": 7.147758483886719, | |
| "learning_rate": 4.133842789394796e-05, | |
| "loss": 3.12, | |
| "step": 14600 | |
| }, | |
| { | |
| "combined_loss": 1.8580541610717773, | |
| "distill_loss": 1.3114832639694214, | |
| "epoch": 0.5348377170488681, | |
| "step": 14600, | |
| "student_mlm_loss": 2.4046249389648438 | |
| }, | |
| { | |
| "epoch": 0.5385009890834493, | |
| "grad_norm": 5.120487213134766, | |
| "learning_rate": 4.1276998304543336e-05, | |
| "loss": 6.7029, | |
| "step": 14700 | |
| }, | |
| { | |
| "combined_loss": 1.9685258865356445, | |
| "distill_loss": 1.2455390691757202, | |
| "epoch": 0.5385009890834493, | |
| "step": 14700, | |
| "student_mlm_loss": 2.6915125846862793 | |
| }, | |
| { | |
| "epoch": 0.5421642611180306, | |
| "grad_norm": 6.225675106048584, | |
| "learning_rate": 4.121556871513871e-05, | |
| "loss": 7.1336, | |
| "step": 14800 | |
| }, | |
| { | |
| "combined_loss": 1.8886613845825195, | |
| "distill_loss": 1.2913726568222046, | |
| "epoch": 0.5421642611180306, | |
| "step": 14800, | |
| "student_mlm_loss": 2.485949993133545 | |
| }, | |
| { | |
| "epoch": 0.5458275331526119, | |
| "grad_norm": 11.508244514465332, | |
| "learning_rate": 4.1154139125734085e-05, | |
| "loss": 11.8719, | |
| "step": 14900 | |
| }, | |
| { | |
| "combined_loss": 2.1455585956573486, | |
| "distill_loss": 1.3711117506027222, | |
| "epoch": 0.5458275331526119, | |
| "step": 14900, | |
| "student_mlm_loss": 2.9200053215026855 | |
| }, | |
| { | |
| "epoch": 0.5494908051871932, | |
| "grad_norm": 17.030780792236328, | |
| "learning_rate": 4.109270953632946e-05, | |
| "loss": 3.091, | |
| "step": 15000 | |
| }, | |
| { | |
| "combined_loss": 1.9433504343032837, | |
| "distill_loss": 1.538583517074585, | |
| "epoch": 0.5494908051871932, | |
| "step": 15000, | |
| "student_mlm_loss": 2.3481173515319824 | |
| }, | |
| { | |
| "epoch": 0.5531540772217745, | |
| "grad_norm": 4.692992687225342, | |
| "learning_rate": 4.103127994692484e-05, | |
| "loss": 3.2488, | |
| "step": 15100 | |
| }, | |
| { | |
| "combined_loss": 2.820077657699585, | |
| "distill_loss": 1.2906769514083862, | |
| "epoch": 0.5531540772217745, | |
| "step": 15100, | |
| "student_mlm_loss": 4.349478244781494 | |
| }, | |
| { | |
| "epoch": 0.5568173492563557, | |
| "grad_norm": 49.70892333984375, | |
| "learning_rate": 4.096985035752021e-05, | |
| "loss": 10.6593, | |
| "step": 15200 | |
| }, | |
| { | |
| "combined_loss": 1.857104778289795, | |
| "distill_loss": 1.4106833934783936, | |
| "epoch": 0.5568173492563557, | |
| "step": 15200, | |
| "student_mlm_loss": 2.3035261631011963 | |
| }, | |
| { | |
| "epoch": 0.5604806212909371, | |
| "grad_norm": 7.913967609405518, | |
| "learning_rate": 4.090842076811558e-05, | |
| "loss": 3.3056, | |
| "step": 15300 | |
| }, | |
| { | |
| "combined_loss": 3.2144076824188232, | |
| "distill_loss": 1.3917032480239868, | |
| "epoch": 0.5604806212909371, | |
| "step": 15300, | |
| "student_mlm_loss": 5.037112236022949 | |
| }, | |
| { | |
| "epoch": 0.5641438933255184, | |
| "grad_norm": 10.575057983398438, | |
| "learning_rate": 4.084699117871096e-05, | |
| "loss": 10.0757, | |
| "step": 15400 | |
| }, | |
| { | |
| "combined_loss": 5.352452754974365, | |
| "distill_loss": 1.3542910814285278, | |
| "epoch": 0.5641438933255184, | |
| "step": 15400, | |
| "student_mlm_loss": 9.350614547729492 | |
| }, | |
| { | |
| "epoch": 0.5678071653600997, | |
| "grad_norm": 119.92784118652344, | |
| "learning_rate": 4.078556158930634e-05, | |
| "loss": 3.4463, | |
| "step": 15500 | |
| }, | |
| { | |
| "combined_loss": 1.7753610610961914, | |
| "distill_loss": 1.3875095844268799, | |
| "epoch": 0.5678071653600997, | |
| "step": 15500, | |
| "student_mlm_loss": 2.163212537765503 | |
| }, | |
| { | |
| "epoch": 0.571470437394681, | |
| "grad_norm": 4.203140735626221, | |
| "learning_rate": 4.0724131999901717e-05, | |
| "loss": 4.8205, | |
| "step": 15600 | |
| }, | |
| { | |
| "combined_loss": 1.8941802978515625, | |
| "distill_loss": 1.3584777116775513, | |
| "epoch": 0.571470437394681, | |
| "step": 15600, | |
| "student_mlm_loss": 2.4298830032348633 | |
| }, | |
| { | |
| "epoch": 0.5751337094292622, | |
| "grad_norm": 16.848825454711914, | |
| "learning_rate": 4.066270241049709e-05, | |
| "loss": 7.7339, | |
| "step": 15700 | |
| }, | |
| { | |
| "combined_loss": 1.9499808549880981, | |
| "distill_loss": 1.3122260570526123, | |
| "epoch": 0.5751337094292622, | |
| "step": 15700, | |
| "student_mlm_loss": 2.587735652923584 | |
| }, | |
| { | |
| "epoch": 0.5787969814638435, | |
| "grad_norm": 2.9838955402374268, | |
| "learning_rate": 4.0601272821092465e-05, | |
| "loss": 3.4354, | |
| "step": 15800 | |
| }, | |
| { | |
| "combined_loss": 1.9672229290008545, | |
| "distill_loss": 1.3119910955429077, | |
| "epoch": 0.5787969814638435, | |
| "step": 15800, | |
| "student_mlm_loss": 2.622454881668091 | |
| }, | |
| { | |
| "epoch": 0.5824602534984248, | |
| "grad_norm": 6.6938676834106445, | |
| "learning_rate": 4.053984323168784e-05, | |
| "loss": 5.2244, | |
| "step": 15900 | |
| }, | |
| { | |
| "combined_loss": 2.8469321727752686, | |
| "distill_loss": 1.361178994178772, | |
| "epoch": 0.5824602534984248, | |
| "step": 15900, | |
| "student_mlm_loss": 4.332685470581055 | |
| }, | |
| { | |
| "epoch": 0.5861235255330061, | |
| "grad_norm": 31.440717697143555, | |
| "learning_rate": 4.047841364228322e-05, | |
| "loss": 8.7168, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.5861235255330061, | |
| "eval_loss": 3.480536937713623, | |
| "eval_runtime": 2.1572, | |
| "eval_samples_per_second": 3243.154, | |
| "eval_steps_per_second": 12.98, | |
| "step": 16000 | |
| }, | |
| { | |
| "combined_loss": 2.0847339630126953, | |
| "distill_loss": 1.4640412330627441, | |
| "epoch": 0.5861235255330061, | |
| "step": 16000, | |
| "student_mlm_loss": 2.7054266929626465 | |
| }, | |
| { | |
| "epoch": 0.5897867975675873, | |
| "grad_norm": 6.238570690155029, | |
| "learning_rate": 4.041698405287859e-05, | |
| "loss": 3.2375, | |
| "step": 16100 | |
| }, | |
| { | |
| "combined_loss": 2.2635374069213867, | |
| "distill_loss": 1.5188945531845093, | |
| "epoch": 0.5897867975675873, | |
| "step": 16100, | |
| "student_mlm_loss": 3.0081801414489746 | |
| }, | |
| { | |
| "epoch": 0.5934500696021686, | |
| "grad_norm": 11.832098960876465, | |
| "learning_rate": 4.035555446347396e-05, | |
| "loss": 3.3115, | |
| "step": 16200 | |
| }, | |
| { | |
| "combined_loss": 2.2285714149475098, | |
| "distill_loss": 1.4724992513656616, | |
| "epoch": 0.5934500696021686, | |
| "step": 16200, | |
| "student_mlm_loss": 2.9846436977386475 | |
| }, | |
| { | |
| "epoch": 0.5971133416367499, | |
| "grad_norm": 8.876389503479004, | |
| "learning_rate": 4.029412487406934e-05, | |
| "loss": 4.1388, | |
| "step": 16300 | |
| }, | |
| { | |
| "combined_loss": 2.0907256603240967, | |
| "distill_loss": 1.2955131530761719, | |
| "epoch": 0.5971133416367499, | |
| "step": 16300, | |
| "student_mlm_loss": 2.8859381675720215 | |
| }, | |
| { | |
| "epoch": 0.6007766136713313, | |
| "grad_norm": 4.118688106536865, | |
| "learning_rate": 4.023269528466472e-05, | |
| "loss": 5.4036, | |
| "step": 16400 | |
| }, | |
| { | |
| "combined_loss": 5.190587997436523, | |
| "distill_loss": 1.502519965171814, | |
| "epoch": 0.6007766136713313, | |
| "step": 16400, | |
| "student_mlm_loss": 8.878656387329102 | |
| }, | |
| { | |
| "epoch": 0.6044398857059126, | |
| "grad_norm": 17.806203842163086, | |
| "learning_rate": 4.01712656952601e-05, | |
| "loss": 3.4529, | |
| "step": 16500 | |
| }, | |
| { | |
| "combined_loss": 2.0771563053131104, | |
| "distill_loss": 1.5032036304473877, | |
| "epoch": 0.6044398857059126, | |
| "step": 16500, | |
| "student_mlm_loss": 2.651108980178833 | |
| }, | |
| { | |
| "epoch": 0.6081031577404938, | |
| "grad_norm": 11.406692504882812, | |
| "learning_rate": 4.010983610585547e-05, | |
| "loss": 2.9157, | |
| "step": 16600 | |
| }, | |
| { | |
| "combined_loss": 2.0262105464935303, | |
| "distill_loss": 1.406888723373413, | |
| "epoch": 0.6081031577404938, | |
| "step": 16600, | |
| "student_mlm_loss": 2.6455323696136475 | |
| }, | |
| { | |
| "epoch": 0.6117664297750751, | |
| "grad_norm": 9.248611450195312, | |
| "learning_rate": 4.0048406516450846e-05, | |
| "loss": 3.7273, | |
| "step": 16700 | |
| }, | |
| { | |
| "combined_loss": 9.912755966186523, | |
| "distill_loss": 1.3654385805130005, | |
| "epoch": 0.6117664297750751, | |
| "step": 16700, | |
| "student_mlm_loss": 18.460073471069336 | |
| }, | |
| { | |
| "epoch": 0.6154297018096564, | |
| "grad_norm": 7.337488651275635, | |
| "learning_rate": 3.9986976927046223e-05, | |
| "loss": 3.5316, | |
| "step": 16800 | |
| }, | |
| { | |
| "combined_loss": 2.2111759185791016, | |
| "distill_loss": 1.410059928894043, | |
| "epoch": 0.6154297018096564, | |
| "step": 16800, | |
| "student_mlm_loss": 3.012291669845581 | |
| }, | |
| { | |
| "epoch": 0.6190929738442377, | |
| "grad_norm": 3.7927513122558594, | |
| "learning_rate": 3.9925547337641595e-05, | |
| "loss": 2.942, | |
| "step": 16900 | |
| }, | |
| { | |
| "combined_loss": 1.9941096305847168, | |
| "distill_loss": 1.3353883028030396, | |
| "epoch": 0.6190929738442377, | |
| "step": 16900, | |
| "student_mlm_loss": 2.6528310775756836 | |
| }, | |
| { | |
| "epoch": 0.622756245878819, | |
| "grad_norm": 8.092863082885742, | |
| "learning_rate": 3.986411774823697e-05, | |
| "loss": 8.3194, | |
| "step": 17000 | |
| }, | |
| { | |
| "combined_loss": 1.8197941780090332, | |
| "distill_loss": 1.2830308675765991, | |
| "epoch": 0.622756245878819, | |
| "step": 17000, | |
| "student_mlm_loss": 2.356557607650757 | |
| }, | |
| { | |
| "epoch": 0.6264195179134002, | |
| "grad_norm": 21.95607566833496, | |
| "learning_rate": 3.9802688158832343e-05, | |
| "loss": 3.6842, | |
| "step": 17100 | |
| }, | |
| { | |
| "combined_loss": 1.967858076095581, | |
| "distill_loss": 1.3744505643844604, | |
| "epoch": 0.6264195179134002, | |
| "step": 17100, | |
| "student_mlm_loss": 2.561265707015991 | |
| }, | |
| { | |
| "epoch": 0.6300827899479815, | |
| "grad_norm": 17.734630584716797, | |
| "learning_rate": 3.974125856942773e-05, | |
| "loss": 3.4446, | |
| "step": 17200 | |
| }, | |
| { | |
| "combined_loss": 3.56831955909729, | |
| "distill_loss": 1.4127169847488403, | |
| "epoch": 0.6300827899479815, | |
| "step": 17200, | |
| "student_mlm_loss": 5.723922252655029 | |
| }, | |
| { | |
| "epoch": 0.6337460619825628, | |
| "grad_norm": 14.227143287658691, | |
| "learning_rate": 3.96798289800231e-05, | |
| "loss": 4.3058, | |
| "step": 17300 | |
| }, | |
| { | |
| "combined_loss": 6.485238552093506, | |
| "distill_loss": 1.3285768032073975, | |
| "epoch": 0.6337460619825628, | |
| "step": 17300, | |
| "student_mlm_loss": 11.641900062561035 | |
| }, | |
| { | |
| "epoch": 0.6374093340171441, | |
| "grad_norm": 27.379819869995117, | |
| "learning_rate": 3.961839939061848e-05, | |
| "loss": 3.3666, | |
| "step": 17400 | |
| }, | |
| { | |
| "combined_loss": 3.212083339691162, | |
| "distill_loss": 1.3358004093170166, | |
| "epoch": 0.6374093340171441, | |
| "step": 17400, | |
| "student_mlm_loss": 5.088366508483887 | |
| }, | |
| { | |
| "epoch": 0.6410726060517254, | |
| "grad_norm": 6.261890411376953, | |
| "learning_rate": 3.955696980121385e-05, | |
| "loss": 6.3216, | |
| "step": 17500 | |
| }, | |
| { | |
| "combined_loss": 1.8787257671356201, | |
| "distill_loss": 1.3068917989730835, | |
| "epoch": 0.6410726060517254, | |
| "step": 17500, | |
| "student_mlm_loss": 2.4505598545074463 | |
| }, | |
| { | |
| "epoch": 0.6447358780863067, | |
| "grad_norm": 4.643723011016846, | |
| "learning_rate": 3.9495540211809226e-05, | |
| "loss": 6.3659, | |
| "step": 17600 | |
| }, | |
| { | |
| "combined_loss": 1.9111711978912354, | |
| "distill_loss": 1.315952181816101, | |
| "epoch": 0.6447358780863067, | |
| "step": 17600, | |
| "student_mlm_loss": 2.506390333175659 | |
| }, | |
| { | |
| "epoch": 0.648399150120888, | |
| "grad_norm": 209.94358825683594, | |
| "learning_rate": 3.9434110622404604e-05, | |
| "loss": 3.1778, | |
| "step": 17700 | |
| }, | |
| { | |
| "combined_loss": 2.7990779876708984, | |
| "distill_loss": 1.360758662223816, | |
| "epoch": 0.648399150120888, | |
| "step": 17700, | |
| "student_mlm_loss": 4.237397193908691 | |
| }, | |
| { | |
| "epoch": 0.6520624221554693, | |
| "grad_norm": 25.861230850219727, | |
| "learning_rate": 3.9372681032999975e-05, | |
| "loss": 6.5636, | |
| "step": 17800 | |
| }, | |
| { | |
| "combined_loss": 3.8194119930267334, | |
| "distill_loss": 1.45068359375, | |
| "epoch": 0.6520624221554693, | |
| "step": 17800, | |
| "student_mlm_loss": 6.188140392303467 | |
| }, | |
| { | |
| "epoch": 0.6557256941900506, | |
| "grad_norm": 46.81015396118164, | |
| "learning_rate": 3.931125144359535e-05, | |
| "loss": 6.4281, | |
| "step": 17900 | |
| }, | |
| { | |
| "combined_loss": 1.8790740966796875, | |
| "distill_loss": 1.2603598833084106, | |
| "epoch": 0.6557256941900506, | |
| "step": 17900, | |
| "student_mlm_loss": 2.497788429260254 | |
| }, | |
| { | |
| "epoch": 0.6593889662246318, | |
| "grad_norm": 3.634798049926758, | |
| "learning_rate": 3.924982185419073e-05, | |
| "loss": 3.7705, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.6593889662246318, | |
| "eval_loss": 3.4686477184295654, | |
| "eval_runtime": 2.0476, | |
| "eval_samples_per_second": 3416.619, | |
| "eval_steps_per_second": 13.674, | |
| "step": 18000 | |
| }, | |
| { | |
| "combined_loss": 1.8001245260238647, | |
| "distill_loss": 1.358407735824585, | |
| "epoch": 0.6593889662246318, | |
| "step": 18000, | |
| "student_mlm_loss": 2.2418413162231445 | |
| }, | |
| { | |
| "epoch": 0.6630522382592131, | |
| "grad_norm": 14.09543514251709, | |
| "learning_rate": 3.918839226478611e-05, | |
| "loss": 7.2198, | |
| "step": 18100 | |
| }, | |
| { | |
| "combined_loss": 2.165346622467041, | |
| "distill_loss": 1.3290469646453857, | |
| "epoch": 0.6630522382592131, | |
| "step": 18100, | |
| "student_mlm_loss": 3.0016462802886963 | |
| }, | |
| { | |
| "epoch": 0.6667155102937944, | |
| "grad_norm": 4.29142951965332, | |
| "learning_rate": 3.912696267538148e-05, | |
| "loss": 4.3053, | |
| "step": 18200 | |
| }, | |
| { | |
| "combined_loss": 1.8569279909133911, | |
| "distill_loss": 1.355130910873413, | |
| "epoch": 0.6667155102937944, | |
| "step": 18200, | |
| "student_mlm_loss": 2.358725070953369 | |
| }, | |
| { | |
| "epoch": 0.6703787823283757, | |
| "grad_norm": 4.424899101257324, | |
| "learning_rate": 3.906553308597686e-05, | |
| "loss": 3.2385, | |
| "step": 18300 | |
| }, | |
| { | |
| "combined_loss": 2.083707094192505, | |
| "distill_loss": 1.307104229927063, | |
| "epoch": 0.6703787823283757, | |
| "step": 18300, | |
| "student_mlm_loss": 2.8603098392486572 | |
| }, | |
| { | |
| "epoch": 0.6740420543629569, | |
| "grad_norm": 8.061409950256348, | |
| "learning_rate": 3.900410349657223e-05, | |
| "loss": 2.9075, | |
| "step": 18400 | |
| }, | |
| { | |
| "combined_loss": 1.9213597774505615, | |
| "distill_loss": 1.434320330619812, | |
| "epoch": 0.6740420543629569, | |
| "step": 18400, | |
| "student_mlm_loss": 2.4083993434906006 | |
| }, | |
| { | |
| "epoch": 0.6777053263975383, | |
| "grad_norm": 55.50898361206055, | |
| "learning_rate": 3.8942673907167606e-05, | |
| "loss": 13.4077, | |
| "step": 18500 | |
| }, | |
| { | |
| "combined_loss": 2.01340389251709, | |
| "distill_loss": 1.3991159200668335, | |
| "epoch": 0.6777053263975383, | |
| "step": 18500, | |
| "student_mlm_loss": 2.6276917457580566 | |
| }, | |
| { | |
| "epoch": 0.6813685984321196, | |
| "grad_norm": 5.348477840423584, | |
| "learning_rate": 3.8881244317762984e-05, | |
| "loss": 6.8559, | |
| "step": 18600 | |
| }, | |
| { | |
| "combined_loss": 2.5955307483673096, | |
| "distill_loss": 1.4375801086425781, | |
| "epoch": 0.6813685984321196, | |
| "step": 18600, | |
| "student_mlm_loss": 3.753481388092041 | |
| }, | |
| { | |
| "epoch": 0.6850318704667009, | |
| "grad_norm": 26.911954879760742, | |
| "learning_rate": 3.8819814728358355e-05, | |
| "loss": 9.8471, | |
| "step": 18700 | |
| }, | |
| { | |
| "combined_loss": 2.3086562156677246, | |
| "distill_loss": 1.4082762002944946, | |
| "epoch": 0.6850318704667009, | |
| "step": 18700, | |
| "student_mlm_loss": 3.209036350250244 | |
| }, | |
| { | |
| "epoch": 0.6886951425012822, | |
| "grad_norm": 8.086039543151855, | |
| "learning_rate": 3.875838513895373e-05, | |
| "loss": 3.841, | |
| "step": 18800 | |
| }, | |
| { | |
| "combined_loss": 4.487699031829834, | |
| "distill_loss": 1.4052667617797852, | |
| "epoch": 0.6886951425012822, | |
| "step": 18800, | |
| "student_mlm_loss": 7.570131301879883 | |
| }, | |
| { | |
| "epoch": 0.6923584145358634, | |
| "grad_norm": 10.749812126159668, | |
| "learning_rate": 3.869695554954911e-05, | |
| "loss": 9.7279, | |
| "step": 18900 | |
| }, | |
| { | |
| "combined_loss": 3.3014779090881348, | |
| "distill_loss": 1.246164083480835, | |
| "epoch": 0.6923584145358634, | |
| "step": 18900, | |
| "student_mlm_loss": 5.3567914962768555 | |
| }, | |
| { | |
| "epoch": 0.6960216865704447, | |
| "grad_norm": 11.313789367675781, | |
| "learning_rate": 3.863552596014449e-05, | |
| "loss": 28.0849, | |
| "step": 19000 | |
| }, | |
| { | |
| "combined_loss": 4.825923919677734, | |
| "distill_loss": 1.377113938331604, | |
| "epoch": 0.6960216865704447, | |
| "step": 19000, | |
| "student_mlm_loss": 8.274733543395996 | |
| }, | |
| { | |
| "epoch": 0.699684958605026, | |
| "grad_norm": 3.8648459911346436, | |
| "learning_rate": 3.857409637073986e-05, | |
| "loss": 5.8981, | |
| "step": 19100 | |
| }, | |
| { | |
| "combined_loss": 3.4921586513519287, | |
| "distill_loss": 1.4171725511550903, | |
| "epoch": 0.699684958605026, | |
| "step": 19100, | |
| "student_mlm_loss": 5.567144870758057 | |
| }, | |
| { | |
| "epoch": 0.7033482306396073, | |
| "grad_norm": 18.98455238342285, | |
| "learning_rate": 3.851266678133523e-05, | |
| "loss": 2.5944, | |
| "step": 19200 | |
| }, | |
| { | |
| "combined_loss": 1.8949182033538818, | |
| "distill_loss": 1.3743678331375122, | |
| "epoch": 0.7033482306396073, | |
| "step": 19200, | |
| "student_mlm_loss": 2.415468692779541 | |
| }, | |
| { | |
| "epoch": 0.7070115026741886, | |
| "grad_norm": 27.53456687927246, | |
| "learning_rate": 3.845123719193061e-05, | |
| "loss": 2.8462, | |
| "step": 19300 | |
| }, | |
| { | |
| "combined_loss": 1.8077284097671509, | |
| "distill_loss": 1.2764451503753662, | |
| "epoch": 0.7070115026741886, | |
| "step": 19300, | |
| "student_mlm_loss": 2.3390116691589355 | |
| }, | |
| { | |
| "epoch": 0.7106747747087698, | |
| "grad_norm": 8.815896987915039, | |
| "learning_rate": 3.8389807602525986e-05, | |
| "loss": 3.403, | |
| "step": 19400 | |
| }, | |
| { | |
| "combined_loss": 2.2496674060821533, | |
| "distill_loss": 1.408218264579773, | |
| "epoch": 0.7106747747087698, | |
| "step": 19400, | |
| "student_mlm_loss": 3.091116428375244 | |
| }, | |
| { | |
| "epoch": 0.7143380467433511, | |
| "grad_norm": 20.02590560913086, | |
| "learning_rate": 3.8328378013121364e-05, | |
| "loss": 3.7767, | |
| "step": 19500 | |
| }, | |
| { | |
| "combined_loss": 2.6540353298187256, | |
| "distill_loss": 1.451707124710083, | |
| "epoch": 0.7143380467433511, | |
| "step": 19500, | |
| "student_mlm_loss": 3.856363534927368 | |
| }, | |
| { | |
| "epoch": 0.7180013187779325, | |
| "grad_norm": 48.139583587646484, | |
| "learning_rate": 3.8266948423716735e-05, | |
| "loss": 3.4148, | |
| "step": 19600 | |
| }, | |
| { | |
| "combined_loss": 3.5710411071777344, | |
| "distill_loss": 1.2874888181686401, | |
| "epoch": 0.7180013187779325, | |
| "step": 19600, | |
| "student_mlm_loss": 5.854593276977539 | |
| }, | |
| { | |
| "epoch": 0.7216645908125138, | |
| "grad_norm": 5.810763835906982, | |
| "learning_rate": 3.820551883431211e-05, | |
| "loss": 11.1815, | |
| "step": 19700 | |
| }, | |
| { | |
| "combined_loss": 2.022658586502075, | |
| "distill_loss": 1.408826231956482, | |
| "epoch": 0.7216645908125138, | |
| "step": 19700, | |
| "student_mlm_loss": 2.636491060256958 | |
| }, | |
| { | |
| "epoch": 0.725327862847095, | |
| "grad_norm": 5.03505277633667, | |
| "learning_rate": 3.814408924490749e-05, | |
| "loss": 3.5792, | |
| "step": 19800 | |
| }, | |
| { | |
| "combined_loss": 2.450950860977173, | |
| "distill_loss": 1.3786026239395142, | |
| "epoch": 0.725327862847095, | |
| "step": 19800, | |
| "student_mlm_loss": 3.523299217224121 | |
| }, | |
| { | |
| "epoch": 0.7289911348816763, | |
| "grad_norm": 44.703548431396484, | |
| "learning_rate": 3.808265965550287e-05, | |
| "loss": 14.0822, | |
| "step": 19900 | |
| }, | |
| { | |
| "combined_loss": 1.8448269367218018, | |
| "distill_loss": 1.3061137199401855, | |
| "epoch": 0.7289911348816763, | |
| "step": 19900, | |
| "student_mlm_loss": 2.383540153503418 | |
| }, | |
| { | |
| "epoch": 0.7326544069162576, | |
| "grad_norm": 73.46593475341797, | |
| "learning_rate": 3.802123006609824e-05, | |
| "loss": 3.5648, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.7326544069162576, | |
| "eval_loss": 3.689605474472046, | |
| "eval_runtime": 2.2951, | |
| "eval_samples_per_second": 3048.261, | |
| "eval_steps_per_second": 12.2, | |
| "step": 20000 | |
| }, | |
| { | |
| "combined_loss": 5.831945896148682, | |
| "distill_loss": 1.2505719661712646, | |
| "epoch": 0.7326544069162576, | |
| "step": 20000, | |
| "student_mlm_loss": 10.41331958770752 | |
| }, | |
| { | |
| "epoch": 0.7363176789508389, | |
| "grad_norm": 7.289074897766113, | |
| "learning_rate": 3.795980047669361e-05, | |
| "loss": 5.9452, | |
| "step": 20100 | |
| }, | |
| { | |
| "combined_loss": 14.608942985534668, | |
| "distill_loss": 1.4141182899475098, | |
| "epoch": 0.7363176789508389, | |
| "step": 20100, | |
| "student_mlm_loss": 27.803768157958984 | |
| }, | |
| { | |
| "epoch": 0.7399809509854202, | |
| "grad_norm": 15.717759132385254, | |
| "learning_rate": 3.7898370887288995e-05, | |
| "loss": 5.3196, | |
| "step": 20200 | |
| }, | |
| { | |
| "combined_loss": 2.34932279586792, | |
| "distill_loss": 1.2641239166259766, | |
| "epoch": 0.7399809509854202, | |
| "step": 20200, | |
| "student_mlm_loss": 3.434521436691284 | |
| }, | |
| { | |
| "epoch": 0.7436442230200014, | |
| "grad_norm": 75.113037109375, | |
| "learning_rate": 3.7836941297884366e-05, | |
| "loss": 3.4868, | |
| "step": 20300 | |
| }, | |
| { | |
| "combined_loss": 2.0885400772094727, | |
| "distill_loss": 1.3560060262680054, | |
| "epoch": 0.7436442230200014, | |
| "step": 20300, | |
| "student_mlm_loss": 2.8210740089416504 | |
| }, | |
| { | |
| "epoch": 0.7473074950545827, | |
| "grad_norm": 12.071985244750977, | |
| "learning_rate": 3.7775511708479744e-05, | |
| "loss": 3.1594, | |
| "step": 20400 | |
| }, | |
| { | |
| "combined_loss": 2.104968309402466, | |
| "distill_loss": 1.456742286682129, | |
| "epoch": 0.7473074950545827, | |
| "step": 20400, | |
| "student_mlm_loss": 2.7531943321228027 | |
| }, | |
| { | |
| "epoch": 0.750970767089164, | |
| "grad_norm": 49.17687225341797, | |
| "learning_rate": 3.7714082119075115e-05, | |
| "loss": 5.0772, | |
| "step": 20500 | |
| }, | |
| { | |
| "combined_loss": 1.9532296657562256, | |
| "distill_loss": 1.2734321355819702, | |
| "epoch": 0.750970767089164, | |
| "step": 20500, | |
| "student_mlm_loss": 2.6330270767211914 | |
| }, | |
| { | |
| "epoch": 0.7546340391237454, | |
| "grad_norm": 4.601011753082275, | |
| "learning_rate": 3.765265252967049e-05, | |
| "loss": 8.0874, | |
| "step": 20600 | |
| }, | |
| { | |
| "combined_loss": 1.8828588724136353, | |
| "distill_loss": 1.35260009765625, | |
| "epoch": 0.7546340391237454, | |
| "step": 20600, | |
| "student_mlm_loss": 2.4131176471710205 | |
| }, | |
| { | |
| "epoch": 0.7582973111583267, | |
| "grad_norm": 3.9183883666992188, | |
| "learning_rate": 3.759122294026587e-05, | |
| "loss": 3.1836, | |
| "step": 20700 | |
| }, | |
| { | |
| "combined_loss": 3.261841058731079, | |
| "distill_loss": 1.35749351978302, | |
| "epoch": 0.7582973111583267, | |
| "step": 20700, | |
| "student_mlm_loss": 5.166188716888428 | |
| }, | |
| { | |
| "epoch": 0.7619605831929079, | |
| "grad_norm": 59.35635757446289, | |
| "learning_rate": 3.752979335086124e-05, | |
| "loss": 3.446, | |
| "step": 20800 | |
| }, | |
| { | |
| "combined_loss": 2.0783181190490723, | |
| "distill_loss": 1.3386023044586182, | |
| "epoch": 0.7619605831929079, | |
| "step": 20800, | |
| "student_mlm_loss": 2.8180341720581055 | |
| }, | |
| { | |
| "epoch": 0.7656238552274892, | |
| "grad_norm": 14.875, | |
| "learning_rate": 3.746836376145662e-05, | |
| "loss": 8.5798, | |
| "step": 20900 | |
| }, | |
| { | |
| "combined_loss": 1.926416039466858, | |
| "distill_loss": 1.3077542781829834, | |
| "epoch": 0.7656238552274892, | |
| "step": 20900, | |
| "student_mlm_loss": 2.5450778007507324 | |
| }, | |
| { | |
| "epoch": 0.7692871272620705, | |
| "grad_norm": 23.419870376586914, | |
| "learning_rate": 3.740693417205199e-05, | |
| "loss": 5.2177, | |
| "step": 21000 | |
| }, | |
| { | |
| "combined_loss": 1.7290170192718506, | |
| "distill_loss": 1.2258715629577637, | |
| "epoch": 0.7692871272620705, | |
| "step": 21000, | |
| "student_mlm_loss": 2.2321624755859375 | |
| }, | |
| { | |
| "epoch": 0.7729503992966518, | |
| "grad_norm": 29.292964935302734, | |
| "learning_rate": 3.7345504582647375e-05, | |
| "loss": 13.8021, | |
| "step": 21100 | |
| }, | |
| { | |
| "combined_loss": 1.9402461051940918, | |
| "distill_loss": 1.2749103307724, | |
| "epoch": 0.7729503992966518, | |
| "step": 21100, | |
| "student_mlm_loss": 2.6055819988250732 | |
| }, | |
| { | |
| "epoch": 0.776613671331233, | |
| "grad_norm": 9.03995418548584, | |
| "learning_rate": 3.7284074993242747e-05, | |
| "loss": 6.547, | |
| "step": 21200 | |
| }, | |
| { | |
| "combined_loss": 2.2710204124450684, | |
| "distill_loss": 1.312924861907959, | |
| "epoch": 0.776613671331233, | |
| "step": 21200, | |
| "student_mlm_loss": 3.229116201400757 | |
| }, | |
| { | |
| "epoch": 0.7802769433658143, | |
| "grad_norm": 11.86938190460205, | |
| "learning_rate": 3.7222645403838124e-05, | |
| "loss": 12.9682, | |
| "step": 21300 | |
| }, | |
| { | |
| "combined_loss": 3.114459991455078, | |
| "distill_loss": 1.318755865097046, | |
| "epoch": 0.7802769433658143, | |
| "step": 21300, | |
| "student_mlm_loss": 4.910163879394531 | |
| }, | |
| { | |
| "epoch": 0.7839402154003956, | |
| "grad_norm": 14.11950969696045, | |
| "learning_rate": 3.7161215814433495e-05, | |
| "loss": 3.1257, | |
| "step": 21400 | |
| }, | |
| { | |
| "combined_loss": 3.882293224334717, | |
| "distill_loss": 1.1930829286575317, | |
| "epoch": 0.7839402154003956, | |
| "step": 21400, | |
| "student_mlm_loss": 6.571503639221191 | |
| }, | |
| { | |
| "epoch": 0.7876034874349769, | |
| "grad_norm": 22.7275447845459, | |
| "learning_rate": 3.709978622502887e-05, | |
| "loss": 3.1395, | |
| "step": 21500 | |
| }, | |
| { | |
| "combined_loss": 2.00057315826416, | |
| "distill_loss": 1.3134089708328247, | |
| "epoch": 0.7876034874349769, | |
| "step": 21500, | |
| "student_mlm_loss": 2.687737226486206 | |
| }, | |
| { | |
| "epoch": 0.7912667594695582, | |
| "grad_norm": 56.84143829345703, | |
| "learning_rate": 3.703835663562425e-05, | |
| "loss": 13.1799, | |
| "step": 21600 | |
| }, | |
| { | |
| "combined_loss": 2.094574213027954, | |
| "distill_loss": 1.3792191743850708, | |
| "epoch": 0.7912667594695582, | |
| "step": 21600, | |
| "student_mlm_loss": 2.809929370880127 | |
| }, | |
| { | |
| "epoch": 0.7949300315041395, | |
| "grad_norm": 30.655105590820312, | |
| "learning_rate": 3.697692704621962e-05, | |
| "loss": 4.1563, | |
| "step": 21700 | |
| }, | |
| { | |
| "combined_loss": 2.167109489440918, | |
| "distill_loss": 1.3041900396347046, | |
| "epoch": 0.7949300315041395, | |
| "step": 21700, | |
| "student_mlm_loss": 3.030029058456421 | |
| }, | |
| { | |
| "epoch": 0.7985933035387208, | |
| "grad_norm": 7.400668144226074, | |
| "learning_rate": 3.6915497456815e-05, | |
| "loss": 9.7848, | |
| "step": 21800 | |
| }, | |
| { | |
| "combined_loss": 2.2639806270599365, | |
| "distill_loss": 1.3241550922393799, | |
| "epoch": 0.7985933035387208, | |
| "step": 21800, | |
| "student_mlm_loss": 3.203806161880493 | |
| }, | |
| { | |
| "epoch": 0.8022565755733021, | |
| "grad_norm": 28.212512969970703, | |
| "learning_rate": 3.685406786741038e-05, | |
| "loss": 2.7595, | |
| "step": 21900 | |
| }, | |
| { | |
| "combined_loss": 1.9249264001846313, | |
| "distill_loss": 1.337939739227295, | |
| "epoch": 0.8022565755733021, | |
| "step": 21900, | |
| "student_mlm_loss": 2.5119130611419678 | |
| }, | |
| { | |
| "epoch": 0.8059198476078834, | |
| "grad_norm": 5.998919486999512, | |
| "learning_rate": 3.6792638278005756e-05, | |
| "loss": 5.9041, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.8059198476078834, | |
| "eval_loss": 3.310230016708374, | |
| "eval_runtime": 1.9252, | |
| "eval_samples_per_second": 3633.98, | |
| "eval_steps_per_second": 14.544, | |
| "step": 22000 | |
| }, | |
| { | |
| "combined_loss": 2.208944320678711, | |
| "distill_loss": 1.2883169651031494, | |
| "epoch": 0.8059198476078834, | |
| "step": 22000, | |
| "student_mlm_loss": 3.1295716762542725 | |
| }, | |
| { | |
| "epoch": 0.8095831196424647, | |
| "grad_norm": 42.16996383666992, | |
| "learning_rate": 3.673120868860113e-05, | |
| "loss": 10.4166, | |
| "step": 22100 | |
| }, | |
| { | |
| "combined_loss": 2.089421510696411, | |
| "distill_loss": 1.3541114330291748, | |
| "epoch": 0.8095831196424647, | |
| "step": 22100, | |
| "student_mlm_loss": 2.8247315883636475 | |
| }, | |
| { | |
| "epoch": 0.8132463916770459, | |
| "grad_norm": 10.702394485473633, | |
| "learning_rate": 3.6669779099196505e-05, | |
| "loss": 3.5812, | |
| "step": 22200 | |
| }, | |
| { | |
| "combined_loss": 1.8974239826202393, | |
| "distill_loss": 1.3954590559005737, | |
| "epoch": 0.8132463916770459, | |
| "step": 22200, | |
| "student_mlm_loss": 2.3993890285491943 | |
| }, | |
| { | |
| "epoch": 0.8169096637116272, | |
| "grad_norm": 149.82179260253906, | |
| "learning_rate": 3.6608349509791876e-05, | |
| "loss": 3.229, | |
| "step": 22300 | |
| }, | |
| { | |
| "combined_loss": 2.0663747787475586, | |
| "distill_loss": 1.3880882263183594, | |
| "epoch": 0.8169096637116272, | |
| "step": 22300, | |
| "student_mlm_loss": 2.7446610927581787 | |
| }, | |
| { | |
| "epoch": 0.8205729357462085, | |
| "grad_norm": 5.735169410705566, | |
| "learning_rate": 3.6546919920387253e-05, | |
| "loss": 13.0135, | |
| "step": 22400 | |
| }, | |
| { | |
| "combined_loss": 2.3801686763763428, | |
| "distill_loss": 1.2296876907348633, | |
| "epoch": 0.8205729357462085, | |
| "step": 22400, | |
| "student_mlm_loss": 3.5306496620178223 | |
| }, | |
| { | |
| "epoch": 0.8242362077807898, | |
| "grad_norm": 3.9154951572418213, | |
| "learning_rate": 3.648549033098263e-05, | |
| "loss": 3.0256, | |
| "step": 22500 | |
| }, | |
| { | |
| "combined_loss": 2.619138240814209, | |
| "distill_loss": 1.369718313217163, | |
| "epoch": 0.8242362077807898, | |
| "step": 22500, | |
| "student_mlm_loss": 3.868557929992676 | |
| }, | |
| { | |
| "epoch": 0.827899479815371, | |
| "grad_norm": 6.706686019897461, | |
| "learning_rate": 3.6424060741578e-05, | |
| "loss": 6.8373, | |
| "step": 22600 | |
| }, | |
| { | |
| "combined_loss": 3.571559429168701, | |
| "distill_loss": 1.360285758972168, | |
| "epoch": 0.827899479815371, | |
| "step": 22600, | |
| "student_mlm_loss": 5.782833099365234 | |
| }, | |
| { | |
| "epoch": 0.8315627518499524, | |
| "grad_norm": 63.70609664916992, | |
| "learning_rate": 3.636263115217338e-05, | |
| "loss": 3.1874, | |
| "step": 22700 | |
| }, | |
| { | |
| "combined_loss": 6.645792007446289, | |
| "distill_loss": 1.3381716012954712, | |
| "epoch": 0.8315627518499524, | |
| "step": 22700, | |
| "student_mlm_loss": 11.953412055969238 | |
| }, | |
| { | |
| "epoch": 0.8352260238845337, | |
| "grad_norm": 112.02607727050781, | |
| "learning_rate": 3.630120156276876e-05, | |
| "loss": 4.1698, | |
| "step": 22800 | |
| }, | |
| { | |
| "combined_loss": 2.399282455444336, | |
| "distill_loss": 1.2190183401107788, | |
| "epoch": 0.8352260238845337, | |
| "step": 22800, | |
| "student_mlm_loss": 3.5795464515686035 | |
| }, | |
| { | |
| "epoch": 0.838889295919115, | |
| "grad_norm": 319.05230712890625, | |
| "learning_rate": 3.6239771973364136e-05, | |
| "loss": 3.351, | |
| "step": 22900 | |
| }, | |
| { | |
| "combined_loss": 5.626018047332764, | |
| "distill_loss": 1.3532286882400513, | |
| "epoch": 0.838889295919115, | |
| "step": 22900, | |
| "student_mlm_loss": 9.898807525634766 | |
| }, | |
| { | |
| "epoch": 0.8425525679536963, | |
| "grad_norm": 4.46912956237793, | |
| "learning_rate": 3.617834238395951e-05, | |
| "loss": 3.1926, | |
| "step": 23000 | |
| }, | |
| { | |
| "combined_loss": 1.8462562561035156, | |
| "distill_loss": 1.339337944984436, | |
| "epoch": 0.8425525679536963, | |
| "step": 23000, | |
| "student_mlm_loss": 2.3531746864318848 | |
| }, | |
| { | |
| "epoch": 0.8462158399882775, | |
| "grad_norm": 15.756026268005371, | |
| "learning_rate": 3.611691279455488e-05, | |
| "loss": 11.7086, | |
| "step": 23100 | |
| }, | |
| { | |
| "combined_loss": 3.4101529121398926, | |
| "distill_loss": 1.3407546281814575, | |
| "epoch": 0.8462158399882775, | |
| "step": 23100, | |
| "student_mlm_loss": 5.479551315307617 | |
| }, | |
| { | |
| "epoch": 0.8498791120228588, | |
| "grad_norm": 12.350069046020508, | |
| "learning_rate": 3.6055483205150256e-05, | |
| "loss": 3.1203, | |
| "step": 23200 | |
| }, | |
| { | |
| "combined_loss": 2.5675039291381836, | |
| "distill_loss": 1.2296205759048462, | |
| "epoch": 0.8498791120228588, | |
| "step": 23200, | |
| "student_mlm_loss": 3.9053874015808105 | |
| }, | |
| { | |
| "epoch": 0.8535423840574401, | |
| "grad_norm": 11.17212963104248, | |
| "learning_rate": 3.5994053615745634e-05, | |
| "loss": 6.2935, | |
| "step": 23300 | |
| }, | |
| { | |
| "combined_loss": 2.901674270629883, | |
| "distill_loss": 1.318871021270752, | |
| "epoch": 0.8535423840574401, | |
| "step": 23300, | |
| "student_mlm_loss": 4.484477519989014 | |
| }, | |
| { | |
| "epoch": 0.8572056560920214, | |
| "grad_norm": 11.69430160522461, | |
| "learning_rate": 3.593262402634101e-05, | |
| "loss": 6.1123, | |
| "step": 23400 | |
| }, | |
| { | |
| "combined_loss": 1.962475061416626, | |
| "distill_loss": 1.3837331533432007, | |
| "epoch": 0.8572056560920214, | |
| "step": 23400, | |
| "student_mlm_loss": 2.541217088699341 | |
| }, | |
| { | |
| "epoch": 0.8608689281266027, | |
| "grad_norm": 6.221428394317627, | |
| "learning_rate": 3.587119443693638e-05, | |
| "loss": 5.0621, | |
| "step": 23500 | |
| }, | |
| { | |
| "combined_loss": 2.3063066005706787, | |
| "distill_loss": 1.364685297012329, | |
| "epoch": 0.8608689281266027, | |
| "step": 23500, | |
| "student_mlm_loss": 3.2479279041290283 | |
| }, | |
| { | |
| "epoch": 0.8645322001611839, | |
| "grad_norm": 3.200302839279175, | |
| "learning_rate": 3.580976484753176e-05, | |
| "loss": 3.1679, | |
| "step": 23600 | |
| }, | |
| { | |
| "combined_loss": 14.653901100158691, | |
| "distill_loss": 1.3521461486816406, | |
| "epoch": 0.8645322001611839, | |
| "step": 23600, | |
| "student_mlm_loss": 27.955656051635742 | |
| }, | |
| { | |
| "epoch": 0.8681954721957652, | |
| "grad_norm": 18.003841400146484, | |
| "learning_rate": 3.574833525812714e-05, | |
| "loss": 4.2524, | |
| "step": 23700 | |
| }, | |
| { | |
| "combined_loss": 2.05013108253479, | |
| "distill_loss": 1.473749041557312, | |
| "epoch": 0.8681954721957652, | |
| "step": 23700, | |
| "student_mlm_loss": 2.6265130043029785 | |
| }, | |
| { | |
| "epoch": 0.8718587442303466, | |
| "grad_norm": 16.64165687561035, | |
| "learning_rate": 3.5686905668722516e-05, | |
| "loss": 3.4139, | |
| "step": 23800 | |
| }, | |
| { | |
| "combined_loss": 3.8039913177490234, | |
| "distill_loss": 1.3022387027740479, | |
| "epoch": 0.8718587442303466, | |
| "step": 23800, | |
| "student_mlm_loss": 6.305744171142578 | |
| }, | |
| { | |
| "epoch": 0.8755220162649279, | |
| "grad_norm": 6.90595817565918, | |
| "learning_rate": 3.562547607931789e-05, | |
| "loss": 5.4512, | |
| "step": 23900 | |
| }, | |
| { | |
| "combined_loss": 2.0175633430480957, | |
| "distill_loss": 1.2362921237945557, | |
| "epoch": 0.8755220162649279, | |
| "step": 23900, | |
| "student_mlm_loss": 2.7988343238830566 | |
| }, | |
| { | |
| "epoch": 0.8791852882995091, | |
| "grad_norm": 26.792980194091797, | |
| "learning_rate": 3.556404648991326e-05, | |
| "loss": 6.622, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.8791852882995091, | |
| "eval_loss": 3.643918991088867, | |
| "eval_runtime": 1.9198, | |
| "eval_samples_per_second": 3644.043, | |
| "eval_steps_per_second": 14.585, | |
| "step": 24000 | |
| }, | |
| { | |
| "combined_loss": 2.1716020107269287, | |
| "distill_loss": 1.3234556913375854, | |
| "epoch": 0.8791852882995091, | |
| "step": 24000, | |
| "student_mlm_loss": 3.0197484493255615 | |
| }, | |
| { | |
| "epoch": 0.8828485603340904, | |
| "grad_norm": 4.8087568283081055, | |
| "learning_rate": 3.550261690050864e-05, | |
| "loss": 4.0542, | |
| "step": 24100 | |
| }, | |
| { | |
| "combined_loss": 13.035262107849121, | |
| "distill_loss": 1.353433609008789, | |
| "epoch": 0.8828485603340904, | |
| "step": 24100, | |
| "student_mlm_loss": 24.717090606689453 | |
| }, | |
| { | |
| "epoch": 0.8865118323686717, | |
| "grad_norm": 10.60560417175293, | |
| "learning_rate": 3.5441187311104014e-05, | |
| "loss": 3.1068, | |
| "step": 24200 | |
| }, | |
| { | |
| "combined_loss": 1.8867456912994385, | |
| "distill_loss": 1.2289210557937622, | |
| "epoch": 0.8865118323686717, | |
| "step": 24200, | |
| "student_mlm_loss": 2.544570207595825 | |
| }, | |
| { | |
| "epoch": 0.890175104403253, | |
| "grad_norm": 11.34473705291748, | |
| "learning_rate": 3.537975772169939e-05, | |
| "loss": 2.9801, | |
| "step": 24300 | |
| }, | |
| { | |
| "combined_loss": 1.7472858428955078, | |
| "distill_loss": 1.229453206062317, | |
| "epoch": 0.890175104403253, | |
| "step": 24300, | |
| "student_mlm_loss": 2.265118360519409 | |
| }, | |
| { | |
| "epoch": 0.8938383764378343, | |
| "grad_norm": 17.742507934570312, | |
| "learning_rate": 3.531832813229476e-05, | |
| "loss": 4.6617, | |
| "step": 24400 | |
| }, | |
| { | |
| "combined_loss": 1.9173786640167236, | |
| "distill_loss": 1.3212807178497314, | |
| "epoch": 0.8938383764378343, | |
| "step": 24400, | |
| "student_mlm_loss": 2.513476610183716 | |
| }, | |
| { | |
| "epoch": 0.8975016484724155, | |
| "grad_norm": 14.223791122436523, | |
| "learning_rate": 3.525689854289014e-05, | |
| "loss": 3.0537, | |
| "step": 24500 | |
| }, | |
| { | |
| "combined_loss": 1.7878549098968506, | |
| "distill_loss": 1.2908958196640015, | |
| "epoch": 0.8975016484724155, | |
| "step": 24500, | |
| "student_mlm_loss": 2.28481388092041 | |
| }, | |
| { | |
| "epoch": 0.9011649205069968, | |
| "grad_norm": 4.241771697998047, | |
| "learning_rate": 3.519546895348552e-05, | |
| "loss": 7.9255, | |
| "step": 24600 | |
| }, | |
| { | |
| "combined_loss": 1.8853719234466553, | |
| "distill_loss": 1.3350555896759033, | |
| "epoch": 0.9011649205069968, | |
| "step": 24600, | |
| "student_mlm_loss": 2.4356882572174072 | |
| }, | |
| { | |
| "epoch": 0.9048281925415781, | |
| "grad_norm": 5.793640613555908, | |
| "learning_rate": 3.513403936408089e-05, | |
| "loss": 2.9971, | |
| "step": 24700 | |
| }, | |
| { | |
| "combined_loss": 9.072087287902832, | |
| "distill_loss": 1.2805593013763428, | |
| "epoch": 0.9048281925415781, | |
| "step": 24700, | |
| "student_mlm_loss": 16.863615036010742 | |
| }, | |
| { | |
| "epoch": 0.9084914645761595, | |
| "grad_norm": 4.500351905822754, | |
| "learning_rate": 3.507260977467627e-05, | |
| "loss": 2.9841, | |
| "step": 24800 | |
| }, | |
| { | |
| "combined_loss": 4.229645252227783, | |
| "distill_loss": 1.231893539428711, | |
| "epoch": 0.9084914645761595, | |
| "step": 24800, | |
| "student_mlm_loss": 7.2273969650268555 | |
| }, | |
| { | |
| "epoch": 0.9121547366107408, | |
| "grad_norm": 24.93678855895996, | |
| "learning_rate": 3.501118018527164e-05, | |
| "loss": 5.2865, | |
| "step": 24900 | |
| }, | |
| { | |
| "combined_loss": 4.519498825073242, | |
| "distill_loss": 1.35053288936615, | |
| "epoch": 0.9121547366107408, | |
| "step": 24900, | |
| "student_mlm_loss": 7.688465118408203 | |
| }, | |
| { | |
| "epoch": 0.915818008645322, | |
| "grad_norm": 9.416017532348633, | |
| "learning_rate": 3.494975059586702e-05, | |
| "loss": 2.9688, | |
| "step": 25000 | |
| }, | |
| { | |
| "combined_loss": 4.33969783782959, | |
| "distill_loss": 1.2811079025268555, | |
| "epoch": 0.915818008645322, | |
| "step": 25000, | |
| "student_mlm_loss": 7.398288249969482 | |
| }, | |
| { | |
| "epoch": 0.9194812806799033, | |
| "grad_norm": 41.79585266113281, | |
| "learning_rate": 3.4888321006462394e-05, | |
| "loss": 12.352, | |
| "step": 25100 | |
| }, | |
| { | |
| "combined_loss": 2.398942232131958, | |
| "distill_loss": 1.3129199743270874, | |
| "epoch": 0.9194812806799033, | |
| "step": 25100, | |
| "student_mlm_loss": 3.484964609146118 | |
| }, | |
| { | |
| "epoch": 0.9231445527144846, | |
| "grad_norm": 27.67843246459961, | |
| "learning_rate": 3.482689141705777e-05, | |
| "loss": 4.6291, | |
| "step": 25200 | |
| }, | |
| { | |
| "combined_loss": 1.8275630474090576, | |
| "distill_loss": 1.1290583610534668, | |
| "epoch": 0.9231445527144846, | |
| "step": 25200, | |
| "student_mlm_loss": 2.5260677337646484 | |
| }, | |
| { | |
| "epoch": 0.9268078247490659, | |
| "grad_norm": 57.03019332885742, | |
| "learning_rate": 3.476546182765314e-05, | |
| "loss": 3.8226, | |
| "step": 25300 | |
| }, | |
| { | |
| "combined_loss": 1.8621808290481567, | |
| "distill_loss": 1.3249785900115967, | |
| "epoch": 0.9268078247490659, | |
| "step": 25300, | |
| "student_mlm_loss": 2.399383068084717 | |
| }, | |
| { | |
| "epoch": 0.9304710967836471, | |
| "grad_norm": 5.4275007247924805, | |
| "learning_rate": 3.470403223824852e-05, | |
| "loss": 3.7803, | |
| "step": 25400 | |
| }, | |
| { | |
| "combined_loss": 5.317490100860596, | |
| "distill_loss": 1.3810964822769165, | |
| "epoch": 0.9304710967836471, | |
| "step": 25400, | |
| "student_mlm_loss": 9.253883361816406 | |
| }, | |
| { | |
| "epoch": 0.9341343688182284, | |
| "grad_norm": 6.36318302154541, | |
| "learning_rate": 3.46426026488439e-05, | |
| "loss": 17.9114, | |
| "step": 25500 | |
| }, | |
| { | |
| "combined_loss": 4.816742897033691, | |
| "distill_loss": 1.274537444114685, | |
| "epoch": 0.9341343688182284, | |
| "step": 25500, | |
| "student_mlm_loss": 8.358948707580566 | |
| }, | |
| { | |
| "epoch": 0.9377976408528097, | |
| "grad_norm": 4.670822620391846, | |
| "learning_rate": 3.458117305943927e-05, | |
| "loss": 3.4352, | |
| "step": 25600 | |
| }, | |
| { | |
| "combined_loss": 1.7166364192962646, | |
| "distill_loss": 1.2876447439193726, | |
| "epoch": 0.9377976408528097, | |
| "step": 25600, | |
| "student_mlm_loss": 2.145627975463867 | |
| }, | |
| { | |
| "epoch": 0.941460912887391, | |
| "grad_norm": 16.301795959472656, | |
| "learning_rate": 3.451974347003465e-05, | |
| "loss": 2.591, | |
| "step": 25700 | |
| }, | |
| { | |
| "combined_loss": 1.8349076509475708, | |
| "distill_loss": 1.3192713260650635, | |
| "epoch": 0.941460912887391, | |
| "step": 25700, | |
| "student_mlm_loss": 2.350543975830078 | |
| }, | |
| { | |
| "epoch": 0.9451241849219723, | |
| "grad_norm": 4.464934349060059, | |
| "learning_rate": 3.4458313880630025e-05, | |
| "loss": 5.3202, | |
| "step": 25800 | |
| }, | |
| { | |
| "combined_loss": 2.022656202316284, | |
| "distill_loss": 1.4582451581954956, | |
| "epoch": 0.9451241849219723, | |
| "step": 25800, | |
| "student_mlm_loss": 2.587067127227783 | |
| }, | |
| { | |
| "epoch": 0.9487874569565536, | |
| "grad_norm": 13.280508041381836, | |
| "learning_rate": 3.43968842912254e-05, | |
| "loss": 3.2685, | |
| "step": 25900 | |
| }, | |
| { | |
| "combined_loss": 1.7409727573394775, | |
| "distill_loss": 1.2449432611465454, | |
| "epoch": 0.9487874569565536, | |
| "step": 25900, | |
| "student_mlm_loss": 2.23700213432312 | |
| }, | |
| { | |
| "epoch": 0.9524507289911349, | |
| "grad_norm": 34.54155349731445, | |
| "learning_rate": 3.4335454701820774e-05, | |
| "loss": 4.4614, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.9524507289911349, | |
| "eval_loss": 3.371135950088501, | |
| "eval_runtime": 1.9026, | |
| "eval_samples_per_second": 3677.064, | |
| "eval_steps_per_second": 14.717, | |
| "step": 26000 | |
| }, | |
| { | |
| "combined_loss": 2.1200222969055176, | |
| "distill_loss": 1.4147942066192627, | |
| "epoch": 0.9524507289911349, | |
| "step": 26000, | |
| "student_mlm_loss": 2.8252503871917725 | |
| }, | |
| { | |
| "epoch": 0.9561140010257162, | |
| "grad_norm": 12.063314437866211, | |
| "learning_rate": 3.427402511241615e-05, | |
| "loss": 3.8605, | |
| "step": 26100 | |
| }, | |
| { | |
| "combined_loss": 2.440842866897583, | |
| "distill_loss": 1.4115891456604004, | |
| "epoch": 0.9561140010257162, | |
| "step": 26100, | |
| "student_mlm_loss": 3.4700965881347656 | |
| }, | |
| { | |
| "epoch": 0.9597772730602975, | |
| "grad_norm": 3.154322862625122, | |
| "learning_rate": 3.421259552301152e-05, | |
| "loss": 3.4216, | |
| "step": 26200 | |
| }, | |
| { | |
| "combined_loss": 2.0511860847473145, | |
| "distill_loss": 1.2086646556854248, | |
| "epoch": 0.9597772730602975, | |
| "step": 26200, | |
| "student_mlm_loss": 2.893707752227783 | |
| }, | |
| { | |
| "epoch": 0.9634405450948788, | |
| "grad_norm": 4.469895839691162, | |
| "learning_rate": 3.41511659336069e-05, | |
| "loss": 8.4313, | |
| "step": 26300 | |
| }, | |
| { | |
| "combined_loss": 1.9184556007385254, | |
| "distill_loss": 1.311684489250183, | |
| "epoch": 0.9634405450948788, | |
| "step": 26300, | |
| "student_mlm_loss": 2.525226593017578 | |
| }, | |
| { | |
| "epoch": 0.96710381712946, | |
| "grad_norm": 37.47445297241211, | |
| "learning_rate": 3.408973634420228e-05, | |
| "loss": 3.33, | |
| "step": 26400 | |
| }, | |
| { | |
| "combined_loss": 1.8568530082702637, | |
| "distill_loss": 1.3435510396957397, | |
| "epoch": 0.96710381712946, | |
| "step": 26400, | |
| "student_mlm_loss": 2.370154857635498 | |
| }, | |
| { | |
| "epoch": 0.9707670891640413, | |
| "grad_norm": 5.385250091552734, | |
| "learning_rate": 3.402830675479765e-05, | |
| "loss": 3.0353, | |
| "step": 26500 | |
| }, | |
| { | |
| "combined_loss": 2.078137159347534, | |
| "distill_loss": 1.4688613414764404, | |
| "epoch": 0.9707670891640413, | |
| "step": 26500, | |
| "student_mlm_loss": 2.687412977218628 | |
| }, | |
| { | |
| "epoch": 0.9744303611986226, | |
| "grad_norm": 20.363506317138672, | |
| "learning_rate": 3.396687716539303e-05, | |
| "loss": 5.5902, | |
| "step": 26600 | |
| }, | |
| { | |
| "combined_loss": 2.420652151107788, | |
| "distill_loss": 1.3566147089004517, | |
| "epoch": 0.9744303611986226, | |
| "step": 26600, | |
| "student_mlm_loss": 3.484689474105835 | |
| }, | |
| { | |
| "epoch": 0.9780936332332039, | |
| "grad_norm": 5.678069591522217, | |
| "learning_rate": 3.3905447575988405e-05, | |
| "loss": 3.1063, | |
| "step": 26700 | |
| }, | |
| { | |
| "combined_loss": 2.2643003463745117, | |
| "distill_loss": 1.3446204662322998, | |
| "epoch": 0.9780936332332039, | |
| "step": 26700, | |
| "student_mlm_loss": 3.1839799880981445 | |
| }, | |
| { | |
| "epoch": 0.9817569052677851, | |
| "grad_norm": 8.722668647766113, | |
| "learning_rate": 3.384401798658378e-05, | |
| "loss": 9.3685, | |
| "step": 26800 | |
| }, | |
| { | |
| "combined_loss": 8.34331226348877, | |
| "distill_loss": 1.3864542245864868, | |
| "epoch": 0.9817569052677851, | |
| "step": 26800, | |
| "student_mlm_loss": 15.3001708984375 | |
| }, | |
| { | |
| "epoch": 0.9854201773023665, | |
| "grad_norm": 5.101404190063477, | |
| "learning_rate": 3.3782588397179154e-05, | |
| "loss": 3.1112, | |
| "step": 26900 | |
| }, | |
| { | |
| "combined_loss": 30.241453170776367, | |
| "distill_loss": 1.3818217515945435, | |
| "epoch": 0.9854201773023665, | |
| "step": 26900, | |
| "student_mlm_loss": 59.1010856628418 | |
| }, | |
| { | |
| "epoch": 0.9890834493369478, | |
| "grad_norm": 3.8359858989715576, | |
| "learning_rate": 3.3721158807774525e-05, | |
| "loss": 3.348, | |
| "step": 27000 | |
| }, | |
| { | |
| "combined_loss": 1.8264105319976807, | |
| "distill_loss": 1.2956147193908691, | |
| "epoch": 0.9890834493369478, | |
| "step": 27000, | |
| "student_mlm_loss": 2.357206344604492 | |
| }, | |
| { | |
| "epoch": 0.9927467213715291, | |
| "grad_norm": 33.43736267089844, | |
| "learning_rate": 3.36597292183699e-05, | |
| "loss": 3.5437, | |
| "step": 27100 | |
| }, | |
| { | |
| "combined_loss": 2.331777572631836, | |
| "distill_loss": 1.3274433612823486, | |
| "epoch": 0.9927467213715291, | |
| "step": 27100, | |
| "student_mlm_loss": 3.3361120223999023 | |
| }, | |
| { | |
| "epoch": 0.9964099934061104, | |
| "grad_norm": 2.9736690521240234, | |
| "learning_rate": 3.359829962896528e-05, | |
| "loss": 2.828, | |
| "step": 27200 | |
| }, | |
| { | |
| "combined_loss": 2.0438201427459717, | |
| "distill_loss": 1.334372639656067, | |
| "epoch": 0.9964099934061104, | |
| "step": 27200, | |
| "student_mlm_loss": 2.753267526626587 | |
| }, | |
| { | |
| "epoch": 1.0000732654406916, | |
| "grad_norm": 3.6774871349334717, | |
| "learning_rate": 3.353687003956066e-05, | |
| "loss": 3.168, | |
| "step": 27300 | |
| }, | |
| { | |
| "combined_loss": 3.4676733016967773, | |
| "distill_loss": 1.2681790590286255, | |
| "epoch": 1.0000732654406916, | |
| "step": 27300, | |
| "student_mlm_loss": 5.667167663574219 | |
| }, | |
| { | |
| "epoch": 1.003736537475273, | |
| "grad_norm": 20.265796661376953, | |
| "learning_rate": 3.347544045015603e-05, | |
| "loss": 4.9071, | |
| "step": 27400 | |
| }, | |
| { | |
| "combined_loss": 1.740236520767212, | |
| "distill_loss": 1.1595730781555176, | |
| "epoch": 1.003736537475273, | |
| "step": 27400, | |
| "student_mlm_loss": 2.3208999633789062 | |
| }, | |
| { | |
| "epoch": 1.0073998095098542, | |
| "grad_norm": 14.427675247192383, | |
| "learning_rate": 3.341401086075141e-05, | |
| "loss": 3.1375, | |
| "step": 27500 | |
| }, | |
| { | |
| "combined_loss": 2.0229873657226562, | |
| "distill_loss": 1.3961925506591797, | |
| "epoch": 1.0073998095098542, | |
| "step": 27500, | |
| "student_mlm_loss": 2.6497819423675537 | |
| }, | |
| { | |
| "epoch": 1.0110630815444355, | |
| "grad_norm": 3.032438039779663, | |
| "learning_rate": 3.3352581271346786e-05, | |
| "loss": 2.7581, | |
| "step": 27600 | |
| }, | |
| { | |
| "combined_loss": 1.9314367771148682, | |
| "distill_loss": 1.2618595361709595, | |
| "epoch": 1.0110630815444355, | |
| "step": 27600, | |
| "student_mlm_loss": 2.6010141372680664 | |
| }, | |
| { | |
| "epoch": 1.0147263535790167, | |
| "grad_norm": 6.167496681213379, | |
| "learning_rate": 3.3291151681942163e-05, | |
| "loss": 6.7788, | |
| "step": 27700 | |
| }, | |
| { | |
| "combined_loss": 2.247697353363037, | |
| "distill_loss": 1.4385483264923096, | |
| "epoch": 1.0147263535790167, | |
| "step": 27700, | |
| "student_mlm_loss": 3.0568461418151855 | |
| }, | |
| { | |
| "epoch": 1.018389625613598, | |
| "grad_norm": 4.82693338394165, | |
| "learning_rate": 3.3229722092537534e-05, | |
| "loss": 5.9229, | |
| "step": 27800 | |
| }, | |
| { | |
| "combined_loss": 3.4328160285949707, | |
| "distill_loss": 1.319059133529663, | |
| "epoch": 1.018389625613598, | |
| "step": 27800, | |
| "student_mlm_loss": 5.546572685241699 | |
| }, | |
| { | |
| "epoch": 1.0220528976481793, | |
| "grad_norm": 13.18911361694336, | |
| "learning_rate": 3.3168292503132906e-05, | |
| "loss": 3.5041, | |
| "step": 27900 | |
| }, | |
| { | |
| "combined_loss": 3.720487594604492, | |
| "distill_loss": 1.233067274093628, | |
| "epoch": 1.0220528976481793, | |
| "step": 27900, | |
| "student_mlm_loss": 6.207907676696777 | |
| }, | |
| { | |
| "epoch": 1.0257161696827606, | |
| "grad_norm": 10.725250244140625, | |
| "learning_rate": 3.310686291372829e-05, | |
| "loss": 2.9279, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.0257161696827606, | |
| "eval_loss": 3.3177244663238525, | |
| "eval_runtime": 2.0821, | |
| "eval_samples_per_second": 3360.034, | |
| "eval_steps_per_second": 13.448, | |
| "step": 28000 | |
| }, | |
| { | |
| "combined_loss": 2.0106987953186035, | |
| "distill_loss": 1.3163011074066162, | |
| "epoch": 1.0257161696827606, | |
| "step": 28000, | |
| "student_mlm_loss": 2.70509672164917 | |
| }, | |
| { | |
| "epoch": 1.0293794417173419, | |
| "grad_norm": 5.406506538391113, | |
| "learning_rate": 3.304543332432366e-05, | |
| "loss": 3.2149, | |
| "step": 28100 | |
| }, | |
| { | |
| "combined_loss": 2.042628288269043, | |
| "distill_loss": 1.3173636198043823, | |
| "epoch": 1.0293794417173419, | |
| "step": 28100, | |
| "student_mlm_loss": 2.767892837524414 | |
| }, | |
| { | |
| "epoch": 1.0330427137519231, | |
| "grad_norm": 3.2733256816864014, | |
| "learning_rate": 3.298400373491904e-05, | |
| "loss": 6.3856, | |
| "step": 28200 | |
| }, | |
| { | |
| "combined_loss": 1.9145760536193848, | |
| "distill_loss": 1.438834309577942, | |
| "epoch": 1.0330427137519231, | |
| "step": 28200, | |
| "student_mlm_loss": 2.390317916870117 | |
| }, | |
| { | |
| "epoch": 1.0367059857865044, | |
| "grad_norm": 10.546121597290039, | |
| "learning_rate": 3.292257414551441e-05, | |
| "loss": 3.5422, | |
| "step": 28300 | |
| }, | |
| { | |
| "combined_loss": 2.6431736946105957, | |
| "distill_loss": 1.367489218711853, | |
| "epoch": 1.0367059857865044, | |
| "step": 28300, | |
| "student_mlm_loss": 3.918858289718628 | |
| }, | |
| { | |
| "epoch": 1.0403692578210857, | |
| "grad_norm": 25.674352645874023, | |
| "learning_rate": 3.286114455610979e-05, | |
| "loss": 6.2258, | |
| "step": 28400 | |
| }, | |
| { | |
| "combined_loss": 1.8416577577590942, | |
| "distill_loss": 1.2867157459259033, | |
| "epoch": 1.0403692578210857, | |
| "step": 28400, | |
| "student_mlm_loss": 2.396599769592285 | |
| }, | |
| { | |
| "epoch": 1.044032529855667, | |
| "grad_norm": 3.6745688915252686, | |
| "learning_rate": 3.2799714966705166e-05, | |
| "loss": 5.0647, | |
| "step": 28500 | |
| }, | |
| { | |
| "combined_loss": 1.9693520069122314, | |
| "distill_loss": 1.3039644956588745, | |
| "epoch": 1.044032529855667, | |
| "step": 28500, | |
| "student_mlm_loss": 2.634739637374878 | |
| }, | |
| { | |
| "epoch": 1.0476958018902485, | |
| "grad_norm": 40.79129409790039, | |
| "learning_rate": 3.273828537730054e-05, | |
| "loss": 2.6424, | |
| "step": 28600 | |
| }, | |
| { | |
| "combined_loss": 2.4251365661621094, | |
| "distill_loss": 1.3121291399002075, | |
| "epoch": 1.0476958018902485, | |
| "step": 28600, | |
| "student_mlm_loss": 3.5381438732147217 | |
| }, | |
| { | |
| "epoch": 1.0513590739248297, | |
| "grad_norm": 7.185906410217285, | |
| "learning_rate": 3.2676855787895915e-05, | |
| "loss": 2.9095, | |
| "step": 28700 | |
| }, | |
| { | |
| "combined_loss": 5.781175136566162, | |
| "distill_loss": 1.3236074447631836, | |
| "epoch": 1.0513590739248297, | |
| "step": 28700, | |
| "student_mlm_loss": 10.23874282836914 | |
| }, | |
| { | |
| "epoch": 1.055022345959411, | |
| "grad_norm": 7.2639079093933105, | |
| "learning_rate": 3.2615426198491286e-05, | |
| "loss": 3.0536, | |
| "step": 28800 | |
| }, | |
| { | |
| "combined_loss": 1.8534462451934814, | |
| "distill_loss": 1.433970332145691, | |
| "epoch": 1.055022345959411, | |
| "step": 28800, | |
| "student_mlm_loss": 2.2729220390319824 | |
| }, | |
| { | |
| "epoch": 1.0586856179939923, | |
| "grad_norm": 82.9974365234375, | |
| "learning_rate": 3.255399660908667e-05, | |
| "loss": 3.4605, | |
| "step": 28900 | |
| }, | |
| { | |
| "combined_loss": 2.385720729827881, | |
| "distill_loss": 1.319982647895813, | |
| "epoch": 1.0586856179939923, | |
| "step": 28900, | |
| "student_mlm_loss": 3.4514589309692383 | |
| }, | |
| { | |
| "epoch": 1.0623488900285736, | |
| "grad_norm": 8.101861000061035, | |
| "learning_rate": 3.249256701968204e-05, | |
| "loss": 2.9531, | |
| "step": 29000 | |
| }, | |
| { | |
| "combined_loss": 1.9569958448410034, | |
| "distill_loss": 1.350255012512207, | |
| "epoch": 1.0623488900285736, | |
| "step": 29000, | |
| "student_mlm_loss": 2.5637366771698 | |
| }, | |
| { | |
| "epoch": 1.0660121620631549, | |
| "grad_norm": 42.843135833740234, | |
| "learning_rate": 3.243113743027742e-05, | |
| "loss": 3.5336, | |
| "step": 29100 | |
| }, | |
| { | |
| "combined_loss": 2.0199599266052246, | |
| "distill_loss": 1.1558183431625366, | |
| "epoch": 1.0660121620631549, | |
| "step": 29100, | |
| "student_mlm_loss": 2.884101390838623 | |
| }, | |
| { | |
| "epoch": 1.0696754340977361, | |
| "grad_norm": 10.401261329650879, | |
| "learning_rate": 3.236970784087279e-05, | |
| "loss": 2.6909, | |
| "step": 29200 | |
| }, | |
| { | |
| "combined_loss": 1.898897409439087, | |
| "distill_loss": 1.2361267805099487, | |
| "epoch": 1.0696754340977361, | |
| "step": 29200, | |
| "student_mlm_loss": 2.5616679191589355 | |
| }, | |
| { | |
| "epoch": 1.0733387061323174, | |
| "grad_norm": 13.08026123046875, | |
| "learning_rate": 3.230827825146817e-05, | |
| "loss": 10.7499, | |
| "step": 29300 | |
| }, | |
| { | |
| "combined_loss": 2.385263442993164, | |
| "distill_loss": 1.2960166931152344, | |
| "epoch": 1.0733387061323174, | |
| "step": 29300, | |
| "student_mlm_loss": 3.4745099544525146 | |
| }, | |
| { | |
| "epoch": 1.0770019781668987, | |
| "grad_norm": 6.8822431564331055, | |
| "learning_rate": 3.2246848662063546e-05, | |
| "loss": 3.0651, | |
| "step": 29400 | |
| }, | |
| { | |
| "combined_loss": 2.1257505416870117, | |
| "distill_loss": 1.3224972486495972, | |
| "epoch": 1.0770019781668987, | |
| "step": 29400, | |
| "student_mlm_loss": 2.929003953933716 | |
| }, | |
| { | |
| "epoch": 1.08066525020148, | |
| "grad_norm": 3.4312744140625, | |
| "learning_rate": 3.218541907265892e-05, | |
| "loss": 3.1323, | |
| "step": 29500 | |
| }, | |
| { | |
| "combined_loss": 2.0117716789245605, | |
| "distill_loss": 1.2447552680969238, | |
| "epoch": 1.08066525020148, | |
| "step": 29500, | |
| "student_mlm_loss": 2.7787880897521973 | |
| }, | |
| { | |
| "epoch": 1.0843285222360612, | |
| "grad_norm": 3.970820426940918, | |
| "learning_rate": 3.2123989483254295e-05, | |
| "loss": 3.7427, | |
| "step": 29600 | |
| }, | |
| { | |
| "combined_loss": 2.493256092071533, | |
| "distill_loss": 1.27970290184021, | |
| "epoch": 1.0843285222360612, | |
| "step": 29600, | |
| "student_mlm_loss": 3.7068092823028564 | |
| }, | |
| { | |
| "epoch": 1.0879917942706425, | |
| "grad_norm": 5.8632426261901855, | |
| "learning_rate": 3.206255989384967e-05, | |
| "loss": 3.0698, | |
| "step": 29700 | |
| }, | |
| { | |
| "combined_loss": 2.017867088317871, | |
| "distill_loss": 1.408115029335022, | |
| "epoch": 1.0879917942706425, | |
| "step": 29700, | |
| "student_mlm_loss": 2.6276190280914307 | |
| }, | |
| { | |
| "epoch": 1.0916550663052238, | |
| "grad_norm": 7.350955963134766, | |
| "learning_rate": 3.200113030444505e-05, | |
| "loss": 10.1517, | |
| "step": 29800 | |
| }, | |
| { | |
| "combined_loss": 3.020230770111084, | |
| "distill_loss": 1.1870992183685303, | |
| "epoch": 1.0916550663052238, | |
| "step": 29800, | |
| "student_mlm_loss": 4.853362083435059 | |
| }, | |
| { | |
| "epoch": 1.095318338339805, | |
| "grad_norm": 14.347647666931152, | |
| "learning_rate": 3.193970071504042e-05, | |
| "loss": 2.8345, | |
| "step": 29900 | |
| }, | |
| { | |
| "combined_loss": 1.8037035465240479, | |
| "distill_loss": 1.2421637773513794, | |
| "epoch": 1.095318338339805, | |
| "step": 29900, | |
| "student_mlm_loss": 2.365243434906006 | |
| }, | |
| { | |
| "epoch": 1.0989816103743864, | |
| "grad_norm": 8.716060638427734, | |
| "learning_rate": 3.18782711256358e-05, | |
| "loss": 4.9073, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.0989816103743864, | |
| "eval_loss": 3.289705753326416, | |
| "eval_runtime": 2.6398, | |
| "eval_samples_per_second": 2650.179, | |
| "eval_steps_per_second": 10.607, | |
| "step": 30000 | |
| }, | |
| { | |
| "combined_loss": 3.3838839530944824, | |
| "distill_loss": 1.2657897472381592, | |
| "epoch": 1.0989816103743864, | |
| "step": 30000, | |
| "student_mlm_loss": 5.501977920532227 | |
| }, | |
| { | |
| "epoch": 1.1026448824089676, | |
| "grad_norm": 9.78013801574707, | |
| "learning_rate": 3.181684153623117e-05, | |
| "loss": 6.1366, | |
| "step": 30100 | |
| }, | |
| { | |
| "combined_loss": 1.8116616010665894, | |
| "distill_loss": 1.3585631847381592, | |
| "epoch": 1.1026448824089676, | |
| "step": 30100, | |
| "student_mlm_loss": 2.2647600173950195 | |
| }, | |
| { | |
| "epoch": 1.106308154443549, | |
| "grad_norm": 20.41010856628418, | |
| "learning_rate": 3.175541194682655e-05, | |
| "loss": 4.7028, | |
| "step": 30200 | |
| }, | |
| { | |
| "combined_loss": 1.9074151515960693, | |
| "distill_loss": 1.119224190711975, | |
| "epoch": 1.106308154443549, | |
| "step": 30200, | |
| "student_mlm_loss": 2.695605993270874 | |
| }, | |
| { | |
| "epoch": 1.1099714264781302, | |
| "grad_norm": 7.005733966827393, | |
| "learning_rate": 3.1693982357421926e-05, | |
| "loss": 4.9073, | |
| "step": 30300 | |
| }, | |
| { | |
| "combined_loss": 1.7690558433532715, | |
| "distill_loss": 1.2762707471847534, | |
| "epoch": 1.1099714264781302, | |
| "step": 30300, | |
| "student_mlm_loss": 2.2618408203125 | |
| }, | |
| { | |
| "epoch": 1.1136346985127115, | |
| "grad_norm": 4.290195465087891, | |
| "learning_rate": 3.16325527680173e-05, | |
| "loss": 4.1257, | |
| "step": 30400 | |
| }, | |
| { | |
| "combined_loss": 15.505983352661133, | |
| "distill_loss": 1.252361536026001, | |
| "epoch": 1.1136346985127115, | |
| "step": 30400, | |
| "student_mlm_loss": 29.759605407714844 | |
| }, | |
| { | |
| "epoch": 1.1172979705472927, | |
| "grad_norm": 27.59025764465332, | |
| "learning_rate": 3.1571123178612675e-05, | |
| "loss": 3.6319, | |
| "step": 30500 | |
| }, | |
| { | |
| "combined_loss": 3.190175771713257, | |
| "distill_loss": 1.237632155418396, | |
| "epoch": 1.1172979705472927, | |
| "step": 30500, | |
| "student_mlm_loss": 5.142719268798828 | |
| }, | |
| { | |
| "epoch": 1.120961242581874, | |
| "grad_norm": 35.681365966796875, | |
| "learning_rate": 3.150969358920805e-05, | |
| "loss": 5.2866, | |
| "step": 30600 | |
| }, | |
| { | |
| "combined_loss": 2.1486501693725586, | |
| "distill_loss": 1.3570821285247803, | |
| "epoch": 1.120961242581874, | |
| "step": 30600, | |
| "student_mlm_loss": 2.940218448638916 | |
| }, | |
| { | |
| "epoch": 1.1246245146164555, | |
| "grad_norm": 28.920949935913086, | |
| "learning_rate": 3.144826399980343e-05, | |
| "loss": 11.35, | |
| "step": 30700 | |
| }, | |
| { | |
| "combined_loss": 3.544619560241699, | |
| "distill_loss": 1.3219174146652222, | |
| "epoch": 1.1246245146164555, | |
| "step": 30700, | |
| "student_mlm_loss": 5.767321586608887 | |
| }, | |
| { | |
| "epoch": 1.1282877866510368, | |
| "grad_norm": 36.29865264892578, | |
| "learning_rate": 3.13868344103988e-05, | |
| "loss": 8.8748, | |
| "step": 30800 | |
| }, | |
| { | |
| "combined_loss": 3.136960744857788, | |
| "distill_loss": 1.4069170951843262, | |
| "epoch": 1.1282877866510368, | |
| "step": 30800, | |
| "student_mlm_loss": 4.86700439453125 | |
| }, | |
| { | |
| "epoch": 1.131951058685618, | |
| "grad_norm": 8.498424530029297, | |
| "learning_rate": 3.132540482099417e-05, | |
| "loss": 2.6175, | |
| "step": 30900 | |
| }, | |
| { | |
| "combined_loss": 2.584123373031616, | |
| "distill_loss": 1.3318666219711304, | |
| "epoch": 1.131951058685618, | |
| "step": 30900, | |
| "student_mlm_loss": 3.8363800048828125 | |
| }, | |
| { | |
| "epoch": 1.1356143307201993, | |
| "grad_norm": 8.784627914428711, | |
| "learning_rate": 3.126397523158955e-05, | |
| "loss": 3.7912, | |
| "step": 31000 | |
| }, | |
| { | |
| "combined_loss": 4.065792083740234, | |
| "distill_loss": 1.279055118560791, | |
| "epoch": 1.1356143307201993, | |
| "step": 31000, | |
| "student_mlm_loss": 6.8525285720825195 | |
| }, | |
| { | |
| "epoch": 1.1392776027547806, | |
| "grad_norm": 15.763399124145508, | |
| "learning_rate": 3.120254564218493e-05, | |
| "loss": 7.3671, | |
| "step": 31100 | |
| }, | |
| { | |
| "combined_loss": 1.9532334804534912, | |
| "distill_loss": 1.2137418985366821, | |
| "epoch": 1.1392776027547806, | |
| "step": 31100, | |
| "student_mlm_loss": 2.6927249431610107 | |
| }, | |
| { | |
| "epoch": 1.142940874789362, | |
| "grad_norm": 6.777341842651367, | |
| "learning_rate": 3.1141116052780306e-05, | |
| "loss": 2.8877, | |
| "step": 31200 | |
| }, | |
| { | |
| "combined_loss": 3.5847015380859375, | |
| "distill_loss": 1.3712694644927979, | |
| "epoch": 1.142940874789362, | |
| "step": 31200, | |
| "student_mlm_loss": 5.798133850097656 | |
| }, | |
| { | |
| "epoch": 1.1466041468239432, | |
| "grad_norm": 6.115112781524658, | |
| "learning_rate": 3.107968646337568e-05, | |
| "loss": 3.3763, | |
| "step": 31300 | |
| }, | |
| { | |
| "combined_loss": 1.899533748626709, | |
| "distill_loss": 1.2805981636047363, | |
| "epoch": 1.1466041468239432, | |
| "step": 31300, | |
| "student_mlm_loss": 2.5184693336486816 | |
| }, | |
| { | |
| "epoch": 1.1502674188585245, | |
| "grad_norm": 3.3896713256835938, | |
| "learning_rate": 3.1018256873971055e-05, | |
| "loss": 3.2932, | |
| "step": 31400 | |
| }, | |
| { | |
| "combined_loss": 1.9794254302978516, | |
| "distill_loss": 1.3896270990371704, | |
| "epoch": 1.1502674188585245, | |
| "step": 31400, | |
| "student_mlm_loss": 2.5692238807678223 | |
| }, | |
| { | |
| "epoch": 1.1539306908931057, | |
| "grad_norm": 12.824034690856934, | |
| "learning_rate": 3.095682728456643e-05, | |
| "loss": 3.5341, | |
| "step": 31500 | |
| }, | |
| { | |
| "combined_loss": 2.5983529090881348, | |
| "distill_loss": 1.2135576009750366, | |
| "epoch": 1.1539306908931057, | |
| "step": 31500, | |
| "student_mlm_loss": 3.9831480979919434 | |
| }, | |
| { | |
| "epoch": 1.157593962927687, | |
| "grad_norm": 73.47982025146484, | |
| "learning_rate": 3.089539769516181e-05, | |
| "loss": 2.9879, | |
| "step": 31600 | |
| }, | |
| { | |
| "combined_loss": 1.8584779500961304, | |
| "distill_loss": 1.3214514255523682, | |
| "epoch": 1.157593962927687, | |
| "step": 31600, | |
| "student_mlm_loss": 2.3955044746398926 | |
| }, | |
| { | |
| "epoch": 1.1612572349622683, | |
| "grad_norm": 5.6778340339660645, | |
| "learning_rate": 3.083396810575718e-05, | |
| "loss": 2.9781, | |
| "step": 31700 | |
| }, | |
| { | |
| "combined_loss": 4.854001045227051, | |
| "distill_loss": 1.2088978290557861, | |
| "epoch": 1.1612572349622683, | |
| "step": 31700, | |
| "student_mlm_loss": 8.499104499816895 | |
| }, | |
| { | |
| "epoch": 1.1649205069968496, | |
| "grad_norm": 17.93754768371582, | |
| "learning_rate": 3.077253851635255e-05, | |
| "loss": 3.5773, | |
| "step": 31800 | |
| }, | |
| { | |
| "combined_loss": 1.9064607620239258, | |
| "distill_loss": 1.363638997077942, | |
| "epoch": 1.1649205069968496, | |
| "step": 31800, | |
| "student_mlm_loss": 2.449282646179199 | |
| }, | |
| { | |
| "epoch": 1.1685837790314308, | |
| "grad_norm": 8.912027359008789, | |
| "learning_rate": 3.071110892694794e-05, | |
| "loss": 3.0949, | |
| "step": 31900 | |
| }, | |
| { | |
| "combined_loss": 1.9666361808776855, | |
| "distill_loss": 1.3997029066085815, | |
| "epoch": 1.1685837790314308, | |
| "step": 31900, | |
| "student_mlm_loss": 2.5335693359375 | |
| }, | |
| { | |
| "epoch": 1.1722470510660121, | |
| "grad_norm": 21.05866050720215, | |
| "learning_rate": 3.064967933754331e-05, | |
| "loss": 2.965, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.1722470510660121, | |
| "eval_loss": 3.516061544418335, | |
| "eval_runtime": 2.6391, | |
| "eval_samples_per_second": 2650.903, | |
| "eval_steps_per_second": 10.61, | |
| "step": 32000 | |
| }, | |
| { | |
| "combined_loss": 2.466904640197754, | |
| "distill_loss": 1.2619636058807373, | |
| "epoch": 1.1722470510660121, | |
| "step": 32000, | |
| "student_mlm_loss": 3.6718459129333496 | |
| }, | |
| { | |
| "epoch": 1.1759103231005934, | |
| "grad_norm": 14.288066864013672, | |
| "learning_rate": 3.0588249748138686e-05, | |
| "loss": 6.5656, | |
| "step": 32100 | |
| }, | |
| { | |
| "combined_loss": 5.987391471862793, | |
| "distill_loss": 1.3964972496032715, | |
| "epoch": 1.1759103231005934, | |
| "step": 32100, | |
| "student_mlm_loss": 10.578286170959473 | |
| }, | |
| { | |
| "epoch": 1.1795735951351747, | |
| "grad_norm": 10.953961372375488, | |
| "learning_rate": 3.052682015873406e-05, | |
| "loss": 7.1246, | |
| "step": 32200 | |
| }, | |
| { | |
| "combined_loss": 1.758845567703247, | |
| "distill_loss": 1.2731348276138306, | |
| "epoch": 1.1795735951351747, | |
| "step": 32200, | |
| "student_mlm_loss": 2.244556188583374 | |
| }, | |
| { | |
| "epoch": 1.183236867169756, | |
| "grad_norm": 17.076087951660156, | |
| "learning_rate": 3.046539056932944e-05, | |
| "loss": 7.3734, | |
| "step": 32300 | |
| }, | |
| { | |
| "combined_loss": 1.7941749095916748, | |
| "distill_loss": 1.282630205154419, | |
| "epoch": 1.183236867169756, | |
| "step": 32300, | |
| "student_mlm_loss": 2.3057196140289307 | |
| }, | |
| { | |
| "epoch": 1.1869001392043372, | |
| "grad_norm": 11.33812427520752, | |
| "learning_rate": 3.040396097992481e-05, | |
| "loss": 5.4979, | |
| "step": 32400 | |
| }, | |
| { | |
| "combined_loss": 2.379426956176758, | |
| "distill_loss": 1.2975032329559326, | |
| "epoch": 1.1869001392043372, | |
| "step": 32400, | |
| "student_mlm_loss": 3.461350917816162 | |
| }, | |
| { | |
| "epoch": 1.1905634112389185, | |
| "grad_norm": 3.6378591060638428, | |
| "learning_rate": 3.0342531390520184e-05, | |
| "loss": 5.077, | |
| "step": 32500 | |
| }, | |
| { | |
| "combined_loss": 1.835166573524475, | |
| "distill_loss": 1.294168472290039, | |
| "epoch": 1.1905634112389185, | |
| "step": 32500, | |
| "student_mlm_loss": 2.376164674758911 | |
| }, | |
| { | |
| "epoch": 1.1942266832735, | |
| "grad_norm": 23.017444610595703, | |
| "learning_rate": 3.0281101801115562e-05, | |
| "loss": 3.1428, | |
| "step": 32600 | |
| }, | |
| { | |
| "combined_loss": 1.8867619037628174, | |
| "distill_loss": 1.2372292280197144, | |
| "epoch": 1.1942266832735, | |
| "step": 32600, | |
| "student_mlm_loss": 2.536294460296631 | |
| }, | |
| { | |
| "epoch": 1.197889955308081, | |
| "grad_norm": 7.055652141571045, | |
| "learning_rate": 3.0219672211710937e-05, | |
| "loss": 8.7118, | |
| "step": 32700 | |
| }, | |
| { | |
| "combined_loss": 6.59044075012207, | |
| "distill_loss": 1.3554973602294922, | |
| "epoch": 1.197889955308081, | |
| "step": 32700, | |
| "student_mlm_loss": 11.825384140014648 | |
| }, | |
| { | |
| "epoch": 1.2015532273426626, | |
| "grad_norm": 6.935373783111572, | |
| "learning_rate": 3.0158242622306314e-05, | |
| "loss": 7.5763, | |
| "step": 32800 | |
| }, | |
| { | |
| "combined_loss": 2.4971964359283447, | |
| "distill_loss": 1.2960432767868042, | |
| "epoch": 1.2015532273426626, | |
| "step": 32800, | |
| "student_mlm_loss": 3.698349714279175 | |
| }, | |
| { | |
| "epoch": 1.2052164993772438, | |
| "grad_norm": 19.48725700378418, | |
| "learning_rate": 3.009681303290169e-05, | |
| "loss": 5.1993, | |
| "step": 32900 | |
| }, | |
| { | |
| "combined_loss": 2.639206886291504, | |
| "distill_loss": 1.2536990642547607, | |
| "epoch": 1.2052164993772438, | |
| "step": 32900, | |
| "student_mlm_loss": 4.024714469909668 | |
| }, | |
| { | |
| "epoch": 1.2088797714118251, | |
| "grad_norm": 215.4875946044922, | |
| "learning_rate": 3.0035383443497067e-05, | |
| "loss": 3.9297, | |
| "step": 33000 | |
| }, | |
| { | |
| "combined_loss": 2.1888670921325684, | |
| "distill_loss": 1.4587746858596802, | |
| "epoch": 1.2088797714118251, | |
| "step": 33000, | |
| "student_mlm_loss": 2.918959379196167 | |
| }, | |
| { | |
| "epoch": 1.2125430434464064, | |
| "grad_norm": 5.346382141113281, | |
| "learning_rate": 2.997395385409244e-05, | |
| "loss": 3.3704, | |
| "step": 33100 | |
| }, | |
| { | |
| "combined_loss": 2.5722949504852295, | |
| "distill_loss": 1.2250982522964478, | |
| "epoch": 1.2125430434464064, | |
| "step": 33100, | |
| "student_mlm_loss": 3.9194915294647217 | |
| }, | |
| { | |
| "epoch": 1.2162063154809877, | |
| "grad_norm": 21.193038940429688, | |
| "learning_rate": 2.991252426468782e-05, | |
| "loss": 3.22, | |
| "step": 33200 | |
| }, | |
| { | |
| "combined_loss": 1.8822517395019531, | |
| "distill_loss": 1.264020323753357, | |
| "epoch": 1.2162063154809877, | |
| "step": 33200, | |
| "student_mlm_loss": 2.5004830360412598 | |
| }, | |
| { | |
| "epoch": 1.219869587515569, | |
| "grad_norm": 8.840603828430176, | |
| "learning_rate": 2.9851094675283193e-05, | |
| "loss": 13.091, | |
| "step": 33300 | |
| }, | |
| { | |
| "combined_loss": 2.0461645126342773, | |
| "distill_loss": 1.3376085758209229, | |
| "epoch": 1.219869587515569, | |
| "step": 33300, | |
| "student_mlm_loss": 2.7547202110290527 | |
| }, | |
| { | |
| "epoch": 1.2235328595501502, | |
| "grad_norm": 16.414852142333984, | |
| "learning_rate": 2.9789665085878564e-05, | |
| "loss": 3.6096, | |
| "step": 33400 | |
| }, | |
| { | |
| "combined_loss": 1.8437246084213257, | |
| "distill_loss": 1.2731173038482666, | |
| "epoch": 1.2235328595501502, | |
| "step": 33400, | |
| "student_mlm_loss": 2.4143319129943848 | |
| }, | |
| { | |
| "epoch": 1.2271961315847315, | |
| "grad_norm": 5.047356605529785, | |
| "learning_rate": 2.9728235496473946e-05, | |
| "loss": 10.6014, | |
| "step": 33500 | |
| }, | |
| { | |
| "combined_loss": 2.0613672733306885, | |
| "distill_loss": 1.1784592866897583, | |
| "epoch": 1.2271961315847315, | |
| "step": 33500, | |
| "student_mlm_loss": 2.944275140762329 | |
| }, | |
| { | |
| "epoch": 1.2308594036193128, | |
| "grad_norm": 8.502574920654297, | |
| "learning_rate": 2.9666805907069317e-05, | |
| "loss": 12.6532, | |
| "step": 33600 | |
| }, | |
| { | |
| "combined_loss": 2.301725149154663, | |
| "distill_loss": 1.2482868432998657, | |
| "epoch": 1.2308594036193128, | |
| "step": 33600, | |
| "student_mlm_loss": 3.355163335800171 | |
| }, | |
| { | |
| "epoch": 1.234522675653894, | |
| "grad_norm": 25.97445297241211, | |
| "learning_rate": 2.9605376317664695e-05, | |
| "loss": 3.1296, | |
| "step": 33700 | |
| }, | |
| { | |
| "combined_loss": 1.8135402202606201, | |
| "distill_loss": 1.309229850769043, | |
| "epoch": 1.234522675653894, | |
| "step": 33700, | |
| "student_mlm_loss": 2.3178505897521973 | |
| }, | |
| { | |
| "epoch": 1.2381859476884753, | |
| "grad_norm": 7.912507057189941, | |
| "learning_rate": 2.954394672826007e-05, | |
| "loss": 2.9749, | |
| "step": 33800 | |
| }, | |
| { | |
| "combined_loss": 1.9506487846374512, | |
| "distill_loss": 1.3808802366256714, | |
| "epoch": 1.2381859476884753, | |
| "step": 33800, | |
| "student_mlm_loss": 2.5204174518585205 | |
| }, | |
| { | |
| "epoch": 1.2418492197230566, | |
| "grad_norm": 28.239988327026367, | |
| "learning_rate": 2.9482517138855447e-05, | |
| "loss": 5.7527, | |
| "step": 33900 | |
| }, | |
| { | |
| "combined_loss": 1.881349802017212, | |
| "distill_loss": 1.3489292860031128, | |
| "epoch": 1.2418492197230566, | |
| "step": 33900, | |
| "student_mlm_loss": 2.4137701988220215 | |
| }, | |
| { | |
| "epoch": 1.245512491757638, | |
| "grad_norm": 25.953353881835938, | |
| "learning_rate": 2.942108754945082e-05, | |
| "loss": 4.0339, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.245512491757638, | |
| "eval_loss": 3.297154188156128, | |
| "eval_runtime": 2.3826, | |
| "eval_samples_per_second": 2936.248, | |
| "eval_steps_per_second": 11.752, | |
| "step": 34000 | |
| }, | |
| { | |
| "combined_loss": 2.5429787635803223, | |
| "distill_loss": 1.2718520164489746, | |
| "epoch": 1.245512491757638, | |
| "step": 34000, | |
| "student_mlm_loss": 3.814105272293091 | |
| }, | |
| { | |
| "epoch": 1.2491757637922192, | |
| "grad_norm": 48.45500183105469, | |
| "learning_rate": 2.9359657960046196e-05, | |
| "loss": 6.1408, | |
| "step": 34100 | |
| }, | |
| { | |
| "combined_loss": 4.794422626495361, | |
| "distill_loss": 1.3052036762237549, | |
| "epoch": 1.2491757637922192, | |
| "step": 34100, | |
| "student_mlm_loss": 8.283641815185547 | |
| }, | |
| { | |
| "epoch": 1.2528390358268005, | |
| "grad_norm": 6.028234004974365, | |
| "learning_rate": 2.9298228370641574e-05, | |
| "loss": 2.9116, | |
| "step": 34200 | |
| }, | |
| { | |
| "combined_loss": 2.125443458557129, | |
| "distill_loss": 1.25053071975708, | |
| "epoch": 1.2528390358268005, | |
| "step": 34200, | |
| "student_mlm_loss": 3.0003561973571777 | |
| }, | |
| { | |
| "epoch": 1.2565023078613817, | |
| "grad_norm": 15.824817657470703, | |
| "learning_rate": 2.9236798781236945e-05, | |
| "loss": 3.5834, | |
| "step": 34300 | |
| }, | |
| { | |
| "combined_loss": 2.156796932220459, | |
| "distill_loss": 1.1805670261383057, | |
| "epoch": 1.2565023078613817, | |
| "step": 34300, | |
| "student_mlm_loss": 3.1330268383026123 | |
| }, | |
| { | |
| "epoch": 1.260165579895963, | |
| "grad_norm": 8.438326835632324, | |
| "learning_rate": 2.9175369191832326e-05, | |
| "loss": 5.0724, | |
| "step": 34400 | |
| }, | |
| { | |
| "combined_loss": 3.144615888595581, | |
| "distill_loss": 1.2467416524887085, | |
| "epoch": 1.260165579895963, | |
| "step": 34400, | |
| "student_mlm_loss": 5.042490005493164 | |
| }, | |
| { | |
| "epoch": 1.2638288519305443, | |
| "grad_norm": 3.7252449989318848, | |
| "learning_rate": 2.9113939602427697e-05, | |
| "loss": 2.9306, | |
| "step": 34500 | |
| }, | |
| { | |
| "combined_loss": 4.309004783630371, | |
| "distill_loss": 1.2629985809326172, | |
| "epoch": 1.2638288519305443, | |
| "step": 34500, | |
| "student_mlm_loss": 7.355010986328125 | |
| }, | |
| { | |
| "epoch": 1.2674921239651256, | |
| "grad_norm": 14.86426067352295, | |
| "learning_rate": 2.9052510013023078e-05, | |
| "loss": 3.059, | |
| "step": 34600 | |
| }, | |
| { | |
| "combined_loss": 2.128227472305298, | |
| "distill_loss": 1.3674236536026, | |
| "epoch": 1.2674921239651256, | |
| "step": 34600, | |
| "student_mlm_loss": 2.889031171798706 | |
| }, | |
| { | |
| "epoch": 1.271155395999707, | |
| "grad_norm": 14.947731018066406, | |
| "learning_rate": 2.899108042361845e-05, | |
| "loss": 3.0461, | |
| "step": 34700 | |
| }, | |
| { | |
| "combined_loss": 1.9557018280029297, | |
| "distill_loss": 1.3122907876968384, | |
| "epoch": 1.271155395999707, | |
| "step": 34700, | |
| "student_mlm_loss": 2.5991127490997314 | |
| }, | |
| { | |
| "epoch": 1.2748186680342881, | |
| "grad_norm": 4.714714527130127, | |
| "learning_rate": 2.8929650834213824e-05, | |
| "loss": 3.0221, | |
| "step": 34800 | |
| }, | |
| { | |
| "combined_loss": 1.7830932140350342, | |
| "distill_loss": 1.278725028038025, | |
| "epoch": 1.2748186680342881, | |
| "step": 34800, | |
| "student_mlm_loss": 2.287461519241333 | |
| }, | |
| { | |
| "epoch": 1.2784819400688696, | |
| "grad_norm": 13.885130882263184, | |
| "learning_rate": 2.88682212448092e-05, | |
| "loss": 8.529, | |
| "step": 34900 | |
| }, | |
| { | |
| "combined_loss": 4.974426746368408, | |
| "distill_loss": 1.4173694849014282, | |
| "epoch": 1.2784819400688696, | |
| "step": 34900, | |
| "student_mlm_loss": 8.53148365020752 | |
| }, | |
| { | |
| "epoch": 1.2821452121034507, | |
| "grad_norm": 6.786545753479004, | |
| "learning_rate": 2.8806791655404576e-05, | |
| "loss": 3.563, | |
| "step": 35000 | |
| }, | |
| { | |
| "combined_loss": 1.7134695053100586, | |
| "distill_loss": 1.2251827716827393, | |
| "epoch": 1.2821452121034507, | |
| "step": 35000, | |
| "student_mlm_loss": 2.201756238937378 | |
| }, | |
| { | |
| "epoch": 1.2858084841380322, | |
| "grad_norm": 18.235891342163086, | |
| "learning_rate": 2.8745362065999954e-05, | |
| "loss": 6.9188, | |
| "step": 35100 | |
| }, | |
| { | |
| "combined_loss": 6.00921106338501, | |
| "distill_loss": 1.3103188276290894, | |
| "epoch": 1.2858084841380322, | |
| "step": 35100, | |
| "student_mlm_loss": 10.70810317993164 | |
| }, | |
| { | |
| "epoch": 1.2894717561726134, | |
| "grad_norm": 6.3708696365356445, | |
| "learning_rate": 2.8683932476595328e-05, | |
| "loss": 6.7695, | |
| "step": 35200 | |
| }, | |
| { | |
| "combined_loss": 2.2400052547454834, | |
| "distill_loss": 1.3289698362350464, | |
| "epoch": 1.2894717561726134, | |
| "step": 35200, | |
| "student_mlm_loss": 3.151040554046631 | |
| }, | |
| { | |
| "epoch": 1.2931350282071947, | |
| "grad_norm": 7.5602946281433105, | |
| "learning_rate": 2.8622502887190706e-05, | |
| "loss": 9.8005, | |
| "step": 35300 | |
| }, | |
| { | |
| "combined_loss": 1.848390817642212, | |
| "distill_loss": 1.2897430658340454, | |
| "epoch": 1.2931350282071947, | |
| "step": 35300, | |
| "student_mlm_loss": 2.407038688659668 | |
| }, | |
| { | |
| "epoch": 1.296798300241776, | |
| "grad_norm": 24.799640655517578, | |
| "learning_rate": 2.8561073297786077e-05, | |
| "loss": 3.2996, | |
| "step": 35400 | |
| }, | |
| { | |
| "combined_loss": 4.894403457641602, | |
| "distill_loss": 1.282358169555664, | |
| "epoch": 1.296798300241776, | |
| "step": 35400, | |
| "student_mlm_loss": 8.506448745727539 | |
| }, | |
| { | |
| "epoch": 1.3004615722763573, | |
| "grad_norm": 34.4364013671875, | |
| "learning_rate": 2.849964370838146e-05, | |
| "loss": 3.399, | |
| "step": 35500 | |
| }, | |
| { | |
| "combined_loss": 1.7965787649154663, | |
| "distill_loss": 1.3232142925262451, | |
| "epoch": 1.3004615722763573, | |
| "step": 35500, | |
| "student_mlm_loss": 2.2699432373046875 | |
| }, | |
| { | |
| "epoch": 1.3041248443109386, | |
| "grad_norm": 7.9551825523376465, | |
| "learning_rate": 2.843821411897683e-05, | |
| "loss": 3.1887, | |
| "step": 35600 | |
| }, | |
| { | |
| "combined_loss": 1.855729579925537, | |
| "distill_loss": 1.2217527627944946, | |
| "epoch": 1.3041248443109386, | |
| "step": 35600, | |
| "student_mlm_loss": 2.48970627784729 | |
| }, | |
| { | |
| "epoch": 1.3077881163455198, | |
| "grad_norm": 5.838754177093506, | |
| "learning_rate": 2.8376784529572204e-05, | |
| "loss": 3.1524, | |
| "step": 35700 | |
| }, | |
| { | |
| "combined_loss": 2.3417129516601562, | |
| "distill_loss": 1.2872867584228516, | |
| "epoch": 1.3077881163455198, | |
| "step": 35700, | |
| "student_mlm_loss": 3.39613938331604 | |
| }, | |
| { | |
| "epoch": 1.3114513883801011, | |
| "grad_norm": 4.118559837341309, | |
| "learning_rate": 2.831535494016758e-05, | |
| "loss": 7.9754, | |
| "step": 35800 | |
| }, | |
| { | |
| "combined_loss": 3.906961679458618, | |
| "distill_loss": 1.2905327081680298, | |
| "epoch": 1.3114513883801011, | |
| "step": 35800, | |
| "student_mlm_loss": 6.523390769958496 | |
| }, | |
| { | |
| "epoch": 1.3151146604146824, | |
| "grad_norm": 5.229255199432373, | |
| "learning_rate": 2.8253925350762956e-05, | |
| "loss": 3.6586, | |
| "step": 35900 | |
| }, | |
| { | |
| "combined_loss": 2.6259002685546875, | |
| "distill_loss": 1.217278003692627, | |
| "epoch": 1.3151146604146824, | |
| "step": 35900, | |
| "student_mlm_loss": 4.034522533416748 | |
| }, | |
| { | |
| "epoch": 1.3187779324492637, | |
| "grad_norm": 9.182631492614746, | |
| "learning_rate": 2.8192495761358334e-05, | |
| "loss": 8.5789, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.3187779324492637, | |
| "eval_loss": 3.3097567558288574, | |
| "eval_runtime": 1.9861, | |
| "eval_samples_per_second": 3522.525, | |
| "eval_steps_per_second": 14.098, | |
| "step": 36000 | |
| }, | |
| { | |
| "combined_loss": 15.921034812927246, | |
| "distill_loss": 1.2575896978378296, | |
| "epoch": 1.3187779324492637, | |
| "step": 36000, | |
| "student_mlm_loss": 30.58448028564453 | |
| }, | |
| { | |
| "epoch": 1.322441204483845, | |
| "grad_norm": 5.999209880828857, | |
| "learning_rate": 2.813106617195371e-05, | |
| "loss": 3.6109, | |
| "step": 36100 | |
| }, | |
| { | |
| "combined_loss": 204.92184448242188, | |
| "distill_loss": 1.2291535139083862, | |
| "epoch": 1.322441204483845, | |
| "step": 36100, | |
| "student_mlm_loss": 408.6145324707031 | |
| }, | |
| { | |
| "epoch": 1.3261044765184262, | |
| "grad_norm": 8.351846694946289, | |
| "learning_rate": 2.8069636582549086e-05, | |
| "loss": 5.9753, | |
| "step": 36200 | |
| }, | |
| { | |
| "combined_loss": 3.7332310676574707, | |
| "distill_loss": 1.377110481262207, | |
| "epoch": 1.3261044765184262, | |
| "step": 36200, | |
| "student_mlm_loss": 6.089351654052734 | |
| }, | |
| { | |
| "epoch": 1.3297677485530075, | |
| "grad_norm": 4.738751411437988, | |
| "learning_rate": 2.800820699314446e-05, | |
| "loss": 2.8706, | |
| "step": 36300 | |
| }, | |
| { | |
| "combined_loss": 1.949210286140442, | |
| "distill_loss": 1.1820151805877686, | |
| "epoch": 1.3297677485530075, | |
| "step": 36300, | |
| "student_mlm_loss": 2.7164053916931152 | |
| }, | |
| { | |
| "epoch": 1.3334310205875888, | |
| "grad_norm": 3.7835421562194824, | |
| "learning_rate": 2.7946777403739832e-05, | |
| "loss": 3.5794, | |
| "step": 36400 | |
| }, | |
| { | |
| "combined_loss": 1.7922800779342651, | |
| "distill_loss": 1.2455928325653076, | |
| "epoch": 1.3334310205875888, | |
| "step": 36400, | |
| "student_mlm_loss": 2.3389673233032227 | |
| }, | |
| { | |
| "epoch": 1.33709429262217, | |
| "grad_norm": 22.528881072998047, | |
| "learning_rate": 2.788534781433521e-05, | |
| "loss": 3.8623, | |
| "step": 36500 | |
| }, | |
| { | |
| "combined_loss": 1.788147211074829, | |
| "distill_loss": 1.2254056930541992, | |
| "epoch": 1.33709429262217, | |
| "step": 36500, | |
| "student_mlm_loss": 2.350888729095459 | |
| }, | |
| { | |
| "epoch": 1.3407575646567513, | |
| "grad_norm": 5.876169681549072, | |
| "learning_rate": 2.7823918224930584e-05, | |
| "loss": 8.4137, | |
| "step": 36600 | |
| }, | |
| { | |
| "combined_loss": 2.0377962589263916, | |
| "distill_loss": 1.2204126119613647, | |
| "epoch": 1.3407575646567513, | |
| "step": 36600, | |
| "student_mlm_loss": 2.855179786682129 | |
| }, | |
| { | |
| "epoch": 1.3444208366913326, | |
| "grad_norm": 20.921276092529297, | |
| "learning_rate": 2.7762488635525962e-05, | |
| "loss": 3.5857, | |
| "step": 36700 | |
| }, | |
| { | |
| "combined_loss": 1.9521321058273315, | |
| "distill_loss": 1.249513864517212, | |
| "epoch": 1.3444208366913326, | |
| "step": 36700, | |
| "student_mlm_loss": 2.654750347137451 | |
| }, | |
| { | |
| "epoch": 1.348084108725914, | |
| "grad_norm": 13.851704597473145, | |
| "learning_rate": 2.7701059046121336e-05, | |
| "loss": 3.8678, | |
| "step": 36800 | |
| }, | |
| { | |
| "combined_loss": 2.2560389041900635, | |
| "distill_loss": 1.2315130233764648, | |
| "epoch": 1.348084108725914, | |
| "step": 36800, | |
| "student_mlm_loss": 3.280564785003662 | |
| }, | |
| { | |
| "epoch": 1.3517473807604952, | |
| "grad_norm": 16.56214714050293, | |
| "learning_rate": 2.7639629456716714e-05, | |
| "loss": 3.3998, | |
| "step": 36900 | |
| }, | |
| { | |
| "combined_loss": 3.098896026611328, | |
| "distill_loss": 1.3377043008804321, | |
| "epoch": 1.3517473807604952, | |
| "step": 36900, | |
| "student_mlm_loss": 4.860087871551514 | |
| }, | |
| { | |
| "epoch": 1.3554106527950767, | |
| "grad_norm": 35.91291809082031, | |
| "learning_rate": 2.757819986731209e-05, | |
| "loss": 3.761, | |
| "step": 37000 | |
| }, | |
| { | |
| "combined_loss": 1.9794631004333496, | |
| "distill_loss": 1.3087836503982544, | |
| "epoch": 1.3554106527950767, | |
| "step": 37000, | |
| "student_mlm_loss": 2.6501426696777344 | |
| }, | |
| { | |
| "epoch": 1.3590739248296577, | |
| "grad_norm": 11.776296615600586, | |
| "learning_rate": 2.7516770277907466e-05, | |
| "loss": 3.9886, | |
| "step": 37100 | |
| }, | |
| { | |
| "combined_loss": 2.3107573986053467, | |
| "distill_loss": 1.268768310546875, | |
| "epoch": 1.3590739248296577, | |
| "step": 37100, | |
| "student_mlm_loss": 3.3527464866638184 | |
| }, | |
| { | |
| "epoch": 1.3627371968642392, | |
| "grad_norm": 13.237029075622559, | |
| "learning_rate": 2.745534068850284e-05, | |
| "loss": 5.3161, | |
| "step": 37200 | |
| }, | |
| { | |
| "combined_loss": 4.210747718811035, | |
| "distill_loss": 1.4009877443313599, | |
| "epoch": 1.3627371968642392, | |
| "step": 37200, | |
| "student_mlm_loss": 7.0205078125 | |
| }, | |
| { | |
| "epoch": 1.3664004688988205, | |
| "grad_norm": 18.256624221801758, | |
| "learning_rate": 2.7393911099098212e-05, | |
| "loss": 3.3122, | |
| "step": 37300 | |
| }, | |
| { | |
| "combined_loss": 2.467655658721924, | |
| "distill_loss": 1.3313319683074951, | |
| "epoch": 1.3664004688988205, | |
| "step": 37300, | |
| "student_mlm_loss": 3.6039793491363525 | |
| }, | |
| { | |
| "epoch": 1.3700637409334018, | |
| "grad_norm": 3.6821129322052, | |
| "learning_rate": 2.7332481509693593e-05, | |
| "loss": 2.5638, | |
| "step": 37400 | |
| }, | |
| { | |
| "combined_loss": 4.0961503982543945, | |
| "distill_loss": 1.2590566873550415, | |
| "epoch": 1.3700637409334018, | |
| "step": 37400, | |
| "student_mlm_loss": 6.933243751525879 | |
| }, | |
| { | |
| "epoch": 1.373727012967983, | |
| "grad_norm": 9.491351127624512, | |
| "learning_rate": 2.7271051920288964e-05, | |
| "loss": 5.2572, | |
| "step": 37500 | |
| }, | |
| { | |
| "combined_loss": 1.8323596715927124, | |
| "distill_loss": 1.2323403358459473, | |
| "epoch": 1.373727012967983, | |
| "step": 37500, | |
| "student_mlm_loss": 2.4323790073394775 | |
| }, | |
| { | |
| "epoch": 1.3773902850025643, | |
| "grad_norm": 10.13337516784668, | |
| "learning_rate": 2.7209622330884342e-05, | |
| "loss": 2.9805, | |
| "step": 37600 | |
| }, | |
| { | |
| "combined_loss": 2.7236733436584473, | |
| "distill_loss": 1.2598845958709717, | |
| "epoch": 1.3773902850025643, | |
| "step": 37600, | |
| "student_mlm_loss": 4.187462329864502 | |
| }, | |
| { | |
| "epoch": 1.3810535570371456, | |
| "grad_norm": 22.098358154296875, | |
| "learning_rate": 2.7148192741479716e-05, | |
| "loss": 3.1095, | |
| "step": 37700 | |
| }, | |
| { | |
| "combined_loss": 1.7910634279251099, | |
| "distill_loss": 1.271672010421753, | |
| "epoch": 1.3810535570371456, | |
| "step": 37700, | |
| "student_mlm_loss": 2.310454845428467 | |
| }, | |
| { | |
| "epoch": 1.3847168290717269, | |
| "grad_norm": 233.01779174804688, | |
| "learning_rate": 2.7086763152075094e-05, | |
| "loss": 3.0334, | |
| "step": 37800 | |
| }, | |
| { | |
| "combined_loss": 2.449730396270752, | |
| "distill_loss": 1.343329906463623, | |
| "epoch": 1.3847168290717269, | |
| "step": 37800, | |
| "student_mlm_loss": 3.556130886077881 | |
| }, | |
| { | |
| "epoch": 1.3883801011063082, | |
| "grad_norm": 7.459797382354736, | |
| "learning_rate": 2.702533356267047e-05, | |
| "loss": 5.0088, | |
| "step": 37900 | |
| }, | |
| { | |
| "combined_loss": 2.047302722930908, | |
| "distill_loss": 1.2358465194702148, | |
| "epoch": 1.3883801011063082, | |
| "step": 37900, | |
| "student_mlm_loss": 2.8587586879730225 | |
| }, | |
| { | |
| "epoch": 1.3920433731408894, | |
| "grad_norm": 3.9627275466918945, | |
| "learning_rate": 2.6963903973265843e-05, | |
| "loss": 2.7476, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.3920433731408894, | |
| "eval_loss": 4.346156120300293, | |
| "eval_runtime": 1.974, | |
| "eval_samples_per_second": 3544.088, | |
| "eval_steps_per_second": 14.184, | |
| "step": 38000 | |
| }, | |
| { | |
| "combined_loss": 2.4468555450439453, | |
| "distill_loss": 1.166190505027771, | |
| "epoch": 1.3920433731408894, | |
| "step": 38000, | |
| "student_mlm_loss": 3.72752046585083 | |
| }, | |
| { | |
| "epoch": 1.3957066451754707, | |
| "grad_norm": 11.812987327575684, | |
| "learning_rate": 2.690247438386122e-05, | |
| "loss": 3.8226, | |
| "step": 38100 | |
| }, | |
| { | |
| "combined_loss": 2.274935245513916, | |
| "distill_loss": 1.3503799438476562, | |
| "epoch": 1.3957066451754707, | |
| "step": 38100, | |
| "student_mlm_loss": 3.199490785598755 | |
| }, | |
| { | |
| "epoch": 1.399369917210052, | |
| "grad_norm": 6.545460224151611, | |
| "learning_rate": 2.6841044794456592e-05, | |
| "loss": 4.1598, | |
| "step": 38200 | |
| }, | |
| { | |
| "combined_loss": 2.1577343940734863, | |
| "distill_loss": 1.2623993158340454, | |
| "epoch": 1.399369917210052, | |
| "step": 38200, | |
| "student_mlm_loss": 3.0530693531036377 | |
| }, | |
| { | |
| "epoch": 1.4030331892446333, | |
| "grad_norm": 7.286951541900635, | |
| "learning_rate": 2.6779615205051973e-05, | |
| "loss": 3.8211, | |
| "step": 38300 | |
| }, | |
| { | |
| "combined_loss": 2.479806900024414, | |
| "distill_loss": 1.2152717113494873, | |
| "epoch": 1.4030331892446333, | |
| "step": 38300, | |
| "student_mlm_loss": 3.74434232711792 | |
| }, | |
| { | |
| "epoch": 1.4066964612792145, | |
| "grad_norm": 18.360294342041016, | |
| "learning_rate": 2.6718185615647344e-05, | |
| "loss": 3.3871, | |
| "step": 38400 | |
| }, | |
| { | |
| "combined_loss": 1.7289254665374756, | |
| "distill_loss": 1.3171356916427612, | |
| "epoch": 1.4066964612792145, | |
| "step": 38400, | |
| "student_mlm_loss": 2.1407151222229004 | |
| }, | |
| { | |
| "epoch": 1.4103597333137958, | |
| "grad_norm": 8.086026191711426, | |
| "learning_rate": 2.6656756026242726e-05, | |
| "loss": 2.6337, | |
| "step": 38500 | |
| }, | |
| { | |
| "combined_loss": 1.9621633291244507, | |
| "distill_loss": 1.3215687274932861, | |
| "epoch": 1.4103597333137958, | |
| "step": 38500, | |
| "student_mlm_loss": 2.6027579307556152 | |
| }, | |
| { | |
| "epoch": 1.414023005348377, | |
| "grad_norm": 13.378824234008789, | |
| "learning_rate": 2.6595326436838097e-05, | |
| "loss": 3.4032, | |
| "step": 38600 | |
| }, | |
| { | |
| "combined_loss": 37.448326110839844, | |
| "distill_loss": 1.2198776006698608, | |
| "epoch": 1.414023005348377, | |
| "step": 38600, | |
| "student_mlm_loss": 73.67677307128906 | |
| }, | |
| { | |
| "epoch": 1.4176862773829584, | |
| "grad_norm": 5.834230422973633, | |
| "learning_rate": 2.653389684743347e-05, | |
| "loss": 6.724, | |
| "step": 38700 | |
| }, | |
| { | |
| "combined_loss": 1.8702625036239624, | |
| "distill_loss": 1.2802906036376953, | |
| "epoch": 1.4176862773829584, | |
| "step": 38700, | |
| "student_mlm_loss": 2.4602344036102295 | |
| }, | |
| { | |
| "epoch": 1.4213495494175397, | |
| "grad_norm": 3.5685741901397705, | |
| "learning_rate": 2.647246725802885e-05, | |
| "loss": 3.2721, | |
| "step": 38800 | |
| }, | |
| { | |
| "combined_loss": 1.7411483526229858, | |
| "distill_loss": 1.285083532333374, | |
| "epoch": 1.4213495494175397, | |
| "step": 38800, | |
| "student_mlm_loss": 2.1972131729125977 | |
| }, | |
| { | |
| "epoch": 1.4250128214521212, | |
| "grad_norm": 8.644251823425293, | |
| "learning_rate": 2.6411037668624223e-05, | |
| "loss": 13.6859, | |
| "step": 38900 | |
| }, | |
| { | |
| "combined_loss": 3.234241008758545, | |
| "distill_loss": 1.2654619216918945, | |
| "epoch": 1.4250128214521212, | |
| "step": 38900, | |
| "student_mlm_loss": 5.203020095825195 | |
| }, | |
| { | |
| "epoch": 1.4286760934867022, | |
| "grad_norm": 15.043992042541504, | |
| "learning_rate": 2.63496080792196e-05, | |
| "loss": 4.3161, | |
| "step": 39000 | |
| }, | |
| { | |
| "combined_loss": 2.013312339782715, | |
| "distill_loss": 1.2555652856826782, | |
| "epoch": 1.4286760934867022, | |
| "step": 39000, | |
| "student_mlm_loss": 2.771059274673462 | |
| }, | |
| { | |
| "epoch": 1.4323393655212837, | |
| "grad_norm": 35.315345764160156, | |
| "learning_rate": 2.6288178489814976e-05, | |
| "loss": 6.3089, | |
| "step": 39100 | |
| }, | |
| { | |
| "combined_loss": 1.7854509353637695, | |
| "distill_loss": 1.2994376420974731, | |
| "epoch": 1.4323393655212837, | |
| "step": 39100, | |
| "student_mlm_loss": 2.2714641094207764 | |
| }, | |
| { | |
| "epoch": 1.4360026375558648, | |
| "grad_norm": 8.155647277832031, | |
| "learning_rate": 2.6226748900410353e-05, | |
| "loss": 3.3881, | |
| "step": 39200 | |
| }, | |
| { | |
| "combined_loss": 1.8790473937988281, | |
| "distill_loss": 1.2656193971633911, | |
| "epoch": 1.4360026375558648, | |
| "step": 39200, | |
| "student_mlm_loss": 2.4924752712249756 | |
| }, | |
| { | |
| "epoch": 1.4396659095904463, | |
| "grad_norm": 4.777060508728027, | |
| "learning_rate": 2.6165319311005725e-05, | |
| "loss": 3.0181, | |
| "step": 39300 | |
| }, | |
| { | |
| "combined_loss": 2.2714784145355225, | |
| "distill_loss": 1.2724400758743286, | |
| "epoch": 1.4396659095904463, | |
| "step": 39300, | |
| "student_mlm_loss": 3.270516872406006 | |
| }, | |
| { | |
| "epoch": 1.4433291816250275, | |
| "grad_norm": 3.7660317420959473, | |
| "learning_rate": 2.6103889721601106e-05, | |
| "loss": 3.3045, | |
| "step": 39400 | |
| }, | |
| { | |
| "combined_loss": 1.9759800434112549, | |
| "distill_loss": 1.1767717599868774, | |
| "epoch": 1.4433291816250275, | |
| "step": 39400, | |
| "student_mlm_loss": 2.775188446044922 | |
| }, | |
| { | |
| "epoch": 1.4469924536596088, | |
| "grad_norm": 55.78919982910156, | |
| "learning_rate": 2.6042460132196477e-05, | |
| "loss": 3.5094, | |
| "step": 39500 | |
| }, | |
| { | |
| "combined_loss": 2.5586395263671875, | |
| "distill_loss": 1.3177176713943481, | |
| "epoch": 1.4469924536596088, | |
| "step": 39500, | |
| "student_mlm_loss": 3.7995612621307373 | |
| }, | |
| { | |
| "epoch": 1.45065572569419, | |
| "grad_norm": 11.648473739624023, | |
| "learning_rate": 2.598103054279185e-05, | |
| "loss": 6.3066, | |
| "step": 39600 | |
| }, | |
| { | |
| "combined_loss": 1.8263496160507202, | |
| "distill_loss": 1.2649195194244385, | |
| "epoch": 1.45065572569419, | |
| "step": 39600, | |
| "student_mlm_loss": 2.387779712677002 | |
| }, | |
| { | |
| "epoch": 1.4543189977287714, | |
| "grad_norm": 4.982020378112793, | |
| "learning_rate": 2.591960095338723e-05, | |
| "loss": 3.1475, | |
| "step": 39700 | |
| }, | |
| { | |
| "combined_loss": 4.95673131942749, | |
| "distill_loss": 1.2415388822555542, | |
| "epoch": 1.4543189977287714, | |
| "step": 39700, | |
| "student_mlm_loss": 8.671923637390137 | |
| }, | |
| { | |
| "epoch": 1.4579822697633527, | |
| "grad_norm": 4.551340103149414, | |
| "learning_rate": 2.5858171363982604e-05, | |
| "loss": 6.0043, | |
| "step": 39800 | |
| }, | |
| { | |
| "combined_loss": 2.124246597290039, | |
| "distill_loss": 1.197386384010315, | |
| "epoch": 1.4579822697633527, | |
| "step": 39800, | |
| "student_mlm_loss": 3.0511069297790527 | |
| }, | |
| { | |
| "epoch": 1.461645541797934, | |
| "grad_norm": 41.217533111572266, | |
| "learning_rate": 2.579674177457798e-05, | |
| "loss": 2.7216, | |
| "step": 39900 | |
| }, | |
| { | |
| "combined_loss": 1.8579926490783691, | |
| "distill_loss": 1.1948734521865845, | |
| "epoch": 1.461645541797934, | |
| "step": 39900, | |
| "student_mlm_loss": 2.5211119651794434 | |
| }, | |
| { | |
| "epoch": 1.4653088138325152, | |
| "grad_norm": 3.3428897857666016, | |
| "learning_rate": 2.5735312185173356e-05, | |
| "loss": 3.5888, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.4653088138325152, | |
| "eval_loss": 3.433469295501709, | |
| "eval_runtime": 2.0987, | |
| "eval_samples_per_second": 3333.452, | |
| "eval_steps_per_second": 13.341, | |
| "step": 40000 | |
| }, | |
| { | |
| "combined_loss": 3.9790029525756836, | |
| "distill_loss": 1.2571158409118652, | |
| "epoch": 1.4653088138325152, | |
| "step": 40000, | |
| "student_mlm_loss": 6.700890064239502 | |
| }, | |
| { | |
| "epoch": 1.4689720858670965, | |
| "grad_norm": 24.387128829956055, | |
| "learning_rate": 2.5673882595768734e-05, | |
| "loss": 3.3546, | |
| "step": 40100 | |
| }, | |
| { | |
| "combined_loss": 2.113370418548584, | |
| "distill_loss": 1.2904696464538574, | |
| "epoch": 1.4689720858670965, | |
| "step": 40100, | |
| "student_mlm_loss": 2.9362711906433105 | |
| }, | |
| { | |
| "epoch": 1.4726353579016778, | |
| "grad_norm": 11.271422386169434, | |
| "learning_rate": 2.5612453006364108e-05, | |
| "loss": 9.1182, | |
| "step": 40200 | |
| }, | |
| { | |
| "combined_loss": 1.7249795198440552, | |
| "distill_loss": 1.2220125198364258, | |
| "epoch": 1.4726353579016778, | |
| "step": 40200, | |
| "student_mlm_loss": 2.2279465198516846 | |
| }, | |
| { | |
| "epoch": 1.476298629936259, | |
| "grad_norm": 88.92086029052734, | |
| "learning_rate": 2.555102341695948e-05, | |
| "loss": 5.5622, | |
| "step": 40300 | |
| }, | |
| { | |
| "combined_loss": 3.5107364654541016, | |
| "distill_loss": 1.2663298845291138, | |
| "epoch": 1.476298629936259, | |
| "step": 40300, | |
| "student_mlm_loss": 5.755143165588379 | |
| }, | |
| { | |
| "epoch": 1.4799619019708403, | |
| "grad_norm": 4.677048683166504, | |
| "learning_rate": 2.5489593827554857e-05, | |
| "loss": 5.3278, | |
| "step": 40400 | |
| }, | |
| { | |
| "combined_loss": 3.5298116207122803, | |
| "distill_loss": 1.1846145391464233, | |
| "epoch": 1.4799619019708403, | |
| "step": 40400, | |
| "student_mlm_loss": 5.875008583068848 | |
| }, | |
| { | |
| "epoch": 1.4836251740054216, | |
| "grad_norm": 21.207704544067383, | |
| "learning_rate": 2.542816423815023e-05, | |
| "loss": 2.9588, | |
| "step": 40500 | |
| }, | |
| { | |
| "combined_loss": 2.6109657287597656, | |
| "distill_loss": 1.2608091831207275, | |
| "epoch": 1.4836251740054216, | |
| "step": 40500, | |
| "student_mlm_loss": 3.9611220359802246 | |
| }, | |
| { | |
| "epoch": 1.4872884460400029, | |
| "grad_norm": 7.7415876388549805, | |
| "learning_rate": 2.536673464874561e-05, | |
| "loss": 2.706, | |
| "step": 40600 | |
| }, | |
| { | |
| "combined_loss": 2.455023765563965, | |
| "distill_loss": 1.3175585269927979, | |
| "epoch": 1.4872884460400029, | |
| "step": 40600, | |
| "student_mlm_loss": 3.5924887657165527 | |
| }, | |
| { | |
| "epoch": 1.4909517180745842, | |
| "grad_norm": 19.366378784179688, | |
| "learning_rate": 2.5305305059340984e-05, | |
| "loss": 2.7981, | |
| "step": 40700 | |
| }, | |
| { | |
| "combined_loss": 3.624007225036621, | |
| "distill_loss": 1.1402699947357178, | |
| "epoch": 1.4909517180745842, | |
| "step": 40700, | |
| "student_mlm_loss": 6.1077446937561035 | |
| }, | |
| { | |
| "epoch": 1.4946149901091654, | |
| "grad_norm": 7.310671806335449, | |
| "learning_rate": 2.524387546993636e-05, | |
| "loss": 29.272, | |
| "step": 40800 | |
| }, | |
| { | |
| "combined_loss": 2.2329726219177246, | |
| "distill_loss": 1.303555965423584, | |
| "epoch": 1.4946149901091654, | |
| "step": 40800, | |
| "student_mlm_loss": 3.1623895168304443 | |
| }, | |
| { | |
| "epoch": 1.4982782621437467, | |
| "grad_norm": 48.7297477722168, | |
| "learning_rate": 2.5182445880531736e-05, | |
| "loss": 3.1319, | |
| "step": 40900 | |
| }, | |
| { | |
| "combined_loss": 1.8255285024642944, | |
| "distill_loss": 1.1643202304840088, | |
| "epoch": 1.4982782621437467, | |
| "step": 40900, | |
| "student_mlm_loss": 2.48673677444458 | |
| }, | |
| { | |
| "epoch": 1.5019415341783282, | |
| "grad_norm": 32.60409927368164, | |
| "learning_rate": 2.5121016291127114e-05, | |
| "loss": 8.524, | |
| "step": 41000 | |
| }, | |
| { | |
| "combined_loss": 2.896923542022705, | |
| "distill_loss": 1.3571655750274658, | |
| "epoch": 1.5019415341783282, | |
| "step": 41000, | |
| "student_mlm_loss": 4.436681747436523 | |
| }, | |
| { | |
| "epoch": 1.5056048062129093, | |
| "grad_norm": 4.127974510192871, | |
| "learning_rate": 2.5059586701722488e-05, | |
| "loss": 6.3087, | |
| "step": 41100 | |
| }, | |
| { | |
| "combined_loss": 2.145819664001465, | |
| "distill_loss": 1.2983198165893555, | |
| "epoch": 1.5056048062129093, | |
| "step": 41100, | |
| "student_mlm_loss": 2.993319511413574 | |
| }, | |
| { | |
| "epoch": 1.5092680782474908, | |
| "grad_norm": 3.873206853866577, | |
| "learning_rate": 2.4998157112317863e-05, | |
| "loss": 5.279, | |
| "step": 41200 | |
| }, | |
| { | |
| "combined_loss": 4.8266730308532715, | |
| "distill_loss": 1.1676665544509888, | |
| "epoch": 1.5092680782474908, | |
| "step": 41200, | |
| "student_mlm_loss": 8.485679626464844 | |
| }, | |
| { | |
| "epoch": 1.5129313502820718, | |
| "grad_norm": 6.902312755584717, | |
| "learning_rate": 2.493672752291324e-05, | |
| "loss": 5.3583, | |
| "step": 41300 | |
| }, | |
| { | |
| "combined_loss": 1.7068848609924316, | |
| "distill_loss": 1.1335561275482178, | |
| "epoch": 1.5129313502820718, | |
| "step": 41300, | |
| "student_mlm_loss": 2.2802135944366455 | |
| }, | |
| { | |
| "epoch": 1.5165946223166533, | |
| "grad_norm": 17.415306091308594, | |
| "learning_rate": 2.487529793350861e-05, | |
| "loss": 2.8319, | |
| "step": 41400 | |
| }, | |
| { | |
| "combined_loss": 1.5696630477905273, | |
| "distill_loss": 1.152633786201477, | |
| "epoch": 1.5165946223166533, | |
| "step": 41400, | |
| "student_mlm_loss": 1.9866924285888672 | |
| }, | |
| { | |
| "epoch": 1.5202578943512344, | |
| "grad_norm": 11.67779541015625, | |
| "learning_rate": 2.481386834410399e-05, | |
| "loss": 3.0117, | |
| "step": 41500 | |
| }, | |
| { | |
| "combined_loss": 1.9209272861480713, | |
| "distill_loss": 1.2611881494522095, | |
| "epoch": 1.5202578943512344, | |
| "step": 41500, | |
| "student_mlm_loss": 2.5806663036346436 | |
| }, | |
| { | |
| "epoch": 1.5239211663858159, | |
| "grad_norm": 9.814743041992188, | |
| "learning_rate": 2.4752438754699364e-05, | |
| "loss": 2.8479, | |
| "step": 41600 | |
| }, | |
| { | |
| "combined_loss": 4.1822404861450195, | |
| "distill_loss": 1.254117488861084, | |
| "epoch": 1.5239211663858159, | |
| "step": 41600, | |
| "student_mlm_loss": 7.110363960266113 | |
| }, | |
| { | |
| "epoch": 1.5275844384203972, | |
| "grad_norm": 11.7344970703125, | |
| "learning_rate": 2.4691009165294742e-05, | |
| "loss": 3.2502, | |
| "step": 41700 | |
| }, | |
| { | |
| "combined_loss": 1.7558622360229492, | |
| "distill_loss": 1.1821727752685547, | |
| "epoch": 1.5275844384203972, | |
| "step": 41700, | |
| "student_mlm_loss": 2.3295516967773438 | |
| }, | |
| { | |
| "epoch": 1.5312477104549784, | |
| "grad_norm": 8.426025390625, | |
| "learning_rate": 2.4629579575890116e-05, | |
| "loss": 3.3169, | |
| "step": 41800 | |
| }, | |
| { | |
| "combined_loss": 1.843000054359436, | |
| "distill_loss": 1.1456735134124756, | |
| "epoch": 1.5312477104549784, | |
| "step": 41800, | |
| "student_mlm_loss": 2.5403265953063965 | |
| }, | |
| { | |
| "epoch": 1.5349109824895597, | |
| "grad_norm": 3.654872417449951, | |
| "learning_rate": 2.456814998648549e-05, | |
| "loss": 2.6259, | |
| "step": 41900 | |
| }, | |
| { | |
| "combined_loss": 1.7651002407073975, | |
| "distill_loss": 1.1741529703140259, | |
| "epoch": 1.5349109824895597, | |
| "step": 41900, | |
| "student_mlm_loss": 2.3560476303100586 | |
| }, | |
| { | |
| "epoch": 1.538574254524141, | |
| "grad_norm": 18.605615615844727, | |
| "learning_rate": 2.450672039708087e-05, | |
| "loss": 2.4854, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.538574254524141, | |
| "eval_loss": 3.4032058715820312, | |
| "eval_runtime": 1.8747, | |
| "eval_samples_per_second": 3731.788, | |
| "eval_steps_per_second": 14.936, | |
| "step": 42000 | |
| }, | |
| { | |
| "combined_loss": 2.60400390625, | |
| "distill_loss": 1.2034615278244019, | |
| "epoch": 1.538574254524141, | |
| "step": 42000, | |
| "student_mlm_loss": 4.004546165466309 | |
| }, | |
| { | |
| "epoch": 1.5422375265587223, | |
| "grad_norm": 6.775146484375, | |
| "learning_rate": 2.4445290807676243e-05, | |
| "loss": 2.8405, | |
| "step": 42100 | |
| }, | |
| { | |
| "combined_loss": 1.7485601902008057, | |
| "distill_loss": 1.1682909727096558, | |
| "epoch": 1.5422375265587223, | |
| "step": 42100, | |
| "student_mlm_loss": 2.328829288482666 | |
| }, | |
| { | |
| "epoch": 1.5459007985933035, | |
| "grad_norm": 24.79000473022461, | |
| "learning_rate": 2.4383861218271617e-05, | |
| "loss": 2.9811, | |
| "step": 42200 | |
| }, | |
| { | |
| "combined_loss": 2.2294323444366455, | |
| "distill_loss": 1.262848138809204, | |
| "epoch": 1.5459007985933035, | |
| "step": 42200, | |
| "student_mlm_loss": 3.196016550064087 | |
| }, | |
| { | |
| "epoch": 1.5495640706278848, | |
| "grad_norm": 11.027627944946289, | |
| "learning_rate": 2.4322431628866992e-05, | |
| "loss": 3.7109, | |
| "step": 42300 | |
| }, | |
| { | |
| "combined_loss": 1.8129802942276, | |
| "distill_loss": 1.205324411392212, | |
| "epoch": 1.5495640706278848, | |
| "step": 42300, | |
| "student_mlm_loss": 2.4206361770629883 | |
| }, | |
| { | |
| "epoch": 1.553227342662466, | |
| "grad_norm": 6.328401565551758, | |
| "learning_rate": 2.426100203946237e-05, | |
| "loss": 31.168, | |
| "step": 42400 | |
| }, | |
| { | |
| "combined_loss": 2.391860246658325, | |
| "distill_loss": 1.1356655359268188, | |
| "epoch": 1.553227342662466, | |
| "step": 42400, | |
| "student_mlm_loss": 3.648054838180542 | |
| }, | |
| { | |
| "epoch": 1.5568906146970474, | |
| "grad_norm": 26.61184310913086, | |
| "learning_rate": 2.4199572450057744e-05, | |
| "loss": 6.4259, | |
| "step": 42500 | |
| }, | |
| { | |
| "combined_loss": 3.222200870513916, | |
| "distill_loss": 1.3243845701217651, | |
| "epoch": 1.5568906146970474, | |
| "step": 42500, | |
| "student_mlm_loss": 5.120017051696777 | |
| }, | |
| { | |
| "epoch": 1.5605538867316286, | |
| "grad_norm": 78.89910888671875, | |
| "learning_rate": 2.4138142860653122e-05, | |
| "loss": 3.3441, | |
| "step": 42600 | |
| }, | |
| { | |
| "combined_loss": 1.7442145347595215, | |
| "distill_loss": 1.282542109489441, | |
| "epoch": 1.5605538867316286, | |
| "step": 42600, | |
| "student_mlm_loss": 2.2058870792388916 | |
| }, | |
| { | |
| "epoch": 1.56421715876621, | |
| "grad_norm": 88.92566680908203, | |
| "learning_rate": 2.4076713271248496e-05, | |
| "loss": 2.8234, | |
| "step": 42700 | |
| }, | |
| { | |
| "combined_loss": 2.366835117340088, | |
| "distill_loss": 1.1711124181747437, | |
| "epoch": 1.56421715876621, | |
| "step": 42700, | |
| "student_mlm_loss": 3.5625579357147217 | |
| }, | |
| { | |
| "epoch": 1.5678804308007912, | |
| "grad_norm": 6.83758544921875, | |
| "learning_rate": 2.4015283681843874e-05, | |
| "loss": 5.4491, | |
| "step": 42800 | |
| }, | |
| { | |
| "combined_loss": 4.174956798553467, | |
| "distill_loss": 1.0669249296188354, | |
| "epoch": 1.5678804308007912, | |
| "step": 42800, | |
| "student_mlm_loss": 7.282988548278809 | |
| }, | |
| { | |
| "epoch": 1.5715437028353727, | |
| "grad_norm": 5.723924160003662, | |
| "learning_rate": 2.395385409243925e-05, | |
| "loss": 3.1108, | |
| "step": 42900 | |
| }, | |
| { | |
| "combined_loss": 2.3197238445281982, | |
| "distill_loss": 1.2763570547103882, | |
| "epoch": 1.5715437028353727, | |
| "step": 42900, | |
| "student_mlm_loss": 3.3630905151367188 | |
| }, | |
| { | |
| "epoch": 1.5752069748699538, | |
| "grad_norm": 14.807353973388672, | |
| "learning_rate": 2.3892424503034623e-05, | |
| "loss": 6.4113, | |
| "step": 43000 | |
| }, | |
| { | |
| "combined_loss": 1.7868092060089111, | |
| "distill_loss": 1.1304634809494019, | |
| "epoch": 1.5752069748699538, | |
| "step": 43000, | |
| "student_mlm_loss": 2.44315505027771 | |
| }, | |
| { | |
| "epoch": 1.5788702469045353, | |
| "grad_norm": 8.68276596069336, | |
| "learning_rate": 2.3830994913629998e-05, | |
| "loss": 5.1213, | |
| "step": 43100 | |
| }, | |
| { | |
| "combined_loss": 19.46100425720215, | |
| "distill_loss": 1.259545087814331, | |
| "epoch": 1.5788702469045353, | |
| "step": 43100, | |
| "student_mlm_loss": 37.6624641418457 | |
| }, | |
| { | |
| "epoch": 1.5825335189391163, | |
| "grad_norm": 4.91242790222168, | |
| "learning_rate": 2.3769565324225372e-05, | |
| "loss": 3.2674, | |
| "step": 43200 | |
| }, | |
| { | |
| "combined_loss": 1.797656536102295, | |
| "distill_loss": 1.3039189577102661, | |
| "epoch": 1.5825335189391163, | |
| "step": 43200, | |
| "student_mlm_loss": 2.2913942337036133 | |
| }, | |
| { | |
| "epoch": 1.5861967909736978, | |
| "grad_norm": 52.68294906616211, | |
| "learning_rate": 2.370813573482075e-05, | |
| "loss": 3.7711, | |
| "step": 43300 | |
| }, | |
| { | |
| "combined_loss": 1.8017528057098389, | |
| "distill_loss": 1.1734706163406372, | |
| "epoch": 1.5861967909736978, | |
| "step": 43300, | |
| "student_mlm_loss": 2.43003511428833 | |
| }, | |
| { | |
| "epoch": 1.5898600630082789, | |
| "grad_norm": 11.869544982910156, | |
| "learning_rate": 2.3646706145416124e-05, | |
| "loss": 9.8177, | |
| "step": 43400 | |
| }, | |
| { | |
| "combined_loss": 2.760119915008545, | |
| "distill_loss": 1.2446471452713013, | |
| "epoch": 1.5898600630082789, | |
| "step": 43400, | |
| "student_mlm_loss": 4.275592803955078 | |
| }, | |
| { | |
| "epoch": 1.5935233350428604, | |
| "grad_norm": 3.7819387912750244, | |
| "learning_rate": 2.3585276556011502e-05, | |
| "loss": 4.6552, | |
| "step": 43500 | |
| }, | |
| { | |
| "combined_loss": 4.660012245178223, | |
| "distill_loss": 1.1187530755996704, | |
| "epoch": 1.5935233350428604, | |
| "step": 43500, | |
| "student_mlm_loss": 8.201271057128906 | |
| }, | |
| { | |
| "epoch": 1.5971866070774414, | |
| "grad_norm": 21.269559860229492, | |
| "learning_rate": 2.3523846966606877e-05, | |
| "loss": 8.5404, | |
| "step": 43600 | |
| }, | |
| { | |
| "combined_loss": 2.3045759201049805, | |
| "distill_loss": 1.3545589447021484, | |
| "epoch": 1.5971866070774414, | |
| "step": 43600, | |
| "student_mlm_loss": 3.2545931339263916 | |
| }, | |
| { | |
| "epoch": 1.600849879112023, | |
| "grad_norm": 8.289508819580078, | |
| "learning_rate": 2.3462417377202254e-05, | |
| "loss": 2.7135, | |
| "step": 43700 | |
| }, | |
| { | |
| "combined_loss": 3.0867691040039062, | |
| "distill_loss": 1.1124651432037354, | |
| "epoch": 1.600849879112023, | |
| "step": 43700, | |
| "student_mlm_loss": 5.061073303222656 | |
| }, | |
| { | |
| "epoch": 1.6045131511466042, | |
| "grad_norm": 22.303661346435547, | |
| "learning_rate": 2.3400987787797625e-05, | |
| "loss": 3.6364, | |
| "step": 43800 | |
| }, | |
| { | |
| "combined_loss": 1.7930564880371094, | |
| "distill_loss": 1.2114512920379639, | |
| "epoch": 1.6045131511466042, | |
| "step": 43800, | |
| "student_mlm_loss": 2.374661684036255 | |
| }, | |
| { | |
| "epoch": 1.6081764231811855, | |
| "grad_norm": 4.351790904998779, | |
| "learning_rate": 2.3339558198393003e-05, | |
| "loss": 5.6887, | |
| "step": 43900 | |
| }, | |
| { | |
| "combined_loss": 1.7365663051605225, | |
| "distill_loss": 1.2089755535125732, | |
| "epoch": 1.6081764231811855, | |
| "step": 43900, | |
| "student_mlm_loss": 2.2641570568084717 | |
| }, | |
| { | |
| "epoch": 1.6118396952157668, | |
| "grad_norm": 13.450850486755371, | |
| "learning_rate": 2.3278128608988378e-05, | |
| "loss": 3.6702, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.6118396952157668, | |
| "eval_loss": 3.194415330886841, | |
| "eval_runtime": 1.9274, | |
| "eval_samples_per_second": 3629.828, | |
| "eval_steps_per_second": 14.528, | |
| "step": 44000 | |
| }, | |
| { | |
| "combined_loss": 1.760496735572815, | |
| "distill_loss": 1.1514201164245605, | |
| "epoch": 1.6118396952157668, | |
| "step": 44000, | |
| "student_mlm_loss": 2.3695733547210693 | |
| }, | |
| { | |
| "epoch": 1.615502967250348, | |
| "grad_norm": 7.381774425506592, | |
| "learning_rate": 2.3216699019583756e-05, | |
| "loss": 2.9269, | |
| "step": 44100 | |
| }, | |
| { | |
| "combined_loss": 4.663776397705078, | |
| "distill_loss": 1.307958722114563, | |
| "epoch": 1.615502967250348, | |
| "step": 44100, | |
| "student_mlm_loss": 8.019594192504883 | |
| }, | |
| { | |
| "epoch": 1.6191662392849293, | |
| "grad_norm": 10.999051094055176, | |
| "learning_rate": 2.315526943017913e-05, | |
| "loss": 3.0334, | |
| "step": 44200 | |
| }, | |
| { | |
| "combined_loss": 1.9191560745239258, | |
| "distill_loss": 1.3481658697128296, | |
| "epoch": 1.6191662392849293, | |
| "step": 44200, | |
| "student_mlm_loss": 2.4901461601257324 | |
| }, | |
| { | |
| "epoch": 1.6228295113195106, | |
| "grad_norm": 6.187446594238281, | |
| "learning_rate": 2.3093839840774504e-05, | |
| "loss": 30.6923, | |
| "step": 44300 | |
| }, | |
| { | |
| "combined_loss": 12.122703552246094, | |
| "distill_loss": 1.1659897565841675, | |
| "epoch": 1.6228295113195106, | |
| "step": 44300, | |
| "student_mlm_loss": 23.079418182373047 | |
| }, | |
| { | |
| "epoch": 1.6264927833540919, | |
| "grad_norm": 6.142828941345215, | |
| "learning_rate": 2.3032410251369882e-05, | |
| "loss": 7.4162, | |
| "step": 44400 | |
| }, | |
| { | |
| "combined_loss": 1.9456160068511963, | |
| "distill_loss": 1.257858157157898, | |
| "epoch": 1.6264927833540919, | |
| "step": 44400, | |
| "student_mlm_loss": 2.633373737335205 | |
| }, | |
| { | |
| "epoch": 1.6301560553886731, | |
| "grad_norm": 15.393942832946777, | |
| "learning_rate": 2.2970980661965257e-05, | |
| "loss": 4.8003, | |
| "step": 44500 | |
| }, | |
| { | |
| "combined_loss": 2.7578635215759277, | |
| "distill_loss": 1.1640808582305908, | |
| "epoch": 1.6301560553886731, | |
| "step": 44500, | |
| "student_mlm_loss": 4.351646423339844 | |
| }, | |
| { | |
| "epoch": 1.6338193274232544, | |
| "grad_norm": 18.73512077331543, | |
| "learning_rate": 2.290955107256063e-05, | |
| "loss": 5.3592, | |
| "step": 44600 | |
| }, | |
| { | |
| "combined_loss": 3.758654832839966, | |
| "distill_loss": 1.260606288909912, | |
| "epoch": 1.6338193274232544, | |
| "step": 44600, | |
| "student_mlm_loss": 6.2567033767700195 | |
| }, | |
| { | |
| "epoch": 1.6374825994578357, | |
| "grad_norm": 6.1570048332214355, | |
| "learning_rate": 2.2848121483156006e-05, | |
| "loss": 10.8594, | |
| "step": 44700 | |
| }, | |
| { | |
| "combined_loss": 3.205047845840454, | |
| "distill_loss": 1.1495074033737183, | |
| "epoch": 1.6374825994578357, | |
| "step": 44700, | |
| "student_mlm_loss": 5.2605881690979 | |
| }, | |
| { | |
| "epoch": 1.641145871492417, | |
| "grad_norm": 8.748614311218262, | |
| "learning_rate": 2.2786691893751383e-05, | |
| "loss": 2.611, | |
| "step": 44800 | |
| }, | |
| { | |
| "combined_loss": 2.7548794746398926, | |
| "distill_loss": 1.153849482536316, | |
| "epoch": 1.641145871492417, | |
| "step": 44800, | |
| "student_mlm_loss": 4.35590934753418 | |
| }, | |
| { | |
| "epoch": 1.6448091435269983, | |
| "grad_norm": 9.594339370727539, | |
| "learning_rate": 2.2725262304346758e-05, | |
| "loss": 3.621, | |
| "step": 44900 | |
| }, | |
| { | |
| "combined_loss": 2.63676381111145, | |
| "distill_loss": 1.144437313079834, | |
| "epoch": 1.6448091435269983, | |
| "step": 44900, | |
| "student_mlm_loss": 4.129090309143066 | |
| }, | |
| { | |
| "epoch": 1.6484724155615798, | |
| "grad_norm": 8.756010055541992, | |
| "learning_rate": 2.2663832714942136e-05, | |
| "loss": 5.0762, | |
| "step": 45000 | |
| }, | |
| { | |
| "combined_loss": 2.0047507286071777, | |
| "distill_loss": 1.203262209892273, | |
| "epoch": 1.6484724155615798, | |
| "step": 45000, | |
| "student_mlm_loss": 2.806239366531372 | |
| }, | |
| { | |
| "epoch": 1.6521356875961608, | |
| "grad_norm": 16.163911819458008, | |
| "learning_rate": 2.260240312553751e-05, | |
| "loss": 3.1675, | |
| "step": 45100 | |
| }, | |
| { | |
| "combined_loss": 1.822305679321289, | |
| "distill_loss": 1.187317967414856, | |
| "epoch": 1.6521356875961608, | |
| "step": 45100, | |
| "student_mlm_loss": 2.4572935104370117 | |
| }, | |
| { | |
| "epoch": 1.6557989596307423, | |
| "grad_norm": 4.047428607940674, | |
| "learning_rate": 2.2540973536132888e-05, | |
| "loss": 2.6406, | |
| "step": 45200 | |
| }, | |
| { | |
| "combined_loss": 2.431349039077759, | |
| "distill_loss": 1.2643455266952515, | |
| "epoch": 1.6557989596307423, | |
| "step": 45200, | |
| "student_mlm_loss": 3.5983526706695557 | |
| }, | |
| { | |
| "epoch": 1.6594622316653234, | |
| "grad_norm": 28.598485946655273, | |
| "learning_rate": 2.247954394672826e-05, | |
| "loss": 3.7667, | |
| "step": 45300 | |
| }, | |
| { | |
| "combined_loss": 2.274944543838501, | |
| "distill_loss": 1.266087293624878, | |
| "epoch": 1.6594622316653234, | |
| "step": 45300, | |
| "student_mlm_loss": 3.283801794052124 | |
| }, | |
| { | |
| "epoch": 1.6631255036999049, | |
| "grad_norm": 11.642946243286133, | |
| "learning_rate": 2.2418114357323637e-05, | |
| "loss": 3.0131, | |
| "step": 45400 | |
| }, | |
| { | |
| "combined_loss": 2.064805507659912, | |
| "distill_loss": 1.2423893213272095, | |
| "epoch": 1.6631255036999049, | |
| "step": 45400, | |
| "student_mlm_loss": 2.8872218132019043 | |
| }, | |
| { | |
| "epoch": 1.666788775734486, | |
| "grad_norm": 7.227854251861572, | |
| "learning_rate": 2.235668476791901e-05, | |
| "loss": 7.556, | |
| "step": 45500 | |
| }, | |
| { | |
| "combined_loss": 1.8626993894577026, | |
| "distill_loss": 1.153686761856079, | |
| "epoch": 1.666788775734486, | |
| "step": 45500, | |
| "student_mlm_loss": 2.571712017059326 | |
| }, | |
| { | |
| "epoch": 1.6704520477690674, | |
| "grad_norm": 11.972105026245117, | |
| "learning_rate": 2.229525517851439e-05, | |
| "loss": 3.9606, | |
| "step": 45600 | |
| }, | |
| { | |
| "combined_loss": 1.7529842853546143, | |
| "distill_loss": 1.2637630701065063, | |
| "epoch": 1.6704520477690674, | |
| "step": 45600, | |
| "student_mlm_loss": 2.2422056198120117 | |
| }, | |
| { | |
| "epoch": 1.6741153198036485, | |
| "grad_norm": 4.263253211975098, | |
| "learning_rate": 2.2233825589109764e-05, | |
| "loss": 3.0922, | |
| "step": 45700 | |
| }, | |
| { | |
| "combined_loss": 2.6089985370635986, | |
| "distill_loss": 1.2136098146438599, | |
| "epoch": 1.6741153198036485, | |
| "step": 45700, | |
| "student_mlm_loss": 4.004387378692627 | |
| }, | |
| { | |
| "epoch": 1.67777859183823, | |
| "grad_norm": 24.4074764251709, | |
| "learning_rate": 2.2172395999705138e-05, | |
| "loss": 3.2329, | |
| "step": 45800 | |
| }, | |
| { | |
| "combined_loss": 1.6919562816619873, | |
| "distill_loss": 1.139168381690979, | |
| "epoch": 1.67777859183823, | |
| "step": 45800, | |
| "student_mlm_loss": 2.244744300842285 | |
| }, | |
| { | |
| "epoch": 1.6814418638728112, | |
| "grad_norm": 5.1518778800964355, | |
| "learning_rate": 2.2110966410300516e-05, | |
| "loss": 9.4019, | |
| "step": 45900 | |
| }, | |
| { | |
| "combined_loss": 2.1822292804718018, | |
| "distill_loss": 1.3423482179641724, | |
| "epoch": 1.6814418638728112, | |
| "step": 45900, | |
| "student_mlm_loss": 3.0221104621887207 | |
| }, | |
| { | |
| "epoch": 1.6851051359073925, | |
| "grad_norm": 18.045368194580078, | |
| "learning_rate": 2.204953682089589e-05, | |
| "loss": 3.3662, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.6851051359073925, | |
| "eval_loss": 3.070533275604248, | |
| "eval_runtime": 1.9768, | |
| "eval_samples_per_second": 3539.063, | |
| "eval_steps_per_second": 14.164, | |
| "step": 46000 | |
| }, | |
| { | |
| "combined_loss": 1.8376495838165283, | |
| "distill_loss": 1.261283278465271, | |
| "epoch": 1.6851051359073925, | |
| "step": 46000, | |
| "student_mlm_loss": 2.414015769958496 | |
| }, | |
| { | |
| "epoch": 1.6887684079419738, | |
| "grad_norm": 5.69982385635376, | |
| "learning_rate": 2.1988107231491265e-05, | |
| "loss": 3.3451, | |
| "step": 46100 | |
| }, | |
| { | |
| "combined_loss": 1.7916219234466553, | |
| "distill_loss": 1.2525031566619873, | |
| "epoch": 1.6887684079419738, | |
| "step": 46100, | |
| "student_mlm_loss": 2.3307406902313232 | |
| }, | |
| { | |
| "epoch": 1.692431679976555, | |
| "grad_norm": 27.134151458740234, | |
| "learning_rate": 2.192667764208664e-05, | |
| "loss": 9.1006, | |
| "step": 46200 | |
| }, | |
| { | |
| "combined_loss": 59.0687141418457, | |
| "distill_loss": 1.1848413944244385, | |
| "epoch": 1.692431679976555, | |
| "step": 46200, | |
| "student_mlm_loss": 116.95258331298828 | |
| }, | |
| { | |
| "epoch": 1.6960949520111364, | |
| "grad_norm": 6.624229431152344, | |
| "learning_rate": 2.1865248052682017e-05, | |
| "loss": 3.0016, | |
| "step": 46300 | |
| }, | |
| { | |
| "combined_loss": 2.7997608184814453, | |
| "distill_loss": 1.1524275541305542, | |
| "epoch": 1.6960949520111364, | |
| "step": 46300, | |
| "student_mlm_loss": 4.447093963623047 | |
| }, | |
| { | |
| "epoch": 1.6997582240457176, | |
| "grad_norm": 5.472049236297607, | |
| "learning_rate": 2.180381846327739e-05, | |
| "loss": 20.0915, | |
| "step": 46400 | |
| }, | |
| { | |
| "combined_loss": 1.7153997421264648, | |
| "distill_loss": 1.237658143043518, | |
| "epoch": 1.6997582240457176, | |
| "step": 46400, | |
| "student_mlm_loss": 2.193141460418701 | |
| }, | |
| { | |
| "epoch": 1.703421496080299, | |
| "grad_norm": 14.290247917175293, | |
| "learning_rate": 2.174238887387277e-05, | |
| "loss": 4.5936, | |
| "step": 46500 | |
| }, | |
| { | |
| "combined_loss": 1.709627628326416, | |
| "distill_loss": 1.2791212797164917, | |
| "epoch": 1.703421496080299, | |
| "step": 46500, | |
| "student_mlm_loss": 2.140133857727051 | |
| }, | |
| { | |
| "epoch": 1.7070847681148802, | |
| "grad_norm": 17.962997436523438, | |
| "learning_rate": 2.1680959284468144e-05, | |
| "loss": 3.3627, | |
| "step": 46600 | |
| }, | |
| { | |
| "combined_loss": 7.8201751708984375, | |
| "distill_loss": 1.3012824058532715, | |
| "epoch": 1.7070847681148802, | |
| "step": 46600, | |
| "student_mlm_loss": 14.339067459106445 | |
| }, | |
| { | |
| "epoch": 1.7107480401494615, | |
| "grad_norm": 6.800339698791504, | |
| "learning_rate": 2.161952969506352e-05, | |
| "loss": 6.7955, | |
| "step": 46700 | |
| }, | |
| { | |
| "combined_loss": 1.809753656387329, | |
| "distill_loss": 1.2891262769699097, | |
| "epoch": 1.7107480401494615, | |
| "step": 46700, | |
| "student_mlm_loss": 2.330381155014038 | |
| }, | |
| { | |
| "epoch": 1.7144113121840427, | |
| "grad_norm": 12.281099319458008, | |
| "learning_rate": 2.1558100105658896e-05, | |
| "loss": 10.3436, | |
| "step": 46800 | |
| }, | |
| { | |
| "combined_loss": 3.3808600902557373, | |
| "distill_loss": 1.2777303457260132, | |
| "epoch": 1.7144113121840427, | |
| "step": 46800, | |
| "student_mlm_loss": 5.483989715576172 | |
| }, | |
| { | |
| "epoch": 1.718074584218624, | |
| "grad_norm": 3.3210408687591553, | |
| "learning_rate": 2.149667051625427e-05, | |
| "loss": 2.8055, | |
| "step": 46900 | |
| }, | |
| { | |
| "combined_loss": 2.1092348098754883, | |
| "distill_loss": 1.2058593034744263, | |
| "epoch": 1.718074584218624, | |
| "step": 46900, | |
| "student_mlm_loss": 3.0126101970672607 | |
| }, | |
| { | |
| "epoch": 1.7217378562532053, | |
| "grad_norm": 11.694738388061523, | |
| "learning_rate": 2.1435240926849645e-05, | |
| "loss": 4.6311, | |
| "step": 47000 | |
| }, | |
| { | |
| "combined_loss": 2.2222890853881836, | |
| "distill_loss": 1.218597173690796, | |
| "epoch": 1.7217378562532053, | |
| "step": 47000, | |
| "student_mlm_loss": 3.2259812355041504 | |
| }, | |
| { | |
| "epoch": 1.7254011282877868, | |
| "grad_norm": 23.036334991455078, | |
| "learning_rate": 2.137381133744502e-05, | |
| "loss": 2.5923, | |
| "step": 47100 | |
| }, | |
| { | |
| "combined_loss": 1.882810354232788, | |
| "distill_loss": 1.2441027164459229, | |
| "epoch": 1.7254011282877868, | |
| "step": 47100, | |
| "student_mlm_loss": 2.5215179920196533 | |
| }, | |
| { | |
| "epoch": 1.7290644003223679, | |
| "grad_norm": 65.06354522705078, | |
| "learning_rate": 2.1312381748040397e-05, | |
| "loss": 3.3375, | |
| "step": 47200 | |
| }, | |
| { | |
| "combined_loss": 1.84983229637146, | |
| "distill_loss": 1.224557876586914, | |
| "epoch": 1.7290644003223679, | |
| "step": 47200, | |
| "student_mlm_loss": 2.475106716156006 | |
| }, | |
| { | |
| "epoch": 1.7327276723569494, | |
| "grad_norm": 9.202945709228516, | |
| "learning_rate": 2.1250952158635772e-05, | |
| "loss": 3.0094, | |
| "step": 47300 | |
| }, | |
| { | |
| "combined_loss": 1.6417255401611328, | |
| "distill_loss": 1.2296794652938843, | |
| "epoch": 1.7327276723569494, | |
| "step": 47300, | |
| "student_mlm_loss": 2.053771734237671 | |
| }, | |
| { | |
| "epoch": 1.7363909443915304, | |
| "grad_norm": 7.1568193435668945, | |
| "learning_rate": 2.118952256923115e-05, | |
| "loss": 3.3413, | |
| "step": 47400 | |
| }, | |
| { | |
| "combined_loss": 2.165384531021118, | |
| "distill_loss": 1.2572156190872192, | |
| "epoch": 1.7363909443915304, | |
| "step": 47400, | |
| "student_mlm_loss": 3.0735535621643066 | |
| }, | |
| { | |
| "epoch": 1.740054216426112, | |
| "grad_norm": 39.054439544677734, | |
| "learning_rate": 2.1128092979826524e-05, | |
| "loss": 4.8522, | |
| "step": 47500 | |
| }, | |
| { | |
| "combined_loss": 2.6122236251831055, | |
| "distill_loss": 1.1487023830413818, | |
| "epoch": 1.740054216426112, | |
| "step": 47500, | |
| "student_mlm_loss": 4.07574462890625 | |
| }, | |
| { | |
| "epoch": 1.743717488460693, | |
| "grad_norm": 3.18758487701416, | |
| "learning_rate": 2.1066663390421902e-05, | |
| "loss": 4.3993, | |
| "step": 47600 | |
| }, | |
| { | |
| "combined_loss": 6.344114303588867, | |
| "distill_loss": 1.1341725587844849, | |
| "epoch": 1.743717488460693, | |
| "step": 47600, | |
| "student_mlm_loss": 11.554056167602539 | |
| }, | |
| { | |
| "epoch": 1.7473807604952745, | |
| "grad_norm": 9.418896675109863, | |
| "learning_rate": 2.1005233801017273e-05, | |
| "loss": 8.7279, | |
| "step": 47700 | |
| }, | |
| { | |
| "combined_loss": 2.8721518516540527, | |
| "distill_loss": 1.2175838947296143, | |
| "epoch": 1.7473807604952745, | |
| "step": 47700, | |
| "student_mlm_loss": 4.526719570159912 | |
| }, | |
| { | |
| "epoch": 1.7510440325298555, | |
| "grad_norm": 4.730939865112305, | |
| "learning_rate": 2.094380421161265e-05, | |
| "loss": 2.74, | |
| "step": 47800 | |
| }, | |
| { | |
| "combined_loss": 1.8483730554580688, | |
| "distill_loss": 1.2789607048034668, | |
| "epoch": 1.7510440325298555, | |
| "step": 47800, | |
| "student_mlm_loss": 2.417785406112671 | |
| }, | |
| { | |
| "epoch": 1.754707304564437, | |
| "grad_norm": 4.566458225250244, | |
| "learning_rate": 2.0882374622208025e-05, | |
| "loss": 2.63, | |
| "step": 47900 | |
| }, | |
| { | |
| "combined_loss": 1.8073049783706665, | |
| "distill_loss": 1.3073413372039795, | |
| "epoch": 1.754707304564437, | |
| "step": 47900, | |
| "student_mlm_loss": 2.3072686195373535 | |
| }, | |
| { | |
| "epoch": 1.7583705765990183, | |
| "grad_norm": 14.967068672180176, | |
| "learning_rate": 2.0820945032803403e-05, | |
| "loss": 2.5821, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.7583705765990183, | |
| "eval_loss": 3.2400870323181152, | |
| "eval_runtime": 1.8322, | |
| "eval_samples_per_second": 3818.29, | |
| "eval_steps_per_second": 15.282, | |
| "step": 48000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 81894, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.7150683130961408e+16, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |