| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 585, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "all_correct": 0.625, |
| "all_wrong": 0.0, |
| "completion_length": 360.09375, |
| "epoch": 0.0017094017094017094, |
| "grad_norm": 1.9826301268041722, |
| "kl": 0.0, |
| "learning_rate": 4.999963950687845e-07, |
| "loss": 0.0, |
| "reward": 2.6583333015441895, |
| "reward_std": 0.8285301923751831, |
| "rewards/accuracy_reward": 1.7989583015441895, |
| "rewards/format_reward": 0.53125, |
| "step": 1, |
| "temporal_rewards": 0.75 |
| }, |
| { |
| "all_correct": 0.75, |
| "all_wrong": 0.0, |
| "completion_length": 460.71875, |
| "epoch": 0.003418803418803419, |
| "grad_norm": 1.783036944219682, |
| "kl": 0.00041484832763671875, |
| "learning_rate": 4.999855803791026e-07, |
| "loss": 0.0, |
| "reward": 3.078125, |
| "reward_std": 0.714574933052063, |
| "rewards/accuracy_reward": 2.090625047683716, |
| "rewards/format_reward": 0.65625, |
| "step": 2, |
| "temporal_rewards": 0.75 |
| }, |
| { |
| "all_correct": 0.75, |
| "all_wrong": 0.0, |
| "completion_length": 458.53125, |
| "epoch": 0.005128205128205128, |
| "grad_norm": 1.9140893572712068, |
| "kl": 0.004611968994140625, |
| "learning_rate": 4.999675562428436e-07, |
| "loss": 0.0002, |
| "reward": 3.1812500953674316, |
| "reward_std": 0.6964144110679626, |
| "rewards/accuracy_reward": 2.0531249046325684, |
| "rewards/format_reward": 0.78125, |
| "step": 3, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 0.625, |
| "all_wrong": 0.125, |
| "completion_length": 342.34375, |
| "epoch": 0.006837606837606838, |
| "grad_norm": 1.6435177595568693, |
| "kl": 0.0047607421875, |
| "learning_rate": 4.99942323179814e-07, |
| "loss": 0.0002, |
| "reward": 3.0593748092651367, |
| "reward_std": 0.4561102092266083, |
| "rewards/accuracy_reward": 1.834375023841858, |
| "rewards/format_reward": 0.9375, |
| "step": 4, |
| "temporal_rewards": 0.625 |
| }, |
| { |
| "all_correct": 0.75, |
| "all_wrong": 0.0, |
| "completion_length": 369.84375, |
| "epoch": 0.008547008547008548, |
| "grad_norm": 1.9133739405680183, |
| "kl": 0.004207611083984375, |
| "learning_rate": 4.999098819177214e-07, |
| "loss": 0.0002, |
| "reward": 3.379166603088379, |
| "reward_std": 0.6761210560798645, |
| "rewards/accuracy_reward": 2.0791666507720947, |
| "rewards/format_reward": 0.9375, |
| "step": 5, |
| "temporal_rewards": 0.75 |
| }, |
| { |
| "all_correct": 0.625, |
| "all_wrong": 0.0, |
| "completion_length": 351.4375, |
| "epoch": 0.010256410256410256, |
| "grad_norm": 1.7373006031428762, |
| "kl": 0.008636474609375, |
| "learning_rate": 4.998702333921537e-07, |
| "loss": 0.0003, |
| "reward": 3.2947916984558105, |
| "reward_std": 0.7285541296005249, |
| "rewards/accuracy_reward": 1.988541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 6, |
| "temporal_rewards": 0.75 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 330.90625, |
| "epoch": 0.011965811965811967, |
| "grad_norm": 1.81484083295842, |
| "kl": 0.00798797607421875, |
| "learning_rate": 4.998233787465529e-07, |
| "loss": 0.0003, |
| "reward": 3.910416603088379, |
| "reward_std": 0.41429048776626587, |
| "rewards/accuracy_reward": 2.519791603088379, |
| "rewards/format_reward": 1.0, |
| "step": 7, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.75, |
| "all_wrong": 0.125, |
| "completion_length": 337.34375, |
| "epoch": 0.013675213675213675, |
| "grad_norm": 1.5267078479800362, |
| "kl": 0.00785064697265625, |
| "learning_rate": 4.99769319332181e-07, |
| "loss": 0.0003, |
| "reward": 3.3531250953674316, |
| "reward_std": 0.3145015239715576, |
| "rewards/accuracy_reward": 2.034374952316284, |
| "rewards/format_reward": 1.0, |
| "step": 8, |
| "temporal_rewards": 0.75 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 335.0625, |
| "epoch": 0.015384615384615385, |
| "grad_norm": 1.685582407194424, |
| "kl": 0.00934600830078125, |
| "learning_rate": 4.997080567080816e-07, |
| "loss": 0.0004, |
| "reward": 3.6895830631256104, |
| "reward_std": 0.36123037338256836, |
| "rewards/accuracy_reward": 2.3427083492279053, |
| "rewards/format_reward": 0.96875, |
| "step": 9, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 353.875, |
| "epoch": 0.017094017094017096, |
| "grad_norm": 1.5329400178924226, |
| "kl": 0.0093841552734375, |
| "learning_rate": 4.996395926410354e-07, |
| "loss": 0.0004, |
| "reward": 3.886458396911621, |
| "reward_std": 0.3707513213157654, |
| "rewards/accuracy_reward": 2.4739582538604736, |
| "rewards/format_reward": 0.96875, |
| "step": 10, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 351.0, |
| "epoch": 0.018803418803418803, |
| "grad_norm": 1.549766282148727, |
| "kl": 0.0126953125, |
| "learning_rate": 4.995639291055083e-07, |
| "loss": 0.0005, |
| "reward": 4.042708396911621, |
| "reward_std": 0.2290562391281128, |
| "rewards/accuracy_reward": 2.586458206176758, |
| "rewards/format_reward": 1.0, |
| "step": 11, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.75, |
| "all_wrong": 0.0, |
| "completion_length": 358.1875, |
| "epoch": 0.020512820512820513, |
| "grad_norm": 1.496305479107766, |
| "kl": 0.016632080078125, |
| "learning_rate": 4.994810682835951e-07, |
| "loss": 0.0007, |
| "reward": 3.6510417461395264, |
| "reward_std": 0.5492165088653564, |
| "rewards/accuracy_reward": 2.3135416507720947, |
| "rewards/format_reward": 0.96875, |
| "step": 12, |
| "temporal_rewards": 0.75 |
| }, |
| { |
| "all_correct": 0.75, |
| "all_wrong": 0.0, |
| "completion_length": 368.4375, |
| "epoch": 0.022222222222222223, |
| "grad_norm": 1.541763401661613, |
| "kl": 0.0149078369140625, |
| "learning_rate": 4.99391012564956e-07, |
| "loss": 0.0006, |
| "reward": 3.7291665077209473, |
| "reward_std": 0.4612278342247009, |
| "rewards/accuracy_reward": 2.351041793823242, |
| "rewards/format_reward": 0.96875, |
| "step": 13, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 374.09375, |
| "epoch": 0.023931623931623933, |
| "grad_norm": 1.548862572081062, |
| "kl": 0.0169219970703125, |
| "learning_rate": 4.992937645467487e-07, |
| "loss": 0.0007, |
| "reward": 3.936458110809326, |
| "reward_std": 0.32494786381721497, |
| "rewards/accuracy_reward": 2.5614583492279053, |
| "rewards/format_reward": 0.96875, |
| "step": 14, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 374.21875, |
| "epoch": 0.02564102564102564, |
| "grad_norm": 1.3211685700244122, |
| "kl": 0.0189056396484375, |
| "learning_rate": 4.991893270335525e-07, |
| "loss": 0.0008, |
| "reward": 3.8302083015441895, |
| "reward_std": 0.275448203086853, |
| "rewards/accuracy_reward": 2.386458396911621, |
| "rewards/format_reward": 1.0, |
| "step": 15, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 390.65625, |
| "epoch": 0.02735042735042735, |
| "grad_norm": 1.0827777389563216, |
| "kl": 0.0201568603515625, |
| "learning_rate": 4.990777030372877e-07, |
| "loss": 0.0008, |
| "reward": 3.918750047683716, |
| "reward_std": 0.27283355593681335, |
| "rewards/accuracy_reward": 2.46875, |
| "rewards/format_reward": 1.0, |
| "step": 16, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 396.5, |
| "epoch": 0.02905982905982906, |
| "grad_norm": 1.4719325483552113, |
| "kl": 0.0209808349609375, |
| "learning_rate": 4.989588957771289e-07, |
| "loss": 0.0008, |
| "reward": 4.057291030883789, |
| "reward_std": 0.38743653893470764, |
| "rewards/accuracy_reward": 2.613541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 17, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.15625, |
| "epoch": 0.03076923076923077, |
| "grad_norm": 1.3897844926570597, |
| "kl": 0.0229339599609375, |
| "learning_rate": 4.988329086794122e-07, |
| "loss": 0.0009, |
| "reward": 4.282291412353516, |
| "reward_std": 0.2059958279132843, |
| "rewards/accuracy_reward": 2.8260416984558105, |
| "rewards/format_reward": 1.0, |
| "step": 18, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 394.75, |
| "epoch": 0.03247863247863248, |
| "grad_norm": 1.4025810108116477, |
| "kl": 0.0246124267578125, |
| "learning_rate": 4.98699745377536e-07, |
| "loss": 0.001, |
| "reward": 3.6979165077209473, |
| "reward_std": 0.22571580111980438, |
| "rewards/accuracy_reward": 2.285416603088379, |
| "rewards/format_reward": 1.0, |
| "step": 19, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 395.375, |
| "epoch": 0.03418803418803419, |
| "grad_norm": 1.3422743689560948, |
| "kl": 0.0244293212890625, |
| "learning_rate": 4.98559409711857e-07, |
| "loss": 0.001, |
| "reward": 4.1895833015441895, |
| "reward_std": 0.34896954894065857, |
| "rewards/accuracy_reward": 2.764583110809326, |
| "rewards/format_reward": 1.0, |
| "step": 20, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.5625, |
| "epoch": 0.035897435897435895, |
| "grad_norm": 1.3706326401655777, |
| "kl": 0.0267791748046875, |
| "learning_rate": 4.984119057295782e-07, |
| "loss": 0.0011, |
| "reward": 4.2083330154418945, |
| "reward_std": 0.21551749110221863, |
| "rewards/accuracy_reward": 2.714583396911621, |
| "rewards/format_reward": 1.0, |
| "step": 21, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 416.375, |
| "epoch": 0.037606837606837605, |
| "grad_norm": 1.3430039575658776, |
| "kl": 0.0264739990234375, |
| "learning_rate": 4.982572376846336e-07, |
| "loss": 0.0011, |
| "reward": 4.012499809265137, |
| "reward_std": 0.36676347255706787, |
| "rewards/accuracy_reward": 2.5687501430511475, |
| "rewards/format_reward": 1.0, |
| "step": 22, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 414.5, |
| "epoch": 0.039316239316239315, |
| "grad_norm": 1.0704324370355776, |
| "kl": 0.027374267578125, |
| "learning_rate": 4.980954100375641e-07, |
| "loss": 0.0011, |
| "reward": 3.9510416984558105, |
| "reward_std": 0.21451421082019806, |
| "rewards/accuracy_reward": 2.476041793823242, |
| "rewards/format_reward": 1.0, |
| "step": 23, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.75, |
| "all_wrong": 0.125, |
| "completion_length": 385.6875, |
| "epoch": 0.041025641025641026, |
| "grad_norm": 1.319994732732891, |
| "kl": 0.0298309326171875, |
| "learning_rate": 4.979264274553905e-07, |
| "loss": 0.0012, |
| "reward": 3.832291603088379, |
| "reward_std": 0.3102354407310486, |
| "rewards/accuracy_reward": 2.4510416984558105, |
| "rewards/format_reward": 1.0, |
| "step": 24, |
| "temporal_rewards": 0.75 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 417.15625, |
| "epoch": 0.042735042735042736, |
| "grad_norm": 1.2440324072815687, |
| "kl": 0.025665283203125, |
| "learning_rate": 4.977502948114771e-07, |
| "loss": 0.001, |
| "reward": 4.2677083015441895, |
| "reward_std": 0.11905767768621445, |
| "rewards/accuracy_reward": 2.7989585399627686, |
| "rewards/format_reward": 1.0, |
| "step": 25, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 397.78125, |
| "epoch": 0.044444444444444446, |
| "grad_norm": 1.3286663602585649, |
| "kl": 0.02685546875, |
| "learning_rate": 4.975670171853925e-07, |
| "loss": 0.0011, |
| "reward": 3.9354166984558105, |
| "reward_std": 0.29135236144065857, |
| "rewards/accuracy_reward": 2.522916793823242, |
| "rewards/format_reward": 1.0, |
| "step": 26, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 429.0, |
| "epoch": 0.046153846153846156, |
| "grad_norm": 1.301323110823831, |
| "kl": 0.0247650146484375, |
| "learning_rate": 4.973765998627628e-07, |
| "loss": 0.001, |
| "reward": 3.8374998569488525, |
| "reward_std": 0.3257061839103699, |
| "rewards/accuracy_reward": 2.4000000953674316, |
| "rewards/format_reward": 1.0, |
| "step": 27, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.28125, |
| "epoch": 0.04786324786324787, |
| "grad_norm": 1.2554920983763502, |
| "kl": 0.029266357421875, |
| "learning_rate": 4.971790483351185e-07, |
| "loss": 0.0012, |
| "reward": 4.2864580154418945, |
| "reward_std": 0.131798654794693, |
| "rewards/accuracy_reward": 2.7989583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 28, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.75, |
| "all_wrong": 0.0, |
| "completion_length": 394.0625, |
| "epoch": 0.04957264957264957, |
| "grad_norm": 0.9339974580407308, |
| "kl": 0.0288543701171875, |
| "learning_rate": 4.969743682997371e-07, |
| "loss": 0.0012, |
| "reward": 3.6687498092651367, |
| "reward_std": 0.45044267177581787, |
| "rewards/accuracy_reward": 2.25, |
| "rewards/format_reward": 1.0, |
| "step": 29, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.4375, |
| "epoch": 0.05128205128205128, |
| "grad_norm": 1.2262654726579296, |
| "kl": 0.0308685302734375, |
| "learning_rate": 4.967625656594781e-07, |
| "loss": 0.0012, |
| "reward": 4.2395830154418945, |
| "reward_std": 0.16444088518619537, |
| "rewards/accuracy_reward": 2.7708332538604736, |
| "rewards/format_reward": 1.0, |
| "step": 30, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.25, |
| "epoch": 0.05299145299145299, |
| "grad_norm": 1.2573879496841676, |
| "kl": 0.0293731689453125, |
| "learning_rate": 4.965436465226134e-07, |
| "loss": 0.0012, |
| "reward": 4.016666412353516, |
| "reward_std": 0.14875400066375732, |
| "rewards/accuracy_reward": 2.535416603088379, |
| "rewards/format_reward": 1.0, |
| "step": 31, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 368.6875, |
| "epoch": 0.0547008547008547, |
| "grad_norm": 1.2564985420337802, |
| "kl": 0.030120849609375, |
| "learning_rate": 4.963176172026501e-07, |
| "loss": 0.0012, |
| "reward": 3.9968748092651367, |
| "reward_std": 0.3774029016494751, |
| "rewards/accuracy_reward": 2.559375047683716, |
| "rewards/format_reward": 1.0, |
| "step": 32, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 424.0625, |
| "epoch": 0.05641025641025641, |
| "grad_norm": 0.9694026424422826, |
| "kl": 0.033355712890625, |
| "learning_rate": 4.960844842181495e-07, |
| "loss": 0.0013, |
| "reward": 4.047916412353516, |
| "reward_std": 0.3180779814720154, |
| "rewards/accuracy_reward": 2.616666555404663, |
| "rewards/format_reward": 1.0, |
| "step": 33, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.03125, |
| "epoch": 0.05811965811965812, |
| "grad_norm": 1.1514296295303623, |
| "kl": 0.03143310546875, |
| "learning_rate": 4.958442542925385e-07, |
| "loss": 0.0013, |
| "reward": 3.9666666984558105, |
| "reward_std": 0.14849570393562317, |
| "rewards/accuracy_reward": 2.4791667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 34, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 362.71875, |
| "epoch": 0.05982905982905983, |
| "grad_norm": 1.1335397647489873, |
| "kl": 0.0335693359375, |
| "learning_rate": 4.955969343539162e-07, |
| "loss": 0.0013, |
| "reward": 4.120833396911621, |
| "reward_std": 0.1538814902305603, |
| "rewards/accuracy_reward": 2.6583333015441895, |
| "rewards/format_reward": 1.0, |
| "step": 35, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 385.40625, |
| "epoch": 0.06153846153846154, |
| "grad_norm": 1.3288920319551032, |
| "kl": 0.03155517578125, |
| "learning_rate": 4.953425315348533e-07, |
| "loss": 0.0013, |
| "reward": 4.229166507720947, |
| "reward_std": 0.2994784116744995, |
| "rewards/accuracy_reward": 2.7760415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 36, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 407.46875, |
| "epoch": 0.06324786324786325, |
| "grad_norm": 1.3491529940946705, |
| "kl": 0.0291900634765625, |
| "learning_rate": 4.950810531721873e-07, |
| "loss": 0.0012, |
| "reward": 4.1583333015441895, |
| "reward_std": 0.08171668648719788, |
| "rewards/accuracy_reward": 2.727083206176758, |
| "rewards/format_reward": 1.0, |
| "step": 37, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.375, |
| "epoch": 0.06495726495726496, |
| "grad_norm": 1.4603865780151248, |
| "kl": 0.02972412109375, |
| "learning_rate": 4.948125068068102e-07, |
| "loss": 0.0012, |
| "reward": 4.2864580154418945, |
| "reward_std": 0.11952318251132965, |
| "rewards/accuracy_reward": 2.811458110809326, |
| "rewards/format_reward": 1.0, |
| "step": 38, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 476.25, |
| "epoch": 0.06666666666666667, |
| "grad_norm": 1.3332368328871798, |
| "kl": 0.02838134765625, |
| "learning_rate": 4.945369001834514e-07, |
| "loss": 0.0011, |
| "reward": 4.363541603088379, |
| "reward_std": 0.16119801998138428, |
| "rewards/accuracy_reward": 2.8885416984558105, |
| "rewards/format_reward": 1.0, |
| "step": 39, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.90625, |
| "epoch": 0.06837606837606838, |
| "grad_norm": 1.1501096034585943, |
| "kl": 0.0298004150390625, |
| "learning_rate": 4.942542412504542e-07, |
| "loss": 0.0012, |
| "reward": 4.179166316986084, |
| "reward_std": 0.09968569874763489, |
| "rewards/accuracy_reward": 2.6854166984558105, |
| "rewards/format_reward": 1.0, |
| "step": 40, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.78125, |
| "epoch": 0.07008547008547009, |
| "grad_norm": 1.3385150586201982, |
| "kl": 0.0303802490234375, |
| "learning_rate": 4.939645381595469e-07, |
| "loss": 0.0012, |
| "reward": 4.415625095367432, |
| "reward_std": 0.12376689910888672, |
| "rewards/accuracy_reward": 2.9468750953674316, |
| "rewards/format_reward": 1.0, |
| "step": 41, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.125, |
| "epoch": 0.07179487179487179, |
| "grad_norm": 1.5092483520373827, |
| "kl": 0.032257080078125, |
| "learning_rate": 4.93667799265607e-07, |
| "loss": 0.0013, |
| "reward": 4.179166793823242, |
| "reward_std": 0.24446402490139008, |
| "rewards/accuracy_reward": 2.7291665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 42, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 413.6875, |
| "epoch": 0.0735042735042735, |
| "grad_norm": 0.8198637140300333, |
| "kl": 0.0333251953125, |
| "learning_rate": 4.933640331264209e-07, |
| "loss": 0.0013, |
| "reward": 4.356249809265137, |
| "reward_std": 0.02693380042910576, |
| "rewards/accuracy_reward": 2.862499952316284, |
| "rewards/format_reward": 1.0, |
| "step": 43, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.34375, |
| "epoch": 0.07521367521367521, |
| "grad_norm": 1.362239639644523, |
| "kl": 0.03790283203125, |
| "learning_rate": 4.930532485024371e-07, |
| "loss": 0.0015, |
| "reward": 4.239583492279053, |
| "reward_std": 0.15847893059253693, |
| "rewards/accuracy_reward": 2.7645833492279053, |
| "rewards/format_reward": 1.0, |
| "step": 44, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.34375, |
| "epoch": 0.07692307692307693, |
| "grad_norm": 1.348031471721414, |
| "kl": 0.03143310546875, |
| "learning_rate": 4.92735454356513e-07, |
| "loss": 0.0013, |
| "reward": 4.290624618530273, |
| "reward_std": 0.0891985222697258, |
| "rewards/accuracy_reward": 2.796875, |
| "rewards/format_reward": 1.0, |
| "step": 45, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.8125, |
| "epoch": 0.07863247863247863, |
| "grad_norm": 1.0447875433549505, |
| "kl": 0.0330810546875, |
| "learning_rate": 4.924106598536569e-07, |
| "loss": 0.0013, |
| "reward": 4.339583396911621, |
| "reward_std": 0.0612565279006958, |
| "rewards/accuracy_reward": 2.858333110809326, |
| "rewards/format_reward": 1.0, |
| "step": 46, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.03125, |
| "epoch": 0.08034188034188035, |
| "grad_norm": 1.1567990099669652, |
| "kl": 0.03277587890625, |
| "learning_rate": 4.920788743607635e-07, |
| "loss": 0.0013, |
| "reward": 4.3927083015441895, |
| "reward_std": 0.13107708096504211, |
| "rewards/accuracy_reward": 2.9114582538604736, |
| "rewards/format_reward": 1.0, |
| "step": 47, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 403.8125, |
| "epoch": 0.08205128205128205, |
| "grad_norm": 1.0369718009838986, |
| "kl": 0.03460693359375, |
| "learning_rate": 4.917401074463441e-07, |
| "loss": 0.0014, |
| "reward": 4.4270830154418945, |
| "reward_std": 0.13362948596477509, |
| "rewards/accuracy_reward": 2.9270834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 48, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.25, |
| "epoch": 0.08376068376068375, |
| "grad_norm": 0.7093588876792561, |
| "kl": 0.037353515625, |
| "learning_rate": 4.913943688802497e-07, |
| "loss": 0.0015, |
| "reward": 3.8843750953674316, |
| "reward_std": 0.1348297894001007, |
| "rewards/accuracy_reward": 2.390625, |
| "rewards/format_reward": 1.0, |
| "step": 49, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 415.625, |
| "epoch": 0.08547008547008547, |
| "grad_norm": 0.9797316320763304, |
| "kl": 0.035736083984375, |
| "learning_rate": 4.910416686333906e-07, |
| "loss": 0.0014, |
| "reward": 3.855208396911621, |
| "reward_std": 0.03958335518836975, |
| "rewards/accuracy_reward": 2.3614583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 50, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 402.71875, |
| "epoch": 0.08717948717948718, |
| "grad_norm": 0.7267487162340365, |
| "kl": 0.034515380859375, |
| "learning_rate": 4.906820168774477e-07, |
| "loss": 0.0014, |
| "reward": 4.229166507720947, |
| "reward_std": 0.04166668653488159, |
| "rewards/accuracy_reward": 2.7291665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 51, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 408.65625, |
| "epoch": 0.08888888888888889, |
| "grad_norm": 0.9894251393442087, |
| "kl": 0.0360107421875, |
| "learning_rate": 4.903154239845797e-07, |
| "loss": 0.0014, |
| "reward": 4.327083587646484, |
| "reward_std": 0.09055028110742569, |
| "rewards/accuracy_reward": 2.8270833492279053, |
| "rewards/format_reward": 1.0, |
| "step": 52, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 403.21875, |
| "epoch": 0.0905982905982906, |
| "grad_norm": 1.2134526914387425, |
| "kl": 0.0322265625, |
| "learning_rate": 4.899419005271241e-07, |
| "loss": 0.0013, |
| "reward": 4.413541793823242, |
| "reward_std": 0.1331278383731842, |
| "rewards/accuracy_reward": 2.9197916984558105, |
| "rewards/format_reward": 1.0, |
| "step": 53, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 428.34375, |
| "epoch": 0.09230769230769231, |
| "grad_norm": 1.1445171148845454, |
| "kl": 0.035369873046875, |
| "learning_rate": 4.895614572772916e-07, |
| "loss": 0.0014, |
| "reward": 4.2208333015441895, |
| "reward_std": 0.23445481061935425, |
| "rewards/accuracy_reward": 2.745833396911621, |
| "rewards/format_reward": 1.0, |
| "step": 54, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 404.46875, |
| "epoch": 0.09401709401709402, |
| "grad_norm": 0.9479258360176128, |
| "kl": 0.036468505859375, |
| "learning_rate": 4.891741052068563e-07, |
| "loss": 0.0015, |
| "reward": 4.464583396911621, |
| "reward_std": 0.05090419948101044, |
| "rewards/accuracy_reward": 2.977083206176758, |
| "rewards/format_reward": 1.0, |
| "step": 55, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 419.21875, |
| "epoch": 0.09572649572649573, |
| "grad_norm": 1.185262142708624, |
| "kl": 0.0364990234375, |
| "learning_rate": 4.887798554868387e-07, |
| "loss": 0.0015, |
| "reward": 4.202083110809326, |
| "reward_std": 0.08526714146137238, |
| "rewards/accuracy_reward": 2.7333333492279053, |
| "rewards/format_reward": 1.0, |
| "step": 56, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 427.78125, |
| "epoch": 0.09743589743589744, |
| "grad_norm": 0.6149452604974177, |
| "kl": 0.033416748046875, |
| "learning_rate": 4.883787194871841e-07, |
| "loss": 0.0013, |
| "reward": 4.359375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.871875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 57, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.4375, |
| "epoch": 0.09914529914529914, |
| "grad_norm": 1.1440282896265581, |
| "kl": 0.039398193359375, |
| "learning_rate": 4.879707087764336e-07, |
| "loss": 0.0016, |
| "reward": 4.335416793823242, |
| "reward_std": 0.06786316633224487, |
| "rewards/accuracy_reward": 2.835416793823242, |
| "rewards/format_reward": 1.0, |
| "step": 58, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 403.34375, |
| "epoch": 0.10085470085470086, |
| "grad_norm": 1.1667079878189268, |
| "kl": 0.0361328125, |
| "learning_rate": 4.875558351213917e-07, |
| "loss": 0.0014, |
| "reward": 4.196874618530273, |
| "reward_std": 0.07863453030586243, |
| "rewards/accuracy_reward": 2.7093749046325684, |
| "rewards/format_reward": 1.0, |
| "step": 59, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 402.3125, |
| "epoch": 0.10256410256410256, |
| "grad_norm": 0.7257762494267078, |
| "kl": 0.04315185546875, |
| "learning_rate": 4.871341104867864e-07, |
| "loss": 0.0017, |
| "reward": 4.106249809265137, |
| "reward_std": 0.03750002384185791, |
| "rewards/accuracy_reward": 2.606250047683716, |
| "rewards/format_reward": 1.0, |
| "step": 60, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 390.375, |
| "epoch": 0.10427350427350428, |
| "grad_norm": 1.246730170179853, |
| "kl": 0.038177490234375, |
| "learning_rate": 4.86705547034924e-07, |
| "loss": 0.0015, |
| "reward": 4.362500190734863, |
| "reward_std": 0.18642225861549377, |
| "rewards/accuracy_reward": 2.9124999046325684, |
| "rewards/format_reward": 1.0, |
| "step": 61, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.5625, |
| "epoch": 0.10598290598290598, |
| "grad_norm": 0.757303226225468, |
| "kl": 0.0382080078125, |
| "learning_rate": 4.862701571253386e-07, |
| "loss": 0.0015, |
| "reward": 4.3645830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.8645832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 62, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 403.84375, |
| "epoch": 0.1076923076923077, |
| "grad_norm": 0.7830222478805806, |
| "kl": 0.0361328125, |
| "learning_rate": 4.858279533144357e-07, |
| "loss": 0.0014, |
| "reward": 4.366666793823242, |
| "reward_std": 0.016666710376739502, |
| "rewards/accuracy_reward": 2.866666793823242, |
| "rewards/format_reward": 1.0, |
| "step": 63, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.0, |
| "epoch": 0.1094017094017094, |
| "grad_norm": 0.7856721207258216, |
| "kl": 0.037872314453125, |
| "learning_rate": 4.853789483551299e-07, |
| "loss": 0.0015, |
| "reward": 4.328125, |
| "reward_std": 0.07114279270172119, |
| "rewards/accuracy_reward": 2.828125, |
| "rewards/format_reward": 1.0, |
| "step": 64, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.6875, |
| "epoch": 0.1111111111111111, |
| "grad_norm": 0.5081050946401335, |
| "kl": 0.040435791015625, |
| "learning_rate": 4.849231551964771e-07, |
| "loss": 0.0016, |
| "reward": 4.213541507720947, |
| "reward_std": 0.059839192777872086, |
| "rewards/accuracy_reward": 2.7135415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 65, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.4375, |
| "epoch": 0.11282051282051282, |
| "grad_norm": 0.7425251071608985, |
| "kl": 0.041046142578125, |
| "learning_rate": 4.844605869833011e-07, |
| "loss": 0.0016, |
| "reward": 4.2239580154418945, |
| "reward_std": 0.05208335071802139, |
| "rewards/accuracy_reward": 2.7239584922790527, |
| "rewards/format_reward": 1.0, |
| "step": 66, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.59375, |
| "epoch": 0.11452991452991453, |
| "grad_norm": 1.7762232281747519, |
| "kl": 0.03778076171875, |
| "learning_rate": 4.839912570558147e-07, |
| "loss": 0.0015, |
| "reward": 4.457291603088379, |
| "reward_std": 0.08541667461395264, |
| "rewards/accuracy_reward": 2.9635417461395264, |
| "rewards/format_reward": 1.0, |
| "step": 67, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.96875, |
| "epoch": 0.11623931623931624, |
| "grad_norm": 1.0171107252467595, |
| "kl": 0.039794921875, |
| "learning_rate": 4.835151789492348e-07, |
| "loss": 0.0016, |
| "reward": 4.327083587646484, |
| "reward_std": 0.06022907793521881, |
| "rewards/accuracy_reward": 2.8333334922790527, |
| "rewards/format_reward": 1.0, |
| "step": 68, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.0, |
| "epoch": 0.11794871794871795, |
| "grad_norm": 0.8418226397769828, |
| "kl": 0.038909912109375, |
| "learning_rate": 4.830323663933919e-07, |
| "loss": 0.0016, |
| "reward": 4.474999904632568, |
| "reward_std": 0.034150637686252594, |
| "rewards/accuracy_reward": 2.981250047683716, |
| "rewards/format_reward": 1.0, |
| "step": 69, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 358.40625, |
| "epoch": 0.11965811965811966, |
| "grad_norm": 1.2733531515130359, |
| "kl": 0.036468505859375, |
| "learning_rate": 4.825428333123346e-07, |
| "loss": 0.0015, |
| "reward": 4.4375, |
| "reward_std": 0.06786306202411652, |
| "rewards/accuracy_reward": 2.968750238418579, |
| "rewards/format_reward": 1.0, |
| "step": 70, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.875, |
| "epoch": 0.12136752136752137, |
| "grad_norm": 1.1124941217119864, |
| "kl": 0.039459228515625, |
| "learning_rate": 4.820465938239273e-07, |
| "loss": 0.0016, |
| "reward": 4.077083110809326, |
| "reward_std": 0.09583336114883423, |
| "rewards/accuracy_reward": 2.5833332538604736, |
| "rewards/format_reward": 1.0, |
| "step": 71, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.0625, |
| "epoch": 0.12307692307692308, |
| "grad_norm": 0.7611058988515808, |
| "kl": 0.037384033203125, |
| "learning_rate": 4.815436622394441e-07, |
| "loss": 0.0015, |
| "reward": 4.3333330154418945, |
| "reward_std": 0.07452815771102905, |
| "rewards/accuracy_reward": 2.8333330154418945, |
| "rewards/format_reward": 1.0, |
| "step": 72, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 401.3125, |
| "epoch": 0.12478632478632479, |
| "grad_norm": 0.7580960727925862, |
| "kl": 0.03826904296875, |
| "learning_rate": 4.810340530631549e-07, |
| "loss": 0.0015, |
| "reward": 4.196874618530273, |
| "reward_std": 0.042802631855010986, |
| "rewards/accuracy_reward": 2.7093749046325684, |
| "rewards/format_reward": 1.0, |
| "step": 73, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.8125, |
| "epoch": 0.1264957264957265, |
| "grad_norm": 1.0374945029632214, |
| "kl": 0.036468505859375, |
| "learning_rate": 4.805177809919081e-07, |
| "loss": 0.0015, |
| "reward": 4.40625, |
| "reward_std": 0.13188013434410095, |
| "rewards/accuracy_reward": 2.9124999046325684, |
| "rewards/format_reward": 1.0, |
| "step": 74, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 369.5, |
| "epoch": 0.1282051282051282, |
| "grad_norm": 1.4163910687642582, |
| "kl": 0.0360107421875, |
| "learning_rate": 4.799948609147061e-07, |
| "loss": 0.0014, |
| "reward": 4.358333110809326, |
| "reward_std": 0.20628556609153748, |
| "rewards/accuracy_reward": 2.9083333015441895, |
| "rewards/format_reward": 0.96875, |
| "step": 75, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 376.75, |
| "epoch": 0.12991452991452992, |
| "grad_norm": 0.9270824940096077, |
| "kl": 0.03839111328125, |
| "learning_rate": 4.794653079122759e-07, |
| "loss": 0.0015, |
| "reward": 3.981250047683716, |
| "reward_std": 0.3762458562850952, |
| "rewards/accuracy_reward": 2.53125, |
| "rewards/format_reward": 1.0, |
| "step": 76, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.625, |
| "epoch": 0.13162393162393163, |
| "grad_norm": 0.9270824940096077, |
| "kl": 0.036865234375, |
| "learning_rate": 4.794653079122759e-07, |
| "loss": 0.0015, |
| "reward": 4.4666666984558105, |
| "reward_std": 0.053745806217193604, |
| "rewards/accuracy_reward": 2.972916603088379, |
| "rewards/format_reward": 1.0, |
| "step": 77, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 365.625, |
| "epoch": 0.13333333333333333, |
| "grad_norm": 1.1283643584028937, |
| "kl": 0.03497314453125, |
| "learning_rate": 4.789291372566351e-07, |
| "loss": 0.0014, |
| "reward": 4.336458206176758, |
| "reward_std": 0.06416243314743042, |
| "rewards/accuracy_reward": 2.8489584922790527, |
| "rewards/format_reward": 1.0, |
| "step": 78, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.78125, |
| "epoch": 0.13504273504273503, |
| "grad_norm": 1.0511802041200045, |
| "kl": 0.03485107421875, |
| "learning_rate": 4.783863644106502e-07, |
| "loss": 0.0014, |
| "reward": 4.3385419845581055, |
| "reward_std": 0.05041898041963577, |
| "rewards/accuracy_reward": 2.851041793823242, |
| "rewards/format_reward": 1.0, |
| "step": 79, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.0625, |
| "epoch": 0.13675213675213677, |
| "grad_norm": 1.1818496933235974, |
| "kl": 0.0379638671875, |
| "learning_rate": 4.778370050275913e-07, |
| "loss": 0.0015, |
| "reward": 4.327083110809326, |
| "reward_std": 0.055192895233631134, |
| "rewards/accuracy_reward": 2.8458333015441895, |
| "rewards/format_reward": 1.0, |
| "step": 80, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.21875, |
| "epoch": 0.13846153846153847, |
| "grad_norm": 0.9747451365148834, |
| "kl": 0.032684326171875, |
| "learning_rate": 4.772810749506809e-07, |
| "loss": 0.0013, |
| "reward": 4.213541507720947, |
| "reward_std": 0.05453469604253769, |
| "rewards/accuracy_reward": 2.7197914123535156, |
| "rewards/format_reward": 1.0, |
| "step": 81, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.21875, |
| "epoch": 0.14017094017094017, |
| "grad_norm": 0.5883256070908801, |
| "kl": 0.031829833984375, |
| "learning_rate": 4.767185902126363e-07, |
| "loss": 0.0013, |
| "reward": 4.368750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 2.875, |
| "rewards/format_reward": 1.0, |
| "step": 82, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 365.8125, |
| "epoch": 0.14188034188034188, |
| "grad_norm": 1.0703457007134716, |
| "kl": 0.038055419921875, |
| "learning_rate": 4.7614956703520804e-07, |
| "loss": 0.0015, |
| "reward": 4.228124618530273, |
| "reward_std": 0.15567252039909363, |
| "rewards/accuracy_reward": 2.734375, |
| "rewards/format_reward": 1.0, |
| "step": 83, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 359.59375, |
| "epoch": 0.14358974358974358, |
| "grad_norm": 1.2506164162586872, |
| "kl": 0.03045654296875, |
| "learning_rate": 4.755740218287112e-07, |
| "loss": 0.0012, |
| "reward": 4.460416793823242, |
| "reward_std": 0.06860043108463287, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 84, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.9375, |
| "epoch": 0.1452991452991453, |
| "grad_norm": 0.7246056285956093, |
| "kl": 0.0301971435546875, |
| "learning_rate": 4.74991971191553e-07, |
| "loss": 0.0012, |
| "reward": 4.366666793823242, |
| "reward_std": 0.016666710376739502, |
| "rewards/accuracy_reward": 2.866666555404663, |
| "rewards/format_reward": 1.0, |
| "step": 85, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.53125, |
| "epoch": 0.147008547008547, |
| "grad_norm": 1.1668199192545485, |
| "kl": 0.0360107421875, |
| "learning_rate": 4.7440343190975353e-07, |
| "loss": 0.0014, |
| "reward": 4.4395833015441895, |
| "reward_std": 0.0936460942029953, |
| "rewards/accuracy_reward": 2.9583334922790527, |
| "rewards/format_reward": 1.0, |
| "step": 86, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.75, |
| "epoch": 0.14871794871794872, |
| "grad_norm": 0.9704193227590118, |
| "kl": 0.03289794921875, |
| "learning_rate": 4.738084209564617e-07, |
| "loss": 0.0013, |
| "reward": 4.3177080154418945, |
| "reward_std": 0.08490978181362152, |
| "rewards/accuracy_reward": 2.8302083015441895, |
| "rewards/format_reward": 1.0, |
| "step": 87, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 361.1875, |
| "epoch": 0.15042735042735042, |
| "grad_norm": 1.1259678958812716, |
| "kl": 0.0352783203125, |
| "learning_rate": 4.73206955491466e-07, |
| "loss": 0.0014, |
| "reward": 4.262499809265137, |
| "reward_std": 0.09999996423721313, |
| "rewards/accuracy_reward": 2.78125, |
| "rewards/format_reward": 1.0, |
| "step": 88, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.15625, |
| "epoch": 0.15213675213675212, |
| "grad_norm": 1.1425644306497342, |
| "kl": 0.035125732421875, |
| "learning_rate": 4.7259905286069954e-07, |
| "loss": 0.0014, |
| "reward": 4.217708587646484, |
| "reward_std": 0.05812295526266098, |
| "rewards/accuracy_reward": 2.7177083492279053, |
| "rewards/format_reward": 1.0, |
| "step": 89, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.21875, |
| "epoch": 0.15384615384615385, |
| "grad_norm": 0.9074640334431748, |
| "kl": 0.03875732421875, |
| "learning_rate": 4.7198473059573974e-07, |
| "loss": 0.0015, |
| "reward": 4.353124618530273, |
| "reward_std": 0.04375004023313522, |
| "rewards/accuracy_reward": 2.859375, |
| "rewards/format_reward": 1.0, |
| "step": 90, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.125, |
| "epoch": 0.15555555555555556, |
| "grad_norm": 0.7003982378481962, |
| "kl": 0.032257080078125, |
| "learning_rate": 4.7136400641330245e-07, |
| "loss": 0.0013, |
| "reward": 4.480208396911621, |
| "reward_std": 0.03958336263895035, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 91, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.71875, |
| "epoch": 0.15726495726495726, |
| "grad_norm": 0.875300468522747, |
| "kl": 0.033599853515625, |
| "learning_rate": 4.707368982147317e-07, |
| "loss": 0.0013, |
| "reward": 4.135416507720947, |
| "reward_std": 0.10416668653488159, |
| "rewards/accuracy_reward": 2.6354165077209473, |
| "rewards/format_reward": 1.0, |
| "step": 92, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.4375, |
| "epoch": 0.15897435897435896, |
| "grad_norm": 1.3612654393709456, |
| "kl": 0.03582763671875, |
| "learning_rate": 4.7010342408548287e-07, |
| "loss": 0.0014, |
| "reward": 4.120833396911621, |
| "reward_std": 0.17232877016067505, |
| "rewards/accuracy_reward": 2.664583206176758, |
| "rewards/format_reward": 1.0, |
| "step": 93, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.28125, |
| "epoch": 0.1606837606837607, |
| "grad_norm": 0.5201581820004612, |
| "kl": 0.037445068359375, |
| "learning_rate": 4.6946360229460114e-07, |
| "loss": 0.0015, |
| "reward": 4.34375, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.84375, |
| "rewards/format_reward": 1.0, |
| "step": 94, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.8125, |
| "epoch": 0.1623931623931624, |
| "grad_norm": 0.681909030921276, |
| "kl": 0.034759521484375, |
| "learning_rate": 4.6881745129419493e-07, |
| "loss": 0.0014, |
| "reward": 4.340624809265137, |
| "reward_std": 0.06875002384185791, |
| "rewards/accuracy_reward": 2.840625047683716, |
| "rewards/format_reward": 1.0, |
| "step": 95, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 376.03125, |
| "epoch": 0.1641025641025641, |
| "grad_norm": 1.1733277547649887, |
| "kl": 0.0390625, |
| "learning_rate": 4.6816498971890357e-07, |
| "loss": 0.0016, |
| "reward": 4.235416412353516, |
| "reward_std": 0.2851067781448364, |
| "rewards/accuracy_reward": 2.7916667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 96, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.25, |
| "epoch": 0.1658119658119658, |
| "grad_norm": 1.2608932989375246, |
| "kl": 0.037841796875, |
| "learning_rate": 4.675062363853598e-07, |
| "loss": 0.0015, |
| "reward": 4.422916412353516, |
| "reward_std": 0.146120086312294, |
| "rewards/accuracy_reward": 2.922916889190674, |
| "rewards/format_reward": 1.0, |
| "step": 97, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 369.21875, |
| "epoch": 0.1675213675213675, |
| "grad_norm": 0.9647820810612151, |
| "kl": 0.034912109375, |
| "learning_rate": 4.668412102916473e-07, |
| "loss": 0.0014, |
| "reward": 4.3302083015441895, |
| "reward_std": 0.04382467269897461, |
| "rewards/accuracy_reward": 2.855208396911621, |
| "rewards/format_reward": 1.0, |
| "step": 98, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.125, |
| "epoch": 0.16923076923076924, |
| "grad_norm": 1.2632460448619276, |
| "kl": 0.036651611328125, |
| "learning_rate": 4.661699306167527e-07, |
| "loss": 0.0015, |
| "reward": 4.460416793823242, |
| "reward_std": 0.06192883849143982, |
| "rewards/accuracy_reward": 2.972916603088379, |
| "rewards/format_reward": 1.0, |
| "step": 99, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.25, |
| "epoch": 0.17094017094017094, |
| "grad_norm": 0.8583887047833437, |
| "kl": 0.0360107421875, |
| "learning_rate": 4.6549241672001225e-07, |
| "loss": 0.0014, |
| "reward": 4.269791603088379, |
| "reward_std": 0.08541667461395264, |
| "rewards/accuracy_reward": 2.7760415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 100, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.5, |
| "epoch": 0.17264957264957265, |
| "grad_norm": 1.1101859505758365, |
| "kl": 0.03741455078125, |
| "learning_rate": 4.648086881405542e-07, |
| "loss": 0.0015, |
| "reward": 3.9375, |
| "reward_std": 0.30000001192092896, |
| "rewards/accuracy_reward": 2.481250047683716, |
| "rewards/format_reward": 1.0, |
| "step": 101, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 378.625, |
| "epoch": 0.17435897435897435, |
| "grad_norm": 1.247638864286834, |
| "kl": 0.045440673828125, |
| "learning_rate": 4.6411876459673425e-07, |
| "loss": 0.0018, |
| "reward": 4.053124904632568, |
| "reward_std": 0.3468588590621948, |
| "rewards/accuracy_reward": 2.621875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 102, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.90625, |
| "epoch": 0.17606837606837608, |
| "grad_norm": 0.5504424525596744, |
| "kl": 0.039276123046875, |
| "learning_rate": 4.634226659855681e-07, |
| "loss": 0.0016, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 103, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.09375, |
| "epoch": 0.17777777777777778, |
| "grad_norm": 0.10526479007567158, |
| "kl": 0.03863525390625, |
| "learning_rate": 4.6272041238215624e-07, |
| "loss": 0.0015, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 104, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 397.59375, |
| "epoch": 0.1794871794871795, |
| "grad_norm": 0.7304258125371498, |
| "kl": 0.036346435546875, |
| "learning_rate": 4.6201202403910643e-07, |
| "loss": 0.0015, |
| "reward": 4.356249809265137, |
| "reward_std": 0.2124999761581421, |
| "rewards/accuracy_reward": 2.90625, |
| "rewards/format_reward": 1.0, |
| "step": 105, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.3125, |
| "epoch": 0.1811965811965812, |
| "grad_norm": 1.3375206675543057, |
| "kl": 0.0443115234375, |
| "learning_rate": 4.612975213859487e-07, |
| "loss": 0.0018, |
| "reward": 4.321874618530273, |
| "reward_std": 0.22130155563354492, |
| "rewards/accuracy_reward": 2.8343749046325684, |
| "rewards/format_reward": 1.0, |
| "step": 106, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.0, |
| "epoch": 0.1829059829059829, |
| "grad_norm": 1.395717497711829, |
| "kl": 0.042724609375, |
| "learning_rate": 4.6057692502854615e-07, |
| "loss": 0.0017, |
| "reward": 4.465624809265137, |
| "reward_std": 0.06875002384185791, |
| "rewards/accuracy_reward": 2.965625047683716, |
| "rewards/format_reward": 1.0, |
| "step": 107, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 407.03125, |
| "epoch": 0.18461538461538463, |
| "grad_norm": 0.4761809503163303, |
| "kl": 0.0377197265625, |
| "learning_rate": 4.5985025574850147e-07, |
| "loss": 0.0015, |
| "reward": 4.4979166984558105, |
| "reward_std": 0.004166662693023682, |
| "rewards/accuracy_reward": 2.9979166984558105, |
| "rewards/format_reward": 1.0, |
| "step": 108, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 409.375, |
| "epoch": 0.18632478632478633, |
| "grad_norm": 0.7405328781008026, |
| "kl": 0.039459228515625, |
| "learning_rate": 4.591175345025566e-07, |
| "loss": 0.0016, |
| "reward": 4.4270830154418945, |
| "reward_std": 0.09300213307142258, |
| "rewards/accuracy_reward": 2.9270834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 109, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.28125, |
| "epoch": 0.18803418803418803, |
| "grad_norm": 0.4422606573426667, |
| "kl": 0.04058837890625, |
| "learning_rate": 4.5837878242198936e-07, |
| "loss": 0.0016, |
| "reward": 4.46875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 110, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.78125, |
| "epoch": 0.18974358974358974, |
| "grad_norm": 0.7932812380553261, |
| "kl": 0.04156494140625, |
| "learning_rate": 4.576340208120029e-07, |
| "loss": 0.0017, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 111, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.625, |
| "epoch": 0.19145299145299147, |
| "grad_norm": 0.9371190698328027, |
| "kl": 0.042938232421875, |
| "learning_rate": 4.568832711511125e-07, |
| "loss": 0.0017, |
| "reward": 4.453125, |
| "reward_std": 0.08570331335067749, |
| "rewards/accuracy_reward": 2.965625047683716, |
| "rewards/format_reward": 1.0, |
| "step": 112, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 417.09375, |
| "epoch": 0.19316239316239317, |
| "grad_norm": 0.9352268527879245, |
| "kl": 0.039825439453125, |
| "learning_rate": 4.56126555090525e-07, |
| "loss": 0.0016, |
| "reward": 4.378125190734863, |
| "reward_std": 0.2437499761581421, |
| "rewards/accuracy_reward": 2.8968749046325684, |
| "rewards/format_reward": 1.0, |
| "step": 113, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 411.0625, |
| "epoch": 0.19487179487179487, |
| "grad_norm": 1.0061361213200832, |
| "kl": 0.048492431640625, |
| "learning_rate": 4.5536389445351543e-07, |
| "loss": 0.0019, |
| "reward": 4.324999809265137, |
| "reward_std": 0.22499999403953552, |
| "rewards/accuracy_reward": 2.8687500953674316, |
| "rewards/format_reward": 0.96875, |
| "step": 114, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 399.40625, |
| "epoch": 0.19658119658119658, |
| "grad_norm": 0.7838565061379232, |
| "kl": 0.037445068359375, |
| "learning_rate": 4.5459531123479673e-07, |
| "loss": 0.0015, |
| "reward": 4.407291889190674, |
| "reward_std": 0.1695673167705536, |
| "rewards/accuracy_reward": 2.913541555404663, |
| "rewards/format_reward": 1.0, |
| "step": 115, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.3125, |
| "epoch": 0.19829059829059828, |
| "grad_norm": 0.7434533516049402, |
| "kl": 0.05072021484375, |
| "learning_rate": 4.5382082759988605e-07, |
| "loss": 0.002, |
| "reward": 4.337500095367432, |
| "reward_std": 0.07500002533197403, |
| "rewards/accuracy_reward": 2.8375000953674316, |
| "rewards/format_reward": 1.0, |
| "step": 116, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 399.3125, |
| "epoch": 0.2, |
| "grad_norm": 1.0066718401998942, |
| "kl": 0.040252685546875, |
| "learning_rate": 4.530404658844653e-07, |
| "loss": 0.0016, |
| "reward": 4.273958206176758, |
| "reward_std": 0.22102615237236023, |
| "rewards/accuracy_reward": 2.8177080154418945, |
| "rewards/format_reward": 1.0, |
| "step": 117, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.5625, |
| "epoch": 0.20170940170940171, |
| "grad_norm": 1.2338773011379278, |
| "kl": 0.040985107421875, |
| "learning_rate": 4.5225424859373684e-07, |
| "loss": 0.0016, |
| "reward": 4.435416221618652, |
| "reward_std": 0.12916666269302368, |
| "rewards/accuracy_reward": 2.9479165077209473, |
| "rewards/format_reward": 1.0, |
| "step": 118, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.6875, |
| "epoch": 0.20341880341880342, |
| "grad_norm": 1.2606475694446262, |
| "kl": 0.044189453125, |
| "learning_rate": 4.5146219840177475e-07, |
| "loss": 0.0018, |
| "reward": 4.474999904632568, |
| "reward_std": 0.050000011920928955, |
| "rewards/accuracy_reward": 2.987499952316284, |
| "rewards/format_reward": 1.0, |
| "step": 119, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 410.34375, |
| "epoch": 0.20512820512820512, |
| "grad_norm": 1.0930625377031935, |
| "kl": 0.034515380859375, |
| "learning_rate": 4.506643381508707e-07, |
| "loss": 0.0014, |
| "reward": 4.4635419845581055, |
| "reward_std": 0.07291674613952637, |
| "rewards/accuracy_reward": 2.9635417461395264, |
| "rewards/format_reward": 1.0, |
| "step": 120, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 408.71875, |
| "epoch": 0.20683760683760682, |
| "grad_norm": 1.1859878555606564, |
| "kl": 0.041656494140625, |
| "learning_rate": 4.498606908508753e-07, |
| "loss": 0.0017, |
| "reward": 4.470832824707031, |
| "reward_std": 0.058333415538072586, |
| "rewards/accuracy_reward": 2.9708335399627686, |
| "rewards/format_reward": 1.0, |
| "step": 121, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 412.5, |
| "epoch": 0.20854700854700856, |
| "grad_norm": 0.854920065845963, |
| "kl": 0.036956787109375, |
| "learning_rate": 4.490512796785344e-07, |
| "loss": 0.0015, |
| "reward": 4.475000381469727, |
| "reward_std": 0.050000011920928955, |
| "rewards/accuracy_reward": 2.981250047683716, |
| "rewards/format_reward": 1.0, |
| "step": 122, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 404.125, |
| "epoch": 0.21025641025641026, |
| "grad_norm": 1.2913323108581982, |
| "kl": 0.04022216796875, |
| "learning_rate": 4.4823612797682087e-07, |
| "loss": 0.0016, |
| "reward": 4.4083333015441895, |
| "reward_std": 0.10176850110292435, |
| "rewards/accuracy_reward": 2.914583206176758, |
| "rewards/format_reward": 1.0, |
| "step": 123, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 408.75, |
| "epoch": 0.21196581196581196, |
| "grad_norm": 1.2109364265824558, |
| "kl": 0.042877197265625, |
| "learning_rate": 4.474152592542612e-07, |
| "loss": 0.0017, |
| "reward": 4.381249904632568, |
| "reward_std": 0.22619642317295074, |
| "rewards/accuracy_reward": 2.887500047683716, |
| "rewards/format_reward": 1.0, |
| "step": 124, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 404.03125, |
| "epoch": 0.21367521367521367, |
| "grad_norm": 0.7902156024479733, |
| "kl": 0.0445556640625, |
| "learning_rate": 4.4658869718425774e-07, |
| "loss": 0.0018, |
| "reward": 4.4864583015441895, |
| "reward_std": 0.027083376422524452, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 125, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 402.03125, |
| "epoch": 0.2153846153846154, |
| "grad_norm": 1.0575353123534599, |
| "kl": 0.044830322265625, |
| "learning_rate": 4.457564656044056e-07, |
| "loss": 0.0018, |
| "reward": 4.453125, |
| "reward_std": 0.06609740853309631, |
| "rewards/accuracy_reward": 2.9656248092651367, |
| "rewards/format_reward": 1.0, |
| "step": 126, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 397.375, |
| "epoch": 0.2170940170940171, |
| "grad_norm": 1.0839676629160238, |
| "kl": 0.043975830078125, |
| "learning_rate": 4.4491858851580553e-07, |
| "loss": 0.0018, |
| "reward": 4.472916603088379, |
| "reward_std": 0.05416667461395264, |
| "rewards/accuracy_reward": 2.9854166507720947, |
| "rewards/format_reward": 1.0, |
| "step": 127, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 399.09375, |
| "epoch": 0.2188034188034188, |
| "grad_norm": 1.4967525172984775, |
| "kl": 0.043731689453125, |
| "learning_rate": 4.4407509008237196e-07, |
| "loss": 0.0018, |
| "reward": 4.415625095367432, |
| "reward_std": 0.12062296271324158, |
| "rewards/accuracy_reward": 2.921875, |
| "rewards/format_reward": 1.0, |
| "step": 128, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.65625, |
| "epoch": 0.2205128205128205, |
| "grad_norm": 1.156753232859856, |
| "kl": 0.045379638671875, |
| "learning_rate": 4.4322599463013545e-07, |
| "loss": 0.0018, |
| "reward": 4.4510416984558105, |
| "reward_std": 0.07613600790500641, |
| "rewards/accuracy_reward": 2.9510416984558105, |
| "rewards/format_reward": 1.0, |
| "step": 129, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 400.375, |
| "epoch": 0.2222222222222222, |
| "grad_norm": 0.7981149668305919, |
| "kl": 0.04595947265625, |
| "learning_rate": 4.4237132664654147e-07, |
| "loss": 0.0018, |
| "reward": 4.368750095367432, |
| "reward_std": 0.1907489001750946, |
| "rewards/accuracy_reward": 2.875, |
| "rewards/format_reward": 1.0, |
| "step": 130, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 397.0625, |
| "epoch": 0.22393162393162394, |
| "grad_norm": 0.8783345359959521, |
| "kl": 0.046112060546875, |
| "learning_rate": 4.415111107797445e-07, |
| "loss": 0.0018, |
| "reward": 4.3729166984558105, |
| "reward_std": 0.17916670441627502, |
| "rewards/accuracy_reward": 2.910416841506958, |
| "rewards/format_reward": 1.0, |
| "step": 131, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.375, |
| "epoch": 0.22564102564102564, |
| "grad_norm": 0.7538049450716533, |
| "kl": 0.047943115234375, |
| "learning_rate": 4.4064537183789675e-07, |
| "loss": 0.0019, |
| "reward": 4.487500190734863, |
| "reward_std": 0.01821785606443882, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 132, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.78125, |
| "epoch": 0.22735042735042735, |
| "grad_norm": 1.1842869997989454, |
| "kl": 0.04400634765625, |
| "learning_rate": 4.397741347884328e-07, |
| "loss": 0.0018, |
| "reward": 4.432291507720947, |
| "reward_std": 0.0720747858285904, |
| "rewards/accuracy_reward": 2.976041793823242, |
| "rewards/format_reward": 1.0, |
| "step": 133, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.5625, |
| "epoch": 0.22905982905982905, |
| "grad_norm": 1.1785282734794618, |
| "kl": 0.05255126953125, |
| "learning_rate": 4.3889742475735e-07, |
| "loss": 0.0021, |
| "reward": 4.413541316986084, |
| "reward_std": 0.17291665077209473, |
| "rewards/accuracy_reward": 2.9322915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 134, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.40625, |
| "epoch": 0.23076923076923078, |
| "grad_norm": 0.8559474662788366, |
| "kl": 0.0535888671875, |
| "learning_rate": 4.38015267028483e-07, |
| "loss": 0.0021, |
| "reward": 4.487500190734863, |
| "reward_std": 0.025000017136335373, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 135, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 384.65625, |
| "epoch": 0.23247863247863249, |
| "grad_norm": 1.229565072425211, |
| "kl": 0.05072021484375, |
| "learning_rate": 4.3712768704277524e-07, |
| "loss": 0.002, |
| "reward": 4.3072919845581055, |
| "reward_std": 0.3026726245880127, |
| "rewards/accuracy_reward": 2.851041793823242, |
| "rewards/format_reward": 1.0, |
| "step": 136, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.46875, |
| "epoch": 0.2341880341880342, |
| "grad_norm": 1.0543514102617846, |
| "kl": 0.051483154296875, |
| "learning_rate": 4.3623471039754525e-07, |
| "loss": 0.0021, |
| "reward": 4.4822916984558105, |
| "reward_std": 0.035416703671216965, |
| "rewards/accuracy_reward": 2.988541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 137, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 375.9375, |
| "epoch": 0.2358974358974359, |
| "grad_norm": 0.9569773264572374, |
| "kl": 0.0533447265625, |
| "learning_rate": 4.3533636284574796e-07, |
| "loss": 0.0021, |
| "reward": 4.259374618530273, |
| "reward_std": 0.1562500149011612, |
| "rewards/accuracy_reward": 2.809375047683716, |
| "rewards/format_reward": 1.0, |
| "step": 138, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.71875, |
| "epoch": 0.2376068376068376, |
| "grad_norm": 0.9824411737431323, |
| "kl": 0.055755615234375, |
| "learning_rate": 4.3443267029523254e-07, |
| "loss": 0.0022, |
| "reward": 4.480208396911621, |
| "reward_std": 0.03958336263895035, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 139, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.59375, |
| "epoch": 0.23931623931623933, |
| "grad_norm": 1.0420267070552791, |
| "kl": 0.048675537109375, |
| "learning_rate": 4.335236588079948e-07, |
| "loss": 0.0019, |
| "reward": 4.477083206176758, |
| "reward_std": 0.04583332687616348, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 140, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.34375, |
| "epoch": 0.24102564102564103, |
| "grad_norm": 0.7563242853625025, |
| "kl": 0.04864501953125, |
| "learning_rate": 4.326093545994258e-07, |
| "loss": 0.0019, |
| "reward": 4.493750095367432, |
| "reward_std": 0.01250004768371582, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 141, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.625, |
| "epoch": 0.24273504273504273, |
| "grad_norm": 0.8629864588588781, |
| "kl": 0.047576904296875, |
| "learning_rate": 4.316897840375558e-07, |
| "loss": 0.0019, |
| "reward": 4.472916603088379, |
| "reward_std": 0.05416667461395264, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 142, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.40625, |
| "epoch": 0.24444444444444444, |
| "grad_norm": 0.8960359847522277, |
| "kl": 0.047332763671875, |
| "learning_rate": 4.307649736422938e-07, |
| "loss": 0.0019, |
| "reward": 4.358333110809326, |
| "reward_std": 0.03333333879709244, |
| "rewards/accuracy_reward": 2.8645834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 143, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 400.15625, |
| "epoch": 0.24615384615384617, |
| "grad_norm": 0.7057007703502248, |
| "kl": 0.045379638671875, |
| "learning_rate": 4.2983495008466273e-07, |
| "loss": 0.0018, |
| "reward": 4.454166412353516, |
| "reward_std": 0.08357105404138565, |
| "rewards/accuracy_reward": 2.991666793823242, |
| "rewards/format_reward": 0.96875, |
| "step": 144, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.625, |
| "epoch": 0.24786324786324787, |
| "grad_norm": 1.1400069761424125, |
| "kl": 0.0487060546875, |
| "learning_rate": 4.2889974018603024e-07, |
| "loss": 0.0019, |
| "reward": 4.257291793823242, |
| "reward_std": 0.20642347633838654, |
| "rewards/accuracy_reward": 2.7760415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 145, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.375, |
| "epoch": 0.24957264957264957, |
| "grad_norm": 0.5078458420795325, |
| "kl": 0.045196533203125, |
| "learning_rate": 4.279593709173351e-07, |
| "loss": 0.0018, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833352580666542, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 146, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 401.03125, |
| "epoch": 0.2512820512820513, |
| "grad_norm": 0.9708960933046578, |
| "kl": 0.048797607421875, |
| "learning_rate": 4.2701386939830964e-07, |
| "loss": 0.002, |
| "reward": 4.426041603088379, |
| "reward_std": 0.1076958030462265, |
| "rewards/accuracy_reward": 2.9697916507720947, |
| "rewards/format_reward": 0.96875, |
| "step": 147, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 399.28125, |
| "epoch": 0.252991452991453, |
| "grad_norm": 0.7150957515941181, |
| "kl": 0.047882080078125, |
| "learning_rate": 4.2606326289669737e-07, |
| "loss": 0.0019, |
| "reward": 4.465624809265137, |
| "reward_std": 0.06875002384185791, |
| "rewards/accuracy_reward": 2.965625047683716, |
| "rewards/format_reward": 1.0, |
| "step": 148, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.59375, |
| "epoch": 0.2547008547008547, |
| "grad_norm": 0.940492080718513, |
| "kl": 0.047332763671875, |
| "learning_rate": 4.251075788274666e-07, |
| "loss": 0.0019, |
| "reward": 4.4791669845581055, |
| "reward_std": 0.036383602768182755, |
| "rewards/accuracy_reward": 2.9791667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 149, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.65625, |
| "epoch": 0.2564102564102564, |
| "grad_norm": 1.1493825902639314, |
| "kl": 0.04644775390625, |
| "learning_rate": 4.241468447520201e-07, |
| "loss": 0.0019, |
| "reward": 4.480208396911621, |
| "reward_std": 0.03958338499069214, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 150, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 398.28125, |
| "epoch": 0.25811965811965815, |
| "grad_norm": 1.459476642452973, |
| "kl": 0.043121337890625, |
| "learning_rate": 4.2318108837739986e-07, |
| "loss": 0.0017, |
| "reward": 4.234375, |
| "reward_std": 0.41515326499938965, |
| "rewards/accuracy_reward": 2.7906250953674316, |
| "rewards/format_reward": 1.0, |
| "step": 151, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 411.5, |
| "epoch": 0.25982905982905985, |
| "grad_norm": 1.110652347817343, |
| "kl": 0.050811767578125, |
| "learning_rate": 4.222103375554883e-07, |
| "loss": 0.002, |
| "reward": 4.445833206176758, |
| "reward_std": 0.06399547308683395, |
| "rewards/accuracy_reward": 2.952083110809326, |
| "rewards/format_reward": 1.0, |
| "step": 152, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.125, |
| "epoch": 0.26153846153846155, |
| "grad_norm": 1.0785519824350434, |
| "kl": 0.0518798828125, |
| "learning_rate": 4.21234620282205e-07, |
| "loss": 0.0021, |
| "reward": 4.2552080154418945, |
| "reward_std": 0.1653914898633957, |
| "rewards/accuracy_reward": 2.7552084922790527, |
| "rewards/format_reward": 1.0, |
| "step": 153, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 406.75, |
| "epoch": 0.26324786324786326, |
| "grad_norm": 1.0721439274563058, |
| "kl": 0.04229736328125, |
| "learning_rate": 4.2025396469669926e-07, |
| "loss": 0.0017, |
| "reward": 4.454166412353516, |
| "reward_std": 0.07500007003545761, |
| "rewards/accuracy_reward": 2.9541664123535156, |
| "rewards/format_reward": 1.0, |
| "step": 154, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 404.46875, |
| "epoch": 0.26495726495726496, |
| "grad_norm": 0.9988184997997093, |
| "kl": 0.043060302734375, |
| "learning_rate": 4.1926839908053847e-07, |
| "loss": 0.0017, |
| "reward": 4.479166507720947, |
| "reward_std": 0.04166668653488159, |
| "rewards/accuracy_reward": 2.9854166507720947, |
| "rewards/format_reward": 1.0, |
| "step": 155, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.5625, |
| "epoch": 0.26666666666666666, |
| "grad_norm": 0.7831640694765154, |
| "kl": 0.045196533203125, |
| "learning_rate": 4.182779518568925e-07, |
| "loss": 0.0018, |
| "reward": 4.4197916984558105, |
| "reward_std": 0.0699336901307106, |
| "rewards/accuracy_reward": 2.9322915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 156, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 391.125, |
| "epoch": 0.26837606837606837, |
| "grad_norm": 0.9630033697673518, |
| "kl": 0.046722412109375, |
| "learning_rate": 4.172826515897145e-07, |
| "loss": 0.0019, |
| "reward": 4.3125, |
| "reward_std": 0.24396438896656036, |
| "rewards/accuracy_reward": 2.828125, |
| "rewards/format_reward": 1.0, |
| "step": 157, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.0625, |
| "epoch": 0.27008547008547007, |
| "grad_norm": 1.17921986026744, |
| "kl": 0.041473388671875, |
| "learning_rate": 4.1628252698291643e-07, |
| "loss": 0.0017, |
| "reward": 4.467708587646484, |
| "reward_std": 0.064583420753479, |
| "rewards/accuracy_reward": 2.9677083492279053, |
| "rewards/format_reward": 1.0, |
| "step": 158, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.125, |
| "epoch": 0.2717948717948718, |
| "grad_norm": 1.0249267708010807, |
| "kl": 0.042877197265625, |
| "learning_rate": 4.1527760687954154e-07, |
| "loss": 0.0017, |
| "reward": 4.4739580154418945, |
| "reward_std": 0.05208338797092438, |
| "rewards/accuracy_reward": 2.9739582538604736, |
| "rewards/format_reward": 1.0, |
| "step": 159, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.34375, |
| "epoch": 0.27350427350427353, |
| "grad_norm": 0.77719285849163, |
| "kl": 0.041015625, |
| "learning_rate": 4.142679202609327e-07, |
| "loss": 0.0016, |
| "reward": 4.354166507720947, |
| "reward_std": 0.04166668653488159, |
| "rewards/accuracy_reward": 2.8541667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 160, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.25, |
| "epoch": 0.27521367521367524, |
| "grad_norm": 0.7752980246081294, |
| "kl": 0.039459228515625, |
| "learning_rate": 4.132534962458962e-07, |
| "loss": 0.0016, |
| "reward": 4.4270830154418945, |
| "reward_std": 0.06305919587612152, |
| "rewards/accuracy_reward": 2.9270834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 161, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 375.75, |
| "epoch": 0.27692307692307694, |
| "grad_norm": 1.1993765246662056, |
| "kl": 0.047943115234375, |
| "learning_rate": 4.122343640898627e-07, |
| "loss": 0.0019, |
| "reward": 4.353125095367432, |
| "reward_std": 0.20582912862300873, |
| "rewards/accuracy_reward": 2.903125047683716, |
| "rewards/format_reward": 1.0, |
| "step": 162, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.03125, |
| "epoch": 0.27863247863247864, |
| "grad_norm": 0.59014945249797, |
| "kl": 0.041290283203125, |
| "learning_rate": 4.112105531840426e-07, |
| "loss": 0.0017, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 163, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 397.625, |
| "epoch": 0.28034188034188035, |
| "grad_norm": 0.5356413668339389, |
| "kl": 0.040740966796875, |
| "learning_rate": 4.101820930545791e-07, |
| "loss": 0.0016, |
| "reward": 4.496874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 164, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.96875, |
| "epoch": 0.28205128205128205, |
| "grad_norm": 0.13294410521207053, |
| "kl": 0.04388427734375, |
| "learning_rate": 4.0914901336169636e-07, |
| "loss": 0.0018, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 165, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.40625, |
| "epoch": 0.28376068376068375, |
| "grad_norm": 0.5764716736438367, |
| "kl": 0.0455322265625, |
| "learning_rate": 4.081113438988443e-07, |
| "loss": 0.0018, |
| "reward": 4.496874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 166, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.625, |
| "epoch": 0.28547008547008546, |
| "grad_norm": 1.465826987702493, |
| "kl": 0.045806884765625, |
| "learning_rate": 4.0706911459183915e-07, |
| "loss": 0.0018, |
| "reward": 4.4041666984558105, |
| "reward_std": 0.17083334922790527, |
| "rewards/accuracy_reward": 2.910416603088379, |
| "rewards/format_reward": 1.0, |
| "step": 167, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.78125, |
| "epoch": 0.28717948717948716, |
| "grad_norm": 0.5935301744550797, |
| "kl": 0.047821044921875, |
| "learning_rate": 4.060223554980007e-07, |
| "loss": 0.0019, |
| "reward": 4.46875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 168, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 494.09375, |
| "epoch": 0.28888888888888886, |
| "grad_norm": 0.8204426630893709, |
| "kl": 0.045166015625, |
| "learning_rate": 4.049710968052851e-07, |
| "loss": 0.0018, |
| "reward": 4.272916793823242, |
| "reward_std": 0.2933366894721985, |
| "rewards/accuracy_reward": 2.8541667461395264, |
| "rewards/format_reward": 0.96875, |
| "step": 169, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.21875, |
| "epoch": 0.2905982905982906, |
| "grad_norm": 0.6213443899630995, |
| "kl": 0.04119873046875, |
| "learning_rate": 4.039153688314145e-07, |
| "loss": 0.0016, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 170, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.34375, |
| "epoch": 0.2923076923076923, |
| "grad_norm": 0.7861053846314547, |
| "kl": 0.041229248046875, |
| "learning_rate": 4.0285520202300304e-07, |
| "loss": 0.0016, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 171, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 367.75, |
| "epoch": 0.294017094017094, |
| "grad_norm": 1.2307384278158895, |
| "kl": 0.04510498046875, |
| "learning_rate": 4.017906269546778e-07, |
| "loss": 0.0018, |
| "reward": 4.4354166984558105, |
| "reward_std": 0.10271236300468445, |
| "rewards/accuracy_reward": 2.960416793823242, |
| "rewards/format_reward": 1.0, |
| "step": 172, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.53125, |
| "epoch": 0.29572649572649573, |
| "grad_norm": 0.9676974134756903, |
| "kl": 0.046539306640625, |
| "learning_rate": 4.0072167432819804e-07, |
| "loss": 0.0019, |
| "reward": 4.477083206176758, |
| "reward_std": 0.036256518214941025, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 173, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.96875, |
| "epoch": 0.29743589743589743, |
| "grad_norm": 1.2274057316899676, |
| "kl": 0.04266357421875, |
| "learning_rate": 3.996483749715693e-07, |
| "loss": 0.0017, |
| "reward": 4.465624809265137, |
| "reward_std": 0.04625225439667702, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 174, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.46875, |
| "epoch": 0.29914529914529914, |
| "grad_norm": 1.2412182079168792, |
| "kl": 0.03826904296875, |
| "learning_rate": 3.9857075983815435e-07, |
| "loss": 0.0015, |
| "reward": 4.462499618530273, |
| "reward_std": 0.06443369388580322, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 175, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.0, |
| "epoch": 0.30085470085470084, |
| "grad_norm": 0.609179641547218, |
| "kl": 0.04107666015625, |
| "learning_rate": 3.974888600057807e-07, |
| "loss": 0.0016, |
| "reward": 4.478125095367432, |
| "reward_std": 0.029536345973610878, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 176, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.90625, |
| "epoch": 0.30256410256410254, |
| "grad_norm": 1.190308130875681, |
| "kl": 0.042327880859375, |
| "learning_rate": 3.964027066758442e-07, |
| "loss": 0.0017, |
| "reward": 4.469791412353516, |
| "reward_std": 0.0508398711681366, |
| "rewards/accuracy_reward": 2.9822916984558105, |
| "rewards/format_reward": 1.0, |
| "step": 177, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.375, |
| "epoch": 0.30427350427350425, |
| "grad_norm": 0.8508866922383489, |
| "kl": 0.041595458984375, |
| "learning_rate": 3.9531233117240916e-07, |
| "loss": 0.0017, |
| "reward": 4.487500190734863, |
| "reward_std": 0.02499997615814209, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 178, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.6875, |
| "epoch": 0.305982905982906, |
| "grad_norm": 1.0986022812237966, |
| "kl": 0.0469970703125, |
| "learning_rate": 3.942177649413051e-07, |
| "loss": 0.0019, |
| "reward": 4.416666507720947, |
| "reward_std": 0.16666670143604279, |
| "rewards/accuracy_reward": 2.9166667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 179, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.28125, |
| "epoch": 0.3076923076923077, |
| "grad_norm": 0.9041233903377462, |
| "kl": 0.039581298828125, |
| "learning_rate": 3.931190395492198e-07, |
| "loss": 0.0016, |
| "reward": 4.428125381469727, |
| "reward_std": 0.14375001192092896, |
| "rewards/accuracy_reward": 2.934375047683716, |
| "rewards/format_reward": 1.0, |
| "step": 180, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.5, |
| "epoch": 0.3094017094017094, |
| "grad_norm": 1.0826453818165487, |
| "kl": 0.040740966796875, |
| "learning_rate": 3.920161866827889e-07, |
| "loss": 0.0016, |
| "reward": 4.436458587646484, |
| "reward_std": 0.1042187437415123, |
| "rewards/accuracy_reward": 2.9427082538604736, |
| "rewards/format_reward": 1.0, |
| "step": 181, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 414.375, |
| "epoch": 0.3111111111111111, |
| "grad_norm": 0.904258739786522, |
| "kl": 0.0513916015625, |
| "learning_rate": 3.909092381476824e-07, |
| "loss": 0.0021, |
| "reward": 4.332291603088379, |
| "reward_std": 0.24280627071857452, |
| "rewards/accuracy_reward": 2.8697915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 182, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 411.875, |
| "epoch": 0.3128205128205128, |
| "grad_norm": 0.42269488554491963, |
| "kl": 0.045135498046875, |
| "learning_rate": 3.8979822586768666e-07, |
| "loss": 0.0018, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 183, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 418.53125, |
| "epoch": 0.3145299145299145, |
| "grad_norm": 0.7604195134349618, |
| "kl": 0.0445556640625, |
| "learning_rate": 3.886831818837847e-07, |
| "loss": 0.0018, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 184, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 415.90625, |
| "epoch": 0.3162393162393162, |
| "grad_norm": 1.1493790993662951, |
| "kl": 0.036895751953125, |
| "learning_rate": 3.875641383532313e-07, |
| "loss": 0.0015, |
| "reward": 4.392707824707031, |
| "reward_std": 0.20166242122650146, |
| "rewards/accuracy_reward": 2.9114584922790527, |
| "rewards/format_reward": 1.0, |
| "step": 185, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 429.375, |
| "epoch": 0.31794871794871793, |
| "grad_norm": 0.4992140699455253, |
| "kl": 0.039215087890625, |
| "learning_rate": 3.864411275486261e-07, |
| "loss": 0.0016, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 186, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 432.4375, |
| "epoch": 0.31965811965811963, |
| "grad_norm": 0.7979965621901377, |
| "kl": 0.03582763671875, |
| "learning_rate": 3.8531418185698286e-07, |
| "loss": 0.0014, |
| "reward": 4.440625190734863, |
| "reward_std": 0.08476538956165314, |
| "rewards/accuracy_reward": 2.953125, |
| "rewards/format_reward": 1.0, |
| "step": 187, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 420.28125, |
| "epoch": 0.3213675213675214, |
| "grad_norm": 0.780631126170187, |
| "kl": 0.03973388671875, |
| "learning_rate": 3.8418333377879503e-07, |
| "loss": 0.0016, |
| "reward": 4.362500190734863, |
| "reward_std": 0.02499997615814209, |
| "rewards/accuracy_reward": 2.875, |
| "rewards/format_reward": 1.0, |
| "step": 188, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 399.3125, |
| "epoch": 0.3230769230769231, |
| "grad_norm": 0.7047170411411621, |
| "kl": 0.041778564453125, |
| "learning_rate": 3.8304861592709904e-07, |
| "loss": 0.0017, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 189, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.0625, |
| "epoch": 0.3247863247863248, |
| "grad_norm": 0.47632381457177264, |
| "kl": 0.038116455078125, |
| "learning_rate": 3.8191006102653317e-07, |
| "loss": 0.0015, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 190, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 404.34375, |
| "epoch": 0.3264957264957265, |
| "grad_norm": 0.4836863827599763, |
| "kl": 0.040771484375, |
| "learning_rate": 3.8076770191239436e-07, |
| "loss": 0.0016, |
| "reward": 4.496874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 191, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.3125, |
| "epoch": 0.3282051282051282, |
| "grad_norm": 1.0078286777321448, |
| "kl": 0.03765869140625, |
| "learning_rate": 3.796215715296909e-07, |
| "loss": 0.0015, |
| "reward": 4.480208396911621, |
| "reward_std": 0.03958336263895035, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 192, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 410.3125, |
| "epoch": 0.3299145299145299, |
| "grad_norm": 0.5610391713071925, |
| "kl": 0.038177490234375, |
| "learning_rate": 3.7847170293219216e-07, |
| "loss": 0.0015, |
| "reward": 4.462500095367432, |
| "reward_std": 0.059511907398700714, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 193, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 402.03125, |
| "epoch": 0.3316239316239316, |
| "grad_norm": 1.1299949442190973, |
| "kl": 0.043731689453125, |
| "learning_rate": 3.7731812928147586e-07, |
| "loss": 0.0017, |
| "reward": 4.447916507720947, |
| "reward_std": 0.10416672378778458, |
| "rewards/accuracy_reward": 2.9479167461395264, |
| "rewards/format_reward": 1.0, |
| "step": 194, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.90625, |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.10446066137589627, |
| "kl": 0.036376953125, |
| "learning_rate": 3.761608838459713e-07, |
| "loss": 0.0015, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 195, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 387.28125, |
| "epoch": 0.335042735042735, |
| "grad_norm": 0.7300977881818299, |
| "kl": 0.038238525390625, |
| "learning_rate": 3.75e-07, |
| "loss": 0.0015, |
| "reward": 4.258333206176758, |
| "reward_std": 0.308286190032959, |
| "rewards/accuracy_reward": 2.8020832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 196, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.15625, |
| "epoch": 0.3367521367521368, |
| "grad_norm": 0.12871250708865678, |
| "kl": 0.036651611328125, |
| "learning_rate": 3.7383551122281333e-07, |
| "loss": 0.0015, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 197, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 386.75, |
| "epoch": 0.3384615384615385, |
| "grad_norm": 0.9165095170318228, |
| "kl": 0.04473876953125, |
| "learning_rate": 3.7266745109762667e-07, |
| "loss": 0.0018, |
| "reward": 4.324999809265137, |
| "reward_std": 0.2749999761581421, |
| "rewards/accuracy_reward": 2.875, |
| "rewards/format_reward": 1.0, |
| "step": 198, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.375, |
| "epoch": 0.3401709401709402, |
| "grad_norm": 0.789501213084791, |
| "kl": 0.04150390625, |
| "learning_rate": 3.7149585331065145e-07, |
| "loss": 0.0017, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 199, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.03125, |
| "epoch": 0.3418803418803419, |
| "grad_norm": 1.11306921071518, |
| "kl": 0.037689208984375, |
| "learning_rate": 3.7032075165012317e-07, |
| "loss": 0.0015, |
| "reward": 4.446874618530273, |
| "reward_std": 0.06790292263031006, |
| "rewards/accuracy_reward": 2.9593749046325684, |
| "rewards/format_reward": 1.0, |
| "step": 200, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.875, |
| "epoch": 0.3435897435897436, |
| "grad_norm": 0.7761111972471315, |
| "kl": 0.04083251953125, |
| "learning_rate": 3.6914218000532694e-07, |
| "loss": 0.0016, |
| "reward": 4.472916603088379, |
| "reward_std": 0.05416666343808174, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 201, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.21875, |
| "epoch": 0.3452991452991453, |
| "grad_norm": 0.10020362663647286, |
| "kl": 0.036346435546875, |
| "learning_rate": 3.679601723656205e-07, |
| "loss": 0.0015, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 202, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.75, |
| "epoch": 0.347008547008547, |
| "grad_norm": 0.7893461647051647, |
| "kl": 0.035858154296875, |
| "learning_rate": 3.6677476281945383e-07, |
| "loss": 0.0014, |
| "reward": 4.4739580154418945, |
| "reward_std": 0.05208335071802139, |
| "rewards/accuracy_reward": 2.9739582538604736, |
| "rewards/format_reward": 1.0, |
| "step": 203, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.40625, |
| "epoch": 0.3487179487179487, |
| "grad_norm": 1.9248208875026689, |
| "kl": 0.0372314453125, |
| "learning_rate": 3.655859855533858e-07, |
| "loss": 0.0015, |
| "reward": 4.461458206176758, |
| "reward_std": 0.06416241079568863, |
| "rewards/accuracy_reward": 2.9739582538604736, |
| "rewards/format_reward": 1.0, |
| "step": 204, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.21875, |
| "epoch": 0.3504273504273504, |
| "grad_norm": 1.1205344384928364, |
| "kl": 0.041900634765625, |
| "learning_rate": 3.6439387485109883e-07, |
| "loss": 0.0017, |
| "reward": 4.352083206176758, |
| "reward_std": 0.1530117392539978, |
| "rewards/accuracy_reward": 2.8958334922790527, |
| "rewards/format_reward": 1.0, |
| "step": 205, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.5625, |
| "epoch": 0.35213675213675216, |
| "grad_norm": 0.14604512652457577, |
| "kl": 0.03472900390625, |
| "learning_rate": 3.6319846509240935e-07, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 206, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 369.375, |
| "epoch": 0.35384615384615387, |
| "grad_norm": 0.9121169092040915, |
| "kl": 0.037811279296875, |
| "learning_rate": 3.6199979075227703e-07, |
| "loss": 0.0015, |
| "reward": 4.4583330154418945, |
| "reward_std": 0.08333337306976318, |
| "rewards/accuracy_reward": 2.9583334922790527, |
| "rewards/format_reward": 1.0, |
| "step": 207, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.6875, |
| "epoch": 0.35555555555555557, |
| "grad_norm": 1.1060318359964751, |
| "kl": 0.04547119140625, |
| "learning_rate": 3.6079788639981036e-07, |
| "loss": 0.0018, |
| "reward": 4.464583396911621, |
| "reward_std": 0.060267072170972824, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 208, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 403.28125, |
| "epoch": 0.3572649572649573, |
| "grad_norm": 0.5438742329843329, |
| "kl": 0.035552978515625, |
| "learning_rate": 3.595927866972693e-07, |
| "loss": 0.0014, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 209, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.21875, |
| "epoch": 0.358974358974359, |
| "grad_norm": 0.7804890857933848, |
| "kl": 0.03814697265625, |
| "learning_rate": 3.5838452639906636e-07, |
| "loss": 0.0015, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 210, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.28125, |
| "epoch": 0.3606837606837607, |
| "grad_norm": 1.0411411893808047, |
| "kl": 0.0460205078125, |
| "learning_rate": 3.571731403507635e-07, |
| "loss": 0.0018, |
| "reward": 4.451041221618652, |
| "reward_std": 0.0706593245267868, |
| "rewards/accuracy_reward": 2.9635417461395264, |
| "rewards/format_reward": 1.0, |
| "step": 211, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.53125, |
| "epoch": 0.3623931623931624, |
| "grad_norm": 1.3492992049831198, |
| "kl": 0.041290283203125, |
| "learning_rate": 3.5595866348806784e-07, |
| "loss": 0.0017, |
| "reward": 4.449999809265137, |
| "reward_std": 0.06233753263950348, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 212, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 371.375, |
| "epoch": 0.3641025641025641, |
| "grad_norm": 0.9541327328256634, |
| "kl": 0.038970947265625, |
| "learning_rate": 3.547411308358238e-07, |
| "loss": 0.0016, |
| "reward": 4.477083206176758, |
| "reward_std": 0.04583332687616348, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 213, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.625, |
| "epoch": 0.3658119658119658, |
| "grad_norm": 0.9048739923450602, |
| "kl": 0.041656494140625, |
| "learning_rate": 3.53520577507003e-07, |
| "loss": 0.0017, |
| "reward": 4.481249809265137, |
| "reward_std": 0.03750000521540642, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 214, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.1875, |
| "epoch": 0.36752136752136755, |
| "grad_norm": 0.7146024062835775, |
| "kl": 0.03594970703125, |
| "learning_rate": 3.522970387016919e-07, |
| "loss": 0.0014, |
| "reward": 4.4739580154418945, |
| "reward_std": 0.05208335071802139, |
| "rewards/accuracy_reward": 2.9739584922790527, |
| "rewards/format_reward": 1.0, |
| "step": 215, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 403.3125, |
| "epoch": 0.36923076923076925, |
| "grad_norm": 0.7588941964472184, |
| "kl": 0.0311737060546875, |
| "learning_rate": 3.510705497060762e-07, |
| "loss": 0.0012, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 216, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.0625, |
| "epoch": 0.37094017094017095, |
| "grad_norm": 1.0066377974377885, |
| "kl": 0.040618896484375, |
| "learning_rate": 3.498411458914238e-07, |
| "loss": 0.0016, |
| "reward": 4.4822916984558105, |
| "reward_std": 0.03541666269302368, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 217, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 372.46875, |
| "epoch": 0.37264957264957266, |
| "grad_norm": 1.0197795186320238, |
| "kl": 0.040863037109375, |
| "learning_rate": 3.4860886271306425e-07, |
| "loss": 0.0016, |
| "reward": 4.327083587646484, |
| "reward_std": 0.310569703578949, |
| "rewards/accuracy_reward": 2.8645832538604736, |
| "rewards/format_reward": 0.96875, |
| "step": 218, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.0, |
| "epoch": 0.37435897435897436, |
| "grad_norm": 0.9536918082343873, |
| "kl": 0.037261962890625, |
| "learning_rate": 3.473737357093662e-07, |
| "loss": 0.0015, |
| "reward": 4.4739580154418945, |
| "reward_std": 0.03916243463754654, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 219, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.5, |
| "epoch": 0.37606837606837606, |
| "grad_norm": 1.357829764444905, |
| "kl": 0.040740966796875, |
| "learning_rate": 3.4613580050071274e-07, |
| "loss": 0.0016, |
| "reward": 4.414583206176758, |
| "reward_std": 0.1612565666437149, |
| "rewards/accuracy_reward": 2.9208333492279053, |
| "rewards/format_reward": 1.0, |
| "step": 220, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.5625, |
| "epoch": 0.37777777777777777, |
| "grad_norm": 0.7225751289221025, |
| "kl": 0.04107666015625, |
| "learning_rate": 3.4489509278847413e-07, |
| "loss": 0.0016, |
| "reward": 4.478124618530273, |
| "reward_std": 0.04375000670552254, |
| "rewards/accuracy_reward": 2.984375238418579, |
| "rewards/format_reward": 1.0, |
| "step": 221, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.90625, |
| "epoch": 0.37948717948717947, |
| "grad_norm": 0.5205268133629182, |
| "kl": 0.0357666015625, |
| "learning_rate": 3.43651648353978e-07, |
| "loss": 0.0014, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 222, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 400.9375, |
| "epoch": 0.3811965811965812, |
| "grad_norm": 0.11832902053787207, |
| "kl": 0.03363037109375, |
| "learning_rate": 3.4240550305747775e-07, |
| "loss": 0.0013, |
| "reward": 4.375, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.875, |
| "rewards/format_reward": 1.0, |
| "step": 223, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 400.5625, |
| "epoch": 0.38290598290598293, |
| "grad_norm": 1.2289186408610433, |
| "kl": 0.03338623046875, |
| "learning_rate": 3.411566928371179e-07, |
| "loss": 0.0013, |
| "reward": 4.476041793823242, |
| "reward_std": 0.0479167103767395, |
| "rewards/accuracy_reward": 2.988541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 224, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.46875, |
| "epoch": 0.38461538461538464, |
| "grad_norm": 0.833083778353936, |
| "kl": 0.035369873046875, |
| "learning_rate": 3.399052537078979e-07, |
| "loss": 0.0014, |
| "reward": 4.471875190734863, |
| "reward_std": 0.044946372509002686, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 225, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.46875, |
| "epoch": 0.38632478632478634, |
| "grad_norm": 0.8212684084277662, |
| "kl": 0.04425048828125, |
| "learning_rate": 3.3865122176063385e-07, |
| "loss": 0.0018, |
| "reward": 4.4822916984558105, |
| "reward_std": 0.025839870795607567, |
| "rewards/accuracy_reward": 2.988541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 226, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.5625, |
| "epoch": 0.38803418803418804, |
| "grad_norm": 2.281771763704506, |
| "kl": 0.039581298828125, |
| "learning_rate": 3.3739463316091694e-07, |
| "loss": 0.0016, |
| "reward": 4.445833206176758, |
| "reward_std": 0.10833341628313065, |
| "rewards/accuracy_reward": 2.952083110809326, |
| "rewards/format_reward": 1.0, |
| "step": 227, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 384.1875, |
| "epoch": 0.38974358974358975, |
| "grad_norm": 1.1279986015302106, |
| "kl": 0.039031982421875, |
| "learning_rate": 3.361355241480709e-07, |
| "loss": 0.0016, |
| "reward": 4.329166412353516, |
| "reward_std": 0.25882017612457275, |
| "rewards/accuracy_reward": 2.879166603088379, |
| "rewards/format_reward": 1.0, |
| "step": 228, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.59375, |
| "epoch": 0.39145299145299145, |
| "grad_norm": 1.2133644943426865, |
| "kl": 0.04425048828125, |
| "learning_rate": 3.348739310341068e-07, |
| "loss": 0.0018, |
| "reward": 4.440625190734863, |
| "reward_std": 0.11132083833217621, |
| "rewards/accuracy_reward": 2.9468748569488525, |
| "rewards/format_reward": 1.0, |
| "step": 229, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.53125, |
| "epoch": 0.39316239316239315, |
| "grad_norm": 0.9498275646544599, |
| "kl": 0.03912353515625, |
| "learning_rate": 3.3360989020267577e-07, |
| "loss": 0.0016, |
| "reward": 4.480208396911621, |
| "reward_std": 0.03958336263895035, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 230, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 400.4375, |
| "epoch": 0.39487179487179486, |
| "grad_norm": 0.5639041842584143, |
| "kl": 0.04180908203125, |
| "learning_rate": 3.323434381080199e-07, |
| "loss": 0.0017, |
| "reward": 4.496874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 231, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.4375, |
| "epoch": 0.39658119658119656, |
| "grad_norm": 1.0578332982670637, |
| "kl": 0.038970947265625, |
| "learning_rate": 3.3107461127392067e-07, |
| "loss": 0.0016, |
| "reward": 4.419791221618652, |
| "reward_std": 0.12802158296108246, |
| "rewards/accuracy_reward": 2.926041603088379, |
| "rewards/format_reward": 1.0, |
| "step": 232, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.9375, |
| "epoch": 0.39829059829059826, |
| "grad_norm": 1.1144973425665088, |
| "kl": 0.04278564453125, |
| "learning_rate": 3.2980344629264576e-07, |
| "loss": 0.0017, |
| "reward": 4.452083110809326, |
| "reward_std": 0.06614777445793152, |
| "rewards/accuracy_reward": 2.9520833492279053, |
| "rewards/format_reward": 1.0, |
| "step": 233, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.78125, |
| "epoch": 0.4, |
| "grad_norm": 0.6621292037188405, |
| "kl": 0.04400634765625, |
| "learning_rate": 3.285299798238937e-07, |
| "loss": 0.0018, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 234, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.78125, |
| "epoch": 0.4017094017094017, |
| "grad_norm": 1.0939550473590092, |
| "kl": 0.0455322265625, |
| "learning_rate": 3.272542485937368e-07, |
| "loss": 0.0018, |
| "reward": 4.474999904632568, |
| "reward_std": 0.05000005289912224, |
| "rewards/accuracy_reward": 2.9750001430511475, |
| "rewards/format_reward": 1.0, |
| "step": 235, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.0625, |
| "epoch": 0.40341880341880343, |
| "grad_norm": 1.116721303420348, |
| "kl": 0.04193115234375, |
| "learning_rate": 3.259762893935617e-07, |
| "loss": 0.0017, |
| "reward": 4.453125, |
| "reward_std": 0.05560123175382614, |
| "rewards/accuracy_reward": 2.9593749046325684, |
| "rewards/format_reward": 1.0, |
| "step": 236, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.40625, |
| "epoch": 0.40512820512820513, |
| "grad_norm": 0.7336037527089339, |
| "kl": 0.04364013671875, |
| "learning_rate": 3.2469613907900844e-07, |
| "loss": 0.0017, |
| "reward": 4.484375, |
| "reward_std": 0.031250037252902985, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 237, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.40625, |
| "epoch": 0.40683760683760684, |
| "grad_norm": 1.0249556634834451, |
| "kl": 0.041595458984375, |
| "learning_rate": 3.234138345689077e-07, |
| "loss": 0.0017, |
| "reward": 4.479166507720947, |
| "reward_std": 0.04166674613952637, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 238, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.96875, |
| "epoch": 0.40854700854700854, |
| "grad_norm": 1.0297579044404537, |
| "kl": 0.038970947265625, |
| "learning_rate": 3.221294128442159e-07, |
| "loss": 0.0016, |
| "reward": 4.4864583015441895, |
| "reward_std": 0.027083396911621094, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 239, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.65625, |
| "epoch": 0.41025641025641024, |
| "grad_norm": 1.1366054973046644, |
| "kl": 0.037109375, |
| "learning_rate": 3.2084291094694877e-07, |
| "loss": 0.0015, |
| "reward": 4.471874713897705, |
| "reward_std": 0.05625009536743164, |
| "rewards/accuracy_reward": 2.971874952316284, |
| "rewards/format_reward": 1.0, |
| "step": 240, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.03125, |
| "epoch": 0.41196581196581195, |
| "grad_norm": 0.816127496687785, |
| "kl": 0.035430908203125, |
| "learning_rate": 3.1955436597911315e-07, |
| "loss": 0.0014, |
| "reward": 4.490624904632568, |
| "reward_std": 0.018750011920928955, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 241, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.65625, |
| "epoch": 0.41367521367521365, |
| "grad_norm": 0.9177988741903882, |
| "kl": 0.0390625, |
| "learning_rate": 3.182638151016369e-07, |
| "loss": 0.0016, |
| "reward": 4.46875, |
| "reward_std": 0.042391255497932434, |
| "rewards/accuracy_reward": 2.968749761581421, |
| "rewards/format_reward": 1.0, |
| "step": 242, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.0625, |
| "epoch": 0.4153846153846154, |
| "grad_norm": 0.09569801333724084, |
| "kl": 0.034759521484375, |
| "learning_rate": 3.16971295533297e-07, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 243, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.875, |
| "epoch": 0.4170940170940171, |
| "grad_norm": 1.056398886307418, |
| "kl": 0.03924560546875, |
| "learning_rate": 3.1567684454964674e-07, |
| "loss": 0.0016, |
| "reward": 4.472916603088379, |
| "reward_std": 0.045361533761024475, |
| "rewards/accuracy_reward": 2.972916841506958, |
| "rewards/format_reward": 1.0, |
| "step": 244, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.15625, |
| "epoch": 0.4188034188034188, |
| "grad_norm": 0.9602785771876751, |
| "kl": 0.045501708984375, |
| "learning_rate": 3.1438049948194e-07, |
| "loss": 0.0018, |
| "reward": 4.476041793823242, |
| "reward_std": 0.03833986073732376, |
| "rewards/accuracy_reward": 2.988541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 245, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.71875, |
| "epoch": 0.4205128205128205, |
| "grad_norm": 0.9869505927258133, |
| "kl": 0.04656982421875, |
| "learning_rate": 3.130822977160554e-07, |
| "loss": 0.0019, |
| "reward": 4.452083110809326, |
| "reward_std": 0.0803452804684639, |
| "rewards/accuracy_reward": 2.9583332538604736, |
| "rewards/format_reward": 1.0, |
| "step": 246, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.5625, |
| "epoch": 0.4222222222222222, |
| "grad_norm": 0.5694726965482299, |
| "kl": 0.037200927734375, |
| "learning_rate": 3.117822766914174e-07, |
| "loss": 0.0015, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 247, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.5, |
| "epoch": 0.4239316239316239, |
| "grad_norm": 0.9716575381117358, |
| "kl": 0.038238525390625, |
| "learning_rate": 3.104804738999169e-07, |
| "loss": 0.0015, |
| "reward": 4.351041793823242, |
| "reward_std": 0.0479167103767395, |
| "rewards/accuracy_reward": 2.851041793823242, |
| "rewards/format_reward": 1.0, |
| "step": 248, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.1875, |
| "epoch": 0.4256410256410256, |
| "grad_norm": 0.9842518166866518, |
| "kl": 0.0413818359375, |
| "learning_rate": 3.091769268848302e-07, |
| "loss": 0.0017, |
| "reward": 4.316666603088379, |
| "reward_std": 0.10375991463661194, |
| "rewards/accuracy_reward": 2.8229165077209473, |
| "rewards/format_reward": 1.0, |
| "step": 249, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.65625, |
| "epoch": 0.42735042735042733, |
| "grad_norm": 1.278001712618916, |
| "kl": 0.042877197265625, |
| "learning_rate": 3.078716732397358e-07, |
| "loss": 0.0017, |
| "reward": 4.3885416984558105, |
| "reward_std": 0.20983925461769104, |
| "rewards/accuracy_reward": 2.894791603088379, |
| "rewards/format_reward": 1.0, |
| "step": 250, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.0, |
| "epoch": 0.42905982905982903, |
| "grad_norm": 1.1688274691177465, |
| "kl": 0.050018310546875, |
| "learning_rate": 3.065647506074306e-07, |
| "loss": 0.002, |
| "reward": 4.448958396911621, |
| "reward_std": 0.08484545350074768, |
| "rewards/accuracy_reward": 2.9552083015441895, |
| "rewards/format_reward": 1.0, |
| "step": 251, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.65625, |
| "epoch": 0.4307692307692308, |
| "grad_norm": 0.9872122914899957, |
| "kl": 0.04180908203125, |
| "learning_rate": 3.0525619667884406e-07, |
| "loss": 0.0017, |
| "reward": 4.469791412353516, |
| "reward_std": 0.048212792724370956, |
| "rewards/accuracy_reward": 2.976041555404663, |
| "rewards/format_reward": 1.0, |
| "step": 252, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.5625, |
| "epoch": 0.4324786324786325, |
| "grad_norm": 0.7672081800546268, |
| "kl": 0.046142578125, |
| "learning_rate": 3.0394604919195157e-07, |
| "loss": 0.0018, |
| "reward": 4.317708492279053, |
| "reward_std": 0.08067251741886139, |
| "rewards/accuracy_reward": 2.8177082538604736, |
| "rewards/format_reward": 1.0, |
| "step": 253, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.875, |
| "epoch": 0.4341880341880342, |
| "grad_norm": 0.9698339884690548, |
| "kl": 0.0462646484375, |
| "learning_rate": 3.026343459306856e-07, |
| "loss": 0.0019, |
| "reward": 4.462500095367432, |
| "reward_std": 0.07499998807907104, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 254, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.8125, |
| "epoch": 0.4358974358974359, |
| "grad_norm": 0.7482162088172302, |
| "kl": 0.040283203125, |
| "learning_rate": 3.0132112472384646e-07, |
| "loss": 0.0016, |
| "reward": 4.484375, |
| "reward_std": 0.024467820301651955, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 255, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.21875, |
| "epoch": 0.4376068376068376, |
| "grad_norm": 0.5797544385226397, |
| "kl": 0.03729248046875, |
| "learning_rate": 3.000064234440111e-07, |
| "loss": 0.0015, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 256, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.59375, |
| "epoch": 0.4393162393162393, |
| "grad_norm": 0.5365371682569317, |
| "kl": 0.039337158203125, |
| "learning_rate": 2.98690280006441e-07, |
| "loss": 0.0016, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 257, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.90625, |
| "epoch": 0.441025641025641, |
| "grad_norm": 1.2492723527722578, |
| "kl": 0.037811279296875, |
| "learning_rate": 2.973727323679887e-07, |
| "loss": 0.0015, |
| "reward": 4.382291793823242, |
| "reward_std": 0.22722449898719788, |
| "rewards/accuracy_reward": 2.894791841506958, |
| "rewards/format_reward": 1.0, |
| "step": 258, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.5, |
| "epoch": 0.4427350427350427, |
| "grad_norm": 1.26066913404296, |
| "kl": 0.04364013671875, |
| "learning_rate": 2.9605381852600284e-07, |
| "loss": 0.0017, |
| "reward": 4.379166603088379, |
| "reward_std": 0.2093537300825119, |
| "rewards/accuracy_reward": 2.8916664123535156, |
| "rewards/format_reward": 1.0, |
| "step": 259, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.03125, |
| "epoch": 0.4444444444444444, |
| "grad_norm": 0.7554523113224114, |
| "kl": 0.03900146484375, |
| "learning_rate": 2.947335765172332e-07, |
| "loss": 0.0016, |
| "reward": 4.483333587646484, |
| "reward_std": 0.03333333879709244, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 260, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.875, |
| "epoch": 0.4461538461538462, |
| "grad_norm": 0.9144306336964317, |
| "kl": 0.0413818359375, |
| "learning_rate": 2.934120444167326e-07, |
| "loss": 0.0017, |
| "reward": 4.447916507720947, |
| "reward_std": 0.08289644867181778, |
| "rewards/accuracy_reward": 2.9541666507720947, |
| "rewards/format_reward": 1.0, |
| "step": 261, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.15625, |
| "epoch": 0.4478632478632479, |
| "grad_norm": 1.2006861780363822, |
| "kl": 0.0380859375, |
| "learning_rate": 2.920892603367596e-07, |
| "loss": 0.0015, |
| "reward": 4.485416412353516, |
| "reward_std": 0.029166698455810547, |
| "rewards/accuracy_reward": 2.991666793823242, |
| "rewards/format_reward": 1.0, |
| "step": 262, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.375, |
| "epoch": 0.4495726495726496, |
| "grad_norm": 0.6264774336128276, |
| "kl": 0.038055419921875, |
| "learning_rate": 2.9076526242567934e-07, |
| "loss": 0.0015, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 263, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.03125, |
| "epoch": 0.4512820512820513, |
| "grad_norm": 0.9285367833144407, |
| "kl": 0.03717041015625, |
| "learning_rate": 2.894400888668628e-07, |
| "loss": 0.0015, |
| "reward": 4.460416793823242, |
| "reward_std": 0.0791667103767395, |
| "rewards/accuracy_reward": 2.991666555404663, |
| "rewards/format_reward": 0.96875, |
| "step": 264, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.875, |
| "epoch": 0.452991452991453, |
| "grad_norm": 0.7830763693612015, |
| "kl": 0.044525146484375, |
| "learning_rate": 2.881137778775863e-07, |
| "loss": 0.0018, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 265, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.40625, |
| "epoch": 0.4547008547008547, |
| "grad_norm": 0.9316521232276594, |
| "kl": 0.037506103515625, |
| "learning_rate": 2.86786367707929e-07, |
| "loss": 0.0015, |
| "reward": 4.480208396911621, |
| "reward_std": 0.03000655397772789, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 266, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.96875, |
| "epoch": 0.4564102564102564, |
| "grad_norm": 0.9675437072980656, |
| "kl": 0.036773681640625, |
| "learning_rate": 2.854578966396697e-07, |
| "loss": 0.0015, |
| "reward": 4.464583396911621, |
| "reward_std": 0.057912446558475494, |
| "rewards/accuracy_reward": 2.9708333015441895, |
| "rewards/format_reward": 1.0, |
| "step": 267, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.59375, |
| "epoch": 0.4581196581196581, |
| "grad_norm": 1.1220859220665398, |
| "kl": 0.03741455078125, |
| "learning_rate": 2.841284029851829e-07, |
| "loss": 0.0015, |
| "reward": 4.4666666984558105, |
| "reward_std": 0.057089872658252716, |
| "rewards/accuracy_reward": 2.972916603088379, |
| "rewards/format_reward": 1.0, |
| "step": 268, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.875, |
| "epoch": 0.4598290598290598, |
| "grad_norm": 0.5689642749646727, |
| "kl": 0.03631591796875, |
| "learning_rate": 2.827979250863341e-07, |
| "loss": 0.0015, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012500028125941753, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 269, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.1875, |
| "epoch": 0.46153846153846156, |
| "grad_norm": 0.9826672817363714, |
| "kl": 0.0389404296875, |
| "learning_rate": 2.814665013133737e-07, |
| "loss": 0.0016, |
| "reward": 4.476041793823242, |
| "reward_std": 0.03833986073732376, |
| "rewards/accuracy_reward": 2.988541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 270, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.8125, |
| "epoch": 0.46324786324786327, |
| "grad_norm": 0.8299425945656934, |
| "kl": 0.039581298828125, |
| "learning_rate": 2.801341700638307e-07, |
| "loss": 0.0016, |
| "reward": 4.465624809265137, |
| "reward_std": 0.06875002384185791, |
| "rewards/accuracy_reward": 2.965625047683716, |
| "rewards/format_reward": 1.0, |
| "step": 271, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.875, |
| "epoch": 0.46495726495726497, |
| "grad_norm": 0.8021355694486639, |
| "kl": 0.03424072265625, |
| "learning_rate": 2.788009697614053e-07, |
| "loss": 0.0014, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 272, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.96875, |
| "epoch": 0.4666666666666667, |
| "grad_norm": 0.5179000391744707, |
| "kl": 0.038330078125, |
| "learning_rate": 2.774669388548604e-07, |
| "loss": 0.0015, |
| "reward": 4.487500190734863, |
| "reward_std": 0.014433743432164192, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 273, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 362.53125, |
| "epoch": 0.4683760683760684, |
| "grad_norm": 0.15075527427477844, |
| "kl": 0.035400390625, |
| "learning_rate": 2.761321158169134e-07, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 274, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.625, |
| "epoch": 0.4700854700854701, |
| "grad_norm": 0.6023840972420389, |
| "kl": 0.04217529296875, |
| "learning_rate": 2.7479653914312606e-07, |
| "loss": 0.0017, |
| "reward": 4.487500190734863, |
| "reward_std": 0.014433743432164192, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 275, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 365.34375, |
| "epoch": 0.4717948717948718, |
| "grad_norm": 1.0588371671437944, |
| "kl": 0.034088134765625, |
| "learning_rate": 2.7346024735079484e-07, |
| "loss": 0.0014, |
| "reward": 4.4666666984558105, |
| "reward_std": 0.06666667759418488, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 276, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.875, |
| "epoch": 0.4735042735042735, |
| "grad_norm": 0.9311754494662231, |
| "kl": 0.0364990234375, |
| "learning_rate": 2.721232789778396e-07, |
| "loss": 0.0015, |
| "reward": 4.46875, |
| "reward_std": 0.06250005960464478, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 277, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.6875, |
| "epoch": 0.4752136752136752, |
| "grad_norm": 0.7622217392129294, |
| "kl": 0.036468505859375, |
| "learning_rate": 2.707856725816926e-07, |
| "loss": 0.0015, |
| "reward": 4.4822916984558105, |
| "reward_std": 0.024850429967045784, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 278, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.21875, |
| "epoch": 0.47692307692307695, |
| "grad_norm": 0.07605318199841635, |
| "kl": 0.036376953125, |
| "learning_rate": 2.694474667381862e-07, |
| "loss": 0.0015, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 279, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.40625, |
| "epoch": 0.47863247863247865, |
| "grad_norm": 0.08281321697663974, |
| "kl": 0.031402587890625, |
| "learning_rate": 2.681087000404406e-07, |
| "loss": 0.0013, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 280, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.9375, |
| "epoch": 0.48034188034188036, |
| "grad_norm": 0.9158024249719325, |
| "kl": 0.035797119140625, |
| "learning_rate": 2.667694110977506e-07, |
| "loss": 0.0014, |
| "reward": 4.428124904632568, |
| "reward_std": 0.14375001192092896, |
| "rewards/accuracy_reward": 2.9281249046325684, |
| "rewards/format_reward": 1.0, |
| "step": 281, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.875, |
| "epoch": 0.48205128205128206, |
| "grad_norm": 1.2380224918264358, |
| "kl": 0.035797119140625, |
| "learning_rate": 2.6542963853447236e-07, |
| "loss": 0.0014, |
| "reward": 4.409374713897705, |
| "reward_std": 0.18125000596046448, |
| "rewards/accuracy_reward": 2.921875238418579, |
| "rewards/format_reward": 1.0, |
| "step": 282, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.0, |
| "epoch": 0.48376068376068376, |
| "grad_norm": 0.8805302625019681, |
| "kl": 0.039886474609375, |
| "learning_rate": 2.6408942098890937e-07, |
| "loss": 0.0016, |
| "reward": 4.478125095367432, |
| "reward_std": 0.04375004768371582, |
| "rewards/accuracy_reward": 2.9781250953674316, |
| "rewards/format_reward": 1.0, |
| "step": 283, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.84375, |
| "epoch": 0.48547008547008547, |
| "grad_norm": 0.7527809954073914, |
| "kl": 0.03570556640625, |
| "learning_rate": 2.627487971121981e-07, |
| "loss": 0.0014, |
| "reward": 4.4739580154418945, |
| "reward_std": 0.03987946733832359, |
| "rewards/accuracy_reward": 2.9739582538604736, |
| "rewards/format_reward": 1.0, |
| "step": 284, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.5625, |
| "epoch": 0.48717948717948717, |
| "grad_norm": 0.09612785925134994, |
| "kl": 0.037109375, |
| "learning_rate": 2.6140780556719347e-07, |
| "loss": 0.0015, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 285, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 371.9375, |
| "epoch": 0.4888888888888889, |
| "grad_norm": 1.0426281181628902, |
| "kl": 0.037139892578125, |
| "learning_rate": 2.600664850273538e-07, |
| "loss": 0.0015, |
| "reward": 4.431250095367432, |
| "reward_std": 0.12285532057285309, |
| "rewards/accuracy_reward": 2.981250047683716, |
| "rewards/format_reward": 0.96875, |
| "step": 286, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.6875, |
| "epoch": 0.4905982905982906, |
| "grad_norm": 0.6014376687211012, |
| "kl": 0.031768798828125, |
| "learning_rate": 2.5872487417562527e-07, |
| "loss": 0.0013, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833352580666542, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 287, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.0625, |
| "epoch": 0.49230769230769234, |
| "grad_norm": 0.7733776094186604, |
| "kl": 0.03582763671875, |
| "learning_rate": 2.573830117033266e-07, |
| "loss": 0.0014, |
| "reward": 4.478125095367432, |
| "reward_std": 0.043749988079071045, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 288, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.71875, |
| "epoch": 0.49401709401709404, |
| "grad_norm": 0.7138068342578241, |
| "kl": 0.035797119140625, |
| "learning_rate": 2.5604093630903305e-07, |
| "loss": 0.0014, |
| "reward": 4.479166507720947, |
| "reward_std": 0.04166668653488159, |
| "rewards/accuracy_reward": 2.9791667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 289, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.5625, |
| "epoch": 0.49572649572649574, |
| "grad_norm": 0.7761774380810611, |
| "kl": 0.031585693359375, |
| "learning_rate": 2.546986866974606e-07, |
| "loss": 0.0013, |
| "reward": 4.363541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.8697915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 290, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.78125, |
| "epoch": 0.49743589743589745, |
| "grad_norm": 0.8551880159617881, |
| "kl": 0.0345458984375, |
| "learning_rate": 2.5335630157834935e-07, |
| "loss": 0.0014, |
| "reward": 4.463541507720947, |
| "reward_std": 0.05353529751300812, |
| "rewards/accuracy_reward": 2.9697914123535156, |
| "rewards/format_reward": 1.0, |
| "step": 291, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.09375, |
| "epoch": 0.49914529914529915, |
| "grad_norm": 0.5529263640378508, |
| "kl": 0.03204345703125, |
| "learning_rate": 2.520138196653475e-07, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 292, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.53125, |
| "epoch": 0.5008547008547009, |
| "grad_norm": 0.9791061277473894, |
| "kl": 0.032501220703125, |
| "learning_rate": 2.506712796748946e-07, |
| "loss": 0.0013, |
| "reward": 4.474999904632568, |
| "reward_std": 0.03943371772766113, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 293, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.0, |
| "epoch": 0.5025641025641026, |
| "grad_norm": 0.8579863225514387, |
| "kl": 0.044525146484375, |
| "learning_rate": 2.4932872032510537e-07, |
| "loss": 0.0018, |
| "reward": 4.453125, |
| "reward_std": 0.09375003725290298, |
| "rewards/accuracy_reward": 2.953125, |
| "rewards/format_reward": 1.0, |
| "step": 294, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.8125, |
| "epoch": 0.5042735042735043, |
| "grad_norm": 0.8092357571514325, |
| "kl": 0.0313720703125, |
| "learning_rate": 2.4798618033465255e-07, |
| "loss": 0.0013, |
| "reward": 4.478125095367432, |
| "reward_std": 0.03244640305638313, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 295, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 365.84375, |
| "epoch": 0.505982905982906, |
| "grad_norm": 0.47095499450394307, |
| "kl": 0.03460693359375, |
| "learning_rate": 2.466436984216507e-07, |
| "loss": 0.0014, |
| "reward": 4.478125095367432, |
| "reward_std": 0.029536345973610878, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 296, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.0, |
| "epoch": 0.5076923076923077, |
| "grad_norm": 0.531091931616097, |
| "kl": 0.02972412109375, |
| "learning_rate": 2.453013133025394e-07, |
| "loss": 0.0012, |
| "reward": 4.484375, |
| "reward_std": 0.019946394488215446, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 297, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 365.8125, |
| "epoch": 0.5094017094017094, |
| "grad_norm": 1.2659627324757727, |
| "kl": 0.0391845703125, |
| "learning_rate": 2.43959063690967e-07, |
| "loss": 0.0016, |
| "reward": 4.417708396911621, |
| "reward_std": 0.15726101398468018, |
| "rewards/accuracy_reward": 2.930208206176758, |
| "rewards/format_reward": 1.0, |
| "step": 298, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.9375, |
| "epoch": 0.5111111111111111, |
| "grad_norm": 0.7314429482901976, |
| "kl": 0.042449951171875, |
| "learning_rate": 2.4261698829667347e-07, |
| "loss": 0.0017, |
| "reward": 4.479166507720947, |
| "reward_std": 0.041666705161333084, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 299, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.125, |
| "epoch": 0.5128205128205128, |
| "grad_norm": 0.09120795369226402, |
| "kl": 0.029754638671875, |
| "learning_rate": 2.412751258243748e-07, |
| "loss": 0.0012, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 300, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.90625, |
| "epoch": 0.5145299145299145, |
| "grad_norm": 0.7868332165703579, |
| "kl": 0.0433349609375, |
| "learning_rate": 2.399335149726463e-07, |
| "loss": 0.0017, |
| "reward": 4.4822916984558105, |
| "reward_std": 0.025839831680059433, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 301, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.0, |
| "epoch": 0.5162393162393163, |
| "grad_norm": 0.763574402727439, |
| "kl": 0.03216552734375, |
| "learning_rate": 2.3859219443280656e-07, |
| "loss": 0.0013, |
| "reward": 4.490624904632568, |
| "reward_std": 0.018750011920928955, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 302, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.1875, |
| "epoch": 0.517948717948718, |
| "grad_norm": 1.0058943562387195, |
| "kl": 0.03338623046875, |
| "learning_rate": 2.3725120288780184e-07, |
| "loss": 0.0013, |
| "reward": 4.425000190734863, |
| "reward_std": 0.1499999761581421, |
| "rewards/accuracy_reward": 2.9375, |
| "rewards/format_reward": 1.0, |
| "step": 303, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 428.5625, |
| "epoch": 0.5196581196581197, |
| "grad_norm": 0.42116405277150404, |
| "kl": 0.03240966796875, |
| "learning_rate": 2.359105790110906e-07, |
| "loss": 0.0013, |
| "reward": 4.488541603088379, |
| "reward_std": 0.013339842669665813, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 304, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.34375, |
| "epoch": 0.5213675213675214, |
| "grad_norm": 1.0950683758577704, |
| "kl": 0.0372314453125, |
| "learning_rate": 2.3457036146552762e-07, |
| "loss": 0.0015, |
| "reward": 4.431250095367432, |
| "reward_std": 0.11542317271232605, |
| "rewards/accuracy_reward": 2.9437499046325684, |
| "rewards/format_reward": 1.0, |
| "step": 305, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.09375, |
| "epoch": 0.5230769230769231, |
| "grad_norm": 0.7371614253995173, |
| "kl": 0.03411865234375, |
| "learning_rate": 2.3323058890224938e-07, |
| "loss": 0.0014, |
| "reward": 4.432291507720947, |
| "reward_std": 0.1354166865348816, |
| "rewards/accuracy_reward": 2.9322915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 306, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.71875, |
| "epoch": 0.5247863247863248, |
| "grad_norm": 0.5134694091227199, |
| "kl": 0.032379150390625, |
| "learning_rate": 2.3189129995955942e-07, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 307, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.28125, |
| "epoch": 0.5264957264957265, |
| "grad_norm": 0.8781524864900261, |
| "kl": 0.038330078125, |
| "learning_rate": 2.305525332618138e-07, |
| "loss": 0.0015, |
| "reward": 4.482291221618652, |
| "reward_std": 0.03541666269302368, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 308, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 399.0, |
| "epoch": 0.5282051282051282, |
| "grad_norm": 1.0415017947017724, |
| "kl": 0.033416748046875, |
| "learning_rate": 2.292143274183074e-07, |
| "loss": 0.0013, |
| "reward": 4.460416793823242, |
| "reward_std": 0.0791667103767395, |
| "rewards/accuracy_reward": 2.960416793823242, |
| "rewards/format_reward": 1.0, |
| "step": 309, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.875, |
| "epoch": 0.5299145299145299, |
| "grad_norm": 0.6008838011788349, |
| "kl": 0.03973388671875, |
| "learning_rate": 2.278767210221604e-07, |
| "loss": 0.0016, |
| "reward": 4.484375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 310, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.71875, |
| "epoch": 0.5316239316239316, |
| "grad_norm": 0.732823233771545, |
| "kl": 0.04248046875, |
| "learning_rate": 2.265397526492052e-07, |
| "loss": 0.0017, |
| "reward": 4.4583330154418945, |
| "reward_std": 0.08333335071802139, |
| "rewards/accuracy_reward": 2.9583334922790527, |
| "rewards/format_reward": 1.0, |
| "step": 311, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 403.90625, |
| "epoch": 0.5333333333333333, |
| "grad_norm": 0.8066497268246617, |
| "kl": 0.037872314453125, |
| "learning_rate": 2.2520346085687397e-07, |
| "loss": 0.0015, |
| "reward": 4.351041793823242, |
| "reward_std": 0.22291666269302368, |
| "rewards/accuracy_reward": 2.9010415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 312, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 405.625, |
| "epoch": 0.535042735042735, |
| "grad_norm": 0.5736301292855377, |
| "kl": 0.033050537109375, |
| "learning_rate": 2.2386788418308665e-07, |
| "loss": 0.0013, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 313, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 402.34375, |
| "epoch": 0.5367521367521367, |
| "grad_norm": 0.92886366537301, |
| "kl": 0.04010009765625, |
| "learning_rate": 2.225330611451396e-07, |
| "loss": 0.0016, |
| "reward": 4.475000381469727, |
| "reward_std": 0.050000011920928955, |
| "rewards/accuracy_reward": 2.981250047683716, |
| "rewards/format_reward": 1.0, |
| "step": 314, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 424.21875, |
| "epoch": 0.5384615384615384, |
| "grad_norm": 0.8095666212677034, |
| "kl": 0.035491943359375, |
| "learning_rate": 2.2119903023859473e-07, |
| "loss": 0.0014, |
| "reward": 4.462500095367432, |
| "reward_std": 0.07500001788139343, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 315, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 396.75, |
| "epoch": 0.5401709401709401, |
| "grad_norm": 0.9069226760525908, |
| "kl": 0.033599853515625, |
| "learning_rate": 2.1986582993616925e-07, |
| "loss": 0.0013, |
| "reward": 4.3458333015441895, |
| "reward_std": 0.23333333432674408, |
| "rewards/accuracy_reward": 2.8958334922790527, |
| "rewards/format_reward": 1.0, |
| "step": 316, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.4375, |
| "epoch": 0.5418803418803418, |
| "grad_norm": 0.08823704142856094, |
| "kl": 0.03680419921875, |
| "learning_rate": 2.1853349868662632e-07, |
| "loss": 0.0015, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 317, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.65625, |
| "epoch": 0.5435897435897435, |
| "grad_norm": 0.731770212008259, |
| "kl": 0.039154052734375, |
| "learning_rate": 2.1720207491366595e-07, |
| "loss": 0.0016, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 318, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 406.25, |
| "epoch": 0.5452991452991452, |
| "grad_norm": 0.12136352666188718, |
| "kl": 0.04296875, |
| "learning_rate": 2.1587159701481713e-07, |
| "loss": 0.0017, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 319, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 400.125, |
| "epoch": 0.5470085470085471, |
| "grad_norm": 0.5122046662863808, |
| "kl": 0.039886474609375, |
| "learning_rate": 2.1454210336033038e-07, |
| "loss": 0.0016, |
| "reward": 4.496874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 320, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 473.65625, |
| "epoch": 0.5487179487179488, |
| "grad_norm": 0.7720865025563447, |
| "kl": 0.037506103515625, |
| "learning_rate": 2.1321363229207094e-07, |
| "loss": 0.0015, |
| "reward": 4.46875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.987499952316284, |
| "rewards/format_reward": 1.0, |
| "step": 321, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 403.46875, |
| "epoch": 0.5504273504273505, |
| "grad_norm": 0.7221011874275695, |
| "kl": 0.039031982421875, |
| "learning_rate": 2.1188622212241363e-07, |
| "loss": 0.0016, |
| "reward": 4.4375, |
| "reward_std": 0.125, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 0.96875, |
| "step": 322, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 406.4375, |
| "epoch": 0.5521367521367522, |
| "grad_norm": 0.540236345777918, |
| "kl": 0.03485107421875, |
| "learning_rate": 2.1055991113313716e-07, |
| "loss": 0.0014, |
| "reward": 4.365624904632568, |
| "reward_std": 0.018750011920928955, |
| "rewards/accuracy_reward": 2.8656249046325684, |
| "rewards/format_reward": 1.0, |
| "step": 323, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.84375, |
| "epoch": 0.5538461538461539, |
| "grad_norm": 0.11607538632372809, |
| "kl": 0.03997802734375, |
| "learning_rate": 2.092347375743207e-07, |
| "loss": 0.0016, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 324, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.03125, |
| "epoch": 0.5555555555555556, |
| "grad_norm": 0.5056705769715667, |
| "kl": 0.039154052734375, |
| "learning_rate": 2.0791073966324034e-07, |
| "loss": 0.0016, |
| "reward": 4.307291507720947, |
| "reward_std": 0.1354166716337204, |
| "rewards/accuracy_reward": 2.8072915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 325, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.375, |
| "epoch": 0.5572649572649573, |
| "grad_norm": 0.9995739010662272, |
| "kl": 0.03607177734375, |
| "learning_rate": 2.065879555832674e-07, |
| "loss": 0.0014, |
| "reward": 4.459374904632568, |
| "reward_std": 0.07446783035993576, |
| "rewards/accuracy_reward": 2.965625047683716, |
| "rewards/format_reward": 1.0, |
| "step": 326, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.03125, |
| "epoch": 0.558974358974359, |
| "grad_norm": 0.5593797794480546, |
| "kl": 0.037933349609375, |
| "learning_rate": 2.052664234827668e-07, |
| "loss": 0.0015, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833352580666542, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 327, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.40625, |
| "epoch": 0.5606837606837607, |
| "grad_norm": 0.7643758219950748, |
| "kl": 0.0341796875, |
| "learning_rate": 2.039461814739971e-07, |
| "loss": 0.0014, |
| "reward": 4.484375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.9906249046325684, |
| "rewards/format_reward": 1.0, |
| "step": 328, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.84375, |
| "epoch": 0.5623931623931624, |
| "grad_norm": 0.5355887210870947, |
| "kl": 0.0362548828125, |
| "learning_rate": 2.0262726763201138e-07, |
| "loss": 0.0015, |
| "reward": 4.371874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.871875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 329, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.9375, |
| "epoch": 0.5641025641025641, |
| "grad_norm": 0.08255482149118182, |
| "kl": 0.030914306640625, |
| "learning_rate": 2.0130971999355901e-07, |
| "loss": 0.0012, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 330, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.5, |
| "epoch": 0.5658119658119658, |
| "grad_norm": 0.4549853870360058, |
| "kl": 0.03265380859375, |
| "learning_rate": 1.9999357655598891e-07, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 331, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.1875, |
| "epoch": 0.5675213675213675, |
| "grad_norm": 1.0004535826942826, |
| "kl": 0.04522705078125, |
| "learning_rate": 1.9867887527615357e-07, |
| "loss": 0.0018, |
| "reward": 4.446874618530273, |
| "reward_std": 0.10625002533197403, |
| "rewards/accuracy_reward": 2.953125, |
| "rewards/format_reward": 1.0, |
| "step": 332, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.46875, |
| "epoch": 0.5692307692307692, |
| "grad_norm": 0.9598847700617524, |
| "kl": 0.03875732421875, |
| "learning_rate": 1.9736565406931444e-07, |
| "loss": 0.0015, |
| "reward": 4.418749809265137, |
| "reward_std": 0.16250000894069672, |
| "rewards/accuracy_reward": 2.9312500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 333, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 390.09375, |
| "epoch": 0.5709401709401709, |
| "grad_norm": 0.5355616265414169, |
| "kl": 0.03277587890625, |
| "learning_rate": 1.960539508080485e-07, |
| "loss": 0.0013, |
| "reward": 4.356249809265137, |
| "reward_std": 0.19618761539459229, |
| "rewards/accuracy_reward": 2.90625, |
| "rewards/format_reward": 1.0, |
| "step": 334, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.625, |
| "epoch": 0.5726495726495726, |
| "grad_norm": 0.9278958288143269, |
| "kl": 0.033538818359375, |
| "learning_rate": 1.9474380332115597e-07, |
| "loss": 0.0013, |
| "reward": 4.477083206176758, |
| "reward_std": 0.045833367854356766, |
| "rewards/accuracy_reward": 2.9833333492279053, |
| "rewards/format_reward": 1.0, |
| "step": 335, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.78125, |
| "epoch": 0.5743589743589743, |
| "grad_norm": 0.7140882017996247, |
| "kl": 0.03717041015625, |
| "learning_rate": 1.934352493925695e-07, |
| "loss": 0.0015, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 336, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.28125, |
| "epoch": 0.576068376068376, |
| "grad_norm": 0.7140882017996247, |
| "kl": 0.03448486328125, |
| "learning_rate": 1.934352493925695e-07, |
| "loss": 0.0014, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 337, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.5, |
| "epoch": 0.5777777777777777, |
| "grad_norm": 0.8126061578453346, |
| "kl": 0.03192138671875, |
| "learning_rate": 1.9212832676026427e-07, |
| "loss": 0.0013, |
| "reward": 4.483333587646484, |
| "reward_std": 0.03333333879709244, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 338, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.96875, |
| "epoch": 0.5794871794871795, |
| "grad_norm": 0.7335705389742384, |
| "kl": 0.035125732421875, |
| "learning_rate": 1.9082307311516984e-07, |
| "loss": 0.0014, |
| "reward": 4.456250190734863, |
| "reward_std": 0.07693374156951904, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 339, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.4375, |
| "epoch": 0.5811965811965812, |
| "grad_norm": 0.7657935514777842, |
| "kl": 0.034881591796875, |
| "learning_rate": 1.895195261000831e-07, |
| "loss": 0.0014, |
| "reward": 4.484375, |
| "reward_std": 0.031250037252902985, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 340, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.71875, |
| "epoch": 0.582905982905983, |
| "grad_norm": 0.09547936317636307, |
| "kl": 0.034759521484375, |
| "learning_rate": 1.8821772330858257e-07, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 341, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.0625, |
| "epoch": 0.5846153846153846, |
| "grad_norm": 0.9453045319770538, |
| "kl": 0.03839111328125, |
| "learning_rate": 1.8691770228394454e-07, |
| "loss": 0.0015, |
| "reward": 4.477083206176758, |
| "reward_std": 0.04583340883255005, |
| "rewards/accuracy_reward": 2.977083206176758, |
| "rewards/format_reward": 1.0, |
| "step": 342, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.78125, |
| "epoch": 0.5863247863247864, |
| "grad_norm": 0.7927846448695134, |
| "kl": 0.03741455078125, |
| "learning_rate": 1.856195005180599e-07, |
| "loss": 0.0015, |
| "reward": 4.479166507720947, |
| "reward_std": 0.04166668653488159, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 343, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.5625, |
| "epoch": 0.588034188034188, |
| "grad_norm": 0.523415665615348, |
| "kl": 0.0307769775390625, |
| "learning_rate": 1.8432315545035327e-07, |
| "loss": 0.0012, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 344, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 408.28125, |
| "epoch": 0.5897435897435898, |
| "grad_norm": 0.07967194840458601, |
| "kl": 0.030059814453125, |
| "learning_rate": 1.8302870446670298e-07, |
| "loss": 0.0012, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 345, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.34375, |
| "epoch": 0.5914529914529915, |
| "grad_norm": 0.6739955777442904, |
| "kl": 0.032501220703125, |
| "learning_rate": 1.8173618489836313e-07, |
| "loss": 0.0013, |
| "reward": 4.452083110809326, |
| "reward_std": 0.04834301769733429, |
| "rewards/accuracy_reward": 2.9583334922790527, |
| "rewards/format_reward": 1.0, |
| "step": 346, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.21875, |
| "epoch": 0.5931623931623932, |
| "grad_norm": 0.9588346689039634, |
| "kl": 0.03369140625, |
| "learning_rate": 1.8044563402088682e-07, |
| "loss": 0.0013, |
| "reward": 4.463541507720947, |
| "reward_std": 0.04878613352775574, |
| "rewards/accuracy_reward": 2.9697914123535156, |
| "rewards/format_reward": 1.0, |
| "step": 347, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.78125, |
| "epoch": 0.5948717948717949, |
| "grad_norm": 0.91793025589706, |
| "kl": 0.0355224609375, |
| "learning_rate": 1.791570890530512e-07, |
| "loss": 0.0014, |
| "reward": 4.40625, |
| "reward_std": 0.1875000298023224, |
| "rewards/accuracy_reward": 2.90625, |
| "rewards/format_reward": 1.0, |
| "step": 348, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.9375, |
| "epoch": 0.5965811965811966, |
| "grad_norm": 0.5238665028970252, |
| "kl": 0.0275115966796875, |
| "learning_rate": 1.7787058715578413e-07, |
| "loss": 0.0011, |
| "reward": 4.479166507720947, |
| "reward_std": 0.029462780803442, |
| "rewards/accuracy_reward": 2.9791667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 349, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.625, |
| "epoch": 0.5982905982905983, |
| "grad_norm": 0.8802153456160818, |
| "kl": 0.0352783203125, |
| "learning_rate": 1.7658616543109234e-07, |
| "loss": 0.0014, |
| "reward": 4.485416889190674, |
| "reward_std": 0.029166698455810547, |
| "rewards/accuracy_reward": 2.991666793823242, |
| "rewards/format_reward": 1.0, |
| "step": 350, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.5, |
| "epoch": 0.6, |
| "grad_norm": 0.6080026072792618, |
| "kl": 0.033416748046875, |
| "learning_rate": 1.7530386092099156e-07, |
| "loss": 0.0013, |
| "reward": 4.478125095367432, |
| "reward_std": 0.025769436731934547, |
| "rewards/accuracy_reward": 2.9781248569488525, |
| "rewards/format_reward": 1.0, |
| "step": 351, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.84375, |
| "epoch": 0.6017094017094017, |
| "grad_norm": 0.6052317335576483, |
| "kl": 0.031951904296875, |
| "learning_rate": 1.7402371060643827e-07, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 352, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.46875, |
| "epoch": 0.6034188034188034, |
| "grad_norm": 0.5873898518946016, |
| "kl": 0.031982421875, |
| "learning_rate": 1.7274575140626315e-07, |
| "loss": 0.0013, |
| "reward": 4.46875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 353, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.5625, |
| "epoch": 0.6051282051282051, |
| "grad_norm": 0.534965438696097, |
| "kl": 0.03204345703125, |
| "learning_rate": 1.7147002017610623e-07, |
| "loss": 0.0013, |
| "reward": 4.496874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 354, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.125, |
| "epoch": 0.6068376068376068, |
| "grad_norm": 0.19326846885011464, |
| "kl": 0.034912109375, |
| "learning_rate": 1.7019655370735424e-07, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 355, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.78125, |
| "epoch": 0.6085470085470085, |
| "grad_norm": 1.2061684015115641, |
| "kl": 0.0396728515625, |
| "learning_rate": 1.6892538872607933e-07, |
| "loss": 0.0016, |
| "reward": 4.342708587646484, |
| "reward_std": 0.2912980318069458, |
| "rewards/accuracy_reward": 2.8489584922790527, |
| "rewards/format_reward": 1.0, |
| "step": 356, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.3125, |
| "epoch": 0.6102564102564103, |
| "grad_norm": 1.1222016670512132, |
| "kl": 0.03387451171875, |
| "learning_rate": 1.6765656189198011e-07, |
| "loss": 0.0014, |
| "reward": 4.426041603088379, |
| "reward_std": 0.14791671931743622, |
| "rewards/accuracy_reward": 2.926041603088379, |
| "rewards/format_reward": 1.0, |
| "step": 357, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.34375, |
| "epoch": 0.611965811965812, |
| "grad_norm": 0.9056732169837467, |
| "kl": 0.031890869140625, |
| "learning_rate": 1.6639010979732428e-07, |
| "loss": 0.0013, |
| "reward": 4.484375, |
| "reward_std": 0.031250059604644775, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 358, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.3125, |
| "epoch": 0.6136752136752137, |
| "grad_norm": 0.10457460621666292, |
| "kl": 0.0273895263671875, |
| "learning_rate": 1.6512606896589322e-07, |
| "loss": 0.0011, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 359, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 369.34375, |
| "epoch": 0.6153846153846154, |
| "grad_norm": 0.8589108015834329, |
| "kl": 0.03399658203125, |
| "learning_rate": 1.6386447585192908e-07, |
| "loss": 0.0014, |
| "reward": 4.476041793823242, |
| "reward_std": 0.03735045716166496, |
| "rewards/accuracy_reward": 2.988541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 360, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.875, |
| "epoch": 0.6170940170940171, |
| "grad_norm": 0.5237910767837538, |
| "kl": 0.03424072265625, |
| "learning_rate": 1.6260536683908298e-07, |
| "loss": 0.0014, |
| "reward": 4.485416412353516, |
| "reward_std": 0.018162094056606293, |
| "rewards/accuracy_reward": 2.9854166507720947, |
| "rewards/format_reward": 1.0, |
| "step": 361, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.6875, |
| "epoch": 0.6188034188034188, |
| "grad_norm": 0.7206265941802336, |
| "kl": 0.0287017822265625, |
| "learning_rate": 1.6134877823936607e-07, |
| "loss": 0.0011, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 362, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 372.96875, |
| "epoch": 0.6205128205128205, |
| "grad_norm": 1.383167067518889, |
| "kl": 0.034393310546875, |
| "learning_rate": 1.6009474629210202e-07, |
| "loss": 0.0014, |
| "reward": 4.219791412353516, |
| "reward_std": 0.28802648186683655, |
| "rewards/accuracy_reward": 2.788541555404663, |
| "rewards/format_reward": 1.0, |
| "step": 363, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.59375, |
| "epoch": 0.6222222222222222, |
| "grad_norm": 1.1165705187064898, |
| "kl": 0.030120849609375, |
| "learning_rate": 1.5884330716288212e-07, |
| "loss": 0.0012, |
| "reward": 4.449999809265137, |
| "reward_std": 0.07795004546642303, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 364, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.1875, |
| "epoch": 0.6239316239316239, |
| "grad_norm": 0.7203681625804031, |
| "kl": 0.0330810546875, |
| "learning_rate": 1.5759449694252225e-07, |
| "loss": 0.0013, |
| "reward": 4.478125095367432, |
| "reward_std": 0.032446444034576416, |
| "rewards/accuracy_reward": 2.9781250953674316, |
| "rewards/format_reward": 1.0, |
| "step": 365, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 369.9375, |
| "epoch": 0.6256410256410256, |
| "grad_norm": 0.7514346669369655, |
| "kl": 0.035736083984375, |
| "learning_rate": 1.5634835164602196e-07, |
| "loss": 0.0014, |
| "reward": 4.457291603088379, |
| "reward_std": 0.08541668206453323, |
| "rewards/accuracy_reward": 2.9635415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 366, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 364.0, |
| "epoch": 0.6273504273504273, |
| "grad_norm": 0.5135545227710894, |
| "kl": 0.0283050537109375, |
| "learning_rate": 1.551049072115259e-07, |
| "loss": 0.0011, |
| "reward": 4.481249809265137, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 367, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.1875, |
| "epoch": 0.629059829059829, |
| "grad_norm": 0.7832820562893558, |
| "kl": 0.035369873046875, |
| "learning_rate": 1.5386419949928732e-07, |
| "loss": 0.0014, |
| "reward": 4.464583396911621, |
| "reward_std": 0.07083334028720856, |
| "rewards/accuracy_reward": 2.964583396911621, |
| "rewards/format_reward": 1.0, |
| "step": 368, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.1875, |
| "epoch": 0.6307692307692307, |
| "grad_norm": 1.3554980343088359, |
| "kl": 0.03521728515625, |
| "learning_rate": 1.5262626429063382e-07, |
| "loss": 0.0014, |
| "reward": 4.3854169845581055, |
| "reward_std": 0.16212184727191925, |
| "rewards/accuracy_reward": 2.8916666507720947, |
| "rewards/format_reward": 1.0, |
| "step": 369, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.03125, |
| "epoch": 0.6324786324786325, |
| "grad_norm": 1.024651879366642, |
| "kl": 0.032989501953125, |
| "learning_rate": 1.5139113728693572e-07, |
| "loss": 0.0013, |
| "reward": 4.415624618530273, |
| "reward_std": 0.16875003278255463, |
| "rewards/accuracy_reward": 2.921875, |
| "rewards/format_reward": 1.0, |
| "step": 370, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.40625, |
| "epoch": 0.6341880341880342, |
| "grad_norm": 1.0957802284772251, |
| "kl": 0.0343017578125, |
| "learning_rate": 1.5015885410857614e-07, |
| "loss": 0.0014, |
| "reward": 4.446875095367432, |
| "reward_std": 0.10624998807907104, |
| "rewards/accuracy_reward": 2.953125, |
| "rewards/format_reward": 1.0, |
| "step": 371, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 366.65625, |
| "epoch": 0.6358974358974359, |
| "grad_norm": 0.8228003354495702, |
| "kl": 0.03204345703125, |
| "learning_rate": 1.4892945029392378e-07, |
| "loss": 0.0013, |
| "reward": 4.465624809265137, |
| "reward_std": 0.048403505235910416, |
| "rewards/accuracy_reward": 2.971874952316284, |
| "rewards/format_reward": 1.0, |
| "step": 372, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.96875, |
| "epoch": 0.6376068376068376, |
| "grad_norm": 0.7670469397108076, |
| "kl": 0.036468505859375, |
| "learning_rate": 1.4770296129830817e-07, |
| "loss": 0.0015, |
| "reward": 4.481249809265137, |
| "reward_std": 0.03017766959965229, |
| "rewards/accuracy_reward": 2.987499952316284, |
| "rewards/format_reward": 1.0, |
| "step": 373, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 371.875, |
| "epoch": 0.6393162393162393, |
| "grad_norm": 0.5412882759369413, |
| "kl": 0.028717041015625, |
| "learning_rate": 1.4647942249299704e-07, |
| "loss": 0.0011, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 374, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.09375, |
| "epoch": 0.6410256410256411, |
| "grad_norm": 0.7394286144083122, |
| "kl": 0.0311279296875, |
| "learning_rate": 1.4525886916417629e-07, |
| "loss": 0.0012, |
| "reward": 4.356249809265137, |
| "reward_std": 0.03750002384185791, |
| "rewards/accuracy_reward": 2.856250047683716, |
| "rewards/format_reward": 1.0, |
| "step": 375, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.65625, |
| "epoch": 0.6427350427350428, |
| "grad_norm": 0.8768404382198639, |
| "kl": 0.033660888671875, |
| "learning_rate": 1.4404133651193212e-07, |
| "loss": 0.0013, |
| "reward": 4.469791412353516, |
| "reward_std": 0.04280628263950348, |
| "rewards/accuracy_reward": 2.976041793823242, |
| "rewards/format_reward": 1.0, |
| "step": 376, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.25, |
| "epoch": 0.6444444444444445, |
| "grad_norm": 0.7423460450093495, |
| "kl": 0.031463623046875, |
| "learning_rate": 1.428268596492364e-07, |
| "loss": 0.0013, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 377, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.96875, |
| "epoch": 0.6461538461538462, |
| "grad_norm": 0.6563307928419717, |
| "kl": 0.028289794921875, |
| "learning_rate": 1.4161547360093362e-07, |
| "loss": 0.0011, |
| "reward": 4.478125095367432, |
| "reward_std": 0.043749988079071045, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 378, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.6875, |
| "epoch": 0.6478632478632479, |
| "grad_norm": 0.7929408963957806, |
| "kl": 0.0272674560546875, |
| "learning_rate": 1.404072133027306e-07, |
| "loss": 0.0011, |
| "reward": 4.478125095367432, |
| "reward_std": 0.043749988079071045, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 379, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.71875, |
| "epoch": 0.6495726495726496, |
| "grad_norm": 0.7163262109272821, |
| "kl": 0.033660888671875, |
| "learning_rate": 1.392021136001897e-07, |
| "loss": 0.0013, |
| "reward": 4.491666793823242, |
| "reward_std": 0.016666710376739502, |
| "rewards/accuracy_reward": 2.991666793823242, |
| "rewards/format_reward": 1.0, |
| "step": 380, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.84375, |
| "epoch": 0.6512820512820513, |
| "grad_norm": 0.7711914301051815, |
| "kl": 0.033843994140625, |
| "learning_rate": 1.3800020924772292e-07, |
| "loss": 0.0014, |
| "reward": 4.478125095367432, |
| "reward_std": 0.030829090625047684, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 381, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.0625, |
| "epoch": 0.652991452991453, |
| "grad_norm": 0.7855800057850417, |
| "kl": 0.03350830078125, |
| "learning_rate": 1.3680153490759073e-07, |
| "loss": 0.0013, |
| "reward": 4.478125095367432, |
| "reward_std": 0.043749988079071045, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 382, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.4375, |
| "epoch": 0.6547008547008547, |
| "grad_norm": 0.791258474349177, |
| "kl": 0.034576416015625, |
| "learning_rate": 1.3560612514890117e-07, |
| "loss": 0.0014, |
| "reward": 4.483333110809326, |
| "reward_std": 0.02375653013586998, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 383, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.59375, |
| "epoch": 0.6564102564102564, |
| "grad_norm": 0.549626026484795, |
| "kl": 0.03564453125, |
| "learning_rate": 1.3441401444661416e-07, |
| "loss": 0.0014, |
| "reward": 4.428124904632568, |
| "reward_std": 0.14375001192092896, |
| "rewards/accuracy_reward": 2.9281249046325684, |
| "rewards/format_reward": 1.0, |
| "step": 384, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.28125, |
| "epoch": 0.6581196581196581, |
| "grad_norm": 0.9408993865860127, |
| "kl": 0.0305633544921875, |
| "learning_rate": 1.3322523718054612e-07, |
| "loss": 0.0012, |
| "reward": 4.482291221618652, |
| "reward_std": 0.03541666269302368, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 385, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 369.75, |
| "epoch": 0.6598290598290598, |
| "grad_norm": 0.9335523737841157, |
| "kl": 0.032958984375, |
| "learning_rate": 1.320398276343795e-07, |
| "loss": 0.0013, |
| "reward": 4.462500095367432, |
| "reward_std": 0.07500000298023224, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 386, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.5, |
| "epoch": 0.6615384615384615, |
| "grad_norm": 0.49404137427048744, |
| "kl": 0.0316162109375, |
| "learning_rate": 1.30857819994673e-07, |
| "loss": 0.0013, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833352580666542, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 387, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.5, |
| "epoch": 0.6632478632478632, |
| "grad_norm": 0.09086186442266743, |
| "kl": 0.0306854248046875, |
| "learning_rate": 1.2967924834987686e-07, |
| "loss": 0.0012, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 388, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.21875, |
| "epoch": 0.6649572649572649, |
| "grad_norm": 0.9109818811980164, |
| "kl": 0.03759765625, |
| "learning_rate": 1.2850414668934847e-07, |
| "loss": 0.0015, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 389, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.59375, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.7334463778326866, |
| "kl": 0.03118896484375, |
| "learning_rate": 1.2733254890237334e-07, |
| "loss": 0.0012, |
| "reward": 4.478124618530273, |
| "reward_std": 0.030829111114144325, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 390, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.65625, |
| "epoch": 0.6683760683760683, |
| "grad_norm": 0.7840492710187797, |
| "kl": 0.0286407470703125, |
| "learning_rate": 1.2616448877718673e-07, |
| "loss": 0.0011, |
| "reward": 4.487500190734863, |
| "reward_std": 0.025000017136335373, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 391, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.75, |
| "epoch": 0.67008547008547, |
| "grad_norm": 0.9495331007616914, |
| "kl": 0.0318603515625, |
| "learning_rate": 1.2500000000000005e-07, |
| "loss": 0.0013, |
| "reward": 4.441666603088379, |
| "reward_std": 0.10923752188682556, |
| "rewards/accuracy_reward": 2.941666603088379, |
| "rewards/format_reward": 1.0, |
| "step": 392, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.625, |
| "epoch": 0.6717948717948717, |
| "grad_norm": 0.8886420165233839, |
| "kl": 0.031982421875, |
| "learning_rate": 1.238391161540287e-07, |
| "loss": 0.0013, |
| "reward": 4.346875190734863, |
| "reward_std": 0.04866214841604233, |
| "rewards/accuracy_reward": 2.846874952316284, |
| "rewards/format_reward": 1.0, |
| "step": 393, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.03125, |
| "epoch": 0.6735042735042736, |
| "grad_norm": 0.49575490041548353, |
| "kl": 0.03472900390625, |
| "learning_rate": 1.2268187071852417e-07, |
| "loss": 0.0014, |
| "reward": 4.496874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 394, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.75, |
| "epoch": 0.6752136752136753, |
| "grad_norm": 0.5842336321782885, |
| "kl": 0.031982421875, |
| "learning_rate": 1.2152829706780784e-07, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 395, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 399.59375, |
| "epoch": 0.676923076923077, |
| "grad_norm": 0.8273180840153671, |
| "kl": 0.032012939453125, |
| "learning_rate": 1.203784284703091e-07, |
| "loss": 0.0013, |
| "reward": 4.469791889190674, |
| "reward_std": 0.06041668727993965, |
| "rewards/accuracy_reward": 2.976041793823242, |
| "rewards/format_reward": 1.0, |
| "step": 396, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.65625, |
| "epoch": 0.6786324786324787, |
| "grad_norm": 0.09560569515302797, |
| "kl": 0.033966064453125, |
| "learning_rate": 1.1923229808760561e-07, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 397, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.59375, |
| "epoch": 0.6803418803418804, |
| "grad_norm": 1.1765257678004042, |
| "kl": 0.034149169921875, |
| "learning_rate": 1.1808993897346678e-07, |
| "loss": 0.0014, |
| "reward": 4.453125, |
| "reward_std": 0.07613962143659592, |
| "rewards/accuracy_reward": 2.9593749046325684, |
| "rewards/format_reward": 1.0, |
| "step": 398, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.53125, |
| "epoch": 0.6820512820512821, |
| "grad_norm": 0.5454270246051581, |
| "kl": 0.03558349609375, |
| "learning_rate": 1.16951384072901e-07, |
| "loss": 0.0014, |
| "reward": 4.493750095367432, |
| "reward_std": 0.0072169057093560696, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 399, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.28125, |
| "epoch": 0.6837606837606838, |
| "grad_norm": 1.093814427642604, |
| "kl": 0.035003662109375, |
| "learning_rate": 1.1581666622120492e-07, |
| "loss": 0.0014, |
| "reward": 4.474999904632568, |
| "reward_std": 0.05000007152557373, |
| "rewards/accuracy_reward": 2.9750001430511475, |
| "rewards/format_reward": 1.0, |
| "step": 400, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.21875, |
| "epoch": 0.6854700854700855, |
| "grad_norm": 0.7507411819451001, |
| "kl": 0.032745361328125, |
| "learning_rate": 1.1468581814301717e-07, |
| "loss": 0.0013, |
| "reward": 4.464583396911621, |
| "reward_std": 0.07083334028720856, |
| "rewards/accuracy_reward": 2.964583396911621, |
| "rewards/format_reward": 1.0, |
| "step": 401, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.5, |
| "epoch": 0.6871794871794872, |
| "grad_norm": 0.7522102069840583, |
| "kl": 0.032196044921875, |
| "learning_rate": 1.1355887245137383e-07, |
| "loss": 0.0013, |
| "reward": 4.472916603088379, |
| "reward_std": 0.05416666343808174, |
| "rewards/accuracy_reward": 2.9791667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 402, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.53125, |
| "epoch": 0.6888888888888889, |
| "grad_norm": 0.7302779915955037, |
| "kl": 0.033203125, |
| "learning_rate": 1.1243586164676871e-07, |
| "loss": 0.0013, |
| "reward": 4.477083206176758, |
| "reward_std": 0.03407880663871765, |
| "rewards/accuracy_reward": 2.977083206176758, |
| "rewards/format_reward": 1.0, |
| "step": 403, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.875, |
| "epoch": 0.6905982905982906, |
| "grad_norm": 1.1002198477594594, |
| "kl": 0.032806396484375, |
| "learning_rate": 1.1131681811621527e-07, |
| "loss": 0.0013, |
| "reward": 4.471874713897705, |
| "reward_std": 0.03420006483793259, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 404, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.3125, |
| "epoch": 0.6923076923076923, |
| "grad_norm": 0.7461741999370974, |
| "kl": 0.0283203125, |
| "learning_rate": 1.1020177413231332e-07, |
| "loss": 0.0011, |
| "reward": 4.483333587646484, |
| "reward_std": 0.03333333879709244, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 405, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.0625, |
| "epoch": 0.694017094017094, |
| "grad_norm": 0.10058816234853812, |
| "kl": 0.032928466796875, |
| "learning_rate": 1.0909076185231761e-07, |
| "loss": 0.0013, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 406, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.4375, |
| "epoch": 0.6957264957264957, |
| "grad_norm": 0.4593278233452189, |
| "kl": 0.03375244140625, |
| "learning_rate": 1.0798381331721107e-07, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 407, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.90625, |
| "epoch": 0.6974358974358974, |
| "grad_norm": 0.9300004852827248, |
| "kl": 0.029571533203125, |
| "learning_rate": 1.0688096045078022e-07, |
| "loss": 0.0012, |
| "reward": 4.3458333015441895, |
| "reward_std": 0.0583333745598793, |
| "rewards/accuracy_reward": 2.8458333015441895, |
| "rewards/format_reward": 1.0, |
| "step": 408, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.21875, |
| "epoch": 0.6991452991452991, |
| "grad_norm": 0.9315789428494882, |
| "kl": 0.030853271484375, |
| "learning_rate": 1.0578223505869493e-07, |
| "loss": 0.0012, |
| "reward": 4.474999904632568, |
| "reward_std": 0.05000004917383194, |
| "rewards/accuracy_reward": 2.9749999046325684, |
| "rewards/format_reward": 1.0, |
| "step": 409, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.03125, |
| "epoch": 0.7008547008547008, |
| "grad_norm": 0.750030869242921, |
| "kl": 0.03131103515625, |
| "learning_rate": 1.0468766882759092e-07, |
| "loss": 0.0013, |
| "reward": 4.483333587646484, |
| "reward_std": 0.03333337977528572, |
| "rewards/accuracy_reward": 2.9833333492279053, |
| "rewards/format_reward": 1.0, |
| "step": 410, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 401.375, |
| "epoch": 0.7025641025641025, |
| "grad_norm": 0.9122749493870264, |
| "kl": 0.033782958984375, |
| "learning_rate": 1.0359729332415582e-07, |
| "loss": 0.0014, |
| "reward": 4.4239583015441895, |
| "reward_std": 0.1520833820104599, |
| "rewards/accuracy_reward": 2.9239583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 411, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.8125, |
| "epoch": 0.7042735042735043, |
| "grad_norm": 0.08071277485359081, |
| "kl": 0.031341552734375, |
| "learning_rate": 1.0251113999421935e-07, |
| "loss": 0.0013, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 412, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.71875, |
| "epoch": 0.705982905982906, |
| "grad_norm": 0.6055413986472074, |
| "kl": 0.03631591796875, |
| "learning_rate": 1.0142924016184568e-07, |
| "loss": 0.0015, |
| "reward": 4.469791412353516, |
| "reward_std": 0.026434533298015594, |
| "rewards/accuracy_reward": 2.9697916507720947, |
| "rewards/format_reward": 1.0, |
| "step": 413, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.40625, |
| "epoch": 0.7076923076923077, |
| "grad_norm": 0.8151039715648754, |
| "kl": 0.033416748046875, |
| "learning_rate": 1.0035162502843073e-07, |
| "loss": 0.0013, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 414, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 406.40625, |
| "epoch": 0.7094017094017094, |
| "grad_norm": 0.7739588314582155, |
| "kl": 0.032196044921875, |
| "learning_rate": 9.927832567180192e-08, |
| "loss": 0.0013, |
| "reward": 4.471875190734863, |
| "reward_std": 0.05624999850988388, |
| "rewards/accuracy_reward": 2.971874952316284, |
| "rewards/format_reward": 1.0, |
| "step": 415, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.03125, |
| "epoch": 0.7111111111111111, |
| "grad_norm": 1.1556123751806124, |
| "kl": 0.0338134765625, |
| "learning_rate": 9.82093730453222e-08, |
| "loss": 0.0014, |
| "reward": 4.472916603088379, |
| "reward_std": 0.05416671186685562, |
| "rewards/accuracy_reward": 2.9791667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 416, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.75, |
| "epoch": 0.7128205128205128, |
| "grad_norm": 0.9375743065286576, |
| "kl": 0.0308837890625, |
| "learning_rate": 9.714479797699692e-08, |
| "loss": 0.0012, |
| "reward": 4.471875190734863, |
| "reward_std": 0.045683782547712326, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 417, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 397.71875, |
| "epoch": 0.7145299145299145, |
| "grad_norm": 0.5137791697239468, |
| "kl": 0.031402587890625, |
| "learning_rate": 9.608463116858542e-08, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012500028125941753, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 418, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.21875, |
| "epoch": 0.7162393162393162, |
| "grad_norm": 0.9293546876839371, |
| "kl": 0.033905029296875, |
| "learning_rate": 9.50289031947149e-08, |
| "loss": 0.0014, |
| "reward": 4.465624809265137, |
| "reward_std": 0.0565461590886116, |
| "rewards/accuracy_reward": 2.9656248092651367, |
| "rewards/format_reward": 1.0, |
| "step": 419, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.34375, |
| "epoch": 0.717948717948718, |
| "grad_norm": 0.5212991604802728, |
| "kl": 0.035247802734375, |
| "learning_rate": 9.397764450199936e-08, |
| "loss": 0.0014, |
| "reward": 4.490624904632568, |
| "reward_std": 0.018750011920928955, |
| "rewards/accuracy_reward": 2.9906249046325684, |
| "rewards/format_reward": 1.0, |
| "step": 420, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 397.59375, |
| "epoch": 0.7196581196581197, |
| "grad_norm": 0.5197704693590894, |
| "kl": 0.033599853515625, |
| "learning_rate": 9.293088540816079e-08, |
| "loss": 0.0013, |
| "reward": 4.3645830154418945, |
| "reward_std": 0.012028153985738754, |
| "rewards/accuracy_reward": 2.8645832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 421, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 403.71875, |
| "epoch": 0.7213675213675214, |
| "grad_norm": 0.7154972281077171, |
| "kl": 0.0341796875, |
| "learning_rate": 9.18886561011557e-08, |
| "loss": 0.0014, |
| "reward": 4.4864583015441895, |
| "reward_std": 0.027083376422524452, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 422, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.03125, |
| "epoch": 0.7230769230769231, |
| "grad_norm": 0.7894497576328201, |
| "kl": 0.034088134765625, |
| "learning_rate": 9.085098663830365e-08, |
| "loss": 0.0014, |
| "reward": 4.479166507720947, |
| "reward_std": 0.041666705161333084, |
| "rewards/accuracy_reward": 2.9791667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 423, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 397.28125, |
| "epoch": 0.7247863247863248, |
| "grad_norm": 0.790896159678788, |
| "kl": 0.03564453125, |
| "learning_rate": 8.981790694542087e-08, |
| "loss": 0.0014, |
| "reward": 4.478125095367432, |
| "reward_std": 0.04375002905726433, |
| "rewards/accuracy_reward": 2.9781250953674316, |
| "rewards/format_reward": 1.0, |
| "step": 424, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 399.6875, |
| "epoch": 0.7264957264957265, |
| "grad_norm": 0.50330884911545, |
| "kl": 0.033538818359375, |
| "learning_rate": 8.87894468159574e-08, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 425, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.15625, |
| "epoch": 0.7282051282051282, |
| "grad_norm": 0.8186288594902728, |
| "kl": 0.034576416015625, |
| "learning_rate": 8.776563591013728e-08, |
| "loss": 0.0014, |
| "reward": 4.340624809265137, |
| "reward_std": 0.06875002384185791, |
| "rewards/accuracy_reward": 2.840625047683716, |
| "rewards/format_reward": 1.0, |
| "step": 426, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.1875, |
| "epoch": 0.7299145299145299, |
| "grad_norm": 0.6945302885094401, |
| "kl": 0.0308685302734375, |
| "learning_rate": 8.674650375410378e-08, |
| "loss": 0.0012, |
| "reward": 4.479166507720947, |
| "reward_std": 0.04166668653488159, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 427, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.6875, |
| "epoch": 0.7316239316239316, |
| "grad_norm": 0.44261007122030527, |
| "kl": 0.0377197265625, |
| "learning_rate": 8.573207973906735e-08, |
| "loss": 0.0015, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 428, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.6875, |
| "epoch": 0.7333333333333333, |
| "grad_norm": 0.9243327957772149, |
| "kl": 0.0330810546875, |
| "learning_rate": 8.47223931204585e-08, |
| "loss": 0.0013, |
| "reward": 4.46875, |
| "reward_std": 0.06250005960464478, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 429, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 393.875, |
| "epoch": 0.7350427350427351, |
| "grad_norm": 0.5341812028772001, |
| "kl": 0.03497314453125, |
| "learning_rate": 8.371747301708357e-08, |
| "loss": 0.0014, |
| "reward": 4.478125095367432, |
| "reward_std": 0.03590351715683937, |
| "rewards/accuracy_reward": 2.9781250953674316, |
| "rewards/format_reward": 1.0, |
| "step": 430, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 406.25, |
| "epoch": 0.7367521367521368, |
| "grad_norm": 0.7251723433495721, |
| "kl": 0.037750244140625, |
| "learning_rate": 8.271734841028552e-08, |
| "loss": 0.0015, |
| "reward": 4.306249618530273, |
| "reward_std": 0.19895486533641815, |
| "rewards/accuracy_reward": 2.84375, |
| "rewards/format_reward": 1.0, |
| "step": 431, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.875, |
| "epoch": 0.7384615384615385, |
| "grad_norm": 0.7511545208510357, |
| "kl": 0.03411865234375, |
| "learning_rate": 8.17220481431074e-08, |
| "loss": 0.0014, |
| "reward": 4.493750095367432, |
| "reward_std": 0.01250004768371582, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 432, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 405.34375, |
| "epoch": 0.7401709401709402, |
| "grad_norm": 0.6079327745864885, |
| "kl": 0.03179931640625, |
| "learning_rate": 8.073160091946155e-08, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012500028125941753, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 433, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 401.1875, |
| "epoch": 0.7418803418803419, |
| "grad_norm": 0.5700935104266779, |
| "kl": 0.0306243896484375, |
| "learning_rate": 7.974603530330067e-08, |
| "loss": 0.0012, |
| "reward": 4.487500190734863, |
| "reward_std": 0.014433743432164192, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 434, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.53125, |
| "epoch": 0.7435897435897436, |
| "grad_norm": 0.753083570670292, |
| "kl": 0.038360595703125, |
| "learning_rate": 7.876537971779493e-08, |
| "loss": 0.0015, |
| "reward": 4.431250095367432, |
| "reward_std": 0.13749998807907104, |
| "rewards/accuracy_reward": 2.9375, |
| "rewards/format_reward": 1.0, |
| "step": 435, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.53125, |
| "epoch": 0.7452991452991453, |
| "grad_norm": 0.8013844938082755, |
| "kl": 0.0286407470703125, |
| "learning_rate": 7.778966244451168e-08, |
| "loss": 0.0011, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.9895830154418945, |
| "rewards/format_reward": 1.0, |
| "step": 436, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.875, |
| "epoch": 0.747008547008547, |
| "grad_norm": 0.5320691070134214, |
| "kl": 0.03106689453125, |
| "learning_rate": 7.681891162260015e-08, |
| "loss": 0.0012, |
| "reward": 4.484375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 437, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.75, |
| "epoch": 0.7487179487179487, |
| "grad_norm": 0.7052346428951947, |
| "kl": 0.03802490234375, |
| "learning_rate": 7.585315524797998e-08, |
| "loss": 0.0015, |
| "reward": 4.46875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 438, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 406.59375, |
| "epoch": 0.7504273504273504, |
| "grad_norm": 0.8602799183038976, |
| "kl": 0.035736083984375, |
| "learning_rate": 7.489242117253341e-08, |
| "loss": 0.0014, |
| "reward": 4.4114580154418945, |
| "reward_std": 0.17708338797092438, |
| "rewards/accuracy_reward": 2.9114582538604736, |
| "rewards/format_reward": 1.0, |
| "step": 439, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 406.15625, |
| "epoch": 0.7521367521367521, |
| "grad_norm": 1.3190919507909775, |
| "kl": 0.0305633544921875, |
| "learning_rate": 7.39367371033027e-08, |
| "loss": 0.0012, |
| "reward": 4.4322919845581055, |
| "reward_std": 0.13541670143604279, |
| "rewards/accuracy_reward": 2.944791793823242, |
| "rewards/format_reward": 1.0, |
| "step": 440, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.78125, |
| "epoch": 0.7538461538461538, |
| "grad_norm": 0.47304749095078713, |
| "kl": 0.032012939453125, |
| "learning_rate": 7.298613060169034e-08, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 441, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.4375, |
| "epoch": 0.7555555555555555, |
| "grad_norm": 0.5246643007021763, |
| "kl": 0.034881591796875, |
| "learning_rate": 7.204062908266489e-08, |
| "loss": 0.0014, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833352580666542, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 442, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.21875, |
| "epoch": 0.7572649572649572, |
| "grad_norm": 0.09360830328304175, |
| "kl": 0.032196044921875, |
| "learning_rate": 7.110025981396975e-08, |
| "loss": 0.0013, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 443, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 397.875, |
| "epoch": 0.7589743589743589, |
| "grad_norm": 0.9678920676984522, |
| "kl": 0.031280517578125, |
| "learning_rate": 7.016504991533726e-08, |
| "loss": 0.0013, |
| "reward": 4.476041793823242, |
| "reward_std": 0.0366131067276001, |
| "rewards/accuracy_reward": 2.976041316986084, |
| "rewards/format_reward": 1.0, |
| "step": 444, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 397.3125, |
| "epoch": 0.7606837606837606, |
| "grad_norm": 0.8190169960494074, |
| "kl": 0.036529541015625, |
| "learning_rate": 6.923502635770617e-08, |
| "loss": 0.0015, |
| "reward": 4.46875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 445, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.84375, |
| "epoch": 0.7623931623931623, |
| "grad_norm": 0.5344964134250557, |
| "kl": 0.030120849609375, |
| "learning_rate": 6.831021596244424e-08, |
| "loss": 0.0012, |
| "reward": 4.490624904632568, |
| "reward_std": 0.018750011920928955, |
| "rewards/accuracy_reward": 2.9906249046325684, |
| "rewards/format_reward": 1.0, |
| "step": 446, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.125, |
| "epoch": 0.764102564102564, |
| "grad_norm": 0.5104896778148305, |
| "kl": 0.0340576171875, |
| "learning_rate": 6.739064540057424e-08, |
| "loss": 0.0014, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 447, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 394.90625, |
| "epoch": 0.7658119658119659, |
| "grad_norm": 0.923990603996032, |
| "kl": 0.036163330078125, |
| "learning_rate": 6.64763411920053e-08, |
| "loss": 0.0014, |
| "reward": 4.346874713897705, |
| "reward_std": 0.23125003278255463, |
| "rewards/accuracy_reward": 2.890625, |
| "rewards/format_reward": 1.0, |
| "step": 448, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.28125, |
| "epoch": 0.7675213675213676, |
| "grad_norm": 0.7213594899516708, |
| "kl": 0.027252197265625, |
| "learning_rate": 6.556732970476747e-08, |
| "loss": 0.0011, |
| "reward": 4.478125095367432, |
| "reward_std": 0.043749988079071045, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 449, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.84375, |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.1570549185979674, |
| "kl": 0.0416259765625, |
| "learning_rate": 6.466363715425199e-08, |
| "loss": 0.0017, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 450, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.53125, |
| "epoch": 0.770940170940171, |
| "grad_norm": 0.076118115415132, |
| "kl": 0.0299072265625, |
| "learning_rate": 6.376528960245476e-08, |
| "loss": 0.0012, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 451, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.375, |
| "epoch": 0.7726495726495727, |
| "grad_norm": 0.9166985950940988, |
| "kl": 0.03118896484375, |
| "learning_rate": 6.28723129572247e-08, |
| "loss": 0.0012, |
| "reward": 4.457291603088379, |
| "reward_std": 0.08541667461395264, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 0.96875, |
| "step": 452, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.1875, |
| "epoch": 0.7743589743589744, |
| "grad_norm": 0.7765296821892308, |
| "kl": 0.03106689453125, |
| "learning_rate": 6.198473297151704e-08, |
| "loss": 0.0012, |
| "reward": 4.463541507720947, |
| "reward_std": 0.07291668653488159, |
| "rewards/accuracy_reward": 2.9635415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 453, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 390.3125, |
| "epoch": 0.7760683760683761, |
| "grad_norm": 0.08247118622297261, |
| "kl": 0.035369873046875, |
| "learning_rate": 6.110257524264997e-08, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 454, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.1875, |
| "epoch": 0.7777777777777778, |
| "grad_norm": 0.1026029910501858, |
| "kl": 0.0347900390625, |
| "learning_rate": 6.022586521156714e-08, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 455, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.21875, |
| "epoch": 0.7794871794871795, |
| "grad_norm": 0.11223910005504889, |
| "kl": 0.0296478271484375, |
| "learning_rate": 5.935462816210324e-08, |
| "loss": 0.0012, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 456, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 396.46875, |
| "epoch": 0.7811965811965812, |
| "grad_norm": 0.09582940298335704, |
| "kl": 0.03265380859375, |
| "learning_rate": 5.848888922025552e-08, |
| "loss": 0.0013, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 457, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.46875, |
| "epoch": 0.7829059829059829, |
| "grad_norm": 1.0500760370031257, |
| "kl": 0.035552978515625, |
| "learning_rate": 5.76286733534585e-08, |
| "loss": 0.0014, |
| "reward": 4.467708587646484, |
| "reward_std": 0.05401712283492088, |
| "rewards/accuracy_reward": 2.980208396911621, |
| "rewards/format_reward": 1.0, |
| "step": 458, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.4375, |
| "epoch": 0.7846153846153846, |
| "grad_norm": 0.7419885297063772, |
| "kl": 0.02899169921875, |
| "learning_rate": 5.67740053698646e-08, |
| "loss": 0.0012, |
| "reward": 4.452083587646484, |
| "reward_std": 0.05791240185499191, |
| "rewards/accuracy_reward": 2.9583332538604736, |
| "rewards/format_reward": 1.0, |
| "step": 459, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 394.5, |
| "epoch": 0.7863247863247863, |
| "grad_norm": 0.6994476739758024, |
| "kl": 0.0301666259765625, |
| "learning_rate": 5.5924909917627995e-08, |
| "loss": 0.0012, |
| "reward": 4.453125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.953125, |
| "rewards/format_reward": 1.0, |
| "step": 460, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 381.625, |
| "epoch": 0.788034188034188, |
| "grad_norm": 1.1649572465381595, |
| "kl": 0.034759521484375, |
| "learning_rate": 5.508141148419443e-08, |
| "loss": 0.0014, |
| "reward": 4.337499618530273, |
| "reward_std": 0.24119488894939423, |
| "rewards/accuracy_reward": 2.8812499046325684, |
| "rewards/format_reward": 1.0, |
| "step": 461, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.5, |
| "epoch": 0.7897435897435897, |
| "grad_norm": 0.6974141792088103, |
| "kl": 0.028564453125, |
| "learning_rate": 5.424353439559445e-08, |
| "loss": 0.0011, |
| "reward": 4.487500190734863, |
| "reward_std": 0.019716894254088402, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 462, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 375.03125, |
| "epoch": 0.7914529914529914, |
| "grad_norm": 0.8862170174981424, |
| "kl": 0.031463623046875, |
| "learning_rate": 5.3411302815742324e-08, |
| "loss": 0.0013, |
| "reward": 4.3729166984558105, |
| "reward_std": 0.17916667461395264, |
| "rewards/accuracy_reward": 2.9166665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 463, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.46875, |
| "epoch": 0.7931623931623931, |
| "grad_norm": 0.7808535507743032, |
| "kl": 0.033905029296875, |
| "learning_rate": 5.2584740745738766e-08, |
| "loss": 0.0014, |
| "reward": 4.487500190734863, |
| "reward_std": 0.02499997615814209, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 464, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.625, |
| "epoch": 0.7948717948717948, |
| "grad_norm": 0.9890091754072906, |
| "kl": 0.0303192138671875, |
| "learning_rate": 5.176387202317914e-08, |
| "loss": 0.0012, |
| "reward": 4.483333110809326, |
| "reward_std": 0.03333336114883423, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 465, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.90625, |
| "epoch": 0.7965811965811965, |
| "grad_norm": 0.8810236829977253, |
| "kl": 0.03179931640625, |
| "learning_rate": 5.0948720321465605e-08, |
| "loss": 0.0013, |
| "reward": 4.484375, |
| "reward_std": 0.031250059604644775, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 466, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 375.875, |
| "epoch": 0.7982905982905983, |
| "grad_norm": 0.7607492562548063, |
| "kl": 0.031219482421875, |
| "learning_rate": 5.013930914912476e-08, |
| "loss": 0.0012, |
| "reward": 4.368750095367432, |
| "reward_std": 0.1875, |
| "rewards/accuracy_reward": 2.90625, |
| "rewards/format_reward": 1.0, |
| "step": 467, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.5625, |
| "epoch": 0.8, |
| "grad_norm": 0.6990381492224721, |
| "kl": 0.0308837890625, |
| "learning_rate": 4.9335661849129295e-08, |
| "loss": 0.0012, |
| "reward": 4.429166793823242, |
| "reward_std": 0.1416666954755783, |
| "rewards/accuracy_reward": 2.929166793823242, |
| "rewards/format_reward": 1.0, |
| "step": 468, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.59375, |
| "epoch": 0.8017094017094017, |
| "grad_norm": 0.7391548171546465, |
| "kl": 0.0304718017578125, |
| "learning_rate": 4.853780159822521e-08, |
| "loss": 0.0012, |
| "reward": 4.485416412353516, |
| "reward_std": 0.029166698455810547, |
| "rewards/accuracy_reward": 2.991666555404663, |
| "rewards/format_reward": 1.0, |
| "step": 469, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.25, |
| "epoch": 0.8034188034188035, |
| "grad_norm": 1.0602483865953845, |
| "kl": 0.0290985107421875, |
| "learning_rate": 4.774575140626316e-08, |
| "loss": 0.0012, |
| "reward": 4.453125, |
| "reward_std": 0.06269193440675735, |
| "rewards/accuracy_reward": 2.9593753814697266, |
| "rewards/format_reward": 1.0, |
| "step": 470, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.1875, |
| "epoch": 0.8051282051282052, |
| "grad_norm": 0.7802955828215046, |
| "kl": 0.028533935546875, |
| "learning_rate": 4.695953411553466e-08, |
| "loss": 0.0011, |
| "reward": 4.484375, |
| "reward_std": 0.020683767274022102, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 471, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.71875, |
| "epoch": 0.8068376068376069, |
| "grad_norm": 0.7183676034820473, |
| "kl": 0.033111572265625, |
| "learning_rate": 4.617917240011393e-08, |
| "loss": 0.0013, |
| "reward": 4.474999904632568, |
| "reward_std": 0.032993994653224945, |
| "rewards/accuracy_reward": 2.9812498092651367, |
| "rewards/format_reward": 1.0, |
| "step": 472, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.375, |
| "epoch": 0.8085470085470086, |
| "grad_norm": 0.8113707858305483, |
| "kl": 0.0306854248046875, |
| "learning_rate": 4.5404688765203233e-08, |
| "loss": 0.0012, |
| "reward": 4.487500190734863, |
| "reward_std": 0.025000035762786865, |
| "rewards/accuracy_reward": 2.9875001907348633, |
| "rewards/format_reward": 1.0, |
| "step": 473, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 366.96875, |
| "epoch": 0.8102564102564103, |
| "grad_norm": 1.5001336107988816, |
| "kl": 0.0313720703125, |
| "learning_rate": 4.463610554648459e-08, |
| "loss": 0.0013, |
| "reward": 4.327083587646484, |
| "reward_std": 0.25816476345062256, |
| "rewards/accuracy_reward": 2.8489584922790527, |
| "rewards/format_reward": 1.0, |
| "step": 474, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 375.75, |
| "epoch": 0.811965811965812, |
| "grad_norm": 0.9830876497961911, |
| "kl": 0.0305328369140625, |
| "learning_rate": 4.387344490947498e-08, |
| "loss": 0.0012, |
| "reward": 4.349999904632568, |
| "reward_std": 0.22499996423721313, |
| "rewards/accuracy_reward": 2.90625, |
| "rewards/format_reward": 1.0, |
| "step": 475, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.34375, |
| "epoch": 0.8136752136752137, |
| "grad_norm": 0.8463083045702553, |
| "kl": 0.034942626953125, |
| "learning_rate": 4.311672884888756e-08, |
| "loss": 0.0014, |
| "reward": 4.462499618530273, |
| "reward_std": 0.03685124218463898, |
| "rewards/accuracy_reward": 2.968749761581421, |
| "rewards/format_reward": 1.0, |
| "step": 476, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.96875, |
| "epoch": 0.8153846153846154, |
| "grad_norm": 0.8916852982526418, |
| "kl": 0.03509521484375, |
| "learning_rate": 4.2365979187997084e-08, |
| "loss": 0.0014, |
| "reward": 4.457291603088379, |
| "reward_std": 0.08541667461395264, |
| "rewards/accuracy_reward": 2.9635415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 477, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.15625, |
| "epoch": 0.8170940170940171, |
| "grad_norm": 2.5252893617820487, |
| "kl": 0.036285400390625, |
| "learning_rate": 4.162121757801068e-08, |
| "loss": 0.0015, |
| "reward": 4.465624809265137, |
| "reward_std": 0.06132083013653755, |
| "rewards/accuracy_reward": 2.9656248092651367, |
| "rewards/format_reward": 1.0, |
| "step": 478, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.59375, |
| "epoch": 0.8188034188034188, |
| "grad_norm": 0.10243465928061427, |
| "kl": 0.038970947265625, |
| "learning_rate": 4.0882465497443313e-08, |
| "loss": 0.0016, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 479, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.5625, |
| "epoch": 0.8205128205128205, |
| "grad_norm": 0.455371320569445, |
| "kl": 0.0281524658203125, |
| "learning_rate": 4.014974425149853e-08, |
| "loss": 0.0011, |
| "reward": 4.487500190734863, |
| "reward_std": 0.014433743432164192, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 480, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.65625, |
| "epoch": 0.8222222222222222, |
| "grad_norm": 0.8196123722085543, |
| "kl": 0.03131103515625, |
| "learning_rate": 3.942307497145378e-08, |
| "loss": 0.0013, |
| "reward": 4.3302083015441895, |
| "reward_std": 0.07463379204273224, |
| "rewards/accuracy_reward": 2.8489582538604736, |
| "rewards/format_reward": 1.0, |
| "step": 481, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 367.78125, |
| "epoch": 0.8239316239316239, |
| "grad_norm": 0.7022762843056454, |
| "kl": 0.032073974609375, |
| "learning_rate": 3.8702478614051345e-08, |
| "loss": 0.0013, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 482, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.125, |
| "epoch": 0.8256410256410256, |
| "grad_norm": 0.597726990167221, |
| "kl": 0.02642822265625, |
| "learning_rate": 3.798797596089351e-08, |
| "loss": 0.0011, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 483, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.40625, |
| "epoch": 0.8273504273504273, |
| "grad_norm": 0.9212880375324389, |
| "kl": 0.0294189453125, |
| "learning_rate": 3.727958761784375e-08, |
| "loss": 0.0012, |
| "reward": 4.481249809265137, |
| "reward_std": 0.03750000521540642, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 484, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.25, |
| "epoch": 0.8290598290598291, |
| "grad_norm": 0.7980192569673967, |
| "kl": 0.0251007080078125, |
| "learning_rate": 3.6577334014431997e-08, |
| "loss": 0.001, |
| "reward": 4.460416793823242, |
| "reward_std": 0.07112002372741699, |
| "rewards/accuracy_reward": 2.960416555404663, |
| "rewards/format_reward": 1.0, |
| "step": 485, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.21875, |
| "epoch": 0.8307692307692308, |
| "grad_norm": 0.08452297569542742, |
| "kl": 0.031005859375, |
| "learning_rate": 3.588123540326571e-08, |
| "loss": 0.0012, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 486, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.4375, |
| "epoch": 0.8324786324786325, |
| "grad_norm": 0.9056999881313452, |
| "kl": 0.027587890625, |
| "learning_rate": 3.5191311859445795e-08, |
| "loss": 0.0011, |
| "reward": 4.459374904632568, |
| "reward_std": 0.08125001192092896, |
| "rewards/accuracy_reward": 2.965625047683716, |
| "rewards/format_reward": 1.0, |
| "step": 487, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.25, |
| "epoch": 0.8341880341880342, |
| "grad_norm": 0.4452409717412741, |
| "kl": 0.03271484375, |
| "learning_rate": 3.450758327998768e-08, |
| "loss": 0.0013, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833352580666542, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 488, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.46875, |
| "epoch": 0.8358974358974359, |
| "grad_norm": 0.06945328996157386, |
| "kl": 0.032135009765625, |
| "learning_rate": 3.383006938324734e-08, |
| "loss": 0.0013, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 489, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.875, |
| "epoch": 0.8376068376068376, |
| "grad_norm": 1.0104190261642534, |
| "kl": 0.033935546875, |
| "learning_rate": 3.315878970835267e-08, |
| "loss": 0.0014, |
| "reward": 4.445833206176758, |
| "reward_std": 0.09952813386917114, |
| "rewards/accuracy_reward": 2.9583330154418945, |
| "rewards/format_reward": 1.0, |
| "step": 490, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.84375, |
| "epoch": 0.8393162393162393, |
| "grad_norm": 0.5840091013811081, |
| "kl": 0.03118896484375, |
| "learning_rate": 3.249376361464021e-08, |
| "loss": 0.0012, |
| "reward": 4.371874809265137, |
| "reward_std": 0.00625002384185791, |
| "rewards/accuracy_reward": 2.871875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 491, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 367.53125, |
| "epoch": 0.841025641025641, |
| "grad_norm": 0.9683824952769233, |
| "kl": 0.029205322265625, |
| "learning_rate": 3.183501028109642e-08, |
| "loss": 0.0012, |
| "reward": 4.474999904632568, |
| "reward_std": 0.03943371772766113, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 492, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.28125, |
| "epoch": 0.8427350427350427, |
| "grad_norm": 0.9037075245637992, |
| "kl": 0.028076171875, |
| "learning_rate": 3.1182548705805056e-08, |
| "loss": 0.0011, |
| "reward": 4.471875190734863, |
| "reward_std": 0.05624999478459358, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 493, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.6875, |
| "epoch": 0.8444444444444444, |
| "grad_norm": 0.7836588351126467, |
| "kl": 0.026031494140625, |
| "learning_rate": 3.053639770539884e-08, |
| "loss": 0.001, |
| "reward": 4.487500190734863, |
| "reward_std": 0.025000017136335373, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 494, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.59375, |
| "epoch": 0.8461538461538461, |
| "grad_norm": 0.8963328689269091, |
| "kl": 0.0316162109375, |
| "learning_rate": 2.989657591451716e-08, |
| "loss": 0.0013, |
| "reward": 4.456249713897705, |
| "reward_std": 0.07264164090156555, |
| "rewards/accuracy_reward": 2.9625000953674316, |
| "rewards/format_reward": 1.0, |
| "step": 495, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.96875, |
| "epoch": 0.8478632478632478, |
| "grad_norm": 0.4743907243744337, |
| "kl": 0.0283660888671875, |
| "learning_rate": 2.9263101785268252e-08, |
| "loss": 0.0011, |
| "reward": 4.359375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.859375, |
| "rewards/format_reward": 1.0, |
| "step": 496, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.5, |
| "epoch": 0.8495726495726496, |
| "grad_norm": 0.49878949373559706, |
| "kl": 0.028564453125, |
| "learning_rate": 2.863599358669755e-08, |
| "loss": 0.0011, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 497, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.125, |
| "epoch": 0.8512820512820513, |
| "grad_norm": 0.9923991962632441, |
| "kl": 0.0313720703125, |
| "learning_rate": 2.8015269404260327e-08, |
| "loss": 0.0013, |
| "reward": 4.391666412353516, |
| "reward_std": 0.20358917117118835, |
| "rewards/accuracy_reward": 2.897916793823242, |
| "rewards/format_reward": 1.0, |
| "step": 498, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.40625, |
| "epoch": 0.852991452991453, |
| "grad_norm": 0.760643580547958, |
| "kl": 0.03564453125, |
| "learning_rate": 2.740094713930044e-08, |
| "loss": 0.0014, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 499, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 367.4375, |
| "epoch": 0.8547008547008547, |
| "grad_norm": 0.7783159068664107, |
| "kl": 0.0299224853515625, |
| "learning_rate": 2.679304450853401e-08, |
| "loss": 0.0012, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 500, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.03125, |
| "epoch": 0.8564102564102564, |
| "grad_norm": 0.8319195898451789, |
| "kl": 0.0276336669921875, |
| "learning_rate": 2.6191579043538333e-08, |
| "loss": 0.0011, |
| "reward": 4.4270830154418945, |
| "reward_std": 0.14583337306976318, |
| "rewards/accuracy_reward": 2.9270832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 501, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.90625, |
| "epoch": 0.8581196581196581, |
| "grad_norm": 0.5761278935527648, |
| "kl": 0.03009033203125, |
| "learning_rate": 2.5596568090246545e-08, |
| "loss": 0.0012, |
| "reward": 4.484375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 502, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.34375, |
| "epoch": 0.8598290598290599, |
| "grad_norm": 0.5211704779844807, |
| "kl": 0.0301666259765625, |
| "learning_rate": 2.500802880844699e-08, |
| "loss": 0.0012, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 503, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.09375, |
| "epoch": 0.8615384615384616, |
| "grad_norm": 1.2959231439342969, |
| "kl": 0.029937744140625, |
| "learning_rate": 2.4425978171288802e-08, |
| "loss": 0.0012, |
| "reward": 4.444791793823242, |
| "reward_std": 0.07988535612821579, |
| "rewards/accuracy_reward": 2.9635417461395264, |
| "rewards/format_reward": 1.0, |
| "step": 504, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.59375, |
| "epoch": 0.8632478632478633, |
| "grad_norm": 0.4892530739596482, |
| "kl": 0.03399658203125, |
| "learning_rate": 2.3850432964791945e-08, |
| "loss": 0.0014, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 505, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.78125, |
| "epoch": 0.864957264957265, |
| "grad_norm": 0.770760898444276, |
| "kl": 0.03436279296875, |
| "learning_rate": 2.3281409787363648e-08, |
| "loss": 0.0014, |
| "reward": 4.481249809265137, |
| "reward_std": 0.03750002384185791, |
| "rewards/accuracy_reward": 2.981250047683716, |
| "rewards/format_reward": 1.0, |
| "step": 506, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 364.90625, |
| "epoch": 0.8666666666666667, |
| "grad_norm": 1.184501188781967, |
| "kl": 0.028167724609375, |
| "learning_rate": 2.2718925049319048e-08, |
| "loss": 0.0011, |
| "reward": 4.462500095367432, |
| "reward_std": 0.062079109251499176, |
| "rewards/accuracy_reward": 2.9749999046325684, |
| "rewards/format_reward": 1.0, |
| "step": 507, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 398.5625, |
| "epoch": 0.8683760683760684, |
| "grad_norm": 0.5037798400384479, |
| "kl": 0.0311126708984375, |
| "learning_rate": 2.2162994972408643e-08, |
| "loss": 0.0012, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 508, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.15625, |
| "epoch": 0.8700854700854701, |
| "grad_norm": 0.7114656942445862, |
| "kl": 0.029266357421875, |
| "learning_rate": 2.1613635589349756e-08, |
| "loss": 0.0012, |
| "reward": 4.403124809265137, |
| "reward_std": 0.18555781245231628, |
| "rewards/accuracy_reward": 2.903125047683716, |
| "rewards/format_reward": 1.0, |
| "step": 509, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.53125, |
| "epoch": 0.8717948717948718, |
| "grad_norm": 0.9284269082444429, |
| "kl": 0.0289459228515625, |
| "learning_rate": 2.1070862743364836e-08, |
| "loss": 0.0012, |
| "reward": 4.4583330154418945, |
| "reward_std": 0.08333337306976318, |
| "rewards/accuracy_reward": 2.9583332538604736, |
| "rewards/format_reward": 1.0, |
| "step": 510, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.8125, |
| "epoch": 0.8735042735042735, |
| "grad_norm": 1.0818595327849414, |
| "kl": 0.02764892578125, |
| "learning_rate": 2.0534692087724015e-08, |
| "loss": 0.0011, |
| "reward": 4.462499618530273, |
| "reward_std": 0.07500006258487701, |
| "rewards/accuracy_reward": 2.9625000953674316, |
| "rewards/format_reward": 1.0, |
| "step": 511, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 371.59375, |
| "epoch": 0.8752136752136752, |
| "grad_norm": 0.8015844416518322, |
| "kl": 0.032684326171875, |
| "learning_rate": 2.0005139085293942e-08, |
| "loss": 0.0013, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 512, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.4375, |
| "epoch": 0.8769230769230769, |
| "grad_norm": 0.5608605027967921, |
| "kl": 0.0270233154296875, |
| "learning_rate": 1.9482219008091883e-08, |
| "loss": 0.0011, |
| "reward": 4.484375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 513, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.4375, |
| "epoch": 0.8786324786324786, |
| "grad_norm": 1.0197644879391652, |
| "kl": 0.02972412109375, |
| "learning_rate": 1.8965946936845027e-08, |
| "loss": 0.0012, |
| "reward": 4.482291221618652, |
| "reward_std": 0.035416703671216965, |
| "rewards/accuracy_reward": 2.988541603088379, |
| "rewards/format_reward": 1.0, |
| "step": 514, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 489.0625, |
| "epoch": 0.8803418803418803, |
| "grad_norm": 0.29736027233547324, |
| "kl": 0.0288543701171875, |
| "learning_rate": 1.845633776055591e-08, |
| "loss": 0.0012, |
| "reward": 4.331250190734863, |
| "reward_std": 0.26249998807907104, |
| "rewards/accuracy_reward": 2.90625, |
| "rewards/format_reward": 0.96875, |
| "step": 515, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.84375, |
| "epoch": 0.882051282051282, |
| "grad_norm": 1.2676126849679452, |
| "kl": 0.02862548828125, |
| "learning_rate": 1.7953406176072632e-08, |
| "loss": 0.0011, |
| "reward": 4.431250095367432, |
| "reward_std": 0.1286948174238205, |
| "rewards/accuracy_reward": 2.9375, |
| "rewards/format_reward": 1.0, |
| "step": 516, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.71875, |
| "epoch": 0.8837606837606837, |
| "grad_norm": 0.7440933489373224, |
| "kl": 0.031646728515625, |
| "learning_rate": 1.7457166687665447e-08, |
| "loss": 0.0013, |
| "reward": 4.353125095367432, |
| "reward_std": 0.043749988079071045, |
| "rewards/accuracy_reward": 2.859375, |
| "rewards/format_reward": 1.0, |
| "step": 517, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.3125, |
| "epoch": 0.8854700854700854, |
| "grad_norm": 0.7393577005284849, |
| "kl": 0.027435302734375, |
| "learning_rate": 1.6967633606608077e-08, |
| "loss": 0.0011, |
| "reward": 4.484375, |
| "reward_std": 0.031250037252902985, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 518, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.375, |
| "epoch": 0.8871794871794871, |
| "grad_norm": 0.9007448687343432, |
| "kl": 0.033538818359375, |
| "learning_rate": 1.6484821050765207e-08, |
| "loss": 0.0013, |
| "reward": 4.457291603088379, |
| "reward_std": 0.08541667461395264, |
| "rewards/accuracy_reward": 2.9635415077209473, |
| "rewards/format_reward": 1.0, |
| "step": 519, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.4375, |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.8896325906031658, |
| "kl": 0.027740478515625, |
| "learning_rate": 1.600874294418528e-08, |
| "loss": 0.0011, |
| "reward": 4.481249809265137, |
| "reward_std": 0.02693377062678337, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 520, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 395.34375, |
| "epoch": 0.8905982905982905, |
| "grad_norm": 0.4679849808583376, |
| "kl": 0.029876708984375, |
| "learning_rate": 1.553941301669892e-08, |
| "loss": 0.0012, |
| "reward": 4.4864583015441895, |
| "reward_std": 0.01965414360165596, |
| "rewards/accuracy_reward": 2.9864583015441895, |
| "rewards/format_reward": 1.0, |
| "step": 521, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.25, |
| "epoch": 0.8923076923076924, |
| "grad_norm": 0.5524700061127931, |
| "kl": 0.0288848876953125, |
| "learning_rate": 1.507684480352292e-08, |
| "loss": 0.0012, |
| "reward": 4.46875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 522, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 367.5, |
| "epoch": 0.8940170940170941, |
| "grad_norm": 0.7202553581598627, |
| "kl": 0.027191162109375, |
| "learning_rate": 1.4621051644870097e-08, |
| "loss": 0.0011, |
| "reward": 4.487500190734863, |
| "reward_std": 0.02499997615814209, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 523, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.125, |
| "epoch": 0.8957264957264958, |
| "grad_norm": 0.7887618526962882, |
| "kl": 0.0266876220703125, |
| "learning_rate": 1.4172046685564209e-08, |
| "loss": 0.0011, |
| "reward": 4.481249809265137, |
| "reward_std": 0.02693377062678337, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 524, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.6875, |
| "epoch": 0.8974358974358975, |
| "grad_norm": 0.5656571630137281, |
| "kl": 0.0274505615234375, |
| "learning_rate": 1.3729842874661362e-08, |
| "loss": 0.0011, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 525, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.40625, |
| "epoch": 0.8991452991452992, |
| "grad_norm": 0.7540298311550256, |
| "kl": 0.030029296875, |
| "learning_rate": 1.3294452965076031e-08, |
| "loss": 0.0012, |
| "reward": 4.487500190734863, |
| "reward_std": 0.02499997615814209, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 526, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.46875, |
| "epoch": 0.9008547008547009, |
| "grad_norm": 0.8815509637917851, |
| "kl": 0.0297698974609375, |
| "learning_rate": 1.2865889513213628e-08, |
| "loss": 0.0012, |
| "reward": 4.4395833015441895, |
| "reward_std": 0.12083335220813751, |
| "rewards/accuracy_reward": 2.9395835399627686, |
| "rewards/format_reward": 1.0, |
| "step": 527, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.34375, |
| "epoch": 0.9025641025641026, |
| "grad_norm": 1.063031546967266, |
| "kl": 0.0306854248046875, |
| "learning_rate": 1.2444164878608304e-08, |
| "loss": 0.0012, |
| "reward": 4.477083206176758, |
| "reward_std": 0.04583334922790527, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 528, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.4375, |
| "epoch": 0.9042735042735043, |
| "grad_norm": 0.8469698276822571, |
| "kl": 0.028076171875, |
| "learning_rate": 1.2029291223566412e-08, |
| "loss": 0.0011, |
| "reward": 4.487500190734863, |
| "reward_std": 0.02499997615814209, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 529, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.40625, |
| "epoch": 0.905982905982906, |
| "grad_norm": 0.9273190591489977, |
| "kl": 0.0251617431640625, |
| "learning_rate": 1.162128051281594e-08, |
| "loss": 0.001, |
| "reward": 4.479166507720947, |
| "reward_std": 0.041666723787784576, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 530, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 387.125, |
| "epoch": 0.9076923076923077, |
| "grad_norm": 0.7771106350182588, |
| "kl": 0.0276641845703125, |
| "learning_rate": 1.1220144513161195e-08, |
| "loss": 0.0011, |
| "reward": 4.4510416984558105, |
| "reward_std": 0.08833983540534973, |
| "rewards/accuracy_reward": 2.9635417461395264, |
| "rewards/format_reward": 1.0, |
| "step": 531, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.375, |
| "epoch": 0.9094017094017094, |
| "grad_norm": 0.6664581026137811, |
| "kl": 0.02801513671875, |
| "learning_rate": 1.082589479314372e-08, |
| "loss": 0.0011, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 532, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 383.78125, |
| "epoch": 0.9111111111111111, |
| "grad_norm": 0.5498490702554147, |
| "kl": 0.0247955322265625, |
| "learning_rate": 1.0438542722708444e-08, |
| "loss": 0.001, |
| "reward": 4.327083587646484, |
| "reward_std": 0.15728820860385895, |
| "rewards/accuracy_reward": 2.8645832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 533, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 366.21875, |
| "epoch": 0.9128205128205128, |
| "grad_norm": 0.9348854652779771, |
| "kl": 0.03265380859375, |
| "learning_rate": 1.0058099472876003e-08, |
| "loss": 0.0013, |
| "reward": 4.4197916984558105, |
| "reward_std": 0.14692401885986328, |
| "rewards/accuracy_reward": 2.9510416984558105, |
| "rewards/format_reward": 0.96875, |
| "step": 534, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 372.71875, |
| "epoch": 0.9145299145299145, |
| "grad_norm": 0.730567908020696, |
| "kl": 0.026275634765625, |
| "learning_rate": 9.684576015420275e-09, |
| "loss": 0.0011, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 535, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 384.375, |
| "epoch": 0.9162393162393162, |
| "grad_norm": 0.5545120278161326, |
| "kl": 0.02777099609375, |
| "learning_rate": 9.31798312255233e-09, |
| "loss": 0.0011, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 536, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.75, |
| "epoch": 0.9179487179487179, |
| "grad_norm": 0.7880922490785651, |
| "kl": 0.028564453125, |
| "learning_rate": 8.958331366609423e-09, |
| "loss": 0.0011, |
| "reward": 4.431250095367432, |
| "reward_std": 0.13749998807907104, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 0.96875, |
| "step": 537, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.75, |
| "epoch": 0.9196581196581196, |
| "grad_norm": 0.5125805217517629, |
| "kl": 0.0287017822265625, |
| "learning_rate": 8.605631119750295e-09, |
| "loss": 0.0011, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 538, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 480.46875, |
| "epoch": 0.9213675213675213, |
| "grad_norm": 1.0440304610202615, |
| "kl": 0.03271484375, |
| "learning_rate": 8.259892553655945e-09, |
| "loss": 0.0013, |
| "reward": 4.306249618530273, |
| "reward_std": 0.29610198736190796, |
| "rewards/accuracy_reward": 2.9000000953674316, |
| "rewards/format_reward": 0.96875, |
| "step": 539, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 495.625, |
| "epoch": 0.9230769230769231, |
| "grad_norm": 0.805580154456117, |
| "kl": 0.0282135009765625, |
| "learning_rate": 7.921125639236415e-09, |
| "loss": 0.0011, |
| "reward": 4.376041412353516, |
| "reward_std": 0.2258828580379486, |
| "rewards/accuracy_reward": 2.913541793823242, |
| "rewards/format_reward": 0.96875, |
| "step": 540, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 392.46875, |
| "epoch": 0.9247863247863248, |
| "grad_norm": 0.9493367853110416, |
| "kl": 0.031402587890625, |
| "learning_rate": 7.589340146343077e-09, |
| "loss": 0.0013, |
| "reward": 4.4791669845581055, |
| "reward_std": 0.04166668653488159, |
| "rewards/accuracy_reward": 2.9854166507720947, |
| "rewards/format_reward": 1.0, |
| "step": 541, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.84375, |
| "epoch": 0.9264957264957265, |
| "grad_norm": 0.7381117295532499, |
| "kl": 0.031951904296875, |
| "learning_rate": 7.2645456434869965e-09, |
| "loss": 0.0013, |
| "reward": 4.483333110809326, |
| "reward_std": 0.03333333879709244, |
| "rewards/accuracy_reward": 2.9895834922790527, |
| "rewards/format_reward": 1.0, |
| "step": 542, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.0625, |
| "epoch": 0.9282051282051282, |
| "grad_norm": 1.1740784609197572, |
| "kl": 0.03228759765625, |
| "learning_rate": 6.946751497562908e-09, |
| "loss": 0.0013, |
| "reward": 4.4552083015441895, |
| "reward_std": 0.08958329260349274, |
| "rewards/accuracy_reward": 2.9739584922790527, |
| "rewards/format_reward": 1.0, |
| "step": 543, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.375, |
| "epoch": 0.9299145299145299, |
| "grad_norm": 0.5328501688828602, |
| "kl": 0.037506103515625, |
| "learning_rate": 6.635966873579063e-09, |
| "loss": 0.0015, |
| "reward": 4.488541603088379, |
| "reward_std": 0.013339842669665813, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 544, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 369.78125, |
| "epoch": 0.9316239316239316, |
| "grad_norm": 0.5149268240763294, |
| "kl": 0.029327392578125, |
| "learning_rate": 6.332200734393056e-09, |
| "loss": 0.0012, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 545, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.3125, |
| "epoch": 0.9333333333333333, |
| "grad_norm": 1.281928028608375, |
| "kl": 0.035430908203125, |
| "learning_rate": 6.0354618404531156e-09, |
| "loss": 0.0014, |
| "reward": 4.435416221618652, |
| "reward_std": 0.11367866396903992, |
| "rewards/accuracy_reward": 2.941666841506958, |
| "rewards/format_reward": 1.0, |
| "step": 546, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.3125, |
| "epoch": 0.935042735042735, |
| "grad_norm": 0.9215949590119717, |
| "kl": 0.03155517578125, |
| "learning_rate": 5.745758749545749e-09, |
| "loss": 0.0013, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 547, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 369.0625, |
| "epoch": 0.9367521367521368, |
| "grad_norm": 0.7928067990222769, |
| "kl": 0.033538818359375, |
| "learning_rate": 5.463099816548577e-09, |
| "loss": 0.0013, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 548, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.0, |
| "epoch": 0.9384615384615385, |
| "grad_norm": 0.4648407109039686, |
| "kl": 0.031524658203125, |
| "learning_rate": 5.187493193189784e-09, |
| "loss": 0.0013, |
| "reward": 4.432291507720947, |
| "reward_std": 0.12192395329475403, |
| "rewards/accuracy_reward": 2.9322915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 549, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 389.03125, |
| "epoch": 0.9401709401709402, |
| "grad_norm": 0.5271865784036786, |
| "kl": 0.0309295654296875, |
| "learning_rate": 4.918946827812659e-09, |
| "loss": 0.0012, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012500028125941753, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 550, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 386.0, |
| "epoch": 0.9418803418803419, |
| "grad_norm": 0.5330250509324654, |
| "kl": 0.028472900390625, |
| "learning_rate": 4.657468465146641e-09, |
| "loss": 0.0011, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833352580666542, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 551, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.21875, |
| "epoch": 0.9435897435897436, |
| "grad_norm": 0.07650932815833315, |
| "kl": 0.0266265869140625, |
| "learning_rate": 4.4030656460838086e-09, |
| "loss": 0.0011, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 552, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.6875, |
| "epoch": 0.9452991452991453, |
| "grad_norm": 0.6953906482391564, |
| "kl": 0.029205322265625, |
| "learning_rate": 4.155745707461466e-09, |
| "loss": 0.0012, |
| "reward": 4.4895830154418945, |
| "reward_std": 0.020833373069763184, |
| "rewards/accuracy_reward": 2.9895832538604736, |
| "rewards/format_reward": 1.0, |
| "step": 553, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.6875, |
| "epoch": 0.947008547008547, |
| "grad_norm": 0.9205409521267338, |
| "kl": 0.0343017578125, |
| "learning_rate": 3.915515781850564e-09, |
| "loss": 0.0014, |
| "reward": 4.454166412353516, |
| "reward_std": 0.08520621806383133, |
| "rewards/accuracy_reward": 2.960416793823242, |
| "rewards/format_reward": 1.0, |
| "step": 554, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 382.53125, |
| "epoch": 0.9487179487179487, |
| "grad_norm": 0.4991360720043758, |
| "kl": 0.031890869140625, |
| "learning_rate": 3.6823827973499754e-09, |
| "loss": 0.0013, |
| "reward": 4.375, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward": 2.9375, |
| "rewards/format_reward": 0.9375, |
| "step": 555, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 391.25, |
| "epoch": 0.9504273504273504, |
| "grad_norm": 1.159726036834094, |
| "kl": 0.032623291015625, |
| "learning_rate": 3.4563534773866256e-09, |
| "loss": 0.0013, |
| "reward": 4.4791669845581055, |
| "reward_std": 0.04166668653488159, |
| "rewards/accuracy_reward": 2.991666793823242, |
| "rewards/format_reward": 1.0, |
| "step": 556, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 0.875, |
| "all_wrong": 0.0, |
| "completion_length": 385.53125, |
| "epoch": 0.9521367521367521, |
| "grad_norm": 0.8136589121786045, |
| "kl": 0.0294342041015625, |
| "learning_rate": 3.2374343405217884e-09, |
| "loss": 0.0012, |
| "reward": 4.253125190734863, |
| "reward_std": 0.24969010055065155, |
| "rewards/accuracy_reward": 2.809375047683716, |
| "rewards/format_reward": 1.0, |
| "step": 557, |
| "temporal_rewards": 0.875 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 378.71875, |
| "epoch": 0.9538461538461539, |
| "grad_norm": 0.782839713763375, |
| "kl": 0.0260162353515625, |
| "learning_rate": 3.025631700262876e-09, |
| "loss": 0.001, |
| "reward": 4.488541603088379, |
| "reward_std": 0.022916674613952637, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 558, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.4375, |
| "epoch": 0.9555555555555556, |
| "grad_norm": 0.9440948087160835, |
| "kl": 0.0279541015625, |
| "learning_rate": 2.820951664881499e-09, |
| "loss": 0.0011, |
| "reward": 4.471875190734863, |
| "reward_std": 0.05625001713633537, |
| "rewards/accuracy_reward": 2.9781250953674316, |
| "rewards/format_reward": 1.0, |
| "step": 559, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 388.21875, |
| "epoch": 0.9572649572649573, |
| "grad_norm": 0.7473828250827926, |
| "kl": 0.0294342041015625, |
| "learning_rate": 2.6234001372372193e-09, |
| "loss": 0.0012, |
| "reward": 4.484375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.9906249046325684, |
| "rewards/format_reward": 1.0, |
| "step": 560, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 374.09375, |
| "epoch": 0.958974358974359, |
| "grad_norm": 0.8309998398272985, |
| "kl": 0.028839111328125, |
| "learning_rate": 2.4329828146074096e-09, |
| "loss": 0.0012, |
| "reward": 4.462500095367432, |
| "reward_std": 0.07499998807907104, |
| "rewards/accuracy_reward": 2.96875, |
| "rewards/format_reward": 1.0, |
| "step": 561, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 368.4375, |
| "epoch": 0.9606837606837607, |
| "grad_norm": 0.7583845881032686, |
| "kl": 0.034393310546875, |
| "learning_rate": 2.2497051885228824e-09, |
| "loss": 0.0014, |
| "reward": 4.472916603088379, |
| "reward_std": 0.05416667461395264, |
| "rewards/accuracy_reward": 2.9791665077209473, |
| "rewards/format_reward": 1.0, |
| "step": 562, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.46875, |
| "epoch": 0.9623931623931624, |
| "grad_norm": 1.0016959232014684, |
| "kl": 0.029144287109375, |
| "learning_rate": 2.073572544609492e-09, |
| "loss": 0.0012, |
| "reward": 4.481249809265137, |
| "reward_std": 0.03750000521540642, |
| "rewards/accuracy_reward": 2.9937500953674316, |
| "rewards/format_reward": 1.0, |
| "step": 563, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.75, |
| "epoch": 0.9641025641025641, |
| "grad_norm": 0.7923931211654085, |
| "kl": 0.0256805419921875, |
| "learning_rate": 1.904589962435782e-09, |
| "loss": 0.001, |
| "reward": 4.487500190734863, |
| "reward_std": 0.02499997615814209, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 564, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.875, |
| "epoch": 0.9658119658119658, |
| "grad_norm": 0.9030914923749467, |
| "kl": 0.037689208984375, |
| "learning_rate": 1.7427623153664362e-09, |
| "loss": 0.0015, |
| "reward": 4.474999904632568, |
| "reward_std": 0.050000011920928955, |
| "rewards/accuracy_reward": 2.981250047683716, |
| "rewards/format_reward": 1.0, |
| "step": 565, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.3125, |
| "epoch": 0.9675213675213675, |
| "grad_norm": 0.7288360265115156, |
| "kl": 0.028076171875, |
| "learning_rate": 1.5880942704217526e-09, |
| "loss": 0.0011, |
| "reward": 4.483333587646484, |
| "reward_std": 0.03333337977528572, |
| "rewards/accuracy_reward": 2.9833333492279053, |
| "rewards/format_reward": 1.0, |
| "step": 566, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 385.34375, |
| "epoch": 0.9692307692307692, |
| "grad_norm": 0.7433585233182205, |
| "kl": 0.03173828125, |
| "learning_rate": 1.4405902881430287e-09, |
| "loss": 0.0013, |
| "reward": 4.360416412353516, |
| "reward_std": 0.0223845187574625, |
| "rewards/accuracy_reward": 2.866666793823242, |
| "rewards/format_reward": 1.0, |
| "step": 567, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 375.78125, |
| "epoch": 0.9709401709401709, |
| "grad_norm": 0.08909391886324619, |
| "kl": 0.0293426513671875, |
| "learning_rate": 1.3002546224639145e-09, |
| "loss": 0.0012, |
| "reward": 4.375, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.875, |
| "rewards/format_reward": 1.0, |
| "step": 568, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.09375, |
| "epoch": 0.9726495726495726, |
| "grad_norm": 0.10649006652034008, |
| "kl": 0.0347900390625, |
| "learning_rate": 1.167091320587843e-09, |
| "loss": 0.0014, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 569, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.1875, |
| "epoch": 0.9743589743589743, |
| "grad_norm": 0.8990118809823487, |
| "kl": 0.02984619140625, |
| "learning_rate": 1.0411042228711253e-09, |
| "loss": 0.0012, |
| "reward": 4.463541507720947, |
| "reward_std": 0.07291668653488159, |
| "rewards/accuracy_reward": 2.9635417461395264, |
| "rewards/format_reward": 1.0, |
| "step": 570, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 400.4375, |
| "epoch": 0.976068376068376, |
| "grad_norm": 0.8780869416335514, |
| "kl": 0.025970458984375, |
| "learning_rate": 9.222969627123433e-10, |
| "loss": 0.001, |
| "reward": 4.452083587646484, |
| "reward_std": 0.06897321343421936, |
| "rewards/accuracy_reward": 2.9520833492279053, |
| "rewards/format_reward": 1.0, |
| "step": 571, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.03125, |
| "epoch": 0.9777777777777777, |
| "grad_norm": 0.9906545174936237, |
| "kl": 0.0267333984375, |
| "learning_rate": 8.106729664475176e-10, |
| "loss": 0.0011, |
| "reward": 4.466666221618652, |
| "reward_std": 0.05708986893296242, |
| "rewards/accuracy_reward": 2.9791667461395264, |
| "rewards/format_reward": 1.0, |
| "step": 572, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.375, |
| "epoch": 0.9794871794871794, |
| "grad_norm": 1.1320410965972476, |
| "kl": 0.034027099609375, |
| "learning_rate": 7.062354532512416e-10, |
| "loss": 0.0014, |
| "reward": 4.445833206176758, |
| "reward_std": 0.10833339393138885, |
| "rewards/accuracy_reward": 2.952083110809326, |
| "rewards/format_reward": 1.0, |
| "step": 573, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 383.65625, |
| "epoch": 0.9811965811965812, |
| "grad_norm": 0.7180503459998184, |
| "kl": 0.028411865234375, |
| "learning_rate": 6.089874350439505e-10, |
| "loss": 0.0011, |
| "reward": 4.487500190734863, |
| "reward_std": 0.02499997615814209, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 574, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.34375, |
| "epoch": 0.9829059829059829, |
| "grad_norm": 0.558477385748197, |
| "kl": 0.0304718017578125, |
| "learning_rate": 5.189317164049633e-10, |
| "loss": 0.0012, |
| "reward": 4.243750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 2.75, |
| "rewards/format_reward": 1.0, |
| "step": 575, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 379.0625, |
| "epoch": 0.9846153846153847, |
| "grad_norm": 0.5290803575151364, |
| "kl": 0.02960205078125, |
| "learning_rate": 4.36070894491658e-10, |
| "loss": 0.0012, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947915077209473, |
| "rewards/format_reward": 1.0, |
| "step": 576, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 371.6875, |
| "epoch": 0.9863247863247864, |
| "grad_norm": 1.0791845979808021, |
| "kl": 0.027618408203125, |
| "learning_rate": 3.6040735896455953e-10, |
| "loss": 0.0011, |
| "reward": 4.471874713897705, |
| "reward_std": 0.04568374156951904, |
| "rewards/accuracy_reward": 2.996875047683716, |
| "rewards/format_reward": 1.0, |
| "step": 577, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 370.75, |
| "epoch": 0.9880341880341881, |
| "grad_norm": 0.884976221370628, |
| "kl": 0.031036376953125, |
| "learning_rate": 2.9194329191833953e-10, |
| "loss": 0.0012, |
| "reward": 4.467708110809326, |
| "reward_std": 0.06458337604999542, |
| "rewards/accuracy_reward": 2.9739582538604736, |
| "rewards/format_reward": 1.0, |
| "step": 578, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 380.03125, |
| "epoch": 0.9897435897435898, |
| "grad_norm": 0.5538748255975208, |
| "kl": 0.03741455078125, |
| "learning_rate": 2.3068066781908867e-10, |
| "loss": 0.0015, |
| "reward": 4.494791507720947, |
| "reward_std": 0.010416686534881592, |
| "rewards/accuracy_reward": 2.9947917461395264, |
| "rewards/format_reward": 1.0, |
| "step": 579, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 376.15625, |
| "epoch": 0.9914529914529915, |
| "grad_norm": 0.5204349719783644, |
| "kl": 0.03594970703125, |
| "learning_rate": 1.7662125344714008e-10, |
| "loss": 0.0014, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 580, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 377.59375, |
| "epoch": 0.9931623931623932, |
| "grad_norm": 0.07787086970335863, |
| "kl": 0.0251312255859375, |
| "learning_rate": 1.297666078462767e-10, |
| "loss": 0.001, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 581, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 373.71875, |
| "epoch": 0.9948717948717949, |
| "grad_norm": 0.9414974976233311, |
| "kl": 0.0290679931640625, |
| "learning_rate": 9.011808227865625e-11, |
| "loss": 0.0012, |
| "reward": 4.452083110809326, |
| "reward_std": 0.08161969482898712, |
| "rewards/accuracy_reward": 2.9583332538604736, |
| "rewards/format_reward": 1.0, |
| "step": 582, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.9375, |
| "epoch": 0.9965811965811966, |
| "grad_norm": 1.0306901148938268, |
| "kl": 0.032440185546875, |
| "learning_rate": 5.7676820185953434e-11, |
| "loss": 0.0013, |
| "reward": 4.484375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.984375, |
| "rewards/format_reward": 1.0, |
| "step": 583, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 381.8125, |
| "epoch": 0.9982905982905983, |
| "grad_norm": 0.5694885078067841, |
| "kl": 0.02459716796875, |
| "learning_rate": 3.244375715633074e-11, |
| "loss": 0.001, |
| "reward": 4.493750095367432, |
| "reward_std": 0.012499988079071045, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 584, |
| "temporal_rewards": 1.0 |
| }, |
| { |
| "all_correct": 1.0, |
| "all_wrong": 0.0, |
| "completion_length": 344.0, |
| "epoch": 1.0, |
| "grad_norm": 0.7766650645462043, |
| "kl": 0.0360107421875, |
| "learning_rate": 1.4419620897432316e-11, |
| "loss": 0.0014, |
| "reward": 4.449999809265137, |
| "reward_std": 0.09999990463256836, |
| "rewards/accuracy_reward": 3.0, |
| "rewards/format_reward": 1.0, |
| "step": 585, |
| "temporal_rewards": 1.0 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 585, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 290, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|