diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4294 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.005813953488372093, + "eval_steps": 500, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5501.0, + "completions/max_terminated_length": 5501.0, + "completions/mean_length": 3558.09375, + "completions/mean_terminated_length": 3558.09375, + "completions/min_length": 2215.0, + "completions/min_terminated_length": 2215.0, + "epoch": 2.4224806201550387e-05, + "grad_norm": 0.00640977891147893, + "kl": 0.0007143020629882812, + "learning_rate": 0.0, + "loss": 0.0006, + "num_tokens": 568407.0, + "reward": 0.4926603138446808, + "reward_std": 0.08448069542646408, + "rewards/avg_thinking_length_func": 157.22222900390625, + "rewards/confidence_score_reward_func": 0.7339284420013428, + "rewards/correct_answer_reward_func": 0.640625, + "rewards/efficient_thinking_reward_func": 0.9699548628723149, + "rewards/format_and_efficient_reward_func": 0.5214560031890869, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.318666696548462, + "rewards/tool_execution_reward_func": 1.983011245727539, + "rewards/visit_tool_reward_func": 0.9305298328399658, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 4.8449612403100775e-05, + "grad_norm": 0.0064083920341846115, + "kl": 0.0007143020629882812, + "learning_rate": 6.25e-08, + "loss": 0.0006, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 7.267441860465116e-05, + "grad_norm": 0.006447812260611595, + "kl": 0.0007295608520507812, + "learning_rate": 1.25e-07, + "loss": 0.0006, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 9.689922480620155e-05, + "grad_norm": 0.0066225031847143186, + "kl": 0.0007305145263671875, + "learning_rate": 1.875e-07, + "loss": 0.0006, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7891.0, + "completions/max_terminated_length": 7891.0, + "completions/mean_length": 3465.828125, + "completions/mean_terminated_length": 3465.828125, + "completions/min_length": 1264.0, + "completions/min_terminated_length": 1264.0, + "epoch": 0.00012112403100775194, + "grad_norm": 0.011221982806523546, + "kl": 0.0008029937744140625, + "learning_rate": 2.5e-07, + "loss": 0.0003, + "num_tokens": 1050218.0, + "reward": 0.35228461027145386, + "reward_std": 0.11903564631938934, + "rewards/avg_thinking_length_func": 172.3975830078125, + "rewards/confidence_score_reward_func": 0.7573737502098083, + "rewards/correct_answer_reward_func": 0.453125, + "rewards/efficient_thinking_reward_func": 0.8796035517984737, + "rewards/format_and_efficient_reward_func": 0.3536693751811981, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.489912509918213, + "rewards/tool_execution_reward_func": 1.9884867668151855, + "rewards/visit_tool_reward_func": 0.9384097456932068, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00014534883720930232, + "grad_norm": 0.011369566083514073, + "kl": 0.0008258819580078125, + "learning_rate": 3.1249999999999997e-07, + "loss": 0.0003, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0001695736434108527, + "grad_norm": 0.011325781329231437, + "kl": 0.000820159912109375, + "learning_rate": 3.75e-07, + "loss": 0.0003, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0001937984496124031, + "grad_norm": 0.011468177438620898, + "kl": 0.0008134841918945312, + "learning_rate": 4.375e-07, + "loss": 0.0003, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9790.0, + "completions/max_terminated_length": 9790.0, + "completions/mean_length": 4101.421875, + "completions/mean_terminated_length": 4101.421875, + "completions/min_length": 1141.0, + "completions/min_terminated_length": 1141.0, + "epoch": 0.00021802325581395349, + "grad_norm": 0.008533015854175789, + "kl": 0.00080108642578125, + "learning_rate": 5e-07, + "loss": 0.0, + "num_tokens": 1636681.0, + "reward": 0.4183655381202698, + "reward_std": 0.0931699275970459, + "rewards/avg_thinking_length_func": 176.92233276367188, + "rewards/confidence_score_reward_func": 0.7306747436523438, + "rewards/correct_answer_reward_func": 0.546875, + "rewards/efficient_thinking_reward_func": 0.8954936332818751, + "rewards/format_and_efficient_reward_func": 0.4208581745624542, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.53083336353302, + "rewards/tool_execution_reward_func": 1.9508955478668213, + "rewards/visit_tool_reward_func": 0.8424738645553589, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00024224806201550387, + "grad_norm": 0.009520985221391949, + "kl": 0.0007925033569335938, + "learning_rate": 5.625e-07, + "loss": 0.0, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00026647286821705426, + "grad_norm": 0.010085270290120536, + "kl": 0.0011835098266601562, + "learning_rate": 6.249999999999999e-07, + "loss": 0.0, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00029069767441860465, + "grad_norm": 0.008472445513601271, + "kl": 0.0008249282836914062, + "learning_rate": 6.875e-07, + "loss": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7445.0, + "completions/max_terminated_length": 7445.0, + "completions/mean_length": 3379.234375, + "completions/mean_terminated_length": 3379.234375, + "completions/min_length": 1491.0, + "completions/min_terminated_length": 1491.0, + "epoch": 0.00031492248062015503, + "grad_norm": 0.01258823765843166, + "kl": 0.0009145736694335938, + "learning_rate": 7.5e-07, + "loss": -0.0001, + "num_tokens": 2110165.0, + "reward": 0.4067286550998688, + "reward_std": 0.18041250109672546, + "rewards/avg_thinking_length_func": 170.76950073242188, + "rewards/confidence_score_reward_func": 0.763248085975647, + "rewards/correct_answer_reward_func": 0.515625, + "rewards/efficient_thinking_reward_func": 0.8802126246942265, + "rewards/format_and_efficient_reward_func": 0.4241780936717987, + "rewards/format_reward_func": 0.99958336353302, + "rewards/num_xml_reward_func": 1.6099066734313965, + "rewards/tool_execution_reward_func": 1.9751970767974854, + "rewards/visit_tool_reward_func": 0.9391972422599792, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0003391472868217054, + "grad_norm": 0.012500551984662189, + "kl": 0.0009927749633789062, + "learning_rate": 8.125e-07, + "loss": -0.0001, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0003633720930232558, + "grad_norm": 0.012416715432446976, + "kl": 0.0010967254638671875, + "learning_rate": 8.75e-07, + "loss": -0.0001, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0003875968992248062, + "grad_norm": 0.01288145978177755, + "kl": 0.001140594482421875, + "learning_rate": 9.374999999999999e-07, + "loss": -0.0001, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10205.0, + "completions/max_terminated_length": 10205.0, + "completions/mean_length": 4119.96875, + "completions/mean_terminated_length": 4119.96875, + "completions/min_length": 1159.0, + "completions/min_terminated_length": 1159.0, + "epoch": 0.0004118217054263566, + "grad_norm": 0.009407611593031055, + "kl": 0.0011768341064453125, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 2691117.0, + "reward": 0.4201432466506958, + "reward_std": 0.0907188206911087, + "rewards/avg_thinking_length_func": 171.4025115966797, + "rewards/confidence_score_reward_func": 0.7308521270751953, + "rewards/correct_answer_reward_func": 0.546875, + "rewards/efficient_thinking_reward_func": 0.861229582956026, + "rewards/format_and_efficient_reward_func": 0.37111079692840576, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.3203115463256836, + "rewards/tool_execution_reward_func": 1.9717044830322266, + "rewards/visit_tool_reward_func": 0.8859716653823853, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00043604651162790697, + "grad_norm": 0.009347834781139657, + "kl": 0.00139617919921875, + "learning_rate": 1.0625e-06, + "loss": 0.0004, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00046027131782945736, + "grad_norm": 0.00928664951165006, + "kl": 0.00167083740234375, + "learning_rate": 1.125e-06, + "loss": 0.0004, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00048449612403100775, + "grad_norm": 0.009342230945576057, + "kl": 0.00212860107421875, + "learning_rate": 1.1874999999999999e-06, + "loss": 0.0004, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7792.0, + "completions/max_terminated_length": 7792.0, + "completions/mean_length": 3474.78125, + "completions/mean_terminated_length": 3474.78125, + "completions/min_length": 1307.0, + "completions/min_terminated_length": 1307.0, + "epoch": 0.0005087209302325581, + "grad_norm": 0.010737570621935045, + "kl": 0.002620697021484375, + "learning_rate": 1.2499999999999999e-06, + "loss": -0.0, + "num_tokens": 3182962.0, + "reward": 0.3430381715297699, + "reward_std": 0.15257038176059723, + "rewards/avg_thinking_length_func": 163.67486572265625, + "rewards/confidence_score_reward_func": 0.7590060234069824, + "rewards/correct_answer_reward_func": 0.4375, + "rewards/efficient_thinking_reward_func": 0.9094734969614356, + "rewards/format_and_efficient_reward_func": 0.354397177696228, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.4648277759552002, + "rewards/tool_execution_reward_func": 1.9753289222717285, + "rewards/visit_tool_reward_func": 0.9633350968360901, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0005329457364341085, + "grad_norm": 0.010610611287841326, + "kl": 0.003204345703125, + "learning_rate": 1.3125e-06, + "loss": -0.0, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0005571705426356589, + "grad_norm": 0.010883725821996518, + "kl": 0.003814697265625, + "learning_rate": 1.375e-06, + "loss": -0.0, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0005813953488372093, + "grad_norm": 0.010728950563018041, + "kl": 0.00518798828125, + "learning_rate": 1.4375e-06, + "loss": -0.0, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9082.0, + "completions/max_terminated_length": 9082.0, + "completions/mean_length": 4205.453125, + "completions/mean_terminated_length": 4205.453125, + "completions/min_length": 1188.0, + "completions/min_terminated_length": 1188.0, + "epoch": 0.0006056201550387597, + "grad_norm": 0.011708703395331976, + "kl": 0.0054779052734375, + "learning_rate": 1.5e-06, + "loss": 0.0016, + "num_tokens": 3788330.0, + "reward": 0.4100201725959778, + "reward_std": 0.12962010502815247, + "rewards/avg_thinking_length_func": 167.64987182617188, + "rewards/confidence_score_reward_func": 0.7269817590713501, + "rewards/correct_answer_reward_func": 0.53125, + "rewards/efficient_thinking_reward_func": 0.894405090660734, + "rewards/format_and_efficient_reward_func": 0.4734077453613281, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.2609543800354004, + "rewards/tool_execution_reward_func": 1.9624817371368408, + "rewards/visit_tool_reward_func": 0.8933978080749512, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0006298449612403101, + "grad_norm": 0.012029441405275343, + "kl": 0.00725555419921875, + "learning_rate": 1.5624999999999999e-06, + "loss": 0.0016, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0006540697674418605, + "grad_norm": 0.011965973272488425, + "kl": 0.010589599609375, + "learning_rate": 1.625e-06, + "loss": 0.0016, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0006782945736434108, + "grad_norm": 0.018054158629818226, + "kl": 0.017059326171875, + "learning_rate": 1.6875e-06, + "loss": 0.0016, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7466.0, + "completions/max_terminated_length": 7466.0, + "completions/mean_length": 3525.1875, + "completions/mean_terminated_length": 3525.1875, + "completions/min_length": 1458.0, + "completions/min_terminated_length": 1458.0, + "epoch": 0.0007025193798449612, + "grad_norm": 0.011398719184495674, + "kl": 0.013671875, + "learning_rate": 1.75e-06, + "loss": 0.0001, + "num_tokens": 4289196.0, + "reward": 0.3574071526527405, + "reward_std": 0.09749965369701385, + "rewards/avg_thinking_length_func": 163.65969848632812, + "rewards/confidence_score_reward_func": 0.7581030130386353, + "rewards/correct_answer_reward_func": 0.453125, + "rewards/efficient_thinking_reward_func": 0.9089163330381327, + "rewards/format_and_efficient_reward_func": 0.3653510808944702, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.4366014003753662, + "rewards/tool_execution_reward_func": 1.9675538539886475, + "rewards/visit_tool_reward_func": 0.960380494594574, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0007267441860465116, + "grad_norm": 0.010833682609318612, + "kl": 0.015838623046875, + "learning_rate": 1.8125e-06, + "loss": 0.0001, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.000750968992248062, + "grad_norm": 0.0231097533212703, + "kl": 0.022918701171875, + "learning_rate": 1.8749999999999998e-06, + "loss": 0.0001, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0007751937984496124, + "grad_norm": 0.011257228008738334, + "kl": 0.021575927734375, + "learning_rate": 1.9375e-06, + "loss": 0.0001, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8888.0, + "completions/max_terminated_length": 8888.0, + "completions/mean_length": 3804.265625, + "completions/mean_terminated_length": 3804.265625, + "completions/min_length": 1067.0, + "completions/min_terminated_length": 1067.0, + "epoch": 0.0007994186046511628, + "grad_norm": 0.02927961829217277, + "kl": 0.030914306640625, + "learning_rate": 2e-06, + "loss": 0.0005, + "num_tokens": 4848644.0, + "reward": 0.46360254287719727, + "reward_std": 0.10140425711870193, + "rewards/avg_thinking_length_func": 168.85345458984375, + "rewards/confidence_score_reward_func": 0.7187485694885254, + "rewards/correct_answer_reward_func": 0.609375, + "rewards/efficient_thinking_reward_func": 0.8848315117904739, + "rewards/format_and_efficient_reward_func": 0.46383440494537354, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.3748114109039307, + "rewards/tool_execution_reward_func": 1.9836355447769165, + "rewards/visit_tool_reward_func": 0.8981889486312866, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0008236434108527132, + "grad_norm": 0.00984263633299767, + "kl": 0.026763916015625, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0008478682170542636, + "grad_norm": 0.022916321346866338, + "kl": 0.03643798828125, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0008720930232558139, + "grad_norm": 0.010968578899761567, + "kl": 0.03680419921875, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7119.0, + "completions/max_terminated_length": 7119.0, + "completions/mean_length": 3045.9375, + "completions/mean_terminated_length": 3045.9375, + "completions/min_length": 1306.0, + "completions/min_terminated_length": 1306.0, + "epoch": 0.0008963178294573643, + "grad_norm": 0.11556192957878203, + "kl": 0.066314697265625, + "learning_rate": 2e-06, + "loss": 0.0011, + "num_tokens": 5286042.0, + "reward": 0.38059696555137634, + "reward_std": 0.20472648739814758, + "rewards/avg_thinking_length_func": 171.9969024658203, + "rewards/confidence_score_reward_func": 0.7361885905265808, + "rewards/correct_answer_reward_func": 0.5, + "rewards/efficient_thinking_reward_func": 0.8792661921309781, + "rewards/format_and_efficient_reward_func": 0.4069961905479431, + "rewards/format_reward_func": 0.9985389709472656, + "rewards/num_xml_reward_func": 1.7584354877471924, + "rewards/tool_execution_reward_func": 1.9876766204833984, + "rewards/visit_tool_reward_func": 0.926859438419342, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0009205426356589147, + "grad_norm": 0.013991455687567742, + "kl": 0.034393310546875, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0009447674418604651, + "grad_norm": 0.01433251116157902, + "kl": 0.0352783203125, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0009689922480620155, + "grad_norm": 0.01682769595676241, + "kl": 0.0426025390625, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6696.0, + "completions/max_terminated_length": 6696.0, + "completions/mean_length": 3054.78125, + "completions/mean_terminated_length": 3054.78125, + "completions/min_length": 734.0, + "completions/min_terminated_length": 734.0, + "epoch": 0.0009932170542635659, + "grad_norm": 0.034243564194623544, + "kl": 0.0538330078125, + "learning_rate": 2e-06, + "loss": 0.0003, + "num_tokens": 5728827.0, + "reward": 0.5321023464202881, + "reward_std": 0.07992984354496002, + "rewards/avg_thinking_length_func": 185.18777465820312, + "rewards/confidence_score_reward_func": 0.699253261089325, + "rewards/correct_answer_reward_func": 0.734375, + "rewards/efficient_thinking_reward_func": 0.8423659179880447, + "rewards/format_and_efficient_reward_func": 0.5654621124267578, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.836081624031067, + "rewards/tool_execution_reward_func": 1.9795209169387817, + "rewards/visit_tool_reward_func": 0.8331901431083679, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0010174418604651163, + "grad_norm": 0.008357434600682397, + "kl": 0.0467529296875, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0010416666666666667, + "grad_norm": 0.009143109288946688, + "kl": 0.05499267578125, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.001065891472868217, + "grad_norm": 0.018383062802239766, + "kl": 0.07135009765625, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5912.0, + "completions/max_terminated_length": 5912.0, + "completions/mean_length": 2513.34375, + "completions/mean_terminated_length": 2513.34375, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "epoch": 0.0010901162790697674, + "grad_norm": 1.2449457515517797, + "kl": 0.5546875, + "learning_rate": 2e-06, + "loss": 0.0011, + "num_tokens": 6121552.0, + "reward": 0.41406646370887756, + "reward_std": 0.1448429971933365, + "rewards/avg_thinking_length_func": 159.43849182128906, + "rewards/confidence_score_reward_func": 0.7091017961502075, + "rewards/correct_answer_reward_func": 0.5625, + "rewards/efficient_thinking_reward_func": 0.9100999417514477, + "rewards/format_and_efficient_reward_func": 0.40307265520095825, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.7179009914398193, + "rewards/tool_execution_reward_func": 1.9982638359069824, + "rewards/visit_tool_reward_func": 0.8534926772117615, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0011143410852713178, + "grad_norm": 0.04725193167363872, + "kl": 0.0830078125, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0011385658914728682, + "grad_norm": 0.01076799271094143, + "kl": 0.0728759765625, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0011627906976744186, + "grad_norm": 1.2844930338395364, + "kl": 0.594970703125, + "learning_rate": 2e-06, + "loss": 0.0012, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6441.0, + "completions/max_terminated_length": 6441.0, + "completions/mean_length": 2998.25, + "completions/mean_terminated_length": 2998.25, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "epoch": 0.001187015503875969, + "grad_norm": 0.07335407885154412, + "kl": 0.0946044921875, + "learning_rate": 2e-06, + "loss": 0.0003, + "num_tokens": 6632198.0, + "reward": 0.4027416408061981, + "reward_std": 0.18368688225746155, + "rewards/avg_thinking_length_func": 144.1616668701172, + "rewards/confidence_score_reward_func": 0.6523082852363586, + "rewards/correct_answer_reward_func": 0.578125, + "rewards/efficient_thinking_reward_func": 0.8715830269761213, + "rewards/format_and_efficient_reward_func": 0.30888205766677856, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.2804265022277832, + "rewards/tool_execution_reward_func": 1.9967105388641357, + "rewards/visit_tool_reward_func": 0.777007520198822, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0012112403100775194, + "grad_norm": 3382.951158532336, + "kl": 386.0513916015625, + "learning_rate": 2e-06, + "loss": 0.1955, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0012354651162790698, + "grad_norm": 0.04763618115574692, + "kl": 0.111083984375, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0012596899224806201, + "grad_norm": 0.011361146003702229, + "kl": 0.0693359375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5699.0, + "completions/max_terminated_length": 5699.0, + "completions/mean_length": 2702.609375, + "completions/mean_terminated_length": 2702.609375, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "epoch": 0.0012839147286821705, + "grad_norm": 641.0279979496779, + "kl": 340.21875, + "learning_rate": 2e-06, + "loss": 0.3029, + "num_tokens": 7071302.0, + "reward": 0.38491296768188477, + "reward_std": 0.20615670084953308, + "rewards/avg_thinking_length_func": 144.03466796875, + "rewards/confidence_score_reward_func": 0.6775128841400146, + "rewards/correct_answer_reward_func": 0.546875, + "rewards/efficient_thinking_reward_func": 0.8956235775099276, + "rewards/format_and_efficient_reward_func": 0.298817902803421, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.156145691871643, + "rewards/tool_execution_reward_func": 2.0, + "rewards/visit_tool_reward_func": 0.8991793990135193, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.001308139534883721, + "grad_norm": 10.07283016494114, + "kl": 6.52294921875, + "learning_rate": 2e-06, + "loss": 0.0054, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0013323643410852713, + "grad_norm": 0.024178305719161252, + "kl": 0.1031494140625, + "learning_rate": 2e-06, + "loss": -0.0001, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0013565891472868217, + "grad_norm": 0.010123659301215143, + "kl": 0.0853271484375, + "learning_rate": 2e-06, + "loss": -0.0001, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7974.0, + "completions/max_terminated_length": 7974.0, + "completions/mean_length": 3418.6875, + "completions/mean_terminated_length": 3418.6875, + "completions/min_length": 1001.0, + "completions/min_terminated_length": 1001.0, + "epoch": 0.001380813953488372, + "grad_norm": 0.011918704634373778, + "kl": 0.0814208984375, + "learning_rate": 2e-06, + "loss": 0.0006, + "num_tokens": 7618083.0, + "reward": 0.33670923113822937, + "reward_std": 0.2170744389295578, + "rewards/avg_thinking_length_func": 162.87310791015625, + "rewards/confidence_score_reward_func": 0.6380267143249512, + "rewards/correct_answer_reward_func": 0.484375, + "rewards/efficient_thinking_reward_func": 0.8769457565983968, + "rewards/format_and_efficient_reward_func": 0.15387150645256042, + "rewards/format_reward_func": 0.9937499761581421, + "rewards/num_xml_reward_func": 0.7425504326820374, + "rewards/tool_execution_reward_func": 1.984375, + "rewards/visit_tool_reward_func": 0.7900611162185669, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0014050387596899225, + "grad_norm": 0.012656826141930118, + "kl": 0.0870361328125, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0014292635658914728, + "grad_norm": 0.01963879028272825, + "kl": 0.102783203125, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0014534883720930232, + "grad_norm": 0.023803010795812877, + "kl": 0.111328125, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6602.0, + "completions/max_terminated_length": 6602.0, + "completions/mean_length": 2937.375, + "completions/mean_terminated_length": 2937.375, + "completions/min_length": 1124.0, + "completions/min_terminated_length": 1124.0, + "epoch": 0.0014777131782945736, + "grad_norm": 0.6399010033168665, + "kl": 0.328857421875, + "learning_rate": 2e-06, + "loss": 0.0008, + "num_tokens": 8081395.0, + "reward": 0.41028502583503723, + "reward_std": 0.1911381632089615, + "rewards/avg_thinking_length_func": 154.1159210205078, + "rewards/confidence_score_reward_func": 0.6654285192489624, + "rewards/correct_answer_reward_func": 0.59375, + "rewards/efficient_thinking_reward_func": 0.8800399071963756, + "rewards/format_and_efficient_reward_func": 0.1847984343767166, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 0.921923816204071, + "rewards/tool_execution_reward_func": 1.9983552694320679, + "rewards/visit_tool_reward_func": 0.883500337600708, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.001501937984496124, + "grad_norm": 0.011193139638749735, + "kl": 0.091552734375, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0015261627906976744, + "grad_norm": 0.010209194017758182, + "kl": 0.086181640625, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0015503875968992248, + "grad_norm": 0.14653936372168078, + "kl": 0.1170654296875, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8850.0, + "completions/max_terminated_length": 8850.0, + "completions/mean_length": 3542.96875, + "completions/mean_terminated_length": 3542.96875, + "completions/min_length": 880.0, + "completions/min_terminated_length": 880.0, + "epoch": 0.0015746124031007752, + "grad_norm": 0.48126334443141955, + "kl": 0.248046875, + "learning_rate": 2e-06, + "loss": -0.0001, + "num_tokens": 8636201.0, + "reward": 0.39273786544799805, + "reward_std": 0.12296080589294434, + "rewards/avg_thinking_length_func": 150.4586639404297, + "rewards/confidence_score_reward_func": 0.6261853575706482, + "rewards/correct_answer_reward_func": 0.578125, + "rewards/efficient_thinking_reward_func": 0.8429494490638886, + "rewards/format_and_efficient_reward_func": 0.26941436529159546, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 0.9826317429542542, + "rewards/tool_execution_reward_func": 1.9983552694320679, + "rewards/visit_tool_reward_func": 0.8202804327011108, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0015988372093023256, + "grad_norm": 0.011505230665385266, + "kl": 0.087646484375, + "learning_rate": 2e-06, + "loss": -0.0003, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.001623062015503876, + "grad_norm": 0.011219221768431348, + "kl": 0.0863037109375, + "learning_rate": 2e-06, + "loss": -0.0003, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0016472868217054263, + "grad_norm": 0.013493117517357446, + "kl": 0.0845947265625, + "learning_rate": 2e-06, + "loss": -0.0003, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6164.0, + "completions/max_terminated_length": 6164.0, + "completions/mean_length": 3045.984375, + "completions/mean_terminated_length": 3045.984375, + "completions/min_length": 853.0, + "completions/min_terminated_length": 853.0, + "epoch": 0.0016715116279069767, + "grad_norm": 0.015474671509878156, + "kl": 0.085205078125, + "learning_rate": 2e-06, + "loss": -0.0001, + "num_tokens": 9080324.0, + "reward": 0.3584170639514923, + "reward_std": 0.2464786320924759, + "rewards/avg_thinking_length_func": 171.05947875976562, + "rewards/confidence_score_reward_func": 0.6698201298713684, + "rewards/correct_answer_reward_func": 0.515625, + "rewards/efficient_thinking_reward_func": 0.9022617067768229, + "rewards/format_and_efficient_reward_func": 0.18420693278312683, + "rewards/format_reward_func": 0.999218761920929, + "rewards/num_xml_reward_func": 0.9476650953292847, + "rewards/tool_execution_reward_func": 1.9967105388641357, + "rewards/visit_tool_reward_func": 0.922633707523346, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.001695736434108527, + "grad_norm": 0.01314183789857302, + "kl": 0.082275390625, + "learning_rate": 2e-06, + "loss": -0.0001, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0017199612403100775, + "grad_norm": 0.012255008742171034, + "kl": 0.0802001953125, + "learning_rate": 2e-06, + "loss": -0.0001, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0017441860465116279, + "grad_norm": 0.016022338448163764, + "kl": 0.0791015625, + "learning_rate": 2e-06, + "loss": -0.0001, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9687.0, + "completions/max_terminated_length": 9687.0, + "completions/mean_length": 4153.765625, + "completions/mean_terminated_length": 4153.765625, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "epoch": 0.0017684108527131783, + "grad_norm": 0.009771623312563241, + "kl": 0.07470703125, + "learning_rate": 2e-06, + "loss": 0.0005, + "num_tokens": 9647713.0, + "reward": 0.39447835087776184, + "reward_std": 0.1022053211927414, + "rewards/avg_thinking_length_func": 180.9823455810547, + "rewards/confidence_score_reward_func": 0.6325613260269165, + "rewards/correct_answer_reward_func": 0.578125, + "rewards/efficient_thinking_reward_func": 0.8102246632773766, + "rewards/format_and_efficient_reward_func": 0.31101614236831665, + "rewards/format_reward_func": 0.9996874928474426, + "rewards/num_xml_reward_func": 1.1014292240142822, + "rewards/tool_execution_reward_func": 1.9983552694320679, + "rewards/visit_tool_reward_func": 0.9176727533340454, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0017926356589147287, + "grad_norm": 0.009518866209493148, + "kl": 0.0743408203125, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.001816860465116279, + "grad_norm": 0.01107061263145856, + "kl": 0.074462890625, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0018410852713178294, + "grad_norm": 0.010455700609646703, + "kl": 0.0758056640625, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 6094.0, + "completions/max_terminated_length": 6094.0, + "completions/mean_length": 3365.65625, + "completions/mean_terminated_length": 3386.5714285714284, + "completions/min_length": 1457.0, + "completions/min_terminated_length": 1457.0, + "epoch": 0.0018653100775193798, + "grad_norm": 0.010697094262847633, + "kl": 0.07177734375, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 10096064.0, + "reward": 0.4402870833873749, + "reward_std": 0.17748701572418213, + "rewards/avg_thinking_length_func": 184.61854553222656, + "rewards/confidence_score_reward_func": 0.6924824714660645, + "rewards/correct_answer_reward_func": 0.625, + "rewards/efficient_thinking_reward_func": 0.8674089768653666, + "rewards/format_and_efficient_reward_func": 0.46700799465179443, + "rewards/format_reward_func": 0.9821969866752625, + "rewards/num_xml_reward_func": 1.4879558086395264, + "rewards/tool_execution_reward_func": 1.9514802694320679, + "rewards/visit_tool_reward_func": 0.9262524843215942, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0018895348837209302, + "grad_norm": 0.010757613261067228, + "kl": 0.0716552734375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0019137596899224806, + "grad_norm": 0.010687573666984099, + "kl": 0.0711669921875, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.001937984496124031, + "grad_norm": 0.010774872814522038, + "kl": 0.07177734375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9500.0, + "completions/max_terminated_length": 9500.0, + "completions/mean_length": 4230.6875, + "completions/mean_terminated_length": 4230.6875, + "completions/min_length": 1095.0, + "completions/min_terminated_length": 1095.0, + "epoch": 0.0019622093023255816, + "grad_norm": 46.52685366161902, + "kl": 28.5504150390625, + "learning_rate": 2e-06, + "loss": 0.0212, + "num_tokens": 10633304.0, + "reward": 0.4479905962944031, + "reward_std": 0.11886347830295563, + "rewards/avg_thinking_length_func": 196.62542724609375, + "rewards/confidence_score_reward_func": 0.6686310768127441, + "rewards/correct_answer_reward_func": 0.625, + "rewards/efficient_thinking_reward_func": 0.8074578120916676, + "rewards/format_and_efficient_reward_func": 0.4098377823829651, + "rewards/format_reward_func": 0.9993749856948853, + "rewards/num_xml_reward_func": 1.3076300621032715, + "rewards/tool_execution_reward_func": 1.9934210777282715, + "rewards/visit_tool_reward_func": 0.9281606674194336, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0019864341085271318, + "grad_norm": 0.011024444662647613, + "kl": 0.0682373046875, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0020106589147286824, + "grad_norm": 0.0110905273039609, + "kl": 0.0682373046875, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0020348837209302325, + "grad_norm": 0.011161056303561772, + "kl": 0.068359375, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7332.0, + "completions/max_terminated_length": 7332.0, + "completions/mean_length": 3184.28125, + "completions/mean_terminated_length": 3184.28125, + "completions/min_length": 1380.0, + "completions/min_terminated_length": 1380.0, + "epoch": 0.002059108527131783, + "grad_norm": 0.007262566160814956, + "kl": 0.0670166015625, + "learning_rate": 2e-06, + "loss": -0.0, + "num_tokens": 11072119.0, + "reward": 0.48964226245880127, + "reward_std": 0.09526845812797546, + "rewards/avg_thinking_length_func": 183.27981567382812, + "rewards/confidence_score_reward_func": 0.7107405066490173, + "rewards/correct_answer_reward_func": 0.671875, + "rewards/efficient_thinking_reward_func": 0.8552614079949872, + "rewards/format_and_efficient_reward_func": 0.509292721748352, + "rewards/format_reward_func": 0.9996874928474426, + "rewards/num_xml_reward_func": 1.630164384841919, + "rewards/tool_execution_reward_func": 1.9862616062164307, + "rewards/visit_tool_reward_func": 0.9241018295288086, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0020833333333333333, + "grad_norm": 0.007239493682926299, + "kl": 0.0675048828125, + "learning_rate": 2e-06, + "loss": -0.0, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002107558139534884, + "grad_norm": 0.007565680492649283, + "kl": 0.06787109375, + "learning_rate": 2e-06, + "loss": -0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002131782945736434, + "grad_norm": 0.007407335837345995, + "kl": 0.0682373046875, + "learning_rate": 2e-06, + "loss": -0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9640.0, + "completions/max_terminated_length": 9640.0, + "completions/mean_length": 3956.140625, + "completions/mean_terminated_length": 3956.140625, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "epoch": 0.0021560077519379847, + "grad_norm": 0.009630461090198177, + "kl": 0.06591796875, + "learning_rate": 2e-06, + "loss": 0.0004, + "num_tokens": 11589191.0, + "reward": 0.4685676693916321, + "reward_std": 0.08529931306838989, + "rewards/avg_thinking_length_func": 185.34999084472656, + "rewards/confidence_score_reward_func": 0.673518717288971, + "rewards/correct_answer_reward_func": 0.65625, + "rewards/efficient_thinking_reward_func": 0.8117772322905137, + "rewards/format_and_efficient_reward_func": 0.4981999397277832, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.5498807430267334, + "rewards/tool_execution_reward_func": 1.9884867668151855, + "rewards/visit_tool_reward_func": 0.9419025182723999, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002180232558139535, + "grad_norm": 0.010035272389521673, + "kl": 0.0660400390625, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0022044573643410855, + "grad_norm": 0.009886020878154878, + "kl": 0.0653076171875, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0022286821705426356, + "grad_norm": 0.010179048111382292, + "kl": 0.0648193359375, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5889.0, + "completions/max_terminated_length": 5889.0, + "completions/mean_length": 3288.046875, + "completions/mean_terminated_length": 3288.046875, + "completions/min_length": 1106.0, + "completions/min_terminated_length": 1106.0, + "epoch": 0.0022529069767441862, + "grad_norm": 0.36462018525457834, + "kl": 0.1248779296875, + "learning_rate": 2e-06, + "loss": 0.001, + "num_tokens": 12047117.0, + "reward": 0.5035778284072876, + "reward_std": 0.09110674262046814, + "rewards/avg_thinking_length_func": 180.05084228515625, + "rewards/confidence_score_reward_func": 0.7095786333084106, + "rewards/correct_answer_reward_func": 0.6875, + "rewards/efficient_thinking_reward_func": 0.865053232533276, + "rewards/format_and_efficient_reward_func": 0.5739701986312866, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.6645023822784424, + "rewards/tool_execution_reward_func": 1.9736841917037964, + "rewards/visit_tool_reward_func": 0.9475066065788269, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0022771317829457364, + "grad_norm": 0.010049427947341465, + "kl": 0.0684814453125, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002301356589147287, + "grad_norm": 0.008406367137924373, + "kl": 0.067138671875, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002325581395348837, + "grad_norm": 0.008646991679074768, + "kl": 0.0682373046875, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9717.0, + "completions/max_terminated_length": 9717.0, + "completions/mean_length": 4042.09375, + "completions/mean_terminated_length": 4042.09375, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "epoch": 0.0023498062015503878, + "grad_norm": 0.36433347439984676, + "kl": 0.266357421875, + "learning_rate": 2e-06, + "loss": 0.0004, + "num_tokens": 12594675.0, + "reward": 0.4354441165924072, + "reward_std": 0.10702547430992126, + "rewards/avg_thinking_length_func": 178.31576538085938, + "rewards/confidence_score_reward_func": 0.6778514385223389, + "rewards/correct_answer_reward_func": 0.59375, + "rewards/efficient_thinking_reward_func": 0.8262231594607177, + "rewards/format_and_efficient_reward_func": 0.4731639623641968, + "rewards/format_reward_func": 0.9996874928474426, + "rewards/num_xml_reward_func": 1.5230944156646729, + "rewards/tool_execution_reward_func": 1.977658987045288, + "rewards/visit_tool_reward_func": 0.90561443567276, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002374031007751938, + "grad_norm": 0.017653090743062046, + "kl": 0.07763671875, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0023982558139534886, + "grad_norm": 0.009650143183516308, + "kl": 0.066650390625, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0024224806201550387, + "grad_norm": 0.009666383934140476, + "kl": 0.066650390625, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7009.0, + "completions/max_terminated_length": 7009.0, + "completions/mean_length": 3569.1875, + "completions/mean_terminated_length": 3569.1875, + "completions/min_length": 1350.0, + "completions/min_terminated_length": 1350.0, + "epoch": 0.0024467054263565893, + "grad_norm": 0.012628187028225836, + "kl": 0.0160369873046875, + "learning_rate": 2e-06, + "loss": 0.0013, + "num_tokens": 13095521.0, + "reward": 0.4694232642650604, + "reward_std": 0.11920525133609772, + "rewards/avg_thinking_length_func": 166.68763732910156, + "rewards/confidence_score_reward_func": 0.693173885345459, + "rewards/correct_answer_reward_func": 0.640625, + "rewards/efficient_thinking_reward_func": 0.8890269113384983, + "rewards/format_and_efficient_reward_func": 0.52373868227005, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.4931187629699707, + "rewards/tool_execution_reward_func": 1.9407894611358643, + "rewards/visit_tool_reward_func": 0.9543420076370239, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0024709302325581395, + "grad_norm": 0.013764666926511201, + "kl": 0.016693115234375, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00249515503875969, + "grad_norm": 0.015582325932853322, + "kl": 0.017730712890625, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0025193798449612403, + "grad_norm": 0.017864538067072777, + "kl": 0.01995849609375, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11718.0, + "completions/max_terminated_length": 11718.0, + "completions/mean_length": 4337.8125, + "completions/mean_terminated_length": 4337.8125, + "completions/min_length": 1402.0, + "completions/min_terminated_length": 1402.0, + "epoch": 0.002543604651162791, + "grad_norm": 0.011715145428905095, + "kl": 0.023681640625, + "learning_rate": 2e-06, + "loss": 0.0003, + "num_tokens": 13691037.0, + "reward": 0.4581317901611328, + "reward_std": 0.07780471444129944, + "rewards/avg_thinking_length_func": 141.15011596679688, + "rewards/confidence_score_reward_func": 0.6525664925575256, + "rewards/correct_answer_reward_func": 0.65625, + "rewards/efficient_thinking_reward_func": 0.7593332235923487, + "rewards/format_and_efficient_reward_func": 0.45769202709198, + "rewards/format_reward_func": 0.9993749856948853, + "rewards/num_xml_reward_func": 1.3809731006622314, + "rewards/tool_execution_reward_func": 1.9640991687774658, + "rewards/visit_tool_reward_func": 0.9199192523956299, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002567829457364341, + "grad_norm": 0.012478280222631418, + "kl": 0.03009033203125, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0025920542635658917, + "grad_norm": 0.013305867700430574, + "kl": 0.0390625, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002616279069767442, + "grad_norm": 0.0183428412461533, + "kl": 0.0509033203125, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6386.0, + "completions/max_terminated_length": 6386.0, + "completions/mean_length": 3297.5625, + "completions/mean_terminated_length": 3297.5625, + "completions/min_length": 1296.0, + "completions/min_terminated_length": 1296.0, + "epoch": 0.0026405038759689924, + "grad_norm": 0.02337332236690807, + "kl": 0.0616455078125, + "learning_rate": 2e-06, + "loss": 0.001, + "num_tokens": 14184466.0, + "reward": 0.40722835063934326, + "reward_std": 0.14360609650611877, + "rewards/avg_thinking_length_func": 138.28097534179688, + "rewards/confidence_score_reward_func": 0.644202470779419, + "rewards/correct_answer_reward_func": 0.59375, + "rewards/efficient_thinking_reward_func": 0.7607926960767375, + "rewards/format_and_efficient_reward_func": 0.46497124433517456, + "rewards/format_reward_func": 1.0, + "rewards/num_xml_reward_func": 1.4057281017303467, + "rewards/tool_execution_reward_func": 1.9434621334075928, + "rewards/visit_tool_reward_func": 0.9184768199920654, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0026647286821705426, + "grad_norm": 0.012698799447402773, + "kl": 0.06781005859375, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002688953488372093, + "grad_norm": 0.012619226324675306, + "kl": 0.0758056640625, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0027131782945736434, + "grad_norm": 0.013347372933753418, + "kl": 0.0892333984375, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10254.0, + "completions/max_terminated_length": 10254.0, + "completions/mean_length": 4017.296875, + "completions/mean_terminated_length": 4017.296875, + "completions/min_length": 1163.0, + "completions/min_terminated_length": 1163.0, + "epoch": 0.002737403100775194, + "grad_norm": 0.8482856229199331, + "kl": 0.163818359375, + "learning_rate": 2e-06, + "loss": 0.0003, + "num_tokens": 14783302.0, + "reward": 0.3793744742870331, + "reward_std": 0.08317889273166656, + "rewards/avg_thinking_length_func": 96.98873901367188, + "rewards/confidence_score_reward_func": 0.5890461206436157, + "rewards/correct_answer_reward_func": 0.578125, + "rewards/efficient_thinking_reward_func": 0.4956153760102844, + "rewards/format_and_efficient_reward_func": 0.3040567636489868, + "rewards/format_reward_func": 0.991857647895813, + "rewards/num_xml_reward_func": 0.9565892815589905, + "rewards/tool_execution_reward_func": 1.883992075920105, + "rewards/visit_tool_reward_func": 0.6309776306152344, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002761627906976744, + "grad_norm": 2.892668930951565, + "kl": 0.87744140625, + "learning_rate": 2e-06, + "loss": 0.002, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0027858527131782948, + "grad_norm": 0.11540075032392746, + "kl": 0.258544921875, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002810077519379845, + "grad_norm": 0.03602102455529362, + "kl": 0.205078125, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7956.0, + "completions/max_terminated_length": 7956.0, + "completions/mean_length": 3060.15625, + "completions/mean_terminated_length": 3060.15625, + "completions/min_length": 1178.0, + "completions/min_terminated_length": 1178.0, + "epoch": 0.0028343023255813955, + "grad_norm": 0.011947818843430099, + "kl": 0.1031494140625, + "learning_rate": 2e-06, + "loss": 0.0013, + "num_tokens": 15239738.0, + "reward": 0.4257793724536896, + "reward_std": 0.15445315837860107, + "rewards/avg_thinking_length_func": 111.71697235107422, + "rewards/confidence_score_reward_func": 0.6188951730728149, + "rewards/correct_answer_reward_func": 0.671875, + "rewards/efficient_thinking_reward_func": 0.7151743089595498, + "rewards/format_and_efficient_reward_func": 0.3122476637363434, + "rewards/format_reward_func": 0.9918689727783203, + "rewards/num_xml_reward_func": 1.2823729515075684, + "rewards/tool_execution_reward_func": 1.9500064849853516, + "rewards/visit_tool_reward_func": 0.8597963452339172, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0028585271317829457, + "grad_norm": 0.01184782503909529, + "kl": 0.0999755859375, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0028827519379844963, + "grad_norm": 0.01222442223239816, + "kl": 0.099365234375, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0029069767441860465, + "grad_norm": 0.01288408566646706, + "kl": 0.1002197265625, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11579.0, + "completions/max_terminated_length": 11579.0, + "completions/mean_length": 3778.484375, + "completions/mean_terminated_length": 3778.484375, + "completions/min_length": 857.0, + "completions/min_terminated_length": 857.0, + "epoch": 0.002931201550387597, + "grad_norm": 0.013127560986285324, + "kl": 0.170654296875, + "learning_rate": 2e-06, + "loss": 0.0006, + "num_tokens": 15802870.0, + "reward": 0.35960614681243896, + "reward_std": 0.09336411207914352, + "rewards/avg_thinking_length_func": 120.11376953125, + "rewards/confidence_score_reward_func": 0.5505574941635132, + "rewards/correct_answer_reward_func": 0.609375, + "rewards/efficient_thinking_reward_func": 0.5848998658707487, + "rewards/format_and_efficient_reward_func": 0.09069697558879852, + "rewards/format_reward_func": 0.9635053873062134, + "rewards/num_xml_reward_func": 0.6183948516845703, + "rewards/tool_execution_reward_func": 1.921267032623291, + "rewards/visit_tool_reward_func": 0.408791184425354, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0029554263565891472, + "grad_norm": 0.011473925349363189, + "kl": 0.173828125, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002979651162790698, + "grad_norm": 0.010667583254555548, + "kl": 0.1767578125, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003003875968992248, + "grad_norm": 0.010839236682357098, + "kl": 0.1826171875, + "learning_rate": 2e-06, + "loss": 0.0006, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7543.0, + "completions/max_terminated_length": 7543.0, + "completions/mean_length": 3495.625, + "completions/mean_terminated_length": 3495.625, + "completions/min_length": 1272.0, + "completions/min_terminated_length": 1272.0, + "epoch": 0.0030281007751937986, + "grad_norm": 0.014134486127382969, + "kl": 0.136474609375, + "learning_rate": 2e-06, + "loss": 0.0011, + "num_tokens": 16267847.0, + "reward": 0.40116244554519653, + "reward_std": 0.11558952927589417, + "rewards/avg_thinking_length_func": 171.4405975341797, + "rewards/confidence_score_reward_func": 0.592523455619812, + "rewards/correct_answer_reward_func": 0.65625, + "rewards/efficient_thinking_reward_func": 0.78887382548876, + "rewards/format_and_efficient_reward_func": -0.007415967993438244, + "rewards/format_reward_func": 0.9569429159164429, + "rewards/num_xml_reward_func": 0.533742368221283, + "rewards/tool_execution_reward_func": 1.984920620918274, + "rewards/visit_tool_reward_func": 0.8972762823104858, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003052325581395349, + "grad_norm": 0.01438304535919498, + "kl": 0.140625, + "learning_rate": 2e-06, + "loss": 0.0011, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0030765503875968994, + "grad_norm": 0.014656756114246808, + "kl": 0.14794921875, + "learning_rate": 2e-06, + "loss": 0.0011, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0031007751937984496, + "grad_norm": 0.015042904271731165, + "kl": 0.15869140625, + "learning_rate": 2e-06, + "loss": 0.0012, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12404.0, + "completions/max_terminated_length": 12404.0, + "completions/mean_length": 4594.015625, + "completions/mean_terminated_length": 4594.015625, + "completions/min_length": 1214.0, + "completions/min_terminated_length": 1214.0, + "epoch": 0.003125, + "grad_norm": 0.01590013423348445, + "kl": 0.26904296875, + "learning_rate": 2e-06, + "loss": 0.0009, + "num_tokens": 16831957.0, + "reward": 0.3612688183784485, + "reward_std": 0.08134222030639648, + "rewards/avg_thinking_length_func": 189.8800048828125, + "rewards/confidence_score_reward_func": 0.5268421173095703, + "rewards/correct_answer_reward_func": 0.625, + "rewards/efficient_thinking_reward_func": 0.6692969275756135, + "rewards/format_and_efficient_reward_func": -0.032691895961761475, + "rewards/format_reward_func": 0.9466335773468018, + "rewards/num_xml_reward_func": 0.4149753153324127, + "rewards/tool_execution_reward_func": 1.9272011518478394, + "rewards/visit_tool_reward_func": 0.7673778533935547, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0031492248062015503, + "grad_norm": 0.01646874780720208, + "kl": 0.29443359375, + "learning_rate": 2e-06, + "loss": 0.0009, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003173449612403101, + "grad_norm": 0.01694506623714648, + "kl": 0.314453125, + "learning_rate": 2e-06, + "loss": 0.0009, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003197674418604651, + "grad_norm": 0.016867539615718644, + "kl": 0.3271484375, + "learning_rate": 2e-06, + "loss": 0.001, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6944.0, + "completions/max_terminated_length": 6944.0, + "completions/mean_length": 2774.265625, + "completions/mean_terminated_length": 2774.265625, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "epoch": 0.0032218992248062017, + "grad_norm": 0.022617229528507702, + "kl": 0.26953125, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 17249404.0, + "reward": 0.20413580536842346, + "reward_std": 0.05481432378292084, + "rewards/avg_thinking_length_func": 129.03866577148438, + "rewards/confidence_score_reward_func": 0.49319422245025635, + "rewards/correct_answer_reward_func": 0.34375, + "rewards/efficient_thinking_reward_func": 0.7432039407243382, + "rewards/format_and_efficient_reward_func": 0.17171993851661682, + "rewards/format_reward_func": 0.9746097326278687, + "rewards/num_xml_reward_func": 0.8615504503250122, + "rewards/tool_execution_reward_func": 1.9303656816482544, + "rewards/visit_tool_reward_func": 0.9013795852661133, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003246124031007752, + "grad_norm": 0.020143739711233816, + "kl": 0.252685546875, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0032703488372093025, + "grad_norm": 0.01785809415589292, + "kl": 0.227294921875, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0032945736434108527, + "grad_norm": 0.015380281270199666, + "kl": 0.199462890625, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7166.0, + "completions/max_terminated_length": 7166.0, + "completions/mean_length": 3239.453125, + "completions/mean_terminated_length": 3239.453125, + "completions/min_length": 1458.0, + "completions/min_terminated_length": 1458.0, + "epoch": 0.0033187984496124033, + "grad_norm": 0.012800365899215092, + "kl": 0.138427734375, + "learning_rate": 2e-06, + "loss": 0.0004, + "num_tokens": 17686794.0, + "reward": 0.3108579218387604, + "reward_std": 0.13888844847679138, + "rewards/avg_thinking_length_func": 171.369384765625, + "rewards/confidence_score_reward_func": 0.5435695648193359, + "rewards/correct_answer_reward_func": 0.515625, + "rewards/efficient_thinking_reward_func": 0.802592893497664, + "rewards/format_and_efficient_reward_func": 0.2916308343410492, + "rewards/format_reward_func": 0.9913173913955688, + "rewards/num_xml_reward_func": 1.4043910503387451, + "rewards/tool_execution_reward_func": 1.8357443809509277, + "rewards/visit_tool_reward_func": 0.8753163814544678, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0033430232558139534, + "grad_norm": 0.014285839855776115, + "kl": 0.1318359375, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003367248062015504, + "grad_norm": 0.015433812962718682, + "kl": 0.128173828125, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003391472868217054, + "grad_norm": 0.015720560114809618, + "kl": 0.1229248046875, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5424.0, + "completions/max_terminated_length": 5424.0, + "completions/mean_length": 3209.875, + "completions/mean_terminated_length": 3209.875, + "completions/min_length": 1301.0, + "completions/min_terminated_length": 1301.0, + "epoch": 0.003415697674418605, + "grad_norm": 0.009160832793565006, + "kl": 0.089599609375, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 18156998.0, + "reward": 0.2771710753440857, + "reward_std": 0.10209451615810394, + "rewards/avg_thinking_length_func": 144.3570556640625, + "rewards/confidence_score_reward_func": 0.5883906483650208, + "rewards/correct_answer_reward_func": 0.421875, + "rewards/efficient_thinking_reward_func": 0.9227171305298694, + "rewards/format_and_efficient_reward_func": 0.303905189037323, + "rewards/format_reward_func": 0.9965387582778931, + "rewards/num_xml_reward_func": 1.6496015787124634, + "rewards/tool_execution_reward_func": 1.9101753234863281, + "rewards/visit_tool_reward_func": 1.0097795724868774, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003439922480620155, + "grad_norm": 0.009348804877622782, + "kl": 0.0853271484375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0034641472868217056, + "grad_norm": 0.009332442022472659, + "kl": 0.080322265625, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0034883720930232558, + "grad_norm": 0.009512893821144673, + "kl": 0.0767822265625, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7366.0, + "completions/max_terminated_length": 7366.0, + "completions/mean_length": 4435.890625, + "completions/mean_terminated_length": 4435.890625, + "completions/min_length": 1397.0, + "completions/min_terminated_length": 1397.0, + "epoch": 0.0035125968992248064, + "grad_norm": 0.013329760690267301, + "kl": 0.055419921875, + "learning_rate": 2e-06, + "loss": -0.0002, + "num_tokens": 18712707.0, + "reward": 0.4343380331993103, + "reward_std": 0.1319217085838318, + "rewards/avg_thinking_length_func": 213.60223388671875, + "rewards/confidence_score_reward_func": 0.6497268080711365, + "rewards/correct_answer_reward_func": 0.625, + "rewards/efficient_thinking_reward_func": 0.8139017177985812, + "rewards/format_and_efficient_reward_func": 0.4802235960960388, + "rewards/format_reward_func": 0.9989955425262451, + "rewards/num_xml_reward_func": 1.751387119293213, + "rewards/tool_execution_reward_func": 1.9038957357406616, + "rewards/visit_tool_reward_func": 0.9324563145637512, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0035368217054263565, + "grad_norm": 0.013975012703647748, + "kl": 0.0540771484375, + "learning_rate": 2e-06, + "loss": -0.0002, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003561046511627907, + "grad_norm": 0.014076489547319788, + "kl": 0.0531005859375, + "learning_rate": 2e-06, + "loss": -0.0002, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0035852713178294573, + "grad_norm": 0.014165636449546886, + "kl": 0.0531005859375, + "learning_rate": 2e-06, + "loss": -0.0002, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5721.0, + "completions/max_terminated_length": 5721.0, + "completions/mean_length": 3476.890625, + "completions/mean_terminated_length": 3476.890625, + "completions/min_length": 1375.0, + "completions/min_terminated_length": 1375.0, + "epoch": 0.003609496124031008, + "grad_norm": 0.007532410662647794, + "kl": 0.06439208984375, + "learning_rate": 2e-06, + "loss": 0.0001, + "num_tokens": 19224629.0, + "reward": 0.3097182512283325, + "reward_std": 0.06608685851097107, + "rewards/avg_thinking_length_func": 155.74346923828125, + "rewards/confidence_score_reward_func": 0.6070291996002197, + "rewards/correct_answer_reward_func": 0.453125, + "rewards/efficient_thinking_reward_func": 0.9227627606272979, + "rewards/format_and_efficient_reward_func": 0.3381012976169586, + "rewards/format_reward_func": 0.9996874928474426, + "rewards/num_xml_reward_func": 1.6836090087890625, + "rewards/tool_execution_reward_func": 1.8510758876800537, + "rewards/visit_tool_reward_func": 0.8944061994552612, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003633720930232558, + "grad_norm": 0.007379430347015788, + "kl": 0.0645751953125, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0036579457364341087, + "grad_norm": 0.008138518366845196, + "kl": 0.0657958984375, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003682170542635659, + "grad_norm": 0.008284296957527382, + "kl": 0.0673828125, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9618.0, + "completions/max_terminated_length": 9618.0, + "completions/mean_length": 4875.078125, + "completions/mean_terminated_length": 4875.078125, + "completions/min_length": 1847.0, + "completions/min_terminated_length": 1847.0, + "epoch": 0.0037063953488372095, + "grad_norm": 0.014704297852595168, + "kl": 0.05523681640625, + "learning_rate": 2e-06, + "loss": 0.0015, + "num_tokens": 19820623.0, + "reward": 0.428906112909317, + "reward_std": 0.16942133009433746, + "rewards/avg_thinking_length_func": 210.6763916015625, + "rewards/confidence_score_reward_func": 0.6548709869384766, + "rewards/correct_answer_reward_func": 0.609375, + "rewards/efficient_thinking_reward_func": 0.7212743512877299, + "rewards/format_and_efficient_reward_func": 0.4301028251647949, + "rewards/format_reward_func": 0.9975892305374146, + "rewards/num_xml_reward_func": 1.4759665727615356, + "rewards/tool_execution_reward_func": 1.8980989456176758, + "rewards/visit_tool_reward_func": 0.9375091791152954, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0037306201550387596, + "grad_norm": 0.015023473705486283, + "kl": 0.0567626953125, + "learning_rate": 2e-06, + "loss": 0.0015, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0037548449612403102, + "grad_norm": 0.015217500076281755, + "kl": 0.05841064453125, + "learning_rate": 2e-06, + "loss": 0.0015, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0037790697674418604, + "grad_norm": 0.016114636489248848, + "kl": 0.0614013671875, + "learning_rate": 2e-06, + "loss": 0.0015, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6823.0, + "completions/max_terminated_length": 6823.0, + "completions/mean_length": 4384.296875, + "completions/mean_terminated_length": 4384.296875, + "completions/min_length": 1697.0, + "completions/min_terminated_length": 1697.0, + "epoch": 0.003803294573643411, + "grad_norm": 0.006588369322686691, + "kl": 0.0643310546875, + "learning_rate": 2e-06, + "loss": 0.0005, + "num_tokens": 20425222.0, + "reward": 0.34698012471199036, + "reward_std": 0.03517330437898636, + "rewards/avg_thinking_length_func": 178.83392333984375, + "rewards/confidence_score_reward_func": 0.6313294172286987, + "rewards/correct_answer_reward_func": 0.484375, + "rewards/efficient_thinking_reward_func": 0.8650427095882729, + "rewards/format_and_efficient_reward_func": 0.37807154655456543, + "rewards/format_reward_func": 0.9995312690734863, + "rewards/num_xml_reward_func": 1.323744297027588, + "rewards/tool_execution_reward_func": 1.96144700050354, + "rewards/visit_tool_reward_func": 0.9631377458572388, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003827519379844961, + "grad_norm": 0.006972139333963718, + "kl": 0.0670166015625, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003851744186046512, + "grad_norm": 0.0071318562836598836, + "kl": 0.06884765625, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003875968992248062, + "grad_norm": 0.007113091376284595, + "kl": 0.06982421875, + "learning_rate": 2e-06, + "loss": 0.0005, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11894.0, + "completions/max_terminated_length": 11894.0, + "completions/mean_length": 5685.0625, + "completions/mean_terminated_length": 5685.0625, + "completions/min_length": 1886.0, + "completions/min_terminated_length": 1886.0, + "epoch": 0.0039001937984496126, + "grad_norm": 0.01558937344658329, + "kl": 0.06414794921875, + "learning_rate": 2e-06, + "loss": 0.0018, + "num_tokens": 21086786.0, + "reward": 0.4025996923446655, + "reward_std": 0.13449470698833466, + "rewards/avg_thinking_length_func": 254.32508850097656, + "rewards/confidence_score_reward_func": 0.6495309472084045, + "rewards/correct_answer_reward_func": 0.578125, + "rewards/efficient_thinking_reward_func": 0.6637161596148502, + "rewards/format_and_efficient_reward_func": 0.458422988653183, + "rewards/format_reward_func": 0.9998437166213989, + "rewards/num_xml_reward_func": 1.5073208808898926, + "rewards/tool_execution_reward_func": 1.9572367668151855, + "rewards/visit_tool_reward_func": 0.9573923349380493, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003924418604651163, + "grad_norm": 0.016638056430155885, + "kl": 0.064453125, + "learning_rate": 2e-06, + "loss": 0.0018, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003948643410852713, + "grad_norm": 0.01813854752521658, + "kl": 0.06536865234375, + "learning_rate": 2e-06, + "loss": 0.0018, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0039728682170542635, + "grad_norm": 0.01938490985845502, + "kl": 0.06988525390625, + "learning_rate": 2e-06, + "loss": 0.0019, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9060.0, + "completions/max_terminated_length": 9060.0, + "completions/mean_length": 4403.21875, + "completions/mean_terminated_length": 4403.21875, + "completions/min_length": 1390.0, + "completions/min_terminated_length": 1390.0, + "epoch": 0.003997093023255814, + "grad_norm": 0.005449273513524992, + "kl": 0.0662841796875, + "learning_rate": 2e-06, + "loss": 0.0001, + "num_tokens": 21662894.0, + "reward": 0.35001087188720703, + "reward_std": 0.009927155449986458, + "rewards/avg_thinking_length_func": 188.15765380859375, + "rewards/confidence_score_reward_func": 0.6182008981704712, + "rewards/correct_answer_reward_func": 0.5, + "rewards/efficient_thinking_reward_func": 0.8001981107519069, + "rewards/format_and_efficient_reward_func": 0.36673688888549805, + "rewards/format_reward_func": 0.9998437166213989, + "rewards/num_xml_reward_func": 1.4394086599349976, + "rewards/tool_execution_reward_func": 1.993227481842041, + "rewards/visit_tool_reward_func": 0.936252236366272, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004021317829457365, + "grad_norm": 0.00568312787735846, + "kl": 0.068115234375, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0040455426356589145, + "grad_norm": 0.005806971085578714, + "kl": 0.069580078125, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004069767441860465, + "grad_norm": 0.00592190722180043, + "kl": 0.070556640625, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9992.0, + "completions/max_terminated_length": 9992.0, + "completions/mean_length": 5377.4375, + "completions/mean_terminated_length": 5377.4375, + "completions/min_length": 1809.0, + "completions/min_terminated_length": 1809.0, + "epoch": 0.004093992248062016, + "grad_norm": 0.359099649617951, + "kl": 0.1207275390625, + "learning_rate": 2e-06, + "loss": 0.0029, + "num_tokens": 22286431.0, + "reward": 0.40037134289741516, + "reward_std": 0.12838459014892578, + "rewards/avg_thinking_length_func": 245.9459228515625, + "rewards/confidence_score_reward_func": 0.6141020059585571, + "rewards/correct_answer_reward_func": 0.609375, + "rewards/efficient_thinking_reward_func": 0.6361426555187852, + "rewards/format_and_efficient_reward_func": 0.45017051696777344, + "rewards/format_reward_func": 0.9981250166893005, + "rewards/num_xml_reward_func": 1.532149076461792, + "rewards/tool_execution_reward_func": 1.9983552694320679, + "rewards/visit_tool_reward_func": 0.9713033437728882, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004118217054263566, + "grad_norm": 0.0312847460920415, + "kl": 0.08642578125, + "learning_rate": 2e-06, + "loss": 0.0028, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004142441860465116, + "grad_norm": 0.5587996108011728, + "kl": 0.2386474609375, + "learning_rate": 2e-06, + "loss": 0.003, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004166666666666667, + "grad_norm": 0.03228792794627183, + "kl": 0.092529296875, + "learning_rate": 2e-06, + "loss": 0.0028, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7094.0, + "completions/max_terminated_length": 7094.0, + "completions/mean_length": 4163.640625, + "completions/mean_terminated_length": 4163.640625, + "completions/min_length": 1385.0, + "completions/min_terminated_length": 1385.0, + "epoch": 0.004190891472868217, + "grad_norm": 0.008141555436627606, + "kl": 0.1014404296875, + "learning_rate": 2e-06, + "loss": 0.0007, + "num_tokens": 22851695.0, + "reward": 0.31291523575782776, + "reward_std": 0.0387241393327713, + "rewards/avg_thinking_length_func": 150.9978485107422, + "rewards/confidence_score_reward_func": 0.5685818195343018, + "rewards/correct_answer_reward_func": 0.46875, + "rewards/efficient_thinking_reward_func": 0.8065696148258371, + "rewards/format_and_efficient_reward_func": 0.30031993985176086, + "rewards/format_reward_func": 0.9996874928474426, + "rewards/num_xml_reward_func": 1.2274867296218872, + "rewards/tool_execution_reward_func": 1.9928336143493652, + "rewards/visit_tool_reward_func": 0.9787203073501587, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004215116279069768, + "grad_norm": 0.008733677069632446, + "kl": 0.1131591796875, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0042393410852713176, + "grad_norm": 0.009638540295346257, + "kl": 0.12744140625, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004263565891472868, + "grad_norm": 0.010992556993855552, + "kl": 0.142822265625, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8227.0, + "completions/max_terminated_length": 8227.0, + "completions/mean_length": 4541.953125, + "completions/mean_terminated_length": 4541.953125, + "completions/min_length": 1507.0, + "completions/min_terminated_length": 1507.0, + "epoch": 0.004287790697674419, + "grad_norm": 0.1409188461026278, + "kl": 0.265625, + "learning_rate": 2e-06, + "loss": 0.0037, + "num_tokens": 23436250.0, + "reward": 0.3243735730648041, + "reward_std": 0.15356436371803284, + "rewards/avg_thinking_length_func": 171.99826049804688, + "rewards/confidence_score_reward_func": 0.5453901290893555, + "rewards/correct_answer_reward_func": 0.53125, + "rewards/efficient_thinking_reward_func": 0.6924963364887087, + "rewards/format_and_efficient_reward_func": 0.3312879800796509, + "rewards/format_reward_func": 0.998577356338501, + "rewards/num_xml_reward_func": 1.3812510967254639, + "rewards/tool_execution_reward_func": 1.9967105388641357, + "rewards/visit_tool_reward_func": 0.9554424285888672, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004312015503875969, + "grad_norm": 0.05228415250398885, + "kl": 0.201904296875, + "learning_rate": 2e-06, + "loss": 0.0037, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004336240310077519, + "grad_norm": 0.060068767522700996, + "kl": 0.2451171875, + "learning_rate": 2e-06, + "loss": 0.0037, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00436046511627907, + "grad_norm": 0.2730620784971272, + "kl": 0.4716796875, + "learning_rate": 2e-06, + "loss": 0.0041, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8120.0, + "completions/max_terminated_length": 8120.0, + "completions/mean_length": 3361.40625, + "completions/mean_terminated_length": 3361.40625, + "completions/min_length": 1075.0, + "completions/min_terminated_length": 1075.0, + "epoch": 0.00438468992248062, + "grad_norm": 0.05853393969832367, + "kl": 0.5302734375, + "learning_rate": 2e-06, + "loss": 0.0007, + "num_tokens": 23987107.0, + "reward": 0.24436859786510468, + "reward_std": 0.04949303716421127, + "rewards/avg_thinking_length_func": 81.72256469726562, + "rewards/confidence_score_reward_func": 0.45580577850341797, + "rewards/correct_answer_reward_func": 0.453125, + "rewards/efficient_thinking_reward_func": 0.573834842856046, + "rewards/format_and_efficient_reward_func": 0.22879377007484436, + "rewards/format_reward_func": 0.995830774307251, + "rewards/num_xml_reward_func": 1.104771614074707, + "rewards/tool_execution_reward_func": 1.9899488687515259, + "rewards/visit_tool_reward_func": 0.8998211622238159, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004408914728682171, + "grad_norm": 0.07516497327438276, + "kl": 0.6845703125, + "learning_rate": 2e-06, + "loss": 0.0009, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004433139534883721, + "grad_norm": 0.05997132496622212, + "kl": 0.626953125, + "learning_rate": 2e-06, + "loss": 0.0008, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004457364341085271, + "grad_norm": 0.037671767248184135, + "kl": 0.48681640625, + "learning_rate": 2e-06, + "loss": 0.0007, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8178.0, + "completions/max_terminated_length": 8178.0, + "completions/mean_length": 3659.640625, + "completions/mean_terminated_length": 3659.640625, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "epoch": 0.004481589147286822, + "grad_norm": 0.016069114631093232, + "kl": 0.34326171875, + "learning_rate": 2e-06, + "loss": 0.0009, + "num_tokens": 24496297.0, + "reward": 0.2305455505847931, + "reward_std": 0.06948232650756836, + "rewards/avg_thinking_length_func": 111.37628936767578, + "rewards/confidence_score_reward_func": 0.37327370047569275, + "rewards/correct_answer_reward_func": 0.515625, + "rewards/efficient_thinking_reward_func": 0.48277143466617184, + "rewards/format_and_efficient_reward_func": 0.1522754281759262, + "rewards/format_reward_func": 0.9647905230522156, + "rewards/num_xml_reward_func": 0.8915370106697083, + "rewards/tool_execution_reward_func": 1.9581143856048584, + "rewards/visit_tool_reward_func": 0.5689894556999207, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0045058139534883725, + "grad_norm": 0.014918137972398021, + "kl": 0.31640625, + "learning_rate": 2e-06, + "loss": 0.0008, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004530038759689922, + "grad_norm": 0.014560290660972823, + "kl": 0.2958984375, + "learning_rate": 2e-06, + "loss": 0.0008, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004554263565891473, + "grad_norm": 0.014191965162457063, + "kl": 0.27880859375, + "learning_rate": 2e-06, + "loss": 0.0008, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11027.0, + "completions/max_terminated_length": 11027.0, + "completions/mean_length": 2904.71875, + "completions/mean_terminated_length": 2904.71875, + "completions/min_length": 912.0, + "completions/min_terminated_length": 912.0, + "epoch": 0.004578488372093023, + "grad_norm": 0.02465674538761865, + "kl": 0.27099609375, + "learning_rate": 2e-06, + "loss": 0.0001, + "num_tokens": 24960803.0, + "reward": 0.22261814773082733, + "reward_std": 0.04196429252624512, + "rewards/avg_thinking_length_func": 79.28602600097656, + "rewards/confidence_score_reward_func": 0.40539172291755676, + "rewards/correct_answer_reward_func": 0.46875, + "rewards/efficient_thinking_reward_func": 0.4911669222941917, + "rewards/format_and_efficient_reward_func": 0.14570605754852295, + "rewards/format_reward_func": 0.9741340279579163, + "rewards/num_xml_reward_func": 0.884125292301178, + "rewards/tool_execution_reward_func": 1.9560561180114746, + "rewards/visit_tool_reward_func": 0.7019689083099365, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004602713178294574, + "grad_norm": 0.01002216998448175, + "kl": 0.24951171875, + "learning_rate": 2e-06, + "loss": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004626937984496124, + "grad_norm": 0.009283017573166963, + "kl": 0.234619140625, + "learning_rate": 2e-06, + "loss": 0.0, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004651162790697674, + "grad_norm": 0.00871351171533654, + "kl": 0.221435546875, + "learning_rate": 2e-06, + "loss": 0.0, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6760.0, + "completions/max_terminated_length": 6760.0, + "completions/mean_length": 3175.625, + "completions/mean_terminated_length": 3175.625, + "completions/min_length": 1279.0, + "completions/min_terminated_length": 1279.0, + "epoch": 0.004675387596899225, + "grad_norm": 0.01864801542206714, + "kl": 0.200927734375, + "learning_rate": 2e-06, + "loss": 0.0013, + "num_tokens": 25421310.0, + "reward": 0.3337632417678833, + "reward_std": 0.1033831387758255, + "rewards/avg_thinking_length_func": 144.4852752685547, + "rewards/confidence_score_reward_func": 0.5157345533370972, + "rewards/correct_answer_reward_func": 0.609375, + "rewards/efficient_thinking_reward_func": 0.6954727584239813, + "rewards/format_and_efficient_reward_func": 0.2803717255592346, + "rewards/format_reward_func": 0.9838045835494995, + "rewards/num_xml_reward_func": 1.244771957397461, + "rewards/tool_execution_reward_func": 1.9927083253860474, + "rewards/visit_tool_reward_func": 0.8324298858642578, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0046996124031007756, + "grad_norm": 0.018411722840556213, + "kl": 0.193603515625, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004723837209302325, + "grad_norm": 0.018380172856358755, + "kl": 0.189208984375, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004748062015503876, + "grad_norm": 0.018655645496485265, + "kl": 0.1875, + "learning_rate": 2e-06, + "loss": 0.0013, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5256.0, + "completions/max_terminated_length": 5256.0, + "completions/mean_length": 2635.984375, + "completions/mean_terminated_length": 2635.984375, + "completions/min_length": 1134.0, + "completions/min_terminated_length": 1134.0, + "epoch": 0.0047722868217054265, + "grad_norm": 0.004219005229441154, + "kl": 0.1275634765625, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 25833510.0, + "reward": 0.30341458320617676, + "reward_std": 0.014322971925139427, + "rewards/avg_thinking_length_func": 147.2517852783203, + "rewards/confidence_score_reward_func": 0.5635701417922974, + "rewards/correct_answer_reward_func": 0.5, + "rewards/efficient_thinking_reward_func": 0.8586018615751865, + "rewards/format_and_efficient_reward_func": 0.28311923146247864, + "rewards/format_reward_func": 0.9866694808006287, + "rewards/num_xml_reward_func": 1.2634769678115845, + "rewards/tool_execution_reward_func": 1.9635450839996338, + "rewards/visit_tool_reward_func": 0.80121248960495, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004796511627906977, + "grad_norm": 0.004672728095639017, + "kl": 0.130615234375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004820736434108527, + "grad_norm": 0.004950768699918263, + "kl": 0.1329345703125, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0048449612403100775, + "grad_norm": 0.005160418640186133, + "kl": 0.1343994140625, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5779.0, + "completions/max_terminated_length": 5779.0, + "completions/mean_length": 2860.046875, + "completions/mean_terminated_length": 2860.046875, + "completions/min_length": 1125.0, + "completions/min_terminated_length": 1125.0, + "epoch": 0.004869186046511628, + "grad_norm": 0.008829648064201757, + "kl": 0.0411376953125, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 26237865.0, + "reward": 0.45401930809020996, + "reward_std": 0.09410357475280762, + "rewards/avg_thinking_length_func": 182.534423828125, + "rewards/confidence_score_reward_func": 0.5977352857589722, + "rewards/correct_answer_reward_func": 0.734375, + "rewards/efficient_thinking_reward_func": 0.784292215730239, + "rewards/format_and_efficient_reward_func": 0.41676729917526245, + "rewards/format_reward_func": 0.9937513470649719, + "rewards/num_xml_reward_func": 1.5355236530303955, + "rewards/tool_execution_reward_func": 1.9931985139846802, + "rewards/visit_tool_reward_func": 0.8612196445465088, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004893410852713179, + "grad_norm": 0.009196641903985264, + "kl": 0.0400390625, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004917635658914728, + "grad_norm": 0.009490032359266305, + "kl": 0.038818359375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004941860465116279, + "grad_norm": 0.009682454113754367, + "kl": 0.03753662109375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6331.0, + "completions/max_terminated_length": 6331.0, + "completions/mean_length": 2659.453125, + "completions/mean_terminated_length": 2659.453125, + "completions/min_length": 895.0, + "completions/min_terminated_length": 895.0, + "epoch": 0.00496608527131783, + "grad_norm": 0.0037363827479903167, + "kl": 0.03375244140625, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 26646632.0, + "reward": 0.3066054582595825, + "reward_std": 0.03825566917657852, + "rewards/avg_thinking_length_func": 136.1707763671875, + "rewards/confidence_score_reward_func": 0.5777994990348816, + "rewards/correct_answer_reward_func": 0.484375, + "rewards/efficient_thinking_reward_func": 0.786608708417682, + "rewards/format_and_efficient_reward_func": 0.3019195795059204, + "rewards/format_reward_func": 0.9903415441513062, + "rewards/num_xml_reward_func": 1.3805111646652222, + "rewards/tool_execution_reward_func": 1.9650006294250488, + "rewards/visit_tool_reward_func": 0.8477368354797363, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00499031007751938, + "grad_norm": 0.0037822209816054495, + "kl": 0.03350830078125, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00501453488372093, + "grad_norm": 0.0038040246120938713, + "kl": 0.033477783203125, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0050387596899224806, + "grad_norm": 0.0038540122892837783, + "kl": 0.03350830078125, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5045.0, + "completions/max_terminated_length": 5045.0, + "completions/mean_length": 2763.5, + "completions/mean_terminated_length": 2763.5, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "epoch": 0.005062984496124031, + "grad_norm": 0.00548683325475162, + "kl": 0.035003662109375, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 27041016.0, + "reward": 0.45102459192276, + "reward_std": 0.06410035490989685, + "rewards/avg_thinking_length_func": 186.88746643066406, + "rewards/confidence_score_reward_func": 0.6191459894180298, + "rewards/correct_answer_reward_func": 0.6875, + "rewards/efficient_thinking_reward_func": 0.8100582820862734, + "rewards/format_and_efficient_reward_func": 0.44868165254592896, + "rewards/format_reward_func": 0.9952791929244995, + "rewards/num_xml_reward_func": 1.649810552597046, + "rewards/tool_execution_reward_func": 1.9959295988082886, + "rewards/visit_tool_reward_func": 0.8671329021453857, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005087209302325582, + "grad_norm": 0.005414160917662644, + "kl": 0.03399658203125, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0051114341085271315, + "grad_norm": 0.005397000227956369, + "kl": 0.033294677734375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005135658914728682, + "grad_norm": 0.005329822482164869, + "kl": 0.03271484375, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5622.0, + "completions/max_terminated_length": 5622.0, + "completions/mean_length": 2689.5, + "completions/mean_terminated_length": 2689.5, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "epoch": 0.005159883720930233, + "grad_norm": 0.004583885118409577, + "kl": 0.027679443359375, + "learning_rate": 2e-06, + "loss": 0.0001, + "num_tokens": 27444785.0, + "reward": 0.3377038240432739, + "reward_std": 0.03283514827489853, + "rewards/avg_thinking_length_func": 156.95558166503906, + "rewards/confidence_score_reward_func": 0.6069622039794922, + "rewards/correct_answer_reward_func": 0.515625, + "rewards/efficient_thinking_reward_func": 0.8533607950008524, + "rewards/format_and_efficient_reward_func": 0.3490750193595886, + "rewards/format_reward_func": 0.9963964819908142, + "rewards/num_xml_reward_func": 1.565781831741333, + "rewards/tool_execution_reward_func": 1.9799107313156128, + "rewards/visit_tool_reward_func": 0.886849582195282, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005184108527131783, + "grad_norm": 0.004553415372891503, + "kl": 0.02728271484375, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005208333333333333, + "grad_norm": 0.004416753047475649, + "kl": 0.026763916015625, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005232558139534884, + "grad_norm": 0.004302097167180992, + "kl": 0.02606201171875, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6509.0, + "completions/max_terminated_length": 6509.0, + "completions/mean_length": 3170.359375, + "completions/mean_terminated_length": 3170.359375, + "completions/min_length": 1316.0, + "completions/min_terminated_length": 1316.0, + "epoch": 0.005256782945736434, + "grad_norm": 0.00874079090702132, + "kl": 0.03131103515625, + "learning_rate": 2e-06, + "loss": 0.0002, + "num_tokens": 27886438.0, + "reward": 0.4546785354614258, + "reward_std": 0.13061311841011047, + "rewards/avg_thinking_length_func": 185.58987426757812, + "rewards/confidence_score_reward_func": 0.6329280138015747, + "rewards/correct_answer_reward_func": 0.671875, + "rewards/efficient_thinking_reward_func": 0.7895888587130873, + "rewards/format_and_efficient_reward_func": 0.43139761686325073, + "rewards/format_reward_func": 0.9971143007278442, + "rewards/num_xml_reward_func": 1.6065764427185059, + "rewards/tool_execution_reward_func": 1.9975961446762085, + "rewards/visit_tool_reward_func": 0.8967168927192688, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005281007751937985, + "grad_norm": 0.009254919184464793, + "kl": 0.031158447265625, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005305232558139535, + "grad_norm": 0.008540278295280325, + "kl": 0.03131103515625, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005329457364341085, + "grad_norm": 0.009027249196409619, + "kl": 0.031463623046875, + "learning_rate": 2e-06, + "loss": 0.0002, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6125.0, + "completions/max_terminated_length": 6125.0, + "completions/mean_length": 2700.765625, + "completions/mean_terminated_length": 2700.765625, + "completions/min_length": 966.0, + "completions/min_terminated_length": 966.0, + "epoch": 0.005353682170542636, + "grad_norm": 0.001653975947803042, + "kl": 0.0260009765625, + "learning_rate": 2e-06, + "loss": 0.0001, + "num_tokens": 28309543.0, + "reward": 0.3285777270793915, + "reward_std": 0.013459177687764168, + "rewards/avg_thinking_length_func": 149.52700805664062, + "rewards/confidence_score_reward_func": 0.6043996214866638, + "rewards/correct_answer_reward_func": 0.5, + "rewards/efficient_thinking_reward_func": 0.8903335916310755, + "rewards/format_and_efficient_reward_func": 0.3600352108478546, + "rewards/format_reward_func": 0.996889591217041, + "rewards/num_xml_reward_func": 1.5710426568984985, + "rewards/tool_execution_reward_func": 1.9776184558868408, + "rewards/visit_tool_reward_func": 0.9032177925109863, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005377906976744186, + "grad_norm": 0.001652035863632615, + "kl": 0.0264892578125, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005402131782945736, + "grad_norm": 0.0016513159446787636, + "kl": 0.0269775390625, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005426356589147287, + "grad_norm": 0.0020335905228311916, + "kl": 0.027557373046875, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4748.0, + "completions/max_terminated_length": 4748.0, + "completions/mean_length": 2978.90625, + "completions/mean_terminated_length": 2978.90625, + "completions/min_length": 1244.0, + "completions/min_terminated_length": 1244.0, + "epoch": 0.005450581395348837, + "grad_norm": 0.006026935047182901, + "kl": 0.03179931640625, + "learning_rate": 2e-06, + "loss": 0.0003, + "num_tokens": 28740848.0, + "reward": 0.4945339560508728, + "reward_std": 0.0744490772485733, + "rewards/avg_thinking_length_func": 172.45849609375, + "rewards/confidence_score_reward_func": 0.6167193651199341, + "rewards/correct_answer_reward_func": 0.765625, + "rewards/efficient_thinking_reward_func": 0.7966197226027097, + "rewards/format_and_efficient_reward_func": 0.512791097164154, + "rewards/format_reward_func": 0.9983228445053101, + "rewards/num_xml_reward_func": 1.6630462408065796, + "rewards/tool_execution_reward_func": 1.9971591234207153, + "rewards/visit_tool_reward_func": 0.9000678062438965, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005474806201550388, + "grad_norm": 0.005801070538806677, + "kl": 0.0323486328125, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005499031007751938, + "grad_norm": 0.005789539677805553, + "kl": 0.03302001953125, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005523255813953488, + "grad_norm": 0.005731300295942885, + "kl": 0.033935546875, + "learning_rate": 2e-06, + "loss": 0.0003, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5315.0, + "completions/max_terminated_length": 5315.0, + "completions/mean_length": 2718.109375, + "completions/mean_terminated_length": 2718.109375, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "epoch": 0.005547480620155039, + "grad_norm": 0.0027604900450052977, + "kl": 0.03369140625, + "learning_rate": 2e-06, + "loss": 0.0, + "num_tokens": 29177563.0, + "reward": 0.33125773072242737, + "reward_std": 0.012095385231077671, + "rewards/avg_thinking_length_func": 138.28082275390625, + "rewards/confidence_score_reward_func": 0.588701605796814, + "rewards/correct_answer_reward_func": 0.5, + "rewards/efficient_thinking_reward_func": 0.8968424695250805, + "rewards/format_and_efficient_reward_func": 0.36526361107826233, + "rewards/format_reward_func": 0.9942506551742554, + "rewards/num_xml_reward_func": 1.484344720840454, + "rewards/tool_execution_reward_func": 1.9658281803131104, + "rewards/visit_tool_reward_func": 0.9050877094268799, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0055717054263565895, + "grad_norm": 0.0028469369049688264, + "kl": 0.03424072265625, + "learning_rate": 2e-06, + "loss": 0.0, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005595930232558139, + "grad_norm": 0.0029207200987881226, + "kl": 0.03466796875, + "learning_rate": 2e-06, + "loss": 0.0, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.00562015503875969, + "grad_norm": 0.002891989345093088, + "kl": 0.03436279296875, + "learning_rate": 2e-06, + "loss": 0.0, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5430.0, + "completions/max_terminated_length": 5430.0, + "completions/mean_length": 3147.21875, + "completions/mean_terminated_length": 3147.21875, + "completions/min_length": 1208.0, + "completions/min_terminated_length": 1208.0, + "epoch": 0.0056443798449612404, + "grad_norm": 0.008009912903442006, + "kl": 0.039306640625, + "learning_rate": 2e-06, + "loss": 0.0004, + "num_tokens": 29641355.0, + "reward": 0.45486128330230713, + "reward_std": 0.10010581463575363, + "rewards/avg_thinking_length_func": 154.7548828125, + "rewards/confidence_score_reward_func": 0.5910084247589111, + "rewards/correct_answer_reward_func": 0.71875, + "rewards/efficient_thinking_reward_func": 0.79141897353926, + "rewards/format_and_efficient_reward_func": 0.4532102346420288, + "rewards/format_reward_func": 0.9973268508911133, + "rewards/num_xml_reward_func": 1.6137380599975586, + "rewards/tool_execution_reward_func": 1.9840686321258545, + "rewards/visit_tool_reward_func": 0.9216470718383789, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005668604651162791, + "grad_norm": 0.008010434434161435, + "kl": 0.03924560546875, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005692829457364341, + "grad_norm": 0.008059617739522514, + "kl": 0.03936767578125, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005717054263565891, + "grad_norm": 0.008321692756210844, + "kl": 0.0400390625, + "learning_rate": 2e-06, + "loss": 0.0004, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5409.0, + "completions/max_terminated_length": 5409.0, + "completions/mean_length": 2747.0, + "completions/mean_terminated_length": 2747.0, + "completions/min_length": 1028.0, + "completions/min_terminated_length": 1028.0, + "epoch": 0.005741279069767442, + "grad_norm": 0.00607005113841099, + "kl": 0.0380859375, + "learning_rate": 2e-06, + "loss": 0.0001, + "num_tokens": 30090756.0, + "reward": 0.3245881199836731, + "reward_std": 0.030338726937770844, + "rewards/avg_thinking_length_func": 118.96601867675781, + "rewards/confidence_score_reward_func": 0.5715887546539307, + "rewards/correct_answer_reward_func": 0.515625, + "rewards/efficient_thinking_reward_func": 0.7931376609790313, + "rewards/format_and_efficient_reward_func": 0.3051683306694031, + "rewards/format_reward_func": 0.9918498396873474, + "rewards/num_xml_reward_func": 1.335392713546753, + "rewards/tool_execution_reward_func": 1.956681728363037, + "rewards/visit_tool_reward_func": 0.8923399448394775, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005765503875968993, + "grad_norm": 0.006076971580972504, + "kl": 0.03839111328125, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005789728682170542, + "grad_norm": 0.005795692009836339, + "kl": 0.0380859375, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005813953488372093, + "grad_norm": 0.005478655391819232, + "kl": 0.0377197265625, + "learning_rate": 2e-06, + "loss": 0.0001, + "step": 240 + } + ], + "logging_steps": 1, + "max_steps": 640, + "num_input_tokens_seen": 30090756, + "num_train_epochs": 1, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}