Yukang commited on
Commit
7e03ccf
·
verified ·
1 Parent(s): cb5eba0

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-3B-Instruct
3
- datasets: open-r1/OpenR1-Math-220k
4
  library_name: transformers
5
  model_name: Qwen2.5-3B-Open-R1-GRPO
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-3B-Open-R1-GRPO
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) on the [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenyukang2020-nvidia/huggingface/runs/v84ltger)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
  base_model: Qwen/Qwen2.5-3B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-3B-Open-R1-GRPO
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-3B-Open-R1-GRPO
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenyukang2020-nvidia/huggingface/runs/9wwsfr8r)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.00012006735727737873,
4
- "train_runtime": 1294.972,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 72.382,
7
- "train_steps_per_second": 2.263
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.00015187089497744325,
4
+ "train_runtime": 1302.9313,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 71.94,
7
+ "train_steps_per_second": 2.249
8
  }
config.json CHANGED
@@ -22,7 +22,7 @@
22
  "tie_word_embeddings": true,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.52.3",
25
- "use_cache": true,
26
  "use_sliding_window": false,
27
  "vocab_size": 151936
28
  }
 
22
  "tie_word_embeddings": true,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.52.3",
25
+ "use_cache": false,
26
  "use_sliding_window": false,
27
  "vocab_size": 151936
28
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:475b54e34ea27c5104c1ddd4518b6f609710ff981dc519a420d872c707c357c1
3
  size 4957560304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40274e3ec9e8dc4d8fc7fa0b582f6f3b3ce9d41fdcacd2960d3506f7a944ed4b
3
  size 4957560304
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:118d6868b8fcb22c426966d8b4db5ffce230512ebc5c29e3e801ceedc92dd266
3
  size 1214366696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a25ec01907d0215b10d3faf0841cf152610f26e87c5b5cccfd966513984b58f0
3
  size 1214366696
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.00012006735727737873,
4
- "train_runtime": 1294.972,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 72.382,
7
- "train_steps_per_second": 2.263
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.00015187089497744325,
4
+ "train_runtime": 1302.9313,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 71.94,
7
+ "train_steps_per_second": 2.249
8
  }
trainer_state.json CHANGED
@@ -87024,7 +87024,7 @@
87024
  "completions/min_terminated_length": 371.0,
87025
  "epoch": 0.9903558931467099,
87026
  "frac_reward_zero_std": 0.6875,
87027
- "grad_norm": 0.09052230139899971,
87028
  "kl": 0.07421875,
87029
  "learning_rate": 6.386247842353755e-09,
87030
  "loss": 0.0128,
@@ -87046,23 +87046,23 @@
87046
  "clip_ratio/low_min": 0.0,
87047
  "clip_ratio/region_mean": 0.0,
87048
  "completions/clipped_ratio": -7.0,
87049
- "completions/max_length": 1668.0,
87050
- "completions/max_terminated_length": 1668.0,
87051
- "completions/mean_length": 811.07421875,
87052
- "completions/mean_terminated_length": 811.07421875,
87053
- "completions/min_length": 251.0,
87054
- "completions/min_terminated_length": 251.0,
87055
  "epoch": 0.9906972774601007,
87056
  "frac_reward_zero_std": 0.6875,
87057
- "grad_norm": 0.08715568374725409,
87058
- "kl": 0.0780029296875,
87059
  "learning_rate": 5.967635461854304e-09,
87060
- "loss": 0.0142,
87061
- "num_tokens": 1391197756.0,
87062
- "reward": 2.07958984375,
87063
- "reward_std": 0.10148443281650543,
87064
- "rewards/accuracy_reward/mean": 0.087890625,
87065
- "rewards/accuracy_reward/std": 0.2834126651287079,
87066
  "rewards/format_reward/mean": 0.9921875,
87067
  "rewards/format_reward/std": 0.08812850713729858,
87068
  "rewards/tag_count_reward/mean": 0.99951171875,
@@ -87075,28 +87075,28 @@
87075
  "clip_ratio/low_mean": 0.0,
87076
  "clip_ratio/low_min": 0.0,
87077
  "clip_ratio/region_mean": 0.0,
87078
- "completions/clipped_ratio": -7.0,
87079
- "completions/max_length": 1928.0,
87080
- "completions/max_terminated_length": 1928.0,
87081
- "completions/mean_length": 818.82421875,
87082
- "completions/mean_terminated_length": 818.82421875,
87083
- "completions/min_length": 155.0,
87084
- "completions/min_terminated_length": 155.0,
87085
  "epoch": 0.9910386617734915,
87086
  "frac_reward_zero_std": 0.59375,
87087
- "grad_norm": 0.10523394407924126,
87088
- "kl": 0.077392578125,
87089
  "learning_rate": 5.563207782363078e-09,
87090
- "loss": 0.0098,
87091
- "num_tokens": 1391710914.0,
87092
- "reward": 2.0439453125,
87093
- "reward_std": 0.1362731009721756,
87094
- "rewards/accuracy_reward/mean": 0.056640625,
87095
- "rewards/accuracy_reward/std": 0.23138070106506348,
87096
- "rewards/format_reward/mean": 0.98828125,
87097
- "rewards/format_reward/std": 0.10772226005792618,
87098
- "rewards/tag_count_reward/mean": 0.9990234375,
87099
- "rewards/tag_count_reward/std": 0.015609703958034515,
87100
  "step": 2903
87101
  },
87102
  {
@@ -87106,27 +87106,27 @@
87106
  "clip_ratio/low_min": 0.0,
87107
  "clip_ratio/region_mean": 0.0,
87108
  "completions/clipped_ratio": -7.0,
87109
- "completions/max_length": 1336.0,
87110
- "completions/max_terminated_length": 1336.0,
87111
- "completions/mean_length": 733.837890625,
87112
- "completions/mean_terminated_length": 733.837890625,
87113
- "completions/min_length": 288.0,
87114
- "completions/min_terminated_length": 288.0,
87115
  "epoch": 0.9913800460868823,
87116
- "frac_reward_zero_std": 0.6875,
87117
- "grad_norm": 0.0962644733446894,
87118
- "kl": 0.080078125,
87119
  "learning_rate": 5.172965377890915e-09,
87120
- "loss": 0.0052,
87121
- "num_tokens": 1392171551.0,
87122
- "reward": 2.06201171875,
87123
- "reward_std": 0.09075203537940979,
87124
- "rewards/accuracy_reward/mean": 0.06640625,
87125
- "rewards/accuracy_reward/std": 0.2492343932390213,
87126
- "rewards/format_reward/mean": 0.99609375,
87127
- "rewards/format_reward/std": 0.06243881583213806,
87128
- "rewards/tag_count_reward/mean": 0.99951171875,
87129
- "rewards/tag_count_reward/std": 0.011048543266952038,
87130
  "step": 2904
87131
  },
87132
  {
@@ -87135,28 +87135,28 @@
87135
  "clip_ratio/low_mean": 0.0,
87136
  "clip_ratio/low_min": 0.0,
87137
  "clip_ratio/region_mean": 0.0,
87138
- "completions/clipped_ratio": -6.984375,
87139
- "completions/max_length": 2048.0,
87140
- "completions/max_terminated_length": 1900.0,
87141
- "completions/mean_length": 771.89453125,
87142
- "completions/mean_terminated_length": 769.3972778320312,
87143
- "completions/min_length": 262.0,
87144
- "completions/min_terminated_length": 262.0,
87145
  "epoch": 0.9917214304002732,
87146
- "frac_reward_zero_std": 0.53125,
87147
- "grad_norm": 0.12188297667505647,
87148
- "kl": 0.0731201171875,
87149
  "learning_rate": 4.79690880231587e-09,
87150
- "loss": 0.0159,
87151
- "num_tokens": 1392641337.0,
87152
- "reward": 2.05078125,
87153
- "reward_std": 0.16719159483909607,
87154
- "rewards/accuracy_reward/mean": 0.068359375,
87155
- "rewards/accuracy_reward/std": 0.25260838866233826,
87156
- "rewards/format_reward/mean": 0.984375,
87157
- "rewards/format_reward/std": 0.12414088100194931,
87158
- "rewards/tag_count_reward/mean": 0.998046875,
87159
- "rewards/tag_count_reward/std": 0.022032126784324646,
87160
  "step": 2905
87161
  },
87162
  {
@@ -87166,27 +87166,27 @@
87166
  "clip_ratio/low_min": 0.0,
87167
  "clip_ratio/region_mean": 0.0,
87168
  "completions/clipped_ratio": -7.0,
87169
- "completions/max_length": 1612.0,
87170
- "completions/max_terminated_length": 1612.0,
87171
- "completions/mean_length": 761.6171875,
87172
- "completions/mean_terminated_length": 761.6171875,
87173
- "completions/min_length": 224.0,
87174
- "completions/min_terminated_length": 224.0,
87175
  "epoch": 0.9920628147136639,
87176
- "frac_reward_zero_std": 0.375,
87177
- "grad_norm": 0.12393463207782816,
87178
- "kl": 0.07861328125,
87179
  "learning_rate": 4.435038589380991e-09,
87180
- "loss": 0.0037,
87181
- "num_tokens": 1393116629.0,
87182
- "reward": 2.1787109375,
87183
- "reward_std": 0.20795820653438568,
87184
- "rewards/accuracy_reward/mean": 0.1953125,
87185
- "rewards/accuracy_reward/std": 0.3968288004398346,
87186
- "rewards/format_reward/mean": 0.986328125,
87187
- "rewards/format_reward/std": 0.1162383034825325,
87188
- "rewards/tag_count_reward/mean": 0.9970703125,
87189
- "rewards/tag_count_reward/std": 0.038198307156562805,
87190
  "step": 2906
87191
  },
87192
  {
@@ -87196,27 +87196,27 @@
87196
  "clip_ratio/low_min": 0.0,
87197
  "clip_ratio/region_mean": 0.0,
87198
  "completions/clipped_ratio": -7.0,
87199
- "completions/max_length": 1460.0,
87200
- "completions/max_terminated_length": 1460.0,
87201
- "completions/mean_length": 704.7890625,
87202
- "completions/mean_terminated_length": 704.7890625,
87203
- "completions/min_length": 310.0,
87204
- "completions/min_terminated_length": 310.0,
87205
  "epoch": 0.9924041990270547,
87206
- "frac_reward_zero_std": 0.625,
87207
- "grad_norm": 0.10340246619120935,
87208
- "kl": 0.0853271484375,
87209
  "learning_rate": 4.087355252694325e-09,
87210
- "loss": 0.0031,
87211
- "num_tokens": 1393561353.0,
87212
- "reward": 2.15185546875,
87213
- "reward_std": 0.16270402073860168,
87214
- "rewards/accuracy_reward/mean": 0.162109375,
87215
- "rewards/accuracy_reward/std": 0.3689115643501282,
87216
- "rewards/format_reward/mean": 0.990234375,
87217
- "rewards/format_reward/std": 0.09843364357948303,
87218
- "rewards/tag_count_reward/mean": 0.99951171875,
87219
- "rewards/tag_count_reward/std": 0.011048543266952038,
87220
  "step": 2907
87221
  },
87222
  {
@@ -87225,28 +87225,28 @@
87225
  "clip_ratio/low_mean": 0.0,
87226
  "clip_ratio/low_min": 0.0,
87227
  "clip_ratio/region_mean": 0.0,
87228
- "completions/clipped_ratio": -6.984375,
87229
  "completions/max_length": 2048.0,
87230
- "completions/max_terminated_length": 1991.0,
87231
- "completions/mean_length": 815.96875,
87232
- "completions/mean_terminated_length": 813.5577392578125,
87233
- "completions/min_length": 178.0,
87234
- "completions/min_terminated_length": 178.0,
87235
  "epoch": 0.9927455833404455,
87236
- "frac_reward_zero_std": 0.5,
87237
- "grad_norm": 0.11634370392277571,
87238
- "kl": 0.0787353515625,
87239
  "learning_rate": 3.753859285730022e-09,
87240
- "loss": 0.0138,
87241
- "num_tokens": 1394058921.0,
87242
- "reward": 2.0869140625,
87243
- "reward_std": 0.18937204778194427,
87244
- "rewards/accuracy_reward/mean": 0.109375,
87245
- "rewards/accuracy_reward/std": 0.31241437792778015,
87246
- "rewards/format_reward/mean": 0.98046875,
87247
- "rewards/format_reward/std": 0.1385180652141571,
87248
- "rewards/tag_count_reward/mean": 0.9970703125,
87249
- "rewards/tag_count_reward/std": 0.026930565014481544,
87250
  "step": 2908
87251
  },
87252
  {
@@ -87255,28 +87255,28 @@
87255
  "clip_ratio/low_mean": 0.0,
87256
  "clip_ratio/low_min": 0.0,
87257
  "clip_ratio/region_mean": 0.0,
87258
- "completions/clipped_ratio": -6.984375,
87259
- "completions/max_length": 2048.0,
87260
- "completions/max_terminated_length": 1637.0,
87261
- "completions/mean_length": 776.6796875,
87262
- "completions/mean_terminated_length": 774.1917724609375,
87263
- "completions/min_length": 217.0,
87264
- "completions/min_terminated_length": 217.0,
87265
  "epoch": 0.9930869676538363,
87266
- "frac_reward_zero_std": 0.4375,
87267
- "grad_norm": 0.12612242780687172,
87268
- "kl": 0.0765380859375,
87269
  "learning_rate": 3.4345511618238957e-09,
87270
- "loss": 0.018,
87271
- "num_tokens": 1394542725.0,
87272
- "reward": 2.119140625,
87273
- "reward_std": 0.18908953666687012,
87274
- "rewards/accuracy_reward/mean": 0.12890625,
87275
- "rewards/accuracy_reward/std": 0.33542385697364807,
87276
- "rewards/format_reward/mean": 0.9921875,
87277
- "rewards/format_reward/std": 0.08812850713729858,
87278
- "rewards/tag_count_reward/mean": 0.998046875,
87279
- "rewards/tag_count_reward/std": 0.03491804376244545,
87280
  "step": 2909
87281
  },
87282
  {
@@ -87285,28 +87285,28 @@
87285
  "clip_ratio/low_mean": 0.0,
87286
  "clip_ratio/low_min": 0.0,
87287
  "clip_ratio/region_mean": 0.0,
87288
- "completions/clipped_ratio": -7.0,
87289
- "completions/max_length": 1677.0,
87290
- "completions/max_terminated_length": 1677.0,
87291
- "completions/mean_length": 776.33203125,
87292
- "completions/mean_terminated_length": 776.33203125,
87293
- "completions/min_length": 345.0,
87294
- "completions/min_terminated_length": 345.0,
87295
  "epoch": 0.9934283519672271,
87296
- "frac_reward_zero_std": 0.46875,
87297
- "grad_norm": 0.11521416617457238,
87298
- "kl": 0.0809326171875,
87299
  "learning_rate": 3.129431334175648e-09,
87300
- "loss": 0.0171,
87301
- "num_tokens": 1395031791.0,
87302
- "reward": 2.1220703125,
87303
- "reward_std": 0.20290058851242065,
87304
- "rewards/accuracy_reward/mean": 0.138671875,
87305
- "rewards/accuracy_reward/std": 0.34594178199768066,
87306
- "rewards/format_reward/mean": 0.986328125,
87307
- "rewards/format_reward/std": 0.1162383034825325,
87308
- "rewards/tag_count_reward/mean": 0.9970703125,
87309
- "rewards/tag_count_reward/std": 0.038198307156562805,
87310
  "step": 2910
87311
  },
87312
  {
@@ -87315,26 +87315,26 @@
87315
  "clip_ratio/low_mean": 0.0,
87316
  "clip_ratio/low_min": 0.0,
87317
  "clip_ratio/region_mean": 0.0,
87318
- "completions/clipped_ratio": -6.984375,
87319
- "completions/max_length": 2048.0,
87320
- "completions/max_terminated_length": 1906.0,
87321
- "completions/mean_length": 795.970703125,
87322
- "completions/mean_terminated_length": 793.5205688476562,
87323
- "completions/min_length": 215.0,
87324
- "completions/min_terminated_length": 215.0,
87325
  "epoch": 0.9937697362806179,
87326
- "frac_reward_zero_std": 0.46875,
87327
- "grad_norm": 0.1145051128307554,
87328
- "kl": 0.0772705078125,
87329
  "learning_rate": 2.8385002358466417e-09,
87330
- "loss": 0.0184,
87331
- "num_tokens": 1395529104.0,
87332
- "reward": 2.123046875,
87333
- "reward_std": 0.18341054022312164,
87334
- "rewards/accuracy_reward/mean": 0.1391129046678543,
87335
- "rewards/accuracy_reward/std": 0.3464137017726898,
87336
- "rewards/format_reward/mean": 0.990234375,
87337
- "rewards/format_reward/std": 0.09843364357948303,
87338
  "rewards/tag_count_reward/mean": 0.998046875,
87339
  "rewards/tag_count_reward/std": 0.022032126784324646,
87340
  "step": 2911
@@ -87346,27 +87346,27 @@
87346
  "clip_ratio/low_min": 0.0,
87347
  "clip_ratio/region_mean": 0.0,
87348
  "completions/clipped_ratio": -7.0,
87349
- "completions/max_length": 1722.0,
87350
- "completions/max_terminated_length": 1722.0,
87351
- "completions/mean_length": 716.91796875,
87352
- "completions/mean_terminated_length": 716.91796875,
87353
- "completions/min_length": 199.0,
87354
- "completions/min_terminated_length": 199.0,
87355
  "epoch": 0.9941111205940087,
87356
- "frac_reward_zero_std": 0.40625,
87357
- "grad_norm": 0.32500195710324153,
87358
- "kl": 0.1021728515625,
87359
  "learning_rate": 2.5617582797610174e-09,
87360
- "loss": 0.0096,
87361
- "num_tokens": 1395976998.0,
87362
- "reward": 2.08984375,
87363
- "reward_std": 0.19573727250099182,
87364
- "rewards/accuracy_reward/mean": 0.109375,
87365
- "rewards/accuracy_reward/std": 0.31241437792778015,
87366
- "rewards/format_reward/mean": 0.982421875,
87367
- "rewards/format_reward/std": 0.13154059648513794,
87368
- "rewards/tag_count_reward/mean": 0.998046875,
87369
- "rewards/tag_count_reward/std": 0.022032126784324646,
87370
  "step": 2912
87371
  },
87372
  {
@@ -87376,27 +87376,27 @@
87376
  "clip_ratio/low_min": 0.0,
87377
  "clip_ratio/region_mean": 0.0,
87378
  "completions/clipped_ratio": -7.0,
87379
- "completions/max_length": 1732.0,
87380
- "completions/max_terminated_length": 1732.0,
87381
- "completions/mean_length": 803.76953125,
87382
- "completions/mean_terminated_length": 803.76953125,
87383
- "completions/min_length": 232.0,
87384
- "completions/min_terminated_length": 232.0,
87385
  "epoch": 0.9944525049073996,
87386
- "frac_reward_zero_std": 0.65625,
87387
- "grad_norm": 0.09947710674034467,
87388
- "kl": 0.07958984375,
87389
  "learning_rate": 2.299205858702358e-09,
87390
- "loss": 0.0011,
87391
- "num_tokens": 1396464736.0,
87392
- "reward": 2.08056640625,
87393
- "reward_std": 0.13412871956825256,
87394
- "rewards/accuracy_reward/mean": 0.0947580635547638,
87395
- "rewards/accuracy_reward/std": 0.29317617416381836,
87396
- "rewards/format_reward/mean": 0.990234375,
87397
- "rewards/format_reward/std": 0.09843364357948303,
87398
- "rewards/tag_count_reward/mean": 0.99853515625,
87399
- "rewards/tag_count_reward/std": 0.019099153578281403,
87400
  "step": 2913
87401
  },
87402
  {
@@ -87405,28 +87405,28 @@
87405
  "clip_ratio/low_mean": 0.0,
87406
  "clip_ratio/low_min": 0.0,
87407
  "clip_ratio/region_mean": 0.0,
87408
- "completions/clipped_ratio": -7.0,
87409
- "completions/max_length": 2024.0,
87410
- "completions/max_terminated_length": 2024.0,
87411
- "completions/mean_length": 813.1953125,
87412
- "completions/mean_terminated_length": 813.1953125,
87413
- "completions/min_length": 236.0,
87414
- "completions/min_terminated_length": 236.0,
87415
  "epoch": 0.9947938892207903,
87416
- "frac_reward_zero_std": 0.59375,
87417
- "grad_norm": 0.08836543570664221,
87418
- "kl": 0.0740966796875,
87419
  "learning_rate": 2.0508433453170218e-09,
87420
- "loss": 0.0143,
87421
- "num_tokens": 1396964116.0,
87422
- "reward": 2.1201171875,
87423
- "reward_std": 0.15555161237716675,
87424
- "rewards/accuracy_reward/mean": 0.12890625,
87425
- "rewards/accuracy_reward/std": 0.33542385697364807,
87426
- "rewards/format_reward/mean": 0.9921875,
87427
- "rewards/format_reward/std": 0.08812850713729858,
87428
- "rewards/tag_count_reward/mean": 0.9990234375,
87429
- "rewards/tag_count_reward/std": 0.015609703958034515,
87430
  "step": 2914
87431
  },
87432
  {
@@ -87436,25 +87436,25 @@
87436
  "clip_ratio/low_min": 0.0,
87437
  "clip_ratio/region_mean": 0.0,
87438
  "completions/clipped_ratio": -7.0,
87439
- "completions/max_length": 1848.0,
87440
- "completions/max_terminated_length": 1848.0,
87441
- "completions/mean_length": 815.22265625,
87442
- "completions/mean_terminated_length": 815.22265625,
87443
- "completions/min_length": 123.0,
87444
- "completions/min_terminated_length": 123.0,
87445
  "epoch": 0.9951352735341811,
87446
- "frac_reward_zero_std": 0.71875,
87447
- "grad_norm": 0.08489306927332194,
87448
- "kl": 0.0733642578125,
87449
  "learning_rate": 1.8166710921097008e-09,
87450
- "loss": 0.0036,
87451
- "num_tokens": 1397459142.0,
87452
- "reward": 2.07861328125,
87453
- "reward_std": 0.1113491952419281,
87454
- "rewards/accuracy_reward/mean": 0.08984375,
87455
- "rewards/accuracy_reward/std": 0.2862374484539032,
87456
- "rewards/format_reward/mean": 0.990234375,
87457
- "rewards/format_reward/std": 0.09843364357948303,
87458
  "rewards/tag_count_reward/mean": 0.99853515625,
87459
  "rewards/tag_count_reward/std": 0.019099153578281403,
87460
  "step": 2915
@@ -87467,26 +87467,26 @@
87467
  "clip_ratio/region_mean": 0.0,
87468
  "completions/clipped_ratio": -6.984375,
87469
  "completions/max_length": 2048.0,
87470
- "completions/max_terminated_length": 1898.0,
87471
- "completions/mean_length": 832.837890625,
87472
- "completions/mean_terminated_length": 830.4598999023438,
87473
- "completions/min_length": 329.0,
87474
- "completions/min_terminated_length": 329.0,
87475
  "epoch": 0.9954766578475719,
87476
  "frac_reward_zero_std": 0.59375,
87477
- "grad_norm": 0.09716514163001252,
87478
- "kl": 0.0833740234375,
87479
  "learning_rate": 1.5966894314456416e-09,
87480
- "loss": 0.0136,
87481
- "num_tokens": 1397976707.0,
87482
- "reward": 2.07763671875,
87483
- "reward_std": 0.15054047107696533,
87484
- "rewards/accuracy_reward/mean": 0.091796875,
87485
- "rewards/accuracy_reward/std": 0.289021372795105,
87486
- "rewards/format_reward/mean": 0.990234375,
87487
- "rewards/format_reward/std": 0.09843364357948303,
87488
- "rewards/tag_count_reward/mean": 0.99560546875,
87489
- "rewards/tag_count_reward/std": 0.04260620102286339,
87490
  "step": 2916
87491
  },
87492
  {
@@ -87496,25 +87496,25 @@
87496
  "clip_ratio/low_min": 0.0,
87497
  "clip_ratio/region_mean": 0.0,
87498
  "completions/clipped_ratio": -7.0,
87499
- "completions/max_length": 1680.0,
87500
- "completions/max_terminated_length": 1680.0,
87501
- "completions/mean_length": 727.982421875,
87502
- "completions/mean_terminated_length": 727.982421875,
87503
- "completions/min_length": 173.0,
87504
- "completions/min_terminated_length": 173.0,
87505
  "epoch": 0.9958180421609627,
87506
- "frac_reward_zero_std": 0.65625,
87507
- "grad_norm": 0.09975321839818838,
87508
- "kl": 0.087890625,
87509
  "learning_rate": 1.3908986755473142e-09,
87510
- "loss": 0.0116,
87511
- "num_tokens": 1398431482.0,
87512
- "reward": 2.0810546875,
87513
- "reward_std": 0.1253216564655304,
87514
- "rewards/accuracy_reward/mean": 0.09375,
87515
- "rewards/accuracy_reward/std": 0.29176566004753113,
87516
- "rewards/format_reward/mean": 0.98828125,
87517
- "rewards/format_reward/std": 0.10772226005792618,
87518
  "rewards/tag_count_reward/mean": 0.9990234375,
87519
  "rewards/tag_count_reward/std": 0.015609703958034515,
87520
  "step": 2917
@@ -87526,27 +87526,27 @@
87526
  "clip_ratio/low_min": 0.0,
87527
  "clip_ratio/region_mean": 0.0,
87528
  "completions/clipped_ratio": -7.0,
87529
- "completions/max_length": 1584.0,
87530
- "completions/max_terminated_length": 1584.0,
87531
- "completions/mean_length": 781.779296875,
87532
- "completions/mean_terminated_length": 781.779296875,
87533
- "completions/min_length": 312.0,
87534
- "completions/min_terminated_length": 312.0,
87535
  "epoch": 0.9961594264743535,
87536
- "frac_reward_zero_std": 0.71875,
87537
- "grad_norm": 0.08017537622311298,
87538
- "kl": 0.0828857421875,
87539
  "learning_rate": 1.199299116497743e-09,
87540
- "loss": 0.0152,
87541
- "num_tokens": 1398908953.0,
87542
- "reward": 2.05615234375,
87543
- "reward_std": 0.09608898311853409,
87544
- "rewards/accuracy_reward/mean": 0.0625,
87545
- "rewards/accuracy_reward/std": 0.2422981858253479,
87546
- "rewards/format_reward/mean": 0.994140625,
87547
- "rewards/format_reward/std": 0.07639661431312561,
87548
- "rewards/tag_count_reward/mean": 0.99951171875,
87549
- "rewards/tag_count_reward/std": 0.011048543266952038,
87550
  "step": 2918
87551
  },
87552
  {
@@ -87556,27 +87556,27 @@
87556
  "clip_ratio/low_min": 0.0,
87557
  "clip_ratio/region_mean": 0.0,
87558
  "completions/clipped_ratio": -7.0,
87559
- "completions/max_length": 1876.0,
87560
- "completions/max_terminated_length": 1876.0,
87561
- "completions/mean_length": 794.0,
87562
- "completions/mean_terminated_length": 794.0,
87563
- "completions/min_length": 268.0,
87564
- "completions/min_terminated_length": 268.0,
87565
  "epoch": 0.9965008107877443,
87566
- "frac_reward_zero_std": 0.40625,
87567
- "grad_norm": 0.11566190296325213,
87568
- "kl": 0.0745849609375,
87569
  "learning_rate": 1.0218910262371762e-09,
87570
- "loss": 0.0017,
87571
- "num_tokens": 1399397769.0,
87572
- "reward": 2.1748046875,
87573
- "reward_std": 0.20263367891311646,
87574
- "rewards/accuracy_reward/mean": 0.181640625,
87575
- "rewards/accuracy_reward/std": 0.38592514395713806,
87576
- "rewards/format_reward/mean": 0.994140625,
87577
- "rewards/format_reward/std": 0.07639661431312561,
87578
- "rewards/tag_count_reward/mean": 0.9990234375,
87579
- "rewards/tag_count_reward/std": 0.015609703958034515,
87580
  "step": 2919
87581
  },
87582
  {
@@ -87586,25 +87586,25 @@
87586
  "clip_ratio/low_min": 0.0,
87587
  "clip_ratio/region_mean": 0.0,
87588
  "completions/clipped_ratio": -7.0,
87589
- "completions/max_length": 1765.0,
87590
- "completions/max_terminated_length": 1765.0,
87591
- "completions/mean_length": 826.298828125,
87592
- "completions/mean_terminated_length": 826.298828125,
87593
- "completions/min_length": 250.0,
87594
- "completions/min_terminated_length": 250.0,
87595
  "epoch": 0.9968421951011351,
87596
  "frac_reward_zero_std": 0.53125,
87597
- "grad_norm": 0.09891209808935245,
87598
- "kl": 0.0767822265625,
87599
  "learning_rate": 8.586746565641957e-10,
87600
- "loss": 0.0099,
87601
- "num_tokens": 1399917122.0,
87602
- "reward": 2.13134765625,
87603
- "reward_std": 0.18901605904102325,
87604
- "rewards/accuracy_reward/mean": 0.138671875,
87605
- "rewards/accuracy_reward/std": 0.34594178199768066,
87606
- "rewards/format_reward/mean": 0.994140625,
87607
- "rewards/format_reward/std": 0.07639661431312561,
87608
  "rewards/tag_count_reward/mean": 0.99853515625,
87609
  "rewards/tag_count_reward/std": 0.019099153578281403,
87610
  "step": 2920
@@ -87616,27 +87616,27 @@
87616
  "clip_ratio/low_min": 0.0,
87617
  "clip_ratio/region_mean": 0.0,
87618
  "completions/clipped_ratio": -7.0,
87619
- "completions/max_length": 1708.0,
87620
- "completions/max_terminated_length": 1708.0,
87621
- "completions/mean_length": 753.177734375,
87622
- "completions/mean_terminated_length": 753.177734375,
87623
- "completions/min_length": 260.0,
87624
- "completions/min_terminated_length": 260.0,
87625
  "epoch": 0.997183579414526,
87626
  "frac_reward_zero_std": 0.59375,
87627
- "grad_norm": 0.10619904424152342,
87628
- "kl": 0.078857421875,
87629
  "learning_rate": 7.096502391346072e-10,
87630
- "loss": 0.0124,
87631
- "num_tokens": 1400383885.0,
87632
- "reward": 2.08154296875,
87633
- "reward_std": 0.16373801231384277,
87634
- "rewards/accuracy_reward/mean": 0.08984375,
87635
- "rewards/accuracy_reward/std": 0.2862374484539032,
87636
  "rewards/format_reward/mean": 0.9921875,
87637
  "rewards/format_reward/std": 0.08812850713729858,
87638
- "rewards/tag_count_reward/mean": 0.99951171875,
87639
- "rewards/tag_count_reward/std": 0.011048543266952038,
87640
  "step": 2921
87641
  },
87642
  {
@@ -87646,27 +87646,27 @@
87646
  "clip_ratio/low_min": 0.0,
87647
  "clip_ratio/region_mean": 0.0,
87648
  "completions/clipped_ratio": -7.0,
87649
- "completions/max_length": 1729.0,
87650
- "completions/max_terminated_length": 1729.0,
87651
- "completions/mean_length": 793.041015625,
87652
- "completions/mean_terminated_length": 793.041015625,
87653
- "completions/min_length": 272.0,
87654
- "completions/min_terminated_length": 272.0,
87655
  "epoch": 0.9975249637279167,
87656
- "frac_reward_zero_std": 0.46875,
87657
- "grad_norm": 0.10599137915386056,
87658
- "kl": 0.0753173828125,
87659
  "learning_rate": 5.748179854614399e-10,
87660
- "loss": 0.0149,
87661
- "num_tokens": 1400873378.0,
87662
- "reward": 2.0498046875,
87663
- "reward_std": 0.1855737864971161,
87664
- "rewards/accuracy_reward/mean": 0.072265625,
87665
- "rewards/accuracy_reward/std": 0.2591804563999176,
87666
- "rewards/format_reward/mean": 0.98046875,
87667
- "rewards/format_reward/std": 0.1385180652141571,
87668
  "rewards/tag_count_reward/mean": 0.9970703125,
87669
- "rewards/tag_count_reward/std": 0.026930565014481544,
87670
  "step": 2922
87671
  },
87672
  {
@@ -87676,27 +87676,27 @@
87676
  "clip_ratio/low_min": 0.0,
87677
  "clip_ratio/region_mean": 0.0,
87678
  "completions/clipped_ratio": -7.0,
87679
- "completions/max_length": 1602.0,
87680
- "completions/max_terminated_length": 1602.0,
87681
- "completions/mean_length": 735.794921875,
87682
- "completions/mean_terminated_length": 735.794921875,
87683
- "completions/min_length": 195.0,
87684
- "completions/min_terminated_length": 195.0,
87685
  "epoch": 0.9978663480413075,
87686
- "frac_reward_zero_std": 0.6875,
87687
- "grad_norm": 0.09767874624378871,
87688
- "kl": 0.079833984375,
87689
  "learning_rate": 4.541780869138368e-10,
87690
- "loss": 0.0138,
87691
- "num_tokens": 1401325961.0,
87692
- "reward": 2.0703125,
87693
- "reward_std": 0.11659518629312515,
87694
- "rewards/accuracy_reward/mean": 0.080078125,
87695
- "rewards/accuracy_reward/std": 0.271679550409317,
87696
  "rewards/format_reward/mean": 0.990234375,
87697
  "rewards/format_reward/std": 0.09843364357948303,
87698
- "rewards/tag_count_reward/mean": 1.0,
87699
- "rewards/tag_count_reward/std": 0.0,
87700
  "step": 2923
87701
  },
87702
  {
@@ -87706,27 +87706,27 @@
87706
  "clip_ratio/low_min": 0.0,
87707
  "clip_ratio/region_mean": 0.0,
87708
  "completions/clipped_ratio": -7.0,
87709
- "completions/max_length": 1751.0,
87710
- "completions/max_terminated_length": 1751.0,
87711
- "completions/mean_length": 797.919921875,
87712
- "completions/mean_terminated_length": 797.919921875,
87713
- "completions/min_length": 211.0,
87714
- "completions/min_terminated_length": 211.0,
87715
  "epoch": 0.9982077323546983,
87716
- "frac_reward_zero_std": 0.53125,
87717
- "grad_norm": 0.11418503362700216,
87718
- "kl": 0.080322265625,
87719
  "learning_rate": 3.477307147192743e-10,
87720
- "loss": 0.0051,
87721
- "num_tokens": 1401810896.0,
87722
- "reward": 2.154296875,
87723
- "reward_std": 0.19188588857650757,
87724
- "rewards/accuracy_reward/mean": 0.16796875,
87725
- "rewards/accuracy_reward/std": 0.374204158782959,
87726
  "rewards/format_reward/mean": 0.98828125,
87727
  "rewards/format_reward/std": 0.10772226005792618,
87728
  "rewards/tag_count_reward/mean": 0.998046875,
87729
- "rewards/tag_count_reward/std": 0.02701912261545658,
87730
  "step": 2924
87731
  },
87732
  {
@@ -87736,27 +87736,27 @@
87736
  "clip_ratio/low_min": 0.0,
87737
  "clip_ratio/region_mean": 0.0,
87738
  "completions/clipped_ratio": -7.0,
87739
- "completions/max_length": 1490.0,
87740
- "completions/max_terminated_length": 1490.0,
87741
- "completions/mean_length": 795.6015625,
87742
- "completions/mean_terminated_length": 795.6015625,
87743
- "completions/min_length": 393.0,
87744
- "completions/min_terminated_length": 393.0,
87745
  "epoch": 0.9985491166680891,
87746
- "frac_reward_zero_std": 0.40625,
87747
- "grad_norm": 0.12543432676122176,
87748
- "kl": 0.0797119140625,
87749
  "learning_rate": 2.5547601995912216e-10,
87750
- "loss": 0.0177,
87751
- "num_tokens": 1402303908.0,
87752
- "reward": 2.14697265625,
87753
- "reward_std": 0.23765350878238678,
87754
- "rewards/accuracy_reward/mean": 0.166015625,
87755
- "rewards/accuracy_reward/std": 0.3724585771560669,
87756
- "rewards/format_reward/mean": 0.984375,
87757
- "rewards/format_reward/std": 0.12414088100194931,
87758
- "rewards/tag_count_reward/mean": 0.99658203125,
87759
- "rewards/tag_count_reward/std": 0.03972800448536873,
87760
  "step": 2925
87761
  },
87762
  {
@@ -87765,28 +87765,28 @@
87765
  "clip_ratio/low_mean": 0.0,
87766
  "clip_ratio/low_min": 0.0,
87767
  "clip_ratio/region_mean": 0.0,
87768
- "completions/clipped_ratio": -6.984375,
87769
- "completions/max_length": 1363.0,
87770
- "completions/max_terminated_length": 1363.0,
87771
- "completions/mean_length": 705.349609375,
87772
- "completions/mean_terminated_length": 704.0626220703125,
87773
- "completions/min_length": 255.0,
87774
- "completions/min_terminated_length": 255.0,
87775
  "epoch": 0.9988905009814799,
87776
  "frac_reward_zero_std": 0.5,
87777
- "grad_norm": 0.22959447791895354,
87778
- "kl": 0.2906494140625,
87779
  "learning_rate": 1.7741413357197367e-10,
87780
- "loss": 0.0225,
87781
- "num_tokens": 1402745271.0,
87782
- "reward": 2.0859375,
87783
- "reward_std": 0.16681620478630066,
87784
- "rewards/accuracy_reward/mean": 0.103515625,
87785
- "rewards/accuracy_reward/std": 0.30492907762527466,
87786
- "rewards/format_reward/mean": 0.986328125,
87787
- "rewards/format_reward/std": 0.1162383034825325,
87788
- "rewards/tag_count_reward/mean": 0.99609375,
87789
- "rewards/tag_count_reward/std": 0.04930410906672478,
87790
  "step": 2926
87791
  },
87792
  {
@@ -87796,27 +87796,27 @@
87796
  "clip_ratio/low_min": 0.0,
87797
  "clip_ratio/region_mean": 0.0,
87798
  "completions/clipped_ratio": -7.0,
87799
- "completions/max_length": 1917.0,
87800
- "completions/max_terminated_length": 1917.0,
87801
- "completions/mean_length": 838.294921875,
87802
- "completions/mean_terminated_length": 838.294921875,
87803
- "completions/min_length": 339.0,
87804
- "completions/min_terminated_length": 339.0,
87805
  "epoch": 0.9992318852948707,
87806
- "frac_reward_zero_std": 0.6875,
87807
- "grad_norm": 0.09099736855996839,
87808
  "kl": 0.0772705078125,
87809
  "learning_rate": 1.1354516635364577e-10,
87810
- "loss": 0.0035,
87811
- "num_tokens": 1403254030.0,
87812
- "reward": 2.048828125,
87813
- "reward_std": 0.09261970221996307,
87814
- "rewards/accuracy_reward/mean": 0.05859375,
87815
- "rewards/accuracy_reward/std": 0.23509246110916138,
87816
- "rewards/format_reward/mean": 0.9921875,
87817
- "rewards/format_reward/std": 0.08812850713729858,
87818
- "rewards/tag_count_reward/mean": 0.998046875,
87819
- "rewards/tag_count_reward/std": 0.022032126784324646,
87820
  "step": 2927
87821
  },
87822
  {
@@ -87825,28 +87825,28 @@
87825
  "clip_ratio/low_mean": 0.0,
87826
  "clip_ratio/low_min": 0.0,
87827
  "clip_ratio/region_mean": 0.0,
87828
- "completions/clipped_ratio": -6.984375,
87829
  "completions/max_length": 2048.0,
87830
- "completions/max_terminated_length": 1740.0,
87831
- "completions/mean_length": 769.6796875,
87832
- "completions/mean_terminated_length": 767.1781005859375,
87833
- "completions/min_length": 271.0,
87834
- "completions/min_terminated_length": 271.0,
87835
  "epoch": 0.9995732696082615,
87836
- "frac_reward_zero_std": 0.40625,
87837
- "grad_norm": 0.12120526316644997,
87838
- "kl": 0.077880859375,
87839
  "learning_rate": 6.386920895384841e-11,
87840
- "loss": 0.032,
87841
- "num_tokens": 1403723962.0,
87842
- "reward": 2.09521484375,
87843
- "reward_std": 0.20090395212173462,
87844
- "rewards/accuracy_reward/mean": 0.12298387289047241,
87845
- "rewards/accuracy_reward/std": 0.32875028252601624,
87846
  "rewards/format_reward/mean": 0.98046875,
87847
  "rewards/format_reward/std": 0.1385180652141571,
87848
- "rewards/tag_count_reward/mean": 0.99560546875,
87849
- "rewards/tag_count_reward/std": 0.050489041954278946,
87850
  "step": 2928
87851
  },
87852
  {
@@ -87856,42 +87856,42 @@
87856
  "clip_ratio/low_min": 0.0,
87857
  "clip_ratio/region_mean": 0.0,
87858
  "completions/clipped_ratio": -7.0,
87859
- "completions/max_length": 1504.0,
87860
- "completions/max_terminated_length": 1504.0,
87861
- "completions/mean_length": 782.98828125,
87862
- "completions/mean_terminated_length": 782.98828125,
87863
- "completions/min_length": 234.0,
87864
- "completions/min_terminated_length": 234.0,
87865
  "epoch": 0.9999146539216524,
87866
- "frac_reward_zero_std": 0.5625,
87867
- "grad_norm": 0.10721380774650786,
87868
- "kl": 0.0792236328125,
87869
  "learning_rate": 2.838633187729478e-11,
87870
- "loss": 0.0172,
87871
- "num_tokens": 1404212612.0,
87872
- "reward": 2.09619140625,
87873
- "reward_std": 0.18561691045761108,
87874
- "rewards/accuracy_reward/mean": 0.12109375,
87875
- "rewards/accuracy_reward/std": 0.3265552520751953,
87876
- "rewards/format_reward/mean": 0.978515625,
87877
- "rewards/format_reward/std": 0.14513419568538666,
87878
- "rewards/tag_count_reward/mean": 0.99658203125,
87879
- "rewards/tag_count_reward/std": 0.029059575870633125,
87880
  "step": 2929
87881
  },
87882
  {
87883
  "epoch": 0.9999146539216524,
87884
  "step": 2929,
87885
  "total_flos": 0.0,
87886
- "train_loss": 0.00012006735727737873,
87887
- "train_runtime": 1294.972,
87888
- "train_samples_per_second": 72.382,
87889
- "train_steps_per_second": 2.263
87890
  }
87891
  ],
87892
  "logging_steps": 1,
87893
  "max_steps": 2930,
87894
- "num_input_tokens_seen": 1404212612,
87895
  "num_train_epochs": 1,
87896
  "save_steps": 100,
87897
  "stateful_callbacks": {
 
87024
  "completions/min_terminated_length": 371.0,
87025
  "epoch": 0.9903558931467099,
87026
  "frac_reward_zero_std": 0.6875,
87027
+ "grad_norm": 0.09051816360814072,
87028
  "kl": 0.07421875,
87029
  "learning_rate": 6.386247842353755e-09,
87030
  "loss": 0.0128,
 
87046
  "clip_ratio/low_min": 0.0,
87047
  "clip_ratio/region_mean": 0.0,
87048
  "completions/clipped_ratio": -7.0,
87049
+ "completions/max_length": 1555.0,
87050
+ "completions/max_terminated_length": 1555.0,
87051
+ "completions/mean_length": 802.6328125,
87052
+ "completions/mean_terminated_length": 802.6328125,
87053
+ "completions/min_length": 182.0,
87054
+ "completions/min_terminated_length": 182.0,
87055
  "epoch": 0.9906972774601007,
87056
  "frac_reward_zero_std": 0.6875,
87057
+ "grad_norm": 0.12488894096087517,
87058
+ "kl": 0.0802001953125,
87059
  "learning_rate": 5.967635461854304e-09,
87060
+ "loss": 0.0146,
87061
+ "num_tokens": 1391193434.0,
87062
+ "reward": 2.07763671875,
87063
+ "reward_std": 0.09719854593276978,
87064
+ "rewards/accuracy_reward/mean": 0.0859375,
87065
+ "rewards/accuracy_reward/std": 0.28054583072662354,
87066
  "rewards/format_reward/mean": 0.9921875,
87067
  "rewards/format_reward/std": 0.08812850713729858,
87068
  "rewards/tag_count_reward/mean": 0.99951171875,
 
87075
  "clip_ratio/low_mean": 0.0,
87076
  "clip_ratio/low_min": 0.0,
87077
  "clip_ratio/region_mean": 0.0,
87078
+ "completions/clipped_ratio": -6.984375,
87079
+ "completions/max_length": 2048.0,
87080
+ "completions/max_terminated_length": 2002.0,
87081
+ "completions/mean_length": 817.966796875,
87082
+ "completions/mean_terminated_length": 815.5596923828125,
87083
+ "completions/min_length": 143.0,
87084
+ "completions/min_terminated_length": 143.0,
87085
  "epoch": 0.9910386617734915,
87086
  "frac_reward_zero_std": 0.59375,
87087
+ "grad_norm": 0.10402330505958786,
87088
+ "kl": 0.0775146484375,
87089
  "learning_rate": 5.563207782363078e-09,
87090
+ "loss": 0.0097,
87091
+ "num_tokens": 1391706153.0,
87092
+ "reward": 2.048828125,
87093
+ "reward_std": 0.13361230492591858,
87094
+ "rewards/accuracy_reward/mean": 0.0625,
87095
+ "rewards/accuracy_reward/std": 0.2422981858253479,
87096
+ "rewards/format_reward/mean": 0.990234375,
87097
+ "rewards/format_reward/std": 0.09843364357948303,
87098
+ "rewards/tag_count_reward/mean": 0.99609375,
87099
+ "rewards/tag_count_reward/std": 0.04119514673948288,
87100
  "step": 2903
87101
  },
87102
  {
 
87106
  "clip_ratio/low_min": 0.0,
87107
  "clip_ratio/region_mean": 0.0,
87108
  "completions/clipped_ratio": -7.0,
87109
+ "completions/max_length": 1835.0,
87110
+ "completions/max_terminated_length": 1835.0,
87111
+ "completions/mean_length": 742.251953125,
87112
+ "completions/mean_terminated_length": 742.251953125,
87113
+ "completions/min_length": 220.0,
87114
+ "completions/min_terminated_length": 220.0,
87115
  "epoch": 0.9913800460868823,
87116
+ "frac_reward_zero_std": 0.53125,
87117
+ "grad_norm": 0.12250308762985097,
87118
+ "kl": 0.08251953125,
87119
  "learning_rate": 5.172965377890915e-09,
87120
+ "loss": 0.0111,
87121
+ "num_tokens": 1392171098.0,
87122
+ "reward": 2.06396484375,
87123
+ "reward_std": 0.16945436596870422,
87124
+ "rewards/accuracy_reward/mean": 0.0859375,
87125
+ "rewards/accuracy_reward/std": 0.28054583072662354,
87126
+ "rewards/format_reward/mean": 0.98046875,
87127
+ "rewards/format_reward/std": 0.1385180652141571,
87128
+ "rewards/tag_count_reward/mean": 0.99755859375,
87129
+ "rewards/tag_count_reward/std": 0.03659820929169655,
87130
  "step": 2904
87131
  },
87132
  {
 
87135
  "clip_ratio/low_mean": 0.0,
87136
  "clip_ratio/low_min": 0.0,
87137
  "clip_ratio/region_mean": 0.0,
87138
+ "completions/clipped_ratio": -7.0,
87139
+ "completions/max_length": 1859.0,
87140
+ "completions/max_terminated_length": 1859.0,
87141
+ "completions/mean_length": 780.9609375,
87142
+ "completions/mean_terminated_length": 780.9609375,
87143
+ "completions/min_length": 309.0,
87144
+ "completions/min_terminated_length": 309.0,
87145
  "epoch": 0.9917214304002732,
87146
+ "frac_reward_zero_std": 0.5625,
87147
+ "grad_norm": 0.09705909340956163,
87148
+ "kl": 0.072509765625,
87149
  "learning_rate": 4.79690880231587e-09,
87150
+ "loss": 0.0189,
87151
+ "num_tokens": 1392645526.0,
87152
+ "reward": 2.05029296875,
87153
+ "reward_std": 0.15216603875160217,
87154
+ "rewards/accuracy_reward/mean": 0.064453125,
87155
+ "rewards/accuracy_reward/std": 0.24579854309558868,
87156
+ "rewards/format_reward/mean": 0.986328125,
87157
+ "rewards/format_reward/std": 0.1162383034825325,
87158
+ "rewards/tag_count_reward/mean": 0.99951171875,
87159
+ "rewards/tag_count_reward/std": 0.011048543266952038,
87160
  "step": 2905
87161
  },
87162
  {
 
87166
  "clip_ratio/low_min": 0.0,
87167
  "clip_ratio/region_mean": 0.0,
87168
  "completions/clipped_ratio": -7.0,
87169
+ "completions/max_length": 1650.0,
87170
+ "completions/max_terminated_length": 1650.0,
87171
+ "completions/mean_length": 745.669921875,
87172
+ "completions/mean_terminated_length": 745.669921875,
87173
+ "completions/min_length": 221.0,
87174
+ "completions/min_terminated_length": 221.0,
87175
  "epoch": 0.9920628147136639,
87176
+ "frac_reward_zero_std": 0.53125,
87177
+ "grad_norm": 0.12680546398444922,
87178
+ "kl": 0.08154296875,
87179
  "learning_rate": 4.435038589380991e-09,
87180
+ "loss": 0.0099,
87181
+ "num_tokens": 1393112653.0,
87182
+ "reward": 2.18994140625,
87183
+ "reward_std": 0.16484864056110382,
87184
+ "rewards/accuracy_reward/mean": 0.203125,
87185
+ "rewards/accuracy_reward/std": 0.4027182459831238,
87186
+ "rewards/format_reward/mean": 0.98828125,
87187
+ "rewards/format_reward/std": 0.10772226005792618,
87188
+ "rewards/tag_count_reward/mean": 0.99853515625,
87189
+ "rewards/tag_count_reward/std": 0.019099153578281403,
87190
  "step": 2906
87191
  },
87192
  {
 
87196
  "clip_ratio/low_min": 0.0,
87197
  "clip_ratio/region_mean": 0.0,
87198
  "completions/clipped_ratio": -7.0,
87199
+ "completions/max_length": 1381.0,
87200
+ "completions/max_terminated_length": 1381.0,
87201
+ "completions/mean_length": 694.4453125,
87202
+ "completions/mean_terminated_length": 694.4453125,
87203
+ "completions/min_length": 176.0,
87204
+ "completions/min_terminated_length": 176.0,
87205
  "epoch": 0.9924041990270547,
87206
+ "frac_reward_zero_std": 0.53125,
87207
+ "grad_norm": 0.11603922474180073,
87208
+ "kl": 0.0872802734375,
87209
  "learning_rate": 4.087355252694325e-09,
87210
+ "loss": 0.0072,
87211
+ "num_tokens": 1393552081.0,
87212
+ "reward": 2.12353515625,
87213
+ "reward_std": 0.18750609457492828,
87214
+ "rewards/accuracy_reward/mean": 0.138671875,
87215
+ "rewards/accuracy_reward/std": 0.34594178199768066,
87216
+ "rewards/format_reward/mean": 0.986328125,
87217
+ "rewards/format_reward/std": 0.1162383034825325,
87218
+ "rewards/tag_count_reward/mean": 0.99853515625,
87219
+ "rewards/tag_count_reward/std": 0.019099153578281403,
87220
  "step": 2907
87221
  },
87222
  {
 
87225
  "clip_ratio/low_mean": 0.0,
87226
  "clip_ratio/low_min": 0.0,
87227
  "clip_ratio/region_mean": 0.0,
87228
+ "completions/clipped_ratio": -6.953125,
87229
  "completions/max_length": 2048.0,
87230
+ "completions/max_terminated_length": 1959.0,
87231
+ "completions/mean_length": 825.453125,
87232
+ "completions/mean_terminated_length": 818.24755859375,
87233
+ "completions/min_length": 169.0,
87234
+ "completions/min_terminated_length": 169.0,
87235
  "epoch": 0.9927455833404455,
87236
+ "frac_reward_zero_std": 0.375,
87237
+ "grad_norm": 0.12487904460610164,
87238
+ "kl": 0.077392578125,
87239
  "learning_rate": 3.753859285730022e-09,
87240
+ "loss": 0.0293,
87241
+ "num_tokens": 1394054505.0,
87242
+ "reward": 2.09375,
87243
+ "reward_std": 0.20864024758338928,
87244
+ "rewards/accuracy_reward/mean": 0.119140625,
87245
+ "rewards/accuracy_reward/std": 0.32427072525024414,
87246
+ "rewards/format_reward/mean": 0.978515625,
87247
+ "rewards/format_reward/std": 0.14513419568538666,
87248
+ "rewards/tag_count_reward/mean": 0.99609375,
87249
+ "rewards/tag_count_reward/std": 0.031035220250487328,
87250
  "step": 2908
87251
  },
87252
  {
 
87255
  "clip_ratio/low_mean": 0.0,
87256
  "clip_ratio/low_min": 0.0,
87257
  "clip_ratio/region_mean": 0.0,
87258
+ "completions/clipped_ratio": -7.0,
87259
+ "completions/max_length": 1732.0,
87260
+ "completions/max_terminated_length": 1732.0,
87261
+ "completions/mean_length": 787.2265625,
87262
+ "completions/mean_terminated_length": 787.2265625,
87263
+ "completions/min_length": 254.0,
87264
+ "completions/min_terminated_length": 254.0,
87265
  "epoch": 0.9930869676538363,
87266
+ "frac_reward_zero_std": 0.5,
87267
+ "grad_norm": 0.11516269657952301,
87268
+ "kl": 0.0797119140625,
87269
  "learning_rate": 3.4345511618238957e-09,
87270
+ "loss": 0.0174,
87271
+ "num_tokens": 1394543709.0,
87272
+ "reward": 2.09228515625,
87273
+ "reward_std": 0.17656370997428894,
87274
+ "rewards/accuracy_reward/mean": 0.115234375,
87275
+ "rewards/accuracy_reward/std": 0.3196168541908264,
87276
+ "rewards/format_reward/mean": 0.98046875,
87277
+ "rewards/format_reward/std": 0.1385180652141571,
87278
+ "rewards/tag_count_reward/mean": 0.99658203125,
87279
+ "rewards/tag_count_reward/std": 0.03972800448536873,
87280
  "step": 2909
87281
  },
87282
  {
 
87285
  "clip_ratio/low_mean": 0.0,
87286
  "clip_ratio/low_min": 0.0,
87287
  "clip_ratio/region_mean": 0.0,
87288
+ "completions/clipped_ratio": -6.984375,
87289
+ "completions/max_length": 1820.0,
87290
+ "completions/max_terminated_length": 1820.0,
87291
+ "completions/mean_length": 778.107421875,
87292
+ "completions/mean_terminated_length": 776.7182006835938,
87293
+ "completions/min_length": 219.0,
87294
+ "completions/min_terminated_length": 219.0,
87295
  "epoch": 0.9934283519672271,
87296
+ "frac_reward_zero_std": 0.4375,
87297
+ "grad_norm": 0.3443649766769348,
87298
+ "kl": 0.0982666015625,
87299
  "learning_rate": 3.129431334175648e-09,
87300
+ "loss": 0.0329,
87301
+ "num_tokens": 1395033684.0,
87302
+ "reward": 2.11474609375,
87303
+ "reward_std": 0.20991787314414978,
87304
+ "rewards/accuracy_reward/mean": 0.146484375,
87305
+ "rewards/accuracy_reward/std": 0.35393697023391724,
87306
+ "rewards/format_reward/mean": 0.97265625,
87307
+ "rewards/format_reward/std": 0.16324250400066376,
87308
+ "rewards/tag_count_reward/mean": 0.99560546875,
87309
+ "rewards/tag_count_reward/std": 0.04260620102286339,
87310
  "step": 2910
87311
  },
87312
  {
 
87315
  "clip_ratio/low_mean": 0.0,
87316
  "clip_ratio/low_min": 0.0,
87317
  "clip_ratio/region_mean": 0.0,
87318
+ "completions/clipped_ratio": -7.0,
87319
+ "completions/max_length": 1908.0,
87320
+ "completions/max_terminated_length": 1908.0,
87321
+ "completions/mean_length": 782.6015625,
87322
+ "completions/mean_terminated_length": 782.6015625,
87323
+ "completions/min_length": 188.0,
87324
+ "completions/min_terminated_length": 188.0,
87325
  "epoch": 0.9937697362806179,
87326
+ "frac_reward_zero_std": 0.53125,
87327
+ "grad_norm": 0.12301114676381447,
87328
+ "kl": 0.0780029296875,
87329
  "learning_rate": 2.8385002358466417e-09,
87330
+ "loss": 0.0053,
87331
+ "num_tokens": 1395524152.0,
87332
+ "reward": 2.103515625,
87333
+ "reward_std": 0.171902135014534,
87334
+ "rewards/accuracy_reward/mean": 0.12096773833036423,
87335
+ "rewards/accuracy_reward/std": 0.32641899585723877,
87336
+ "rewards/format_reward/mean": 0.98828125,
87337
+ "rewards/format_reward/std": 0.10772226005792618,
87338
  "rewards/tag_count_reward/mean": 0.998046875,
87339
  "rewards/tag_count_reward/std": 0.022032126784324646,
87340
  "step": 2911
 
87346
  "clip_ratio/low_min": 0.0,
87347
  "clip_ratio/region_mean": 0.0,
87348
  "completions/clipped_ratio": -7.0,
87349
+ "completions/max_length": 1561.0,
87350
+ "completions/max_terminated_length": 1561.0,
87351
+ "completions/mean_length": 714.876953125,
87352
+ "completions/mean_terminated_length": 714.876953125,
87353
+ "completions/min_length": 184.0,
87354
+ "completions/min_terminated_length": 184.0,
87355
  "epoch": 0.9941111205940087,
87356
+ "frac_reward_zero_std": 0.53125,
87357
+ "grad_norm": 0.11373851773465948,
87358
+ "kl": 0.08544921875,
87359
  "learning_rate": 2.5617582797610174e-09,
87360
+ "loss": 0.026,
87361
+ "num_tokens": 1395971001.0,
87362
+ "reward": 2.08349609375,
87363
+ "reward_std": 0.17116190493106842,
87364
+ "rewards/accuracy_reward/mean": 0.1015625,
87365
+ "rewards/accuracy_reward/std": 0.30236753821372986,
87366
+ "rewards/format_reward/mean": 0.984375,
87367
+ "rewards/format_reward/std": 0.12414088100194931,
87368
+ "rewards/tag_count_reward/mean": 0.99755859375,
87369
+ "rewards/tag_count_reward/std": 0.024608410894870758,
87370
  "step": 2912
87371
  },
87372
  {
 
87376
  "clip_ratio/low_min": 0.0,
87377
  "clip_ratio/region_mean": 0.0,
87378
  "completions/clipped_ratio": -7.0,
87379
+ "completions/max_length": 1902.0,
87380
+ "completions/max_terminated_length": 1902.0,
87381
+ "completions/mean_length": 794.6015625,
87382
+ "completions/mean_terminated_length": 794.6015625,
87383
+ "completions/min_length": 281.0,
87384
+ "completions/min_terminated_length": 281.0,
87385
  "epoch": 0.9944525049073996,
87386
+ "frac_reward_zero_std": 0.5625,
87387
+ "grad_norm": 0.11380928184475643,
87388
+ "kl": 0.0804443359375,
87389
  "learning_rate": 2.299205858702358e-09,
87390
+ "loss": 0.0148,
87391
+ "num_tokens": 1396454045.0,
87392
+ "reward": 2.11669921875,
87393
+ "reward_std": 0.15615960955619812,
87394
+ "rewards/accuracy_reward/mean": 0.13709677755832672,
87395
+ "rewards/accuracy_reward/std": 0.34429675340652466,
87396
+ "rewards/format_reward/mean": 0.984375,
87397
+ "rewards/format_reward/std": 0.12414088100194931,
87398
+ "rewards/tag_count_reward/mean": 0.99951171875,
87399
+ "rewards/tag_count_reward/std": 0.011048543266952038,
87400
  "step": 2913
87401
  },
87402
  {
 
87405
  "clip_ratio/low_mean": 0.0,
87406
  "clip_ratio/low_min": 0.0,
87407
  "clip_ratio/region_mean": 0.0,
87408
+ "completions/clipped_ratio": -6.984375,
87409
+ "completions/max_length": 2048.0,
87410
+ "completions/max_terminated_length": 2010.0,
87411
+ "completions/mean_length": 835.59765625,
87412
+ "completions/mean_terminated_length": 833.2250366210938,
87413
+ "completions/min_length": 245.0,
87414
+ "completions/min_terminated_length": 245.0,
87415
  "epoch": 0.9947938892207903,
87416
+ "frac_reward_zero_std": 0.5,
87417
+ "grad_norm": 0.10149774742284243,
87418
+ "kl": 0.074462890625,
87419
  "learning_rate": 2.0508433453170218e-09,
87420
+ "loss": 0.0245,
87421
+ "num_tokens": 1396964895.0,
87422
+ "reward": 2.08154296875,
87423
+ "reward_std": 0.1786787509918213,
87424
+ "rewards/accuracy_reward/mean": 0.10546875,
87425
+ "rewards/accuracy_reward/std": 0.3074568510055542,
87426
+ "rewards/format_reward/mean": 0.98046875,
87427
+ "rewards/format_reward/std": 0.1385180652141571,
87428
+ "rewards/tag_count_reward/mean": 0.99560546875,
87429
+ "rewards/tag_count_reward/std": 0.050489041954278946,
87430
  "step": 2914
87431
  },
87432
  {
 
87436
  "clip_ratio/low_min": 0.0,
87437
  "clip_ratio/region_mean": 0.0,
87438
  "completions/clipped_ratio": -7.0,
87439
+ "completions/max_length": 1628.0,
87440
+ "completions/max_terminated_length": 1628.0,
87441
+ "completions/mean_length": 808.962890625,
87442
+ "completions/mean_terminated_length": 808.962890625,
87443
+ "completions/min_length": 257.0,
87444
+ "completions/min_terminated_length": 257.0,
87445
  "epoch": 0.9951352735341811,
87446
+ "frac_reward_zero_std": 0.625,
87447
+ "grad_norm": 0.09026605662138586,
87448
+ "kl": 0.0748291015625,
87449
  "learning_rate": 1.8166710921097008e-09,
87450
+ "loss": 0.0103,
87451
+ "num_tokens": 1397456716.0,
87452
+ "reward": 2.08642578125,
87453
+ "reward_std": 0.14233165979385376,
87454
+ "rewards/accuracy_reward/mean": 0.099609375,
87455
+ "rewards/accuracy_reward/std": 0.29977133870124817,
87456
+ "rewards/format_reward/mean": 0.98828125,
87457
+ "rewards/format_reward/std": 0.10772226005792618,
87458
  "rewards/tag_count_reward/mean": 0.99853515625,
87459
  "rewards/tag_count_reward/std": 0.019099153578281403,
87460
  "step": 2915
 
87467
  "clip_ratio/region_mean": 0.0,
87468
  "completions/clipped_ratio": -6.984375,
87469
  "completions/max_length": 2048.0,
87470
+ "completions/max_terminated_length": 1839.0,
87471
+ "completions/mean_length": 838.197265625,
87472
+ "completions/mean_terminated_length": 835.8297119140625,
87473
+ "completions/min_length": 341.0,
87474
+ "completions/min_terminated_length": 341.0,
87475
  "epoch": 0.9954766578475719,
87476
  "frac_reward_zero_std": 0.59375,
87477
+ "grad_norm": 0.0990546564513383,
87478
+ "kl": 0.0794677734375,
87479
  "learning_rate": 1.5966894314456416e-09,
87480
+ "loss": 0.0194,
87481
+ "num_tokens": 1397977025.0,
87482
+ "reward": 2.06494140625,
87483
+ "reward_std": 0.1546308547258377,
87484
+ "rewards/accuracy_reward/mean": 0.080078125,
87485
+ "rewards/accuracy_reward/std": 0.271679550409317,
87486
+ "rewards/format_reward/mean": 0.986328125,
87487
+ "rewards/format_reward/std": 0.1162383034825325,
87488
+ "rewards/tag_count_reward/mean": 0.99853515625,
87489
+ "rewards/tag_count_reward/std": 0.019099153578281403,
87490
  "step": 2916
87491
  },
87492
  {
 
87496
  "clip_ratio/low_min": 0.0,
87497
  "clip_ratio/region_mean": 0.0,
87498
  "completions/clipped_ratio": -7.0,
87499
+ "completions/max_length": 1592.0,
87500
+ "completions/max_terminated_length": 1592.0,
87501
+ "completions/mean_length": 735.02734375,
87502
+ "completions/mean_terminated_length": 735.02734375,
87503
+ "completions/min_length": 160.0,
87504
+ "completions/min_terminated_length": 160.0,
87505
  "epoch": 0.9958180421609627,
87506
+ "frac_reward_zero_std": 0.6875,
87507
+ "grad_norm": 0.09306644198204554,
87508
+ "kl": 0.08642578125,
87509
  "learning_rate": 1.3908986755473142e-09,
87510
+ "loss": 0.0144,
87511
+ "num_tokens": 1398435407.0,
87512
+ "reward": 2.0849609375,
87513
+ "reward_std": 0.1204758882522583,
87514
+ "rewards/accuracy_reward/mean": 0.099609375,
87515
+ "rewards/accuracy_reward/std": 0.29977133870124817,
87516
+ "rewards/format_reward/mean": 0.986328125,
87517
+ "rewards/format_reward/std": 0.1162383034825325,
87518
  "rewards/tag_count_reward/mean": 0.9990234375,
87519
  "rewards/tag_count_reward/std": 0.015609703958034515,
87520
  "step": 2917
 
87526
  "clip_ratio/low_min": 0.0,
87527
  "clip_ratio/region_mean": 0.0,
87528
  "completions/clipped_ratio": -7.0,
87529
+ "completions/max_length": 1498.0,
87530
+ "completions/max_terminated_length": 1498.0,
87531
+ "completions/mean_length": 774.666015625,
87532
+ "completions/mean_terminated_length": 774.666015625,
87533
+ "completions/min_length": 360.0,
87534
+ "completions/min_terminated_length": 360.0,
87535
  "epoch": 0.9961594264743535,
87536
+ "frac_reward_zero_std": 0.5625,
87537
+ "grad_norm": 0.11717990565862879,
87538
+ "kl": 0.08349609375,
87539
  "learning_rate": 1.199299116497743e-09,
87540
+ "loss": 0.0232,
87541
+ "num_tokens": 1398909236.0,
87542
+ "reward": 2.056640625,
87543
+ "reward_std": 0.13060790300369263,
87544
+ "rewards/accuracy_reward/mean": 0.0703125,
87545
+ "rewards/accuracy_reward/std": 0.25592297315597534,
87546
+ "rewards/format_reward/mean": 0.98828125,
87547
+ "rewards/format_reward/std": 0.10772226005792618,
87548
+ "rewards/tag_count_reward/mean": 0.998046875,
87549
+ "rewards/tag_count_reward/std": 0.022032126784324646,
87550
  "step": 2918
87551
  },
87552
  {
 
87556
  "clip_ratio/low_min": 0.0,
87557
  "clip_ratio/region_mean": 0.0,
87558
  "completions/clipped_ratio": -7.0,
87559
+ "completions/max_length": 1840.0,
87560
+ "completions/max_terminated_length": 1840.0,
87561
+ "completions/mean_length": 794.30078125,
87562
+ "completions/mean_terminated_length": 794.30078125,
87563
+ "completions/min_length": 239.0,
87564
+ "completions/min_terminated_length": 239.0,
87565
  "epoch": 0.9965008107877443,
87566
+ "frac_reward_zero_std": 0.375,
87567
+ "grad_norm": 0.11873912143398956,
87568
+ "kl": 0.0755615234375,
87569
  "learning_rate": 1.0218910262371762e-09,
87570
+ "loss": 0.0179,
87571
+ "num_tokens": 1399398206.0,
87572
+ "reward": 2.14208984375,
87573
+ "reward_std": 0.23764190077781677,
87574
+ "rewards/accuracy_reward/mean": 0.162109375,
87575
+ "rewards/accuracy_reward/std": 0.3689115643501282,
87576
+ "rewards/format_reward/mean": 0.982421875,
87577
+ "rewards/format_reward/std": 0.13154059648513794,
87578
+ "rewards/tag_count_reward/mean": 0.99755859375,
87579
+ "rewards/tag_count_reward/std": 0.03659820929169655,
87580
  "step": 2919
87581
  },
87582
  {
 
87586
  "clip_ratio/low_min": 0.0,
87587
  "clip_ratio/region_mean": 0.0,
87588
  "completions/clipped_ratio": -7.0,
87589
+ "completions/max_length": 2016.0,
87590
+ "completions/max_terminated_length": 2016.0,
87591
+ "completions/mean_length": 833.984375,
87592
+ "completions/mean_terminated_length": 833.984375,
87593
+ "completions/min_length": 292.0,
87594
+ "completions/min_terminated_length": 292.0,
87595
  "epoch": 0.9968421951011351,
87596
  "frac_reward_zero_std": 0.53125,
87597
+ "grad_norm": 0.10545064088363902,
87598
+ "kl": 0.0784912109375,
87599
  "learning_rate": 8.586746565641957e-10,
87600
+ "loss": 0.0003,
87601
+ "num_tokens": 1399921494.0,
87602
+ "reward": 2.12548828125,
87603
+ "reward_std": 0.1864640712738037,
87604
+ "rewards/accuracy_reward/mean": 0.13671875,
87605
+ "rewards/accuracy_reward/std": 0.3438861668109894,
87606
+ "rewards/format_reward/mean": 0.990234375,
87607
+ "rewards/format_reward/std": 0.09843364357948303,
87608
  "rewards/tag_count_reward/mean": 0.99853515625,
87609
  "rewards/tag_count_reward/std": 0.019099153578281403,
87610
  "step": 2920
 
87616
  "clip_ratio/low_min": 0.0,
87617
  "clip_ratio/region_mean": 0.0,
87618
  "completions/clipped_ratio": -7.0,
87619
+ "completions/max_length": 1390.0,
87620
+ "completions/max_terminated_length": 1390.0,
87621
+ "completions/mean_length": 740.46484375,
87622
+ "completions/mean_terminated_length": 740.46484375,
87623
+ "completions/min_length": 262.0,
87624
+ "completions/min_terminated_length": 262.0,
87625
  "epoch": 0.997183579414526,
87626
  "frac_reward_zero_std": 0.59375,
87627
+ "grad_norm": 0.10125262138426178,
87628
+ "kl": 0.0797119140625,
87629
  "learning_rate": 7.096502391346072e-10,
87630
+ "loss": 0.0078,
87631
+ "num_tokens": 1400381748.0,
87632
+ "reward": 2.06103515625,
87633
+ "reward_std": 0.15187877416610718,
87634
+ "rewards/accuracy_reward/mean": 0.0703125,
87635
+ "rewards/accuracy_reward/std": 0.25592297315597534,
87636
  "rewards/format_reward/mean": 0.9921875,
87637
  "rewards/format_reward/std": 0.08812850713729858,
87638
+ "rewards/tag_count_reward/mean": 0.99853515625,
87639
+ "rewards/tag_count_reward/std": 0.019099153578281403,
87640
  "step": 2921
87641
  },
87642
  {
 
87646
  "clip_ratio/low_min": 0.0,
87647
  "clip_ratio/region_mean": 0.0,
87648
  "completions/clipped_ratio": -7.0,
87649
+ "completions/max_length": 1748.0,
87650
+ "completions/max_terminated_length": 1748.0,
87651
+ "completions/mean_length": 813.0234375,
87652
+ "completions/mean_terminated_length": 813.0234375,
87653
+ "completions/min_length": 291.0,
87654
+ "completions/min_terminated_length": 291.0,
87655
  "epoch": 0.9975249637279167,
87656
+ "frac_reward_zero_std": 0.5625,
87657
+ "grad_norm": 0.09890654551815842,
87658
+ "kl": 0.0748291015625,
87659
  "learning_rate": 5.748179854614399e-10,
87660
+ "loss": 0.0172,
87661
+ "num_tokens": 1400881472.0,
87662
+ "reward": 2.0908203125,
87663
+ "reward_std": 0.1693429946899414,
87664
+ "rewards/accuracy_reward/mean": 0.107421875,
87665
+ "rewards/accuracy_reward/std": 0.30995169281959534,
87666
+ "rewards/format_reward/mean": 0.986328125,
87667
+ "rewards/format_reward/std": 0.1162383034825325,
87668
  "rewards/tag_count_reward/mean": 0.9970703125,
87669
+ "rewards/tag_count_reward/std": 0.038198307156562805,
87670
  "step": 2922
87671
  },
87672
  {
 
87676
  "clip_ratio/low_min": 0.0,
87677
  "clip_ratio/region_mean": 0.0,
87678
  "completions/clipped_ratio": -7.0,
87679
+ "completions/max_length": 1460.0,
87680
+ "completions/max_terminated_length": 1460.0,
87681
+ "completions/mean_length": 737.50390625,
87682
+ "completions/mean_terminated_length": 737.50390625,
87683
+ "completions/min_length": 209.0,
87684
+ "completions/min_terminated_length": 209.0,
87685
  "epoch": 0.9978663480413075,
87686
+ "frac_reward_zero_std": 0.625,
87687
+ "grad_norm": 0.10499893723997014,
87688
+ "kl": 0.081787109375,
87689
  "learning_rate": 4.541780869138368e-10,
87690
+ "loss": 0.0114,
87691
+ "num_tokens": 1401334930.0,
87692
+ "reward": 2.06103515625,
87693
+ "reward_std": 0.1196913868188858,
87694
+ "rewards/accuracy_reward/mean": 0.072265625,
87695
+ "rewards/accuracy_reward/std": 0.2591804563999176,
87696
  "rewards/format_reward/mean": 0.990234375,
87697
  "rewards/format_reward/std": 0.09843364357948303,
87698
+ "rewards/tag_count_reward/mean": 0.99853515625,
87699
+ "rewards/tag_count_reward/std": 0.019099153578281403,
87700
  "step": 2923
87701
  },
87702
  {
 
87706
  "clip_ratio/low_min": 0.0,
87707
  "clip_ratio/region_mean": 0.0,
87708
  "completions/clipped_ratio": -7.0,
87709
+ "completions/max_length": 1754.0,
87710
+ "completions/max_terminated_length": 1754.0,
87711
+ "completions/mean_length": 797.484375,
87712
+ "completions/mean_terminated_length": 797.484375,
87713
+ "completions/min_length": 212.0,
87714
+ "completions/min_terminated_length": 212.0,
87715
  "epoch": 0.9982077323546983,
87716
+ "frac_reward_zero_std": 0.5,
87717
+ "grad_norm": 0.10729524502444683,
87718
+ "kl": 0.080078125,
87719
  "learning_rate": 3.477307147192743e-10,
87720
+ "loss": 0.016,
87721
+ "num_tokens": 1401819642.0,
87722
+ "reward": 2.13671875,
87723
+ "reward_std": 0.1730148196220398,
87724
+ "rewards/accuracy_reward/mean": 0.150390625,
87725
+ "rewards/accuracy_reward/std": 0.35780346393585205,
87726
  "rewards/format_reward/mean": 0.98828125,
87727
  "rewards/format_reward/std": 0.10772226005792618,
87728
  "rewards/tag_count_reward/mean": 0.998046875,
87729
+ "rewards/tag_count_reward/std": 0.022032126784324646,
87730
  "step": 2924
87731
  },
87732
  {
 
87736
  "clip_ratio/low_min": 0.0,
87737
  "clip_ratio/region_mean": 0.0,
87738
  "completions/clipped_ratio": -7.0,
87739
+ "completions/max_length": 1937.0,
87740
+ "completions/max_terminated_length": 1937.0,
87741
+ "completions/mean_length": 811.166015625,
87742
+ "completions/mean_terminated_length": 811.166015625,
87743
+ "completions/min_length": 327.0,
87744
+ "completions/min_terminated_length": 327.0,
87745
  "epoch": 0.9985491166680891,
87746
+ "frac_reward_zero_std": 0.5,
87747
+ "grad_norm": 0.10537359711068275,
87748
+ "kl": 0.081298828125,
87749
  "learning_rate": 2.5547601995912216e-10,
87750
+ "loss": 0.0178,
87751
+ "num_tokens": 1402320623.0,
87752
+ "reward": 2.14892578125,
87753
+ "reward_std": 0.21511411666870117,
87754
+ "rewards/accuracy_reward/mean": 0.162109375,
87755
+ "rewards/accuracy_reward/std": 0.3689115643501282,
87756
+ "rewards/format_reward/mean": 0.98828125,
87757
+ "rewards/format_reward/std": 0.10772226005792618,
87758
+ "rewards/tag_count_reward/mean": 0.99853515625,
87759
+ "rewards/tag_count_reward/std": 0.019099153578281403,
87760
  "step": 2925
87761
  },
87762
  {
 
87765
  "clip_ratio/low_mean": 0.0,
87766
  "clip_ratio/low_min": 0.0,
87767
  "clip_ratio/region_mean": 0.0,
87768
+ "completions/clipped_ratio": -7.0,
87769
+ "completions/max_length": 1430.0,
87770
+ "completions/max_terminated_length": 1430.0,
87771
+ "completions/mean_length": 703.416015625,
87772
+ "completions/mean_terminated_length": 703.416015625,
87773
+ "completions/min_length": 203.0,
87774
+ "completions/min_terminated_length": 203.0,
87775
  "epoch": 0.9988905009814799,
87776
  "frac_reward_zero_std": 0.5,
87777
+ "grad_norm": 0.1300151179262908,
87778
+ "kl": 0.0833740234375,
87779
  "learning_rate": 1.7741413357197367e-10,
87780
+ "loss": 0.0125,
87781
+ "num_tokens": 1402760996.0,
87782
+ "reward": 2.080078125,
87783
+ "reward_std": 0.17316466569900513,
87784
+ "rewards/accuracy_reward/mean": 0.099609375,
87785
+ "rewards/accuracy_reward/std": 0.29977133870124817,
87786
+ "rewards/format_reward/mean": 0.982421875,
87787
+ "rewards/format_reward/std": 0.13154059648513794,
87788
+ "rewards/tag_count_reward/mean": 0.998046875,
87789
+ "rewards/tag_count_reward/std": 0.022032126784324646,
87790
  "step": 2926
87791
  },
87792
  {
 
87796
  "clip_ratio/low_min": 0.0,
87797
  "clip_ratio/region_mean": 0.0,
87798
  "completions/clipped_ratio": -7.0,
87799
+ "completions/max_length": 1723.0,
87800
+ "completions/max_terminated_length": 1723.0,
87801
+ "completions/mean_length": 840.955078125,
87802
+ "completions/mean_terminated_length": 840.955078125,
87803
+ "completions/min_length": 352.0,
87804
+ "completions/min_terminated_length": 352.0,
87805
  "epoch": 0.9992318852948707,
87806
+ "frac_reward_zero_std": 0.5625,
87807
+ "grad_norm": 0.10865170665818756,
87808
  "kl": 0.0772705078125,
87809
  "learning_rate": 1.1354516635364577e-10,
87810
+ "loss": 0.0192,
87811
+ "num_tokens": 1403271117.0,
87812
+ "reward": 2.0263671875,
87813
+ "reward_std": 0.13820995390415192,
87814
+ "rewards/accuracy_reward/mean": 0.048828125,
87815
+ "rewards/accuracy_reward/std": 0.2157193273305893,
87816
+ "rewards/format_reward/mean": 0.98046875,
87817
+ "rewards/format_reward/std": 0.1385180652141571,
87818
+ "rewards/tag_count_reward/mean": 0.9970703125,
87819
+ "rewards/tag_count_reward/std": 0.026930565014481544,
87820
  "step": 2927
87821
  },
87822
  {
 
87825
  "clip_ratio/low_mean": 0.0,
87826
  "clip_ratio/low_min": 0.0,
87827
  "clip_ratio/region_mean": 0.0,
87828
+ "completions/clipped_ratio": -6.953125,
87829
  "completions/max_length": 2048.0,
87830
+ "completions/max_terminated_length": 1749.0,
87831
+ "completions/mean_length": 767.625,
87832
+ "completions/mean_terminated_length": 760.07861328125,
87833
+ "completions/min_length": 279.0,
87834
+ "completions/min_terminated_length": 279.0,
87835
  "epoch": 0.9995732696082615,
87836
+ "frac_reward_zero_std": 0.53125,
87837
+ "grad_norm": 0.1098002562236282,
87838
+ "kl": 0.0780029296875,
87839
  "learning_rate": 6.386920895384841e-11,
87840
+ "loss": 0.0139,
87841
+ "num_tokens": 1403739997.0,
87842
+ "reward": 2.08935546875,
87843
+ "reward_std": 0.16179436445236206,
87844
+ "rewards/accuracy_reward/mean": 0.11895161122083664,
87845
+ "rewards/accuracy_reward/std": 0.3240584135055542,
87846
  "rewards/format_reward/mean": 0.98046875,
87847
  "rewards/format_reward/std": 0.1385180652141571,
87848
+ "rewards/tag_count_reward/mean": 0.99365234375,
87849
+ "rewards/tag_count_reward/std": 0.05493048578500748,
87850
  "step": 2928
87851
  },
87852
  {
 
87856
  "clip_ratio/low_min": 0.0,
87857
  "clip_ratio/region_mean": 0.0,
87858
  "completions/clipped_ratio": -7.0,
87859
+ "completions/max_length": 1440.0,
87860
+ "completions/max_terminated_length": 1440.0,
87861
+ "completions/mean_length": 793.716796875,
87862
+ "completions/mean_terminated_length": 793.716796875,
87863
+ "completions/min_length": 171.0,
87864
+ "completions/min_terminated_length": 171.0,
87865
  "epoch": 0.9999146539216524,
87866
+ "frac_reward_zero_std": 0.5,
87867
+ "grad_norm": 0.11492651391052233,
87868
+ "kl": 0.0770263671875,
87869
  "learning_rate": 2.838633187729478e-11,
87870
+ "loss": 0.0089,
87871
+ "num_tokens": 1404234140.0,
87872
+ "reward": 2.09912109375,
87873
+ "reward_std": 0.16686061024665833,
87874
+ "rewards/accuracy_reward/mean": 0.115234375,
87875
+ "rewards/accuracy_reward/std": 0.3196168541908264,
87876
+ "rewards/format_reward/mean": 0.986328125,
87877
+ "rewards/format_reward/std": 0.1162383034825325,
87878
+ "rewards/tag_count_reward/mean": 0.99755859375,
87879
+ "rewards/tag_count_reward/std": 0.024608410894870758,
87880
  "step": 2929
87881
  },
87882
  {
87883
  "epoch": 0.9999146539216524,
87884
  "step": 2929,
87885
  "total_flos": 0.0,
87886
+ "train_loss": 0.00015187089497744325,
87887
+ "train_runtime": 1302.9313,
87888
+ "train_samples_per_second": 71.94,
87889
+ "train_steps_per_second": 2.249
87890
  }
87891
  ],
87892
  "logging_steps": 1,
87893
  "max_steps": 2930,
87894
+ "num_input_tokens_seen": 1404234140,
87895
  "num_train_epochs": 1,
87896
  "save_steps": 100,
87897
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce4599805e4a46a67aa4e3afb0862b0625c1ee9e61ed3b82c608ce522b65f009
3
  size 8504
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df64e4c9b10422927491885e2146cbeed575ec0206ed018c170dc3bb7d57cf3c
3
  size 8504